textutils 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest.txt CHANGED
@@ -16,11 +16,13 @@ lib/textutils/helper/title_helper.rb
16
16
  lib/textutils/helper/unicode_helper.rb
17
17
  lib/textutils/helper/value_helper.rb
18
18
  lib/textutils/helper/xml_helper.rb
19
+ lib/textutils/patterns.rb
19
20
  lib/textutils/reader/code_reader.rb
20
21
  lib/textutils/reader/fixture_reader.rb
21
22
  lib/textutils/reader/hash_reader.rb
22
23
  lib/textutils/reader/line_reader.rb
23
24
  lib/textutils/reader/values_reader.rb
25
+ lib/textutils/sanitizier.rb
24
26
  lib/textutils/utils.rb
25
27
  lib/textutils/version.rb
26
28
  test/helper.rb
data/README.markdown CHANGED
@@ -4,7 +4,7 @@
4
4
  * bugs :: [github.com/rubylibs/textutils/issues](https://github.com/rubylibs/textutils/issues)
5
5
  * gem :: [rubygems.org/gems/textutils](https://rubygems.org/gems/textutils)
6
6
  * rdoc :: [rubydoc.info/gems/textutils](http://rubydoc.info/gems/textutils)
7
- * forum :: [groups.google.com/group/webslideshow](https://groups.google.com/group/webslideshow)
7
+ * forum :: [ruby-talk@ruby-lang.org](www.ruby-lang.org/en/community/mailing-lists/)
8
8
 
9
9
 
10
10
  ## Filters
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Hoe.spec 'textutils' do
11
11
  self.urls = ['https://github.com/rubylibs/textutils']
12
12
 
13
13
  self.author = 'Gerald Bauer'
14
- self.email = 'webslideshow@googlegroups.com'
14
+ self.email = 'ruby-talk@ruby-lang.org'
15
15
 
16
16
  # switch extension to .markdown for gihub formatting
17
17
  self.readme_file = 'README.markdown'
@@ -8,15 +8,28 @@ def strip_tags( ht )
8
8
  ### to be done
9
9
  ## strip markup tags; return plain text; use brute force for now
10
10
  # check at least for presence of required a-z+ tag names
11
-
11
+ #
12
+ # note: make sure we cover h1/h2/h3/h4/h5/h6 tag w/ number!!
13
+
12
14
  ### ht.gsub( /<[^>]+>/, '' ) - old simple
13
15
 
14
16
  ## todo: add strip comments e.g. <!-- xxxx --> ???
15
17
  ## or use new strip_comments( ht )
16
18
 
17
- ht = ht.gsub( /<([a-z]+)\s*\/>/i, '' ) # remove xml-style empty tags eg. <br /> or <br/>
18
- ht = ht.gsub( /<([a-z]+)(\s+[^>]*)?>/i, '' ) # opening tag <p>
19
- ht = ht.gsub( /<\/([a-z]+)\s*>/i, '' ) # closing tag e.g. </p>
19
+
20
+ ## note: follow offical xml spec
21
+ ## - allows for first char: (Letter | '_' | ':')
22
+ ## - allows for followup chars: (Letter | Digit | '_' | ':' | '.' | '-')
23
+
24
+ tag_name_pattern = "[a-z_:][a-z0-9_:.\\-]*"
25
+
26
+ empty_tag_pattern = "<#{tag_name_pattern}\\s*/>"
27
+ opening_tag_pattern = "<#{tag_name_pattern}(\\s+[^>]*)?>"
28
+ closing_tag_pattern = "</#{tag_name_pattern}\\s*>"
29
+
30
+ ht = ht.gsub( /#{empty_tag_pattern}/i, '' ) # remove xml-style empty tags eg. <br /> or <br/>
31
+ ht = ht.gsub( /#{opening_tag_pattern}/i, '' ) # opening tag <p>
32
+ ht = ht.gsub( /#{closing_tag_pattern}/i, '' ) # closing tag e.g. </p>
20
33
  ht
21
34
  end
22
35
 
@@ -52,8 +65,8 @@ def whitelist( ht, tags, opts={} )
52
65
  # -- note: will NOT strip comments for now e.g. <!-- -->
53
66
  ht = strip_tags( ht )
54
67
 
55
- pp ht # fix: debugging indo - remove
56
-
68
+ ## pp ht # fix: debugging indo - remove
69
+
57
70
  ############################################
58
71
  # step three - restore whitelisted tags
59
72
 
@@ -0,0 +1,66 @@
1
+ # encoding: utf-8
2
+
3
+ module TextUtils
4
+
5
+ # collection of regex patterns for reuse
6
+
7
+ ### todo: add a patterns.md page to github ??
8
+ ## - add regexper pics??
9
+
10
+ ############
11
+ # about ruby regexps
12
+ #
13
+ # try the rubular - Ruby regular expression editor and tester
14
+ # -> http://rubular.com
15
+ # code -> ?? by ??
16
+ #
17
+ #
18
+ # Jeff Avallone's Regexper - Shows State-Automata Diagrams
19
+ # try -> http://regexper.com
20
+ # code -> https://github.com/javallone/regexper
21
+ #
22
+ #
23
+ # Regular Expressions | The Bastards Book of Ruby by Dan Nguyen
24
+ # http://ruby.bastardsbook.com/chapters/regexes/
25
+ #
26
+ # move to notes regex|patterns on geraldb.github.io ??
27
+ #
28
+
29
+ EMPTY_LINE_PATTERN = '^\s*$'
30
+
31
+ #################################
32
+ ### Start of Line Comment Patterns
33
+
34
+ COMMENT_LINE_PATTERN = '^\s*#' # e.g. Ruby/Shell style starting w/ # this is a comment
35
+
36
+ COMMENT_LINE_HASKELL_PATTERN = '^\s*--' # e.g. Haskell/Ada? style starting w/ --
37
+ COMMENT_LINE_ALT_PATTERN = COMMENT_LINE_HASKELL_PATTERN
38
+
39
+ COMMENT_LINE_TEX_PATTERN = '^\s*%' # e.g. TeX/LaTeX style starting w/ %
40
+ COMMENT_LINE_ALT_II_PATTERN = COMMENT_LINE_TEX_PATTERN
41
+
42
+ #############################
43
+ ### End of Line (EOL) Comment Patterns
44
+
45
+ EOL_COMMENT_PATTERN = '\s+#.+$' # fix: use \b word boundry instead of \s - why why not?
46
+ # why /b - everything but a-z0-9, that is, spaces but also includes umlauts, special chars etc.
47
+
48
+ ##############
49
+ ## Dates
50
+ #
51
+ # some info at www.regular-expressions.info/dates.html
52
+
53
+ YYYY_STRICT_19_20_PATTERN = '(?:19|20)\d\d'
54
+ YYYY_STRICT_20_PATTERN = '20\d\d'
55
+
56
+ MM_STRICT_PATTERN = '0[1-9]|1[012]'
57
+ M_STRICT_PATTERN = '0?[1-9]|1[012]'
58
+
59
+ DD_STRICT_PATTERN = '0[1-9]|[12][0-9]|3[01]'
60
+ D_STRICT_PATTERN = '0?[1-9]|[12][0-9]|3[01]'
61
+
62
+ ######
63
+ ## Time
64
+
65
+
66
+ end # TextUtils
@@ -0,0 +1,72 @@
1
+ # encoding: utf-8
2
+
3
+ module TextUtils
4
+
5
+ class Sanitizier
6
+
7
+ include LogUtils::Logging
8
+
9
+ @@ignore_tags = %w{ head script style }
10
+ @@inline_tags = %w{ span b i u }
11
+ @@block_tags = %w{ p div ul ol }
12
+
13
+
14
+ def initialize( ht )
15
+ @ht = ht # hypertext (html source)
16
+ end
17
+
18
+ def to_plain_text
19
+
20
+ ht = @ht
21
+ ht = handle_ignore_tags( ht )
22
+
23
+ ## handle_pre_tags ?? - special rule for preformatted (keep whitespace)
24
+
25
+ ht = handle_inline_tags( ht )
26
+ ht = handle_block_tags( ht )
27
+ ht = handle_other_tags( ht ) # rules for remain/left over tags
28
+
29
+ ht = handle_entities( ht )
30
+
31
+ ht
32
+ end
33
+
34
+ def handle_entities( ht )
35
+ ## unescape entities
36
+ # - check if it also works for generic entities like &#20; etc.
37
+ # or only for &gt; &lt; etc.
38
+ ht = CGI.unescapeHTML( ht )
39
+ end
40
+
41
+ def tag_regex( tag )
42
+ # note use non-greedy .*? for content
43
+
44
+ /<#{tag}[^>]*>(.*?)<\/#{tag}>/mi
45
+ end
46
+
47
+ def handle_ignore_tags( ht )
48
+ @@ignore_tags.each do |tag|
49
+ ht.gsub!( tag_regex(tag), '' )
50
+ end
51
+ ht
52
+ end
53
+
54
+ def handle_inline_tags( ht )
55
+ @@inline_tags.each do |tag|
56
+ # add a space after
57
+ ht.gsub!( tag_regex(tag), '\1 ' )
58
+ end
59
+ ht
60
+ end
61
+
62
+ def handle_block_tags( ht )
63
+ @@block_tags.each do |tag|
64
+ ht.gsub!( tag_regex(tag), "\n\1\n" )
65
+ end
66
+ ht
67
+ end
68
+
69
+
70
+ end # class Sanitizier
71
+
72
+ end # module TextUtils
@@ -39,8 +39,14 @@ def find_data_path_from_gemfile_gitref( name )
39
39
 
40
40
  # escape chars for regex e.g. . becomes \.
41
41
  name_esc = name.gsub( '.', '\.' )
42
- name_regex = /\/(#{name_esc}-[a-z0-9]+)|(#{name_esc})\/lib$/ # e.g. /\/(beer\.db-[a-z0-9]+)|(beer\.db)\//
43
42
 
43
+
44
+ # note:
45
+ # - hexdigest must be 12 chars e.g. b7d1c9619a54 or similar
46
+
47
+ # e.g. match /\/(beer\.db-[a-z0-9]+)|(beer\.db)\//
48
+
49
+ name_regex = /\/((#{name_esc}-[a-z0-9]{12})|(#{name_esc}))\/lib$/
44
50
  candidates = []
45
51
  $LOAD_PATH.each do |path|
46
52
  if path =~ name_regex
@@ -1,6 +1,6 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.7.0'
4
+ VERSION = '0.7.1'
5
5
 
6
6
  end # module TextUtils
data/lib/textutils.rb CHANGED
@@ -16,6 +16,9 @@ require 'logutils'
16
16
 
17
17
  require 'textutils/version'
18
18
 
19
+ require 'textutils/patterns' # regex patterns for reuse
20
+ require 'textutils/sanitizier'
21
+
19
22
  require 'textutils/filter/code_filter'
20
23
  require 'textutils/filter/comment_filter'
21
24
  require 'textutils/filter/erb_django_filter'
@@ -13,6 +13,44 @@ class TestHypertextHelper < MiniTest::Unit::TestCase
13
13
 
14
14
  include TextUtils::HypertextHelper # lets us use textify, etc.
15
15
 
16
+ def test_strip_tags
17
+ ## empty tags
18
+ assert_equal '', strip_tags( '<hr />' )
19
+ assert_equal '', strip_tags( '<hr/>' )
20
+ assert_equal '', strip_tags( '<my-emtpy/>' )
21
+ assert_equal '', strip_tags( '<my-emtpy />' )
22
+
23
+ assert_equal 'hello', strip_tags( '<h1>hello</h1>' )
24
+ assert_equal 'hello', strip_tags( '<h2>hello</h2>' )
25
+ assert_equal 'hello', strip_tags( '<p>hello</p>' )
26
+ assert_equal 'hello', strip_tags( '<div>hello</div>' )
27
+ assert_equal 'hello', strip_tags( '<my-header>hello</my-header>' )
28
+
29
+ assert_equal 'hello', strip_tags( '<h1 id="test">hello</h1>' )
30
+ assert_equal 'hello', strip_tags( '<p id="test">hello</p>' )
31
+ assert_equal 'hello', strip_tags( '<div id="test">hello</div>' )
32
+ assert_equal 'hello', strip_tags( '<my-header id="test">hello</my-header>' )
33
+
34
+ ## check case in-sensitive
35
+ assert_equal '', strip_tags( '<HR />' )
36
+ assert_equal '', strip_tags( '<hR />' )
37
+ assert_equal '', strip_tags( '<Hr />' )
38
+ assert_equal '', strip_tags( '<HR/>' )
39
+ assert_equal '', strip_tags( '<My-EmTpY/>' )
40
+ assert_equal '', strip_tags( '<My-EmTpY />' )
41
+
42
+ assert_equal 'hello', strip_tags( '<H1>hello</H1>' )
43
+ assert_equal 'hello', strip_tags( '<H2>hello</h2>' )
44
+ assert_equal 'hello', strip_tags( '<P>hello</P>' )
45
+ assert_equal 'hello', strip_tags( '<DiV>hello</dIv>' )
46
+ assert_equal 'hello', strip_tags( '<mY-hEaDer>hello</MY-HEADER>' )
47
+
48
+ assert_equal 'hello', strip_tags( '<H1 ID="test">hello</h1>' )
49
+ assert_equal 'hello', strip_tags( '<P id="test">hello</p>' )
50
+ assert_equal 'hello', strip_tags( '<DIV Id="test">hello</dIV>' )
51
+ assert_equal 'hello', strip_tags( '<MY-HEADER iD="test">hello</mY-hEaDeR>' )
52
+ end
53
+
16
54
 
17
55
  def test_stylesheet_link_tag
18
56
  hyout = "<link rel='stylesheet' type='text/css' href='hello.css'>"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-10-11 00:00:00.000000000 Z
12
+ date: 2013-12-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: logutils
16
- requirement: &72211900 !ruby/object:Gem::Requirement
16
+ requirement: &21849324 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,31 +21,31 @@ dependencies:
21
21
  version: '0.5'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *72211900
24
+ version_requirements: *21849324
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rdoc
27
- requirement: &72228610 !ruby/object:Gem::Requirement
27
+ requirement: &21848640 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
31
31
  - !ruby/object:Gem::Version
32
- version: '3.10'
32
+ version: '4.0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *72228610
35
+ version_requirements: *21848640
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: hoe
38
- requirement: &72228310 !ruby/object:Gem::Requirement
38
+ requirement: &21848040 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
42
42
  - !ruby/object:Gem::Version
43
- version: '3.3'
43
+ version: '3.7'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *72228310
46
+ version_requirements: *21848040
47
47
  description: textutils - Text Filters, Helpers, Readers and More
48
- email: webslideshow@googlegroups.com
48
+ email: ruby-talk@ruby-lang.org
49
49
  executables: []
50
50
  extensions: []
51
51
  extra_rdoc_files:
@@ -69,11 +69,13 @@ files:
69
69
  - lib/textutils/helper/unicode_helper.rb
70
70
  - lib/textutils/helper/value_helper.rb
71
71
  - lib/textutils/helper/xml_helper.rb
72
+ - lib/textutils/patterns.rb
72
73
  - lib/textutils/reader/code_reader.rb
73
74
  - lib/textutils/reader/fixture_reader.rb
74
75
  - lib/textutils/reader/hash_reader.rb
75
76
  - lib/textutils/reader/line_reader.rb
76
77
  - lib/textutils/reader/values_reader.rb
78
+ - lib/textutils/sanitizier.rb
77
79
  - lib/textutils/utils.rb
78
80
  - lib/textutils/version.rb
79
81
  - test/helper.rb
@@ -104,11 +106,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
104
106
  version: '0'
105
107
  requirements: []
106
108
  rubyforge_project: textutils
107
- rubygems_version: 1.8.17
109
+ rubygems_version: 1.8.16
108
110
  signing_key:
109
111
  specification_version: 3
110
112
  summary: textutils - Text Filters, Helpers, Readers and More
111
113
  test_files:
112
- - test/test_unicode_helper.rb
113
114
  - test/test_hypertext_helper.rb
114
115
  - test/test_title_helper.rb
116
+ - test/test_unicode_helper.rb