textutils 0.7.0 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest.txt CHANGED
@@ -16,11 +16,13 @@ lib/textutils/helper/title_helper.rb
16
16
  lib/textutils/helper/unicode_helper.rb
17
17
  lib/textutils/helper/value_helper.rb
18
18
  lib/textutils/helper/xml_helper.rb
19
+ lib/textutils/patterns.rb
19
20
  lib/textutils/reader/code_reader.rb
20
21
  lib/textutils/reader/fixture_reader.rb
21
22
  lib/textutils/reader/hash_reader.rb
22
23
  lib/textutils/reader/line_reader.rb
23
24
  lib/textutils/reader/values_reader.rb
25
+ lib/textutils/sanitizier.rb
24
26
  lib/textutils/utils.rb
25
27
  lib/textutils/version.rb
26
28
  test/helper.rb
data/README.markdown CHANGED
@@ -4,7 +4,7 @@
4
4
  * bugs :: [github.com/rubylibs/textutils/issues](https://github.com/rubylibs/textutils/issues)
5
5
  * gem :: [rubygems.org/gems/textutils](https://rubygems.org/gems/textutils)
6
6
  * rdoc :: [rubydoc.info/gems/textutils](http://rubydoc.info/gems/textutils)
7
- * forum :: [groups.google.com/group/webslideshow](https://groups.google.com/group/webslideshow)
7
+ * forum :: [ruby-talk@ruby-lang.org](www.ruby-lang.org/en/community/mailing-lists/)
8
8
 
9
9
 
10
10
  ## Filters
data/Rakefile CHANGED
@@ -11,7 +11,7 @@ Hoe.spec 'textutils' do
11
11
  self.urls = ['https://github.com/rubylibs/textutils']
12
12
 
13
13
  self.author = 'Gerald Bauer'
14
- self.email = 'webslideshow@googlegroups.com'
14
+ self.email = 'ruby-talk@ruby-lang.org'
15
15
 
16
16
  # switch extension to .markdown for gihub formatting
17
17
  self.readme_file = 'README.markdown'
@@ -8,15 +8,28 @@ def strip_tags( ht )
8
8
  ### to be done
9
9
  ## strip markup tags; return plain text; use brute force for now
10
10
  # check at least for presence of required a-z+ tag names
11
-
11
+ #
12
+ # note: make sure we cover h1/h2/h3/h4/h5/h6 tag w/ number!!
13
+
12
14
  ### ht.gsub( /<[^>]+>/, '' ) - old simple
13
15
 
14
16
  ## todo: add strip comments e.g. <!-- xxxx --> ???
15
17
  ## or use new strip_comments( ht )
16
18
 
17
- ht = ht.gsub( /<([a-z]+)\s*\/>/i, '' ) # remove xml-style empty tags eg. <br /> or <br/>
18
- ht = ht.gsub( /<([a-z]+)(\s+[^>]*)?>/i, '' ) # opening tag <p>
19
- ht = ht.gsub( /<\/([a-z]+)\s*>/i, '' ) # closing tag e.g. </p>
19
+
20
+ ## note: follow offical xml spec
21
+ ## - allows for first char: (Letter | '_' | ':')
22
+ ## - allows for followup chars: (Letter | Digit | '_' | ':' | '.' | '-')
23
+
24
+ tag_name_pattern = "[a-z_:][a-z0-9_:.\\-]*"
25
+
26
+ empty_tag_pattern = "<#{tag_name_pattern}\\s*/>"
27
+ opening_tag_pattern = "<#{tag_name_pattern}(\\s+[^>]*)?>"
28
+ closing_tag_pattern = "</#{tag_name_pattern}\\s*>"
29
+
30
+ ht = ht.gsub( /#{empty_tag_pattern}/i, '' ) # remove xml-style empty tags eg. <br /> or <br/>
31
+ ht = ht.gsub( /#{opening_tag_pattern}/i, '' ) # opening tag <p>
32
+ ht = ht.gsub( /#{closing_tag_pattern}/i, '' ) # closing tag e.g. </p>
20
33
  ht
21
34
  end
22
35
 
@@ -52,8 +65,8 @@ def whitelist( ht, tags, opts={} )
52
65
  # -- note: will NOT strip comments for now e.g. <!-- -->
53
66
  ht = strip_tags( ht )
54
67
 
55
- pp ht # fix: debugging indo - remove
56
-
68
+ ## pp ht # fix: debugging indo - remove
69
+
57
70
  ############################################
58
71
  # step three - restore whitelisted tags
59
72
 
@@ -0,0 +1,66 @@
1
+ # encoding: utf-8
2
+
3
+ module TextUtils
4
+
5
+ # collection of regex patterns for reuse
6
+
7
+ ### todo: add a patterns.md page to github ??
8
+ ## - add regexper pics??
9
+
10
+ ############
11
+ # about ruby regexps
12
+ #
13
+ # try the rubular - Ruby regular expression editor and tester
14
+ # -> http://rubular.com
15
+ # code -> ?? by ??
16
+ #
17
+ #
18
+ # Jeff Avallone's Regexper - Shows State-Automata Diagrams
19
+ # try -> http://regexper.com
20
+ # code -> https://github.com/javallone/regexper
21
+ #
22
+ #
23
+ # Regular Expressions | The Bastards Book of Ruby by Dan Nguyen
24
+ # http://ruby.bastardsbook.com/chapters/regexes/
25
+ #
26
+ # move to notes regex|patterns on geraldb.github.io ??
27
+ #
28
+
29
+ EMPTY_LINE_PATTERN = '^\s*$'
30
+
31
+ #################################
32
+ ### Start of Line Comment Patterns
33
+
34
+ COMMENT_LINE_PATTERN = '^\s*#' # e.g. Ruby/Shell style starting w/ # this is a comment
35
+
36
+ COMMENT_LINE_HASKELL_PATTERN = '^\s*--' # e.g. Haskell/Ada? style starting w/ --
37
+ COMMENT_LINE_ALT_PATTERN = COMMENT_LINE_HASKELL_PATTERN
38
+
39
+ COMMENT_LINE_TEX_PATTERN = '^\s*%' # e.g. TeX/LaTeX style starting w/ %
40
+ COMMENT_LINE_ALT_II_PATTERN = COMMENT_LINE_TEX_PATTERN
41
+
42
+ #############################
43
+ ### End of Line (EOL) Comment Patterns
44
+
45
+ EOL_COMMENT_PATTERN = '\s+#.+$' # fix: use \b word boundry instead of \s - why why not?
46
+ # why /b - everything but a-z0-9, that is, spaces but also includes umlauts, special chars etc.
47
+
48
+ ##############
49
+ ## Dates
50
+ #
51
+ # some info at www.regular-expressions.info/dates.html
52
+
53
+ YYYY_STRICT_19_20_PATTERN = '(?:19|20)\d\d'
54
+ YYYY_STRICT_20_PATTERN = '20\d\d'
55
+
56
+ MM_STRICT_PATTERN = '0[1-9]|1[012]'
57
+ M_STRICT_PATTERN = '0?[1-9]|1[012]'
58
+
59
+ DD_STRICT_PATTERN = '0[1-9]|[12][0-9]|3[01]'
60
+ D_STRICT_PATTERN = '0?[1-9]|[12][0-9]|3[01]'
61
+
62
+ ######
63
+ ## Time
64
+
65
+
66
+ end # TextUtils
@@ -0,0 +1,72 @@
1
+ # encoding: utf-8
2
+
3
+ module TextUtils
4
+
5
+ class Sanitizier
6
+
7
+ include LogUtils::Logging
8
+
9
+ @@ignore_tags = %w{ head script style }
10
+ @@inline_tags = %w{ span b i u }
11
+ @@block_tags = %w{ p div ul ol }
12
+
13
+
14
+ def initialize( ht )
15
+ @ht = ht # hypertext (html source)
16
+ end
17
+
18
+ def to_plain_text
19
+
20
+ ht = @ht
21
+ ht = handle_ignore_tags( ht )
22
+
23
+ ## handle_pre_tags ?? - special rule for preformatted (keep whitespace)
24
+
25
+ ht = handle_inline_tags( ht )
26
+ ht = handle_block_tags( ht )
27
+ ht = handle_other_tags( ht ) # rules for remain/left over tags
28
+
29
+ ht = handle_entities( ht )
30
+
31
+ ht
32
+ end
33
+
34
+ def handle_entities( ht )
35
+ ## unescape entities
36
+ # - check if it also works for generic entities like &#20; etc.
37
+ # or only for &gt; &lt; etc.
38
+ ht = CGI.unescapeHTML( ht )
39
+ end
40
+
41
+ def tag_regex( tag )
42
+ # note use non-greedy .*? for content
43
+
44
+ /<#{tag}[^>]*>(.*?)<\/#{tag}>/mi
45
+ end
46
+
47
+ def handle_ignore_tags( ht )
48
+ @@ignore_tags.each do |tag|
49
+ ht.gsub!( tag_regex(tag), '' )
50
+ end
51
+ ht
52
+ end
53
+
54
+ def handle_inline_tags( ht )
55
+ @@inline_tags.each do |tag|
56
+ # add a space after
57
+ ht.gsub!( tag_regex(tag), '\1 ' )
58
+ end
59
+ ht
60
+ end
61
+
62
+ def handle_block_tags( ht )
63
+ @@block_tags.each do |tag|
64
+ ht.gsub!( tag_regex(tag), "\n\1\n" )
65
+ end
66
+ ht
67
+ end
68
+
69
+
70
+ end # class Sanitizier
71
+
72
+ end # module TextUtils
@@ -39,8 +39,14 @@ def find_data_path_from_gemfile_gitref( name )
39
39
 
40
40
  # escape chars for regex e.g. . becomes \.
41
41
  name_esc = name.gsub( '.', '\.' )
42
- name_regex = /\/(#{name_esc}-[a-z0-9]+)|(#{name_esc})\/lib$/ # e.g. /\/(beer\.db-[a-z0-9]+)|(beer\.db)\//
43
42
 
43
+
44
+ # note:
45
+ # - hexdigest must be 12 chars e.g. b7d1c9619a54 or similar
46
+
47
+ # e.g. match /\/(beer\.db-[a-z0-9]+)|(beer\.db)\//
48
+
49
+ name_regex = /\/((#{name_esc}-[a-z0-9]{12})|(#{name_esc}))\/lib$/
44
50
  candidates = []
45
51
  $LOAD_PATH.each do |path|
46
52
  if path =~ name_regex
@@ -1,6 +1,6 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.7.0'
4
+ VERSION = '0.7.1'
5
5
 
6
6
  end # module TextUtils
data/lib/textutils.rb CHANGED
@@ -16,6 +16,9 @@ require 'logutils'
16
16
 
17
17
  require 'textutils/version'
18
18
 
19
+ require 'textutils/patterns' # regex patterns for reuse
20
+ require 'textutils/sanitizier'
21
+
19
22
  require 'textutils/filter/code_filter'
20
23
  require 'textutils/filter/comment_filter'
21
24
  require 'textutils/filter/erb_django_filter'
@@ -13,6 +13,44 @@ class TestHypertextHelper < MiniTest::Unit::TestCase
13
13
 
14
14
  include TextUtils::HypertextHelper # lets us use textify, etc.
15
15
 
16
+ def test_strip_tags
17
+ ## empty tags
18
+ assert_equal '', strip_tags( '<hr />' )
19
+ assert_equal '', strip_tags( '<hr/>' )
20
+ assert_equal '', strip_tags( '<my-emtpy/>' )
21
+ assert_equal '', strip_tags( '<my-emtpy />' )
22
+
23
+ assert_equal 'hello', strip_tags( '<h1>hello</h1>' )
24
+ assert_equal 'hello', strip_tags( '<h2>hello</h2>' )
25
+ assert_equal 'hello', strip_tags( '<p>hello</p>' )
26
+ assert_equal 'hello', strip_tags( '<div>hello</div>' )
27
+ assert_equal 'hello', strip_tags( '<my-header>hello</my-header>' )
28
+
29
+ assert_equal 'hello', strip_tags( '<h1 id="test">hello</h1>' )
30
+ assert_equal 'hello', strip_tags( '<p id="test">hello</p>' )
31
+ assert_equal 'hello', strip_tags( '<div id="test">hello</div>' )
32
+ assert_equal 'hello', strip_tags( '<my-header id="test">hello</my-header>' )
33
+
34
+ ## check case in-sensitive
35
+ assert_equal '', strip_tags( '<HR />' )
36
+ assert_equal '', strip_tags( '<hR />' )
37
+ assert_equal '', strip_tags( '<Hr />' )
38
+ assert_equal '', strip_tags( '<HR/>' )
39
+ assert_equal '', strip_tags( '<My-EmTpY/>' )
40
+ assert_equal '', strip_tags( '<My-EmTpY />' )
41
+
42
+ assert_equal 'hello', strip_tags( '<H1>hello</H1>' )
43
+ assert_equal 'hello', strip_tags( '<H2>hello</h2>' )
44
+ assert_equal 'hello', strip_tags( '<P>hello</P>' )
45
+ assert_equal 'hello', strip_tags( '<DiV>hello</dIv>' )
46
+ assert_equal 'hello', strip_tags( '<mY-hEaDer>hello</MY-HEADER>' )
47
+
48
+ assert_equal 'hello', strip_tags( '<H1 ID="test">hello</h1>' )
49
+ assert_equal 'hello', strip_tags( '<P id="test">hello</p>' )
50
+ assert_equal 'hello', strip_tags( '<DIV Id="test">hello</dIV>' )
51
+ assert_equal 'hello', strip_tags( '<MY-HEADER iD="test">hello</mY-hEaDeR>' )
52
+ end
53
+
16
54
 
17
55
  def test_stylesheet_link_tag
18
56
  hyout = "<link rel='stylesheet' type='text/css' href='hello.css'>"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-10-11 00:00:00.000000000 Z
12
+ date: 2013-12-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: logutils
16
- requirement: &72211900 !ruby/object:Gem::Requirement
16
+ requirement: &21849324 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,31 +21,31 @@ dependencies:
21
21
  version: '0.5'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *72211900
24
+ version_requirements: *21849324
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rdoc
27
- requirement: &72228610 !ruby/object:Gem::Requirement
27
+ requirement: &21848640 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
31
31
  - !ruby/object:Gem::Version
32
- version: '3.10'
32
+ version: '4.0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *72228610
35
+ version_requirements: *21848640
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: hoe
38
- requirement: &72228310 !ruby/object:Gem::Requirement
38
+ requirement: &21848040 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
42
42
  - !ruby/object:Gem::Version
43
- version: '3.3'
43
+ version: '3.7'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *72228310
46
+ version_requirements: *21848040
47
47
  description: textutils - Text Filters, Helpers, Readers and More
48
- email: webslideshow@googlegroups.com
48
+ email: ruby-talk@ruby-lang.org
49
49
  executables: []
50
50
  extensions: []
51
51
  extra_rdoc_files:
@@ -69,11 +69,13 @@ files:
69
69
  - lib/textutils/helper/unicode_helper.rb
70
70
  - lib/textutils/helper/value_helper.rb
71
71
  - lib/textutils/helper/xml_helper.rb
72
+ - lib/textutils/patterns.rb
72
73
  - lib/textutils/reader/code_reader.rb
73
74
  - lib/textutils/reader/fixture_reader.rb
74
75
  - lib/textutils/reader/hash_reader.rb
75
76
  - lib/textutils/reader/line_reader.rb
76
77
  - lib/textutils/reader/values_reader.rb
78
+ - lib/textutils/sanitizier.rb
77
79
  - lib/textutils/utils.rb
78
80
  - lib/textutils/version.rb
79
81
  - test/helper.rb
@@ -104,11 +106,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
104
106
  version: '0'
105
107
  requirements: []
106
108
  rubyforge_project: textutils
107
- rubygems_version: 1.8.17
109
+ rubygems_version: 1.8.16
108
110
  signing_key:
109
111
  specification_version: 3
110
112
  summary: textutils - Text Filters, Helpers, Readers and More
111
113
  test_files:
112
- - test/test_unicode_helper.rb
113
114
  - test/test_hypertext_helper.rb
114
115
  - test/test_title_helper.rb
116
+ - test/test_unicode_helper.rb