RubyGems - textutils - Versions diffs - 0.7.0 → 0.7.1 - Mend

textutils 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/Manifest.txt +2 -0
data/README.markdown +1 -1
data/Rakefile +1 -1
data/lib/textutils/helper/hypertext_helper.rb +19 -6
data/lib/textutils/patterns.rb +66 -0
data/lib/textutils/sanitizier.rb +72 -0
data/lib/textutils/utils.rb +7 -1
data/lib/textutils/version.rb +1 -1
data/lib/textutils.rb +3 -0
data/test/test_hypertext_helper.rb +38 -0
metadata +15 -13

data/Manifest.txt CHANGED Viewed

@@ -16,11 +16,13 @@ lib/textutils/helper/title_helper.rb
 lib/textutils/helper/unicode_helper.rb
 lib/textutils/helper/value_helper.rb
 lib/textutils/helper/xml_helper.rb
+lib/textutils/patterns.rb
 lib/textutils/reader/code_reader.rb
 lib/textutils/reader/fixture_reader.rb
 lib/textutils/reader/hash_reader.rb
 lib/textutils/reader/line_reader.rb
 lib/textutils/reader/values_reader.rb
+lib/textutils/sanitizier.rb
 lib/textutils/utils.rb
 lib/textutils/version.rb
 test/helper.rb

data/README.markdown CHANGED Viewed

@@ -4,7 +4,7 @@
 * bugs  :: [github.com/rubylibs/textutils/issues](https://github.com/rubylibs/textutils/issues)
 * gem   :: [rubygems.org/gems/textutils](https://rubygems.org/gems/textutils)
 * rdoc  :: [rubydoc.info/gems/textutils](http://rubydoc.info/gems/textutils)
-* forum :: [groups.google.com/group/webslideshow](https://groups.google.com/group/webslideshow)
+* forum :: [ruby-talk@ruby-lang.org](www.ruby-lang.org/en/community/mailing-lists/)
 ## Filters

data/Rakefile CHANGED Viewed

@@ -11,7 +11,7 @@ Hoe.spec 'textutils' do
   self.urls    = ['https://github.com/rubylibs/textutils']
   self.author  = 'Gerald Bauer'
-  self.email   = 'webslideshow@googlegroups.com'
+  self.email   = 'ruby-talk@ruby-lang.org'
   # switch extension to .markdown for gihub formatting
   self.readme_file  = 'README.markdown'

data/lib/textutils/helper/hypertext_helper.rb CHANGED Viewed

@@ -8,15 +8,28 @@ def strip_tags( ht )
   ### to be done
   ## strip markup tags; return plain text; use brute force for now
   # check at least for presence of required a-z+ tag names
+  #
+  #  note: make sure we cover h1/h2/h3/h4/h5/h6  tag w/ number!!
   ### ht.gsub( /<[^>]+>/, '' ) - old simple
   ## todo: add strip comments e.g. <!-- xxxx --> ???
   ##  or use new strip_comments( ht )
-  ht = ht.gsub( /<([a-z]+)\s*\/>/i, '' )       # remove xml-style empty tags eg. <br /> or <br/>
-  ht = ht.gsub( /<([a-z]+)(\s+[^>]*)?>/i, '' ) # opening tag <p>
-  ht = ht.gsub( /<\/([a-z]+)\s*>/i, '' )       # closing tag e.g. </p>
+  ## note: follow offical xml spec
+  ##  - allows for first char:  (Letter | '_' | ':')
+  ##  - allows for followup chars: (Letter | Digit | '_' | ':' | '.' | '-')
+  tag_name_pattern = "[a-z_:][a-z0-9_:.\\-]*"
+  empty_tag_pattern   =  "<#{tag_name_pattern}\\s*/>"
+  opening_tag_pattern =  "<#{tag_name_pattern}(\\s+[^>]*)?>"
+  closing_tag_pattern =  "</#{tag_name_pattern}\\s*>"
+  ht = ht.gsub( /#{empty_tag_pattern}/i, '' )    # remove xml-style empty tags eg. <br /> or <br/>
+  ht = ht.gsub( /#{opening_tag_pattern}/i, '' )  # opening tag <p>
+  ht = ht.gsub( /#{closing_tag_pattern}/i, '' )  # closing tag e.g. </p>
   ht
 end
@@ -52,8 +65,8 @@ def whitelist( ht, tags, opts={} )
   #  -- note: will NOT strip comments for now e.g. <!-- -->
   ht = strip_tags( ht )
-  pp ht  # fix: debugging indo - remove
+  ## pp ht  # fix: debugging indo - remove
   ############################################
   # step three - restore whitelisted tags

data/lib/textutils/patterns.rb ADDED Viewed

@@ -0,0 +1,66 @@
+# encoding: utf-8
+module TextUtils
+# collection of regex patterns for reuse
+### todo: add a patterns.md page to  github ??
+##  - add regexper pics??
+############
+# about ruby regexps
+#
+# try the rubular - Ruby regular expression editor and tester
+#  -> http://rubular.com
+#   code -> ??  by ??
+#
+#
+# Jeff Avallone's Regexper - Shows State-Automata Diagrams
+#  try -> http://regexper.com
+#    code -> https://github.com/javallone/regexper
+#
+#
+#  Regular Expressions | The Bastards Book of Ruby by Dan Nguyen
+# http://ruby.bastardsbook.com/chapters/regexes/
+#
+# move to notes  regex|patterns on  geraldb.github.io ??
+#
+  EMPTY_LINE_PATTERN = '^\s*$'
+  #################################
+  ### Start of Line Comment Patterns
+  COMMENT_LINE_PATTERN = '^\s*#'   # e.g. Ruby/Shell style  starting w/  # this is a comment
+  COMMENT_LINE_HASKELL_PATTERN = '^\s*--'   # e.g. Haskell/Ada? style starting w/ --
+  COMMENT_LINE_ALT_PATTERN = COMMENT_LINE_HASKELL_PATTERN
+  COMMENT_LINE_TEX_PATTERN = '^\s*%'   # e.g. TeX/LaTeX style starting w/ %
+  COMMENT_LINE_ALT_II_PATTERN = COMMENT_LINE_TEX_PATTERN
+  #############################
+  ### End of Line (EOL) Comment Patterns
+  EOL_COMMENT_PATTERN = '\s+#.+$'    # fix: use \b word boundry instead of \s - why why not?
+  # why /b  - everything but a-z0-9, that is, spaces but also includes umlauts, special chars etc.
+  ##############
+  ## Dates
+  #
+  # some info at www.regular-expressions.info/dates.html
+  YYYY_STRICT_19_20_PATTERN = '(?:19|20)\d\d'
+  YYYY_STRICT_20_PATTERN = '20\d\d'
+  MM_STRICT_PATTERN = '0[1-9]|1[012]'
+  M_STRICT_PATTERN =  '0?[1-9]|1[012]'
+  DD_STRICT_PATTERN = '0[1-9]|[12][0-9]|3[01]'
+  D_STRICT_PATTERN =  '0?[1-9]|[12][0-9]|3[01]'
+  ######
+  ## Time
+end # TextUtils

data/lib/textutils/sanitizier.rb ADDED Viewed

@@ -0,0 +1,72 @@
+# encoding: utf-8
+module TextUtils
+class Sanitizier
+  include LogUtils::Logging
+  @@ignore_tags = %w{ head script style }
+  @@inline_tags = %w{ span b i u }
+  @@block_tags  = %w{ p div ul ol }
+  def initialize( ht )
+    @ht = ht  # hypertext (html source)
+  end
+  def to_plain_text
+    ht = @ht
+    ht = handle_ignore_tags( ht )
+## handle_pre_tags ??  - special rule for preformatted (keep whitespace)
+    ht = handle_inline_tags( ht )
+    ht = handle_block_tags( ht )
+    ht = handle_other_tags( ht )  # rules for remain/left over tags
+    ht = handle_entities( ht )
+    ht
+  end
+  def handle_entities( ht )
+    ## unescape entities
+    #  - check if it also works for generic entities like &#20; etc.
+    #  or only for &gt; &lt; etc.
+    ht = CGI.unescapeHTML( ht )
+  end
+  def tag_regex( tag )
+    # note use non-greedy .*? for content
+    /<#{tag}[^>]*>(.*?)<\/#{tag}>/mi
+  end
+  def handle_ignore_tags( ht )
+    @@ignore_tags.each do |tag|
+      ht.gsub!( tag_regex(tag), '' )
+    end
+    ht
+  end
+  def handle_inline_tags( ht )
+    @@inline_tags.each do |tag|
+      # add a space after
+      ht.gsub!( tag_regex(tag), '\1 ' )
+    end
+    ht
+  end
+  def handle_block_tags( ht )
+    @@block_tags.each do |tag|
+      ht.gsub!( tag_regex(tag), "\n\1\n" )
+    end
+    ht
+  end
+end # class Sanitizier
+end # module TextUtils

data/lib/textutils/utils.rb CHANGED Viewed

@@ -39,8 +39,14 @@ def find_data_path_from_gemfile_gitref( name )
   # escape chars for regex e.g. . becomes \.
   name_esc = name.gsub( '.', '\.' )
-  name_regex = /\/(#{name_esc}-[a-z0-9]+)|(#{name_esc})\/lib$/  # e.g. /\/(beer\.db-[a-z0-9]+)|(beer\.db)\//
+  # note:
+  #  - hexdigest must be 12 chars e.g. b7d1c9619a54 or similar
+  # e.g. match /\/(beer\.db-[a-z0-9]+)|(beer\.db)\//
+  name_regex = /\/((#{name_esc}-[a-z0-9]{12})|(#{name_esc}))\/lib$/
   candidates = []
   $LOAD_PATH.each do |path|
     if path =~ name_regex

data/lib/textutils/version.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module TextUtils
-  VERSION = '0.7.0'
+  VERSION = '0.7.1'
 end   # module TextUtils

data/lib/textutils.rb CHANGED Viewed

@@ -16,6 +16,9 @@ require 'logutils'
 require 'textutils/version'
+require 'textutils/patterns'   # regex patterns for reuse
+require 'textutils/sanitizier'
 require 'textutils/filter/code_filter'
 require 'textutils/filter/comment_filter'
 require 'textutils/filter/erb_django_filter'

data/test/test_hypertext_helper.rb CHANGED Viewed

@@ -13,6 +13,44 @@ class TestHypertextHelper < MiniTest::Unit::TestCase
   include TextUtils::HypertextHelper   #  lets us use textify, etc.
+  def test_strip_tags
+    ## empty tags
+    assert_equal '', strip_tags( '<hr />' )
+    assert_equal '', strip_tags( '<hr/>' )
+    assert_equal '', strip_tags( '<my-emtpy/>' )
+    assert_equal '', strip_tags( '<my-emtpy />' )
+    assert_equal 'hello', strip_tags( '<h1>hello</h1>' )
+    assert_equal 'hello', strip_tags( '<h2>hello</h2>' )
+    assert_equal 'hello', strip_tags( '<p>hello</p>' )
+    assert_equal 'hello', strip_tags( '<div>hello</div>' )
+    assert_equal 'hello', strip_tags( '<my-header>hello</my-header>' )
+    assert_equal 'hello', strip_tags( '<h1 id="test">hello</h1>' )
+    assert_equal 'hello', strip_tags( '<p id="test">hello</p>' )
+    assert_equal 'hello', strip_tags( '<div id="test">hello</div>' )
+    assert_equal 'hello', strip_tags( '<my-header id="test">hello</my-header>' )
+    ## check case in-sensitive
+    assert_equal '', strip_tags( '<HR />' )
+    assert_equal '', strip_tags( '<hR />' )
+    assert_equal '', strip_tags( '<Hr />' )
+    assert_equal '', strip_tags( '<HR/>' )
+    assert_equal '', strip_tags( '<My-EmTpY/>' )
+    assert_equal '', strip_tags( '<My-EmTpY />' )
+    assert_equal 'hello', strip_tags( '<H1>hello</H1>' )
+    assert_equal 'hello', strip_tags( '<H2>hello</h2>' )
+    assert_equal 'hello', strip_tags( '<P>hello</P>' )
+    assert_equal 'hello', strip_tags( '<DiV>hello</dIv>' )
+    assert_equal 'hello', strip_tags( '<mY-hEaDer>hello</MY-HEADER>' )
+    assert_equal 'hello', strip_tags( '<H1 ID="test">hello</h1>' )
+    assert_equal 'hello', strip_tags( '<P id="test">hello</p>' )
+    assert_equal 'hello', strip_tags( '<DIV Id="test">hello</dIV>' )
+    assert_equal 'hello', strip_tags( '<MY-HEADER iD="test">hello</mY-hEaDeR>' )
+  end
   def test_stylesheet_link_tag
     hyout = "<link rel='stylesheet' type='text/css' href='hello.css'>"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: textutils
 version: !ruby/object:Gem::Version
-  version: 0.7.0
+  version: 0.7.1
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-10-11 00:00:00.000000000 Z
+date: 2013-12-25 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: logutils
-  requirement: &72211900 !ruby/object:Gem::Requirement
+  requirement: &21849324 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
@@ -21,31 +21,31 @@ dependencies:
         version: '0.5'
   type: :runtime
   prerelease: false
-  version_requirements: *72211900
+  version_requirements: *21849324
 - !ruby/object:Gem::Dependency
   name: rdoc
-  requirement: &72228610 !ruby/object:Gem::Requirement
+  requirement: &21848640 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: '3.10'
+        version: '4.0'
   type: :development
   prerelease: false
-  version_requirements: *72228610
+  version_requirements: *21848640
 - !ruby/object:Gem::Dependency
   name: hoe
-  requirement: &72228310 !ruby/object:Gem::Requirement
+  requirement: &21848040 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: '3.3'
+        version: '3.7'
   type: :development
   prerelease: false
-  version_requirements: *72228310
+  version_requirements: *21848040
 description: textutils - Text Filters, Helpers, Readers and More
-email: webslideshow@googlegroups.com
+email: ruby-talk@ruby-lang.org
 executables: []
 extensions: []
 extra_rdoc_files:
@@ -69,11 +69,13 @@ files:
 - lib/textutils/helper/unicode_helper.rb
 - lib/textutils/helper/value_helper.rb
 - lib/textutils/helper/xml_helper.rb
+- lib/textutils/patterns.rb
 - lib/textutils/reader/code_reader.rb
 - lib/textutils/reader/fixture_reader.rb
 - lib/textutils/reader/hash_reader.rb
 - lib/textutils/reader/line_reader.rb
 - lib/textutils/reader/values_reader.rb
+- lib/textutils/sanitizier.rb
 - lib/textutils/utils.rb
 - lib/textutils/version.rb
 - test/helper.rb
@@ -104,11 +106,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project: textutils
-rubygems_version: 1.8.17
+rubygems_version: 1.8.16
 signing_key:
 specification_version: 3
 summary: textutils - Text Filters, Helpers, Readers and More
 test_files:
-- test/test_unicode_helper.rb
 - test/test_hypertext_helper.rb
 - test/test_title_helper.rb
+- test/test_unicode_helper.rb