RubyGems - iudex-html - Versions diffs - 1.0.0-java - Mend

iudex-html 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/History.rdoc +2 -0
data/Manifest.txt +24 -0
data/README.rdoc +25 -0
data/Rakefile +53 -0
data/build/HTML.java.erb +91 -0
data/build/attributes +82 -0
data/build/java_generate.rb +139 -0
data/build/tags +130 -0
data/lib/iudex-html.rb +56 -0
data/lib/iudex-html/base.rb +21 -0
data/lib/iudex-html/factory_helper.rb +95 -0
data/lib/iudex-html/iudex-html-1.0.0.jar +0 -0
data/pom.xml +51 -0
data/test/html_test_helper.rb +100 -0
data/test/setup.rb +38 -0
data/test/test_characters_normalizer.rb +81 -0
data/test/test_extract_filter.rb +165 -0
data/test/test_factory_helper.rb +51 -0
data/test/test_html_parser.rb +128 -0
data/test/test_other_filters.rb +51 -0
data/test/test_other_tree_filters.rb +124 -0
data/test/test_parse_filter.rb +72 -0
data/test/test_tree_walker.rb +94 -0
data/test/test_word_counters.rb +96 -0
metadata +162 -0

data/test/test_extract_filter.rb ADDED Viewed

@@ -0,0 +1,165 @@
+#!/usr/bin/env jruby
+# -*- coding: utf-8 -*-
+#.hashdot.profile += jruby-shortlived
+#--
+# Copyright (c) 2010-2011 David Kellum
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+#++
+require File.join( File.dirname( __FILE__ ), "setup" )
+class TestExtractFilter < MiniTest::Unit::TestCase
+  include HTMLTestHelper
+  include Iudex::HTML::Filters
+  include Iudex::Filter::KeyHelper
+  include Iudex::HTML::Tree::Filters
+  Order = HTMLTreeFilter::Order
+  def test_single_tree
+    htests = [ [ nil,                    nil ],
+               [ '<div></div>',          nil ],
+               [ '<div>too short</div>', nil ],
+               [ <<-'HTML', nil ],
+                 <div>
+                   <p>too short</p>
+                 </div>
+                 HTML
+               [ <<-'HTML', nil ],
+                 <div>a1 a2<br/>
+                      a3 a4</div>
+                 HTML
+               [ <<-'HTML', "a1 a2 a3 a4" ],
+                 <div>a1 a2 a3 a4</div>
+                 HTML
+               [ <<-'HTML', "a1 a2 a3 a4" ],
+                 <div>
+                   <p>a1 a2 a3 a4</p>
+                 </div>
+                 HTML
+               [ <<-'HTML', "a1 a2 a3 a4" ],
+                 <div>a1 a2 a3 a4<br/>
+                      not part of extract</div>
+                 HTML
+               [ <<-'HTML', "a1 a2 a3 a4" ],
+                 <div>not<br/>
+                      a1 a2 a3 a4
+                      </div>
+                 HTML
+               [ <<-'HTML', "A more substantive paragraph." ],
+                 <div>
+                   <p>Short junk</p>
+                   <hr/>
+                   <p>A more <i>substantive </i>paragraph.</p>
+                   <p>A similarly <i>substantive </i>paragraph.</p>
+                 </div>
+                 HTML
+               [ <<-'HTML', "A more substantive paragraph." ],
+                 <i>
+                   <p>Short junk</p>
+                   <i>
+                     <hr/>
+                     <p>A more <i>substantive </i>paragraph.</p>
+                   </i>
+                   <p>A similarly <i>substantive </i>paragraph.</p>
+                 </i>
+                 HTML
+               [ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
+                 <div>
+                   <p>a1 a2 a3</p>
+                   <p>a1 a2 a3 a4 a5</p>
+                   <p>a1 a2 a3 a4 a5 a6</p>
+                   <p>a1 a2 a3 a4 a5 a6 a7</p>
+                   <p>a1 a2 a3 a4 a5 a6 a7 a8</p>
+                   <p>a1 a2 a3 a4 a5 a6 a7 a8 a9</p>
+                 </div>
+                 HTML
+               [ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
+                 <div>
+                   <p>a1 a2 a3 a4 a5 a6 a7</p>
+                   <p>a1 a2 a3</p>
+                   <p>a1 a2 a3 a4 a5</p>
+                   <p>a1 a2 a3 a4 a5 a6</p>
+                   <div>
+                      <p>a1 a2 a3 a4 a5 a6 a7 a8</p>
+                         a1 a2 a3 a4 a5 a6 a7 a8 a9
+                   </div>
+                 </div>
+                 HTML
+               [ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
+                 <div>
+                   <p>a1 a2 a3</p>
+                   <p>a1 a2 a3 a4 a5</p>
+                   <p>a1 a2 a3 a4 a5 a6</p>
+                   <p>a1 a2 a3 a4 a5 a6 a7</p>
+                   <div>
+                        a1 a2 a3 a4 a5 a6 a7 a8
+                     <p>a1 a2 a3 a4 a5 a6 a7 a8 a9</p>
+                   </div>
+                 </div>
+                 HTML
+             ]
+    htests.each do | html, exp_extract |
+      map = ( content( html ) if html ) || UniMap.new
+      tfc = TreeFilterChain.new( [ CharactersNormalizer.new,
+                                   WordCounter.new ] )
+      fc = [ HTMLTreeFilter.new( :source_tree.to_k, tfc, Order::DEPTH_FIRST ),
+             ExtractFilter.new( [ :source_tree.to_k ] ) ]
+      chain = filter_chain( fc, :fragment )
+      assert( chain.filter( map ) )
+      assert_equal( exp_extract, map.extract && map.extract.to_s,
+                    "from:\n" + html.to_s )
+    end
+  end
+  def test_multi_tree
+    htests = [ [ nil, nil, nil ],
+               [ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4' ],
+                  <div>too short</div>
+                 HTML1
+                  <div>a1 a2 a3 a4</div>
+                 HTML2
+               [ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4' ],
+                  <div>a1 a2 a3 a4</div>
+                 HTML1
+                  <div>too short</div>
+                 HTML2
+               [ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4 a5 a6 a7 a8' ],
+                  <div>a1 a2 a3 a4 a5 a6 a7 a8</div>
+                 HTML1
+                  <div>a1 a2 a3 a4 a5 a6 a7 a8 a9</div>
+                 HTML2
+             ]
+    htests.each do | summary, content, exp_extract |
+      map = UniMap.new
+      map.summary = summary
+      map.content = content
+      filters = [ html_clean_filters( :summary ),
+                  html_clean_filters( :content ),
+                  ExtractFilter.new( keys( :summary_tree, :content_tree ) ) ]
+      chain = FilterChain.new( "test", filters.flatten )
+      assert( chain.filter( map ) )
+      assert_equal( exp_extract, map.extract && map.extract.to_s,
+                    "summary: #{summary}\n" +
+                    "content: #{content}" )
+    end
+  end
+end

data/test/test_factory_helper.rb ADDED Viewed

@@ -0,0 +1,51 @@
+#!/usr/bin/env jruby
+# -*- coding: utf-8 -*-
+#.hashdot.profile += jruby-shortlived
+#--
+# Copyright (c) 2010-2011 David Kellum
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+#++
+require File.join( File.dirname( __FILE__ ), "setup" )
+require 'iudex-html'
+RJack::Logback.config_console( :stderr => true, :level => RJack::Logback::WARN  )
+require 'iudex-html/factory_helper'
+require 'iudex-filter/filter_chain_factory'
+class TestFactoryHelper < MiniTest::Unit::TestCase
+  include HTMLTestHelper
+  class TestFilterChainFactory < Iudex::Filter::Core::FilterChainFactory
+    include Iudex::HTML::Filters::FactoryHelper
+    def filters
+     [ html_clean_filters( :title, :title_tree ), # _tree optional arg
+       html_clean_filters( :summary ),            # implied :summary_tree
+        html_write_filter( :summary ) ].flatten
+    end
+  end
+  def test
+    fcf = TestFilterChainFactory.new( "test" )
+    fcf.open
+    assert( fcf.open? )
+    fcf.close
+  end
+end

data/test/test_html_parser.rb ADDED Viewed

@@ -0,0 +1,128 @@
+#!/usr/bin/env jruby
+# -*- coding: utf-8 -*-
+#.hashdot.profile += jruby-shortlived
+#--
+# Copyright (c) 2010-2011 David Kellum
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+#++
+require File.join( File.dirname( __FILE__ ), "setup" )
+class TestHTMLParser < MiniTest::Unit::TestCase
+  include HTMLTestHelper
+  HTML_META = <<HTML
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <head>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+  <title>Iūdex</title>
+ </head>
+ <body>
+  <p>Iūdex test.</p>
+ </body>
+</html>
+HTML
+  def test_charset_same
+    assert_doc( HTML_META, parse( HTML_META, "UTF-8" ) )
+  end
+  def test_charset_rerun
+    assert_doc( HTML_META, parse( HTML_META, "ISO-8859-1" ) )
+  end
+  def test_charset_bogus
+    alt = HTML_META.sub( /utf-8/, 'bogus' )
+    assert_doc( alt, parse( alt, "UTF-8" ) )
+  end
+  HTML_SKIP_TAGS = <<HTML
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <head>
+  <style>style me</style>
+ </head>
+ <body>
+  <unknown_empty/>
+  <p>normal text.</p>
+  <not_empty><p>foo</p><br/></not_empty>
+  <nostyle><p>foo</p><br/></nostyle>
+ </body>
+</html>
+HTML
+  HTML_SKIP_TAGS_SKIPPED = <<HTML
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <head/>
+ <body>
+  <p>normal text.</p>
+ </body>
+</html>
+HTML
+  def test_skip_tags
+    assert_doc( HTML_SKIP_TAGS_SKIPPED, parse( HTML_SKIP_TAGS, "ISO-8859-1" ) )
+  end
+  HTML_OUTSIDE = <<HTML
+before
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <head/>
+ <body>
+  <p>normal text.</p>
+ </body>
+</html>
+after
+HTML
+  HTML_INSIDE = <<HTML
+<html xmlns="http://www.w3.org/1999/xhtml">
+ <head/>
+ <body>before
+  <p>normal text.</p>after</body>
+</html>
+HTML
+  def test_outer_text
+    assert_doc( HTML_INSIDE, parse( HTML_OUTSIDE, "ISO-8859-1" ) )
+  end
+  HTML_FRAG = {
+    :in  =>      "one<p>two</p><br/> three",
+    :out => "<div>one<p>two</p><br/> three</div>" }
+  def test_parse_fragment
+    tree = parseFragment( HTML_FRAG[ :in ] )
+    assert_fragment( HTML_FRAG[ :out ], tree )
+  end
+  HTML_CDATA = {
+    :in  => "<p><![CDATA[two]]></p>",
+    :out => "<p/>" }
+  # By default (incl HTML browsers) CDATA sections are dropped.
+  def test_cdata
+    tree = parseFragment( HTML_CDATA[ :in ] )
+    assert_fragment( HTML_CDATA[ :out ], tree )
+  end
+  # Neko doesn't ban/reorder blocks in inline elements.
+  def test_inline_nest
+    html = { :in  => "<div><i>begin <p>block</p> end.</i></div>",
+             :out => "<div><i>begin <p>block</p> end.</i></div>" }
+    tree = parseFragment( html[ :in ] )
+    assert_fragment( html[ :out ], tree )
+  end
+end

data/test/test_other_filters.rb ADDED Viewed

@@ -0,0 +1,51 @@
+#!/usr/bin/env jruby
+# -*- coding: utf-8 -*-
+#.hashdot.profile += jruby-shortlived
+#--
+# Copyright (c) 2010-2011 David Kellum
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+#++
+require File.join( File.dirname( __FILE__ ), "setup" )
+require 'iudex-html'
+class TestOtherFilters < MiniTest::Unit::TestCase
+  include HTMLTestHelper
+  include Iudex::Core
+  include Iudex::HTML
+  include Iudex::HTML::Filters
+  def test_title_extractor
+    html = <<HTML
+<html>
+ <head>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+  <title>I&#363;dex</title>
+  <style>style</style>
+ </head>
+ <body>
+  <p>Iūdex test.</p>
+ </body>
+</html>
+HTML
+    map = content( html )
+    chain = filter_chain( TitleExtractor.new )
+    assert( chain.filter( map ) )
+    assert_equal( 'Iūdex', map.title.to_s )
+  end
+end

data/test/test_other_tree_filters.rb ADDED Viewed

@@ -0,0 +1,124 @@
+#!/usr/bin/env jruby
+#.hashdot.profile += jruby-shortlived
+#--
+# Copyright (c) 2010-2011 David Kellum
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you
+# may not use this file except in compliance with the License.  You
+# may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.  See the License for the specific language governing
+# permissions and limitations under the License.
+#++
+require File.join( File.dirname( __FILE__ ), "setup" )
+class TestOtherTreeFilters < MiniTest::Unit::TestCase
+  include HTMLTestHelper
+  include Iudex::HTML::Tree
+  include Iudex::HTML::Tree::Filters
+  def test_non_html_atts_dropped
+    # Bogus is dropped already by parser
+    html = {}
+    html[ :in ] = <<HTML
+<div bogus="not html">
+ <p>test.</p>
+</div>
+HTML
+    html[ :out ] = cut_atts( html[ :in ], 'bogus' )
+    assert_transform( html ) #identity
+  end
+  def test_attribute_cleaner
+    html = {}
+    html[ :in ] = <<HTML
+<div style="font:big">
+ <a href=".." style="drop" rel="foo">link text</a>
+ <img src=".." alt="foo" height="33" width="44" align="left"/>
+</div>
+HTML
+    html[ :out ] = cut_atts( html[ :in ], 'style', 'align' )
+    assert_transform( html, AttributeCleaner.new )
+  end
+  def test_empty_inline_remover
+    hs = [ { :in  => "<div><b> keep </b></div>",
+             :out => "<div><b> keep </b></div>" },
+           { :in  => '<div><b><img src="keep"/></b></div>',
+             :out => '<div><b><img src="keep"/></b></div>' },
+           { :in  => "<div>first<span/></div>",
+             :out => "<div>first~~~~~~~</div>" },
+           { :in  => "<div>first<b> </b></div>",
+             :out => "<div>first~~~ ~~~~</div>" },
+           { :in  => "<div><b><span/></b>last</div>",
+             :out => "<div>~~~~~~~~~~~~~~last</div>" },
+           { :in  => "<div><b><span/> </b>last</div>",
+             :out => "<div>~~~~~~~~~~ ~~~~last</div>" },
+           { :in  => "<div><b> <br/> </b>last</div>",
+             :out => "<div>~~~ <br/> ~~~~last</div>" } ]
+    hs.each do |html|
+      assert_transform( html, EmptyInlineRemover.new )
+    end
+  end
+  def test_css_display_filter_pattern
+    f = CSSDisplayFilter.new
+    assert( f.has_display_none( 'display: none' ) )
+    assert( f.has_display_none( '{display: none}' ) ) #lenient
+    assert( f.has_display_none( 'other:foo; DISPLAY:NONE;' ) )
+    assert( ! f.has_display_none( 'display: block' ) )
+    assert( ! f.has_display_none( 'other-display: none' ) )
+    assert( ! f.has_display_none( 'display: nonetheless' ) )
+  end
+  def test_css_display_filter
+    html = {}
+    html[ :in ] = <<HTML
+<div>
+ <b>keep</b>
+ <div style="display:none"><b>drop</b> me</div>
+</div>
+HTML
+    html[ :out ] = <<HTML
+<div>
+ <b>keep</b>
+</div>
+HTML
+    assert_transform( html, CSSDisplayFilter.new )
+  end
+  def test_xmp_to_pre_converter
+    html = { :in  => "<div><xmp> <i>keep</i> </xmp></div>",
+             :out => "<div><pre> &lt;i>keep&lt;/i> </pre></div>" }
+    assert_transform( html, XmpToPreConverter.new )
+  end
+  def cut_atts( html, *atts )
+    atts.each do |att|
+      html = html.gsub( / #{att}="[^"]+"/, '' )
+    end
+    html
+  end
+end