iudex-html 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,165 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+
23
+ class TestExtractFilter < MiniTest::Unit::TestCase
24
+ include HTMLTestHelper
25
+ include Iudex::HTML::Filters
26
+ include Iudex::Filter::KeyHelper
27
+
28
+ include Iudex::HTML::Tree::Filters
29
+ Order = HTMLTreeFilter::Order
30
+
31
+ def test_single_tree
32
+ htests = [ [ nil, nil ],
33
+ [ '<div></div>', nil ],
34
+ [ '<div>too short</div>', nil ],
35
+ [ <<-'HTML', nil ],
36
+ <div>
37
+ <p>too short</p>
38
+ </div>
39
+ HTML
40
+ [ <<-'HTML', nil ],
41
+ <div>a1 a2<br/>
42
+ a3 a4</div>
43
+ HTML
44
+ [ <<-'HTML', "a1 a2 a3 a4" ],
45
+ <div>a1 a2 a3 a4</div>
46
+ HTML
47
+ [ <<-'HTML', "a1 a2 a3 a4" ],
48
+ <div>
49
+ <p>a1 a2 a3 a4</p>
50
+ </div>
51
+ HTML
52
+ [ <<-'HTML', "a1 a2 a3 a4" ],
53
+ <div>a1 a2 a3 a4<br/>
54
+ not part of extract</div>
55
+ HTML
56
+ [ <<-'HTML', "a1 a2 a3 a4" ],
57
+ <div>not<br/>
58
+ a1 a2 a3 a4
59
+ </div>
60
+ HTML
61
+ [ <<-'HTML', "A more substantive paragraph." ],
62
+ <div>
63
+ <p>Short junk</p>
64
+ <hr/>
65
+ <p>A more <i>substantive </i>paragraph.</p>
66
+ <p>A similarly <i>substantive </i>paragraph.</p>
67
+ </div>
68
+ HTML
69
+ [ <<-'HTML', "A more substantive paragraph." ],
70
+ <i>
71
+ <p>Short junk</p>
72
+ <i>
73
+ <hr/>
74
+ <p>A more <i>substantive </i>paragraph.</p>
75
+ </i>
76
+ <p>A similarly <i>substantive </i>paragraph.</p>
77
+ </i>
78
+ HTML
79
+ [ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
80
+ <div>
81
+ <p>a1 a2 a3</p>
82
+ <p>a1 a2 a3 a4 a5</p>
83
+ <p>a1 a2 a3 a4 a5 a6</p>
84
+ <p>a1 a2 a3 a4 a5 a6 a7</p>
85
+ <p>a1 a2 a3 a4 a5 a6 a7 a8</p>
86
+ <p>a1 a2 a3 a4 a5 a6 a7 a8 a9</p>
87
+ </div>
88
+ HTML
89
+ [ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
90
+ <div>
91
+ <p>a1 a2 a3 a4 a5 a6 a7</p>
92
+ <p>a1 a2 a3</p>
93
+ <p>a1 a2 a3 a4 a5</p>
94
+ <p>a1 a2 a3 a4 a5 a6</p>
95
+ <div>
96
+ <p>a1 a2 a3 a4 a5 a6 a7 a8</p>
97
+ a1 a2 a3 a4 a5 a6 a7 a8 a9
98
+ </div>
99
+ </div>
100
+ HTML
101
+ [ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
102
+ <div>
103
+ <p>a1 a2 a3</p>
104
+ <p>a1 a2 a3 a4 a5</p>
105
+ <p>a1 a2 a3 a4 a5 a6</p>
106
+ <p>a1 a2 a3 a4 a5 a6 a7</p>
107
+ <div>
108
+ a1 a2 a3 a4 a5 a6 a7 a8
109
+ <p>a1 a2 a3 a4 a5 a6 a7 a8 a9</p>
110
+ </div>
111
+ </div>
112
+ HTML
113
+ ]
114
+
115
+ htests.each do | html, exp_extract |
116
+ map = ( content( html ) if html ) || UniMap.new
117
+ tfc = TreeFilterChain.new( [ CharactersNormalizer.new,
118
+ WordCounter.new ] )
119
+ fc = [ HTMLTreeFilter.new( :source_tree.to_k, tfc, Order::DEPTH_FIRST ),
120
+ ExtractFilter.new( [ :source_tree.to_k ] ) ]
121
+ chain = filter_chain( fc, :fragment )
122
+ assert( chain.filter( map ) )
123
+ assert_equal( exp_extract, map.extract && map.extract.to_s,
124
+ "from:\n" + html.to_s )
125
+ end
126
+ end
127
+
128
+ def test_multi_tree
129
+ htests = [ [ nil, nil, nil ],
130
+ [ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4' ],
131
+ <div>too short</div>
132
+ HTML1
133
+ <div>a1 a2 a3 a4</div>
134
+ HTML2
135
+ [ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4' ],
136
+ <div>a1 a2 a3 a4</div>
137
+ HTML1
138
+ <div>too short</div>
139
+ HTML2
140
+ [ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4 a5 a6 a7 a8' ],
141
+ <div>a1 a2 a3 a4 a5 a6 a7 a8</div>
142
+ HTML1
143
+ <div>a1 a2 a3 a4 a5 a6 a7 a8 a9</div>
144
+ HTML2
145
+ ]
146
+
147
+ htests.each do | summary, content, exp_extract |
148
+ map = UniMap.new
149
+ map.summary = summary
150
+ map.content = content
151
+
152
+ filters = [ html_clean_filters( :summary ),
153
+ html_clean_filters( :content ),
154
+ ExtractFilter.new( keys( :summary_tree, :content_tree ) ) ]
155
+
156
+ chain = FilterChain.new( "test", filters.flatten )
157
+ assert( chain.filter( map ) )
158
+ assert_equal( exp_extract, map.extract && map.extract.to_s,
159
+ "summary: #{summary}\n" +
160
+ "content: #{content}" )
161
+ end
162
+
163
+ end
164
+
165
+ end
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+
23
+ require 'iudex-html'
24
+
25
+ RJack::Logback.config_console( :stderr => true, :level => RJack::Logback::WARN )
26
+
27
+ require 'iudex-html/factory_helper'
28
+
29
+ require 'iudex-filter/filter_chain_factory'
30
+
31
+ class TestFactoryHelper < MiniTest::Unit::TestCase
32
+ include HTMLTestHelper
33
+
34
+ class TestFilterChainFactory < Iudex::Filter::Core::FilterChainFactory
35
+ include Iudex::HTML::Filters::FactoryHelper
36
+
37
+ def filters
38
+ [ html_clean_filters( :title, :title_tree ), # _tree optional arg
39
+ html_clean_filters( :summary ), # implied :summary_tree
40
+ html_write_filter( :summary ) ].flatten
41
+ end
42
+ end
43
+
44
+ def test
45
+ fcf = TestFilterChainFactory.new( "test" )
46
+ fcf.open
47
+ assert( fcf.open? )
48
+ fcf.close
49
+ end
50
+
51
+ end
@@ -0,0 +1,128 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+
23
+ class TestHTMLParser < MiniTest::Unit::TestCase
24
+ include HTMLTestHelper
25
+
26
+ HTML_META = <<HTML
27
+ <html xmlns="http://www.w3.org/1999/xhtml">
28
+ <head>
29
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
30
+ <title>Iūdex</title>
31
+ </head>
32
+ <body>
33
+ <p>Iūdex test.</p>
34
+ </body>
35
+ </html>
36
+ HTML
37
+
38
+ def test_charset_same
39
+ assert_doc( HTML_META, parse( HTML_META, "UTF-8" ) )
40
+ end
41
+
42
+ def test_charset_rerun
43
+ assert_doc( HTML_META, parse( HTML_META, "ISO-8859-1" ) )
44
+ end
45
+
46
+ def test_charset_bogus
47
+ alt = HTML_META.sub( /utf-8/, 'bogus' )
48
+ assert_doc( alt, parse( alt, "UTF-8" ) )
49
+ end
50
+
51
+ HTML_SKIP_TAGS = <<HTML
52
+ <html xmlns="http://www.w3.org/1999/xhtml">
53
+ <head>
54
+ <style>style me</style>
55
+ </head>
56
+ <body>
57
+ <unknown_empty/>
58
+ <p>normal text.</p>
59
+ <not_empty><p>foo</p><br/></not_empty>
60
+ <nostyle><p>foo</p><br/></nostyle>
61
+ </body>
62
+ </html>
63
+ HTML
64
+
65
+ HTML_SKIP_TAGS_SKIPPED = <<HTML
66
+ <html xmlns="http://www.w3.org/1999/xhtml">
67
+ <head/>
68
+ <body>
69
+ <p>normal text.</p>
70
+ </body>
71
+ </html>
72
+ HTML
73
+
74
+ def test_skip_tags
75
+ assert_doc( HTML_SKIP_TAGS_SKIPPED, parse( HTML_SKIP_TAGS, "ISO-8859-1" ) )
76
+ end
77
+
78
+ HTML_OUTSIDE = <<HTML
79
+ before
80
+ <html xmlns="http://www.w3.org/1999/xhtml">
81
+ <head/>
82
+ <body>
83
+ <p>normal text.</p>
84
+ </body>
85
+ </html>
86
+ after
87
+ HTML
88
+
89
+ HTML_INSIDE = <<HTML
90
+ <html xmlns="http://www.w3.org/1999/xhtml">
91
+ <head/>
92
+ <body>before
93
+ <p>normal text.</p>after</body>
94
+ </html>
95
+ HTML
96
+
97
+ def test_outer_text
98
+ assert_doc( HTML_INSIDE, parse( HTML_OUTSIDE, "ISO-8859-1" ) )
99
+ end
100
+
101
+ HTML_FRAG = {
102
+ :in => "one<p>two</p><br/> three",
103
+ :out => "<div>one<p>two</p><br/> three</div>" }
104
+
105
+ def test_parse_fragment
106
+ tree = parseFragment( HTML_FRAG[ :in ] )
107
+ assert_fragment( HTML_FRAG[ :out ], tree )
108
+ end
109
+
110
+ HTML_CDATA = {
111
+ :in => "<p><![CDATA[two]]></p>",
112
+ :out => "<p/>" }
113
+ # By default (incl HTML browsers) CDATA sections are dropped.
114
+
115
+ def test_cdata
116
+ tree = parseFragment( HTML_CDATA[ :in ] )
117
+ assert_fragment( HTML_CDATA[ :out ], tree )
118
+ end
119
+
120
+ # Neko doesn't ban/reorder blocks in inline elements.
121
+ def test_inline_nest
122
+ html = { :in => "<div><i>begin <p>block</p> end.</i></div>",
123
+ :out => "<div><i>begin <p>block</p> end.</i></div>" }
124
+ tree = parseFragment( html[ :in ] )
125
+ assert_fragment( html[ :out ], tree )
126
+ end
127
+
128
+ end
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+ require 'iudex-html'
23
+
24
+ class TestOtherFilters < MiniTest::Unit::TestCase
25
+ include HTMLTestHelper
26
+
27
+ include Iudex::Core
28
+ include Iudex::HTML
29
+ include Iudex::HTML::Filters
30
+
31
+ def test_title_extractor
32
+ html = <<HTML
33
+ <html>
34
+ <head>
35
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
36
+ <title>I&#363;dex</title>
37
+ <style>style</style>
38
+ </head>
39
+ <body>
40
+ <p>Iūdex test.</p>
41
+ </body>
42
+ </html>
43
+ HTML
44
+
45
+ map = content( html )
46
+ chain = filter_chain( TitleExtractor.new )
47
+ assert( chain.filter( map ) )
48
+ assert_equal( 'Iūdex', map.title.to_s )
49
+ end
50
+
51
+ end
@@ -0,0 +1,124 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ class TestOtherTreeFilters < MiniTest::Unit::TestCase
23
+ include HTMLTestHelper
24
+ include Iudex::HTML::Tree
25
+ include Iudex::HTML::Tree::Filters
26
+
27
+ def test_non_html_atts_dropped
28
+ # Bogus is dropped already by parser
29
+ html = {}
30
+ html[ :in ] = <<HTML
31
+ <div bogus="not html">
32
+ <p>test.</p>
33
+ </div>
34
+ HTML
35
+ html[ :out ] = cut_atts( html[ :in ], 'bogus' )
36
+
37
+ assert_transform( html ) #identity
38
+ end
39
+
40
+ def test_attribute_cleaner
41
+ html = {}
42
+ html[ :in ] = <<HTML
43
+ <div style="font:big">
44
+ <a href=".." style="drop" rel="foo">link text</a>
45
+ <img src=".." alt="foo" height="33" width="44" align="left"/>
46
+ </div>
47
+ HTML
48
+
49
+ html[ :out ] = cut_atts( html[ :in ], 'style', 'align' )
50
+
51
+ assert_transform( html, AttributeCleaner.new )
52
+ end
53
+
54
+ def test_empty_inline_remover
55
+
56
+ hs = [ { :in => "<div><b> keep </b></div>",
57
+ :out => "<div><b> keep </b></div>" },
58
+
59
+ { :in => '<div><b><img src="keep"/></b></div>',
60
+ :out => '<div><b><img src="keep"/></b></div>' },
61
+
62
+ { :in => "<div>first<span/></div>",
63
+ :out => "<div>first~~~~~~~</div>" },
64
+
65
+ { :in => "<div>first<b> </b></div>",
66
+ :out => "<div>first~~~ ~~~~</div>" },
67
+
68
+ { :in => "<div><b><span/></b>last</div>",
69
+ :out => "<div>~~~~~~~~~~~~~~last</div>" },
70
+
71
+ { :in => "<div><b><span/> </b>last</div>",
72
+ :out => "<div>~~~~~~~~~~ ~~~~last</div>" },
73
+
74
+ { :in => "<div><b> <br/> </b>last</div>",
75
+ :out => "<div>~~~ <br/> ~~~~last</div>" } ]
76
+
77
+ hs.each do |html|
78
+ assert_transform( html, EmptyInlineRemover.new )
79
+ end
80
+
81
+ end
82
+
83
+ def test_css_display_filter_pattern
84
+ f = CSSDisplayFilter.new
85
+ assert( f.has_display_none( 'display: none' ) )
86
+ assert( f.has_display_none( '{display: none}' ) ) #lenient
87
+ assert( f.has_display_none( 'other:foo; DISPLAY:NONE;' ) )
88
+
89
+ assert( ! f.has_display_none( 'display: block' ) )
90
+ assert( ! f.has_display_none( 'other-display: none' ) )
91
+ assert( ! f.has_display_none( 'display: nonetheless' ) )
92
+ end
93
+
94
+ def test_css_display_filter
95
+ html = {}
96
+ html[ :in ] = <<HTML
97
+ <div>
98
+ <b>keep</b>
99
+ <div style="display:none"><b>drop</b> me</div>
100
+ </div>
101
+ HTML
102
+ html[ :out ] = <<HTML
103
+ <div>
104
+ <b>keep</b>
105
+ </div>
106
+ HTML
107
+ assert_transform( html, CSSDisplayFilter.new )
108
+ end
109
+
110
+ def test_xmp_to_pre_converter
111
+ html = { :in => "<div><xmp> <i>keep</i> </xmp></div>",
112
+ :out => "<div><pre> &lt;i>keep&lt;/i> </pre></div>" }
113
+
114
+ assert_transform( html, XmpToPreConverter.new )
115
+ end
116
+
117
+ def cut_atts( html, *atts )
118
+ atts.each do |att|
119
+ html = html.gsub( / #{att}="[^"]+"/, '' )
120
+ end
121
+ html
122
+ end
123
+
124
+ end