iudex-html 1.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,165 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+
23
+ class TestExtractFilter < MiniTest::Unit::TestCase
24
+ include HTMLTestHelper
25
+ include Iudex::HTML::Filters
26
+ include Iudex::Filter::KeyHelper
27
+
28
+ include Iudex::HTML::Tree::Filters
29
+ Order = HTMLTreeFilter::Order
30
+
31
+ def test_single_tree
32
+ htests = [ [ nil, nil ],
33
+ [ '<div></div>', nil ],
34
+ [ '<div>too short</div>', nil ],
35
+ [ <<-'HTML', nil ],
36
+ <div>
37
+ <p>too short</p>
38
+ </div>
39
+ HTML
40
+ [ <<-'HTML', nil ],
41
+ <div>a1 a2<br/>
42
+ a3 a4</div>
43
+ HTML
44
+ [ <<-'HTML', "a1 a2 a3 a4" ],
45
+ <div>a1 a2 a3 a4</div>
46
+ HTML
47
+ [ <<-'HTML', "a1 a2 a3 a4" ],
48
+ <div>
49
+ <p>a1 a2 a3 a4</p>
50
+ </div>
51
+ HTML
52
+ [ <<-'HTML', "a1 a2 a3 a4" ],
53
+ <div>a1 a2 a3 a4<br/>
54
+ not part of extract</div>
55
+ HTML
56
+ [ <<-'HTML', "a1 a2 a3 a4" ],
57
+ <div>not<br/>
58
+ a1 a2 a3 a4
59
+ </div>
60
+ HTML
61
+ [ <<-'HTML', "A more substantive paragraph." ],
62
+ <div>
63
+ <p>Short junk</p>
64
+ <hr/>
65
+ <p>A more <i>substantive </i>paragraph.</p>
66
+ <p>A similarly <i>substantive </i>paragraph.</p>
67
+ </div>
68
+ HTML
69
+ [ <<-'HTML', "A more substantive paragraph." ],
70
+ <i>
71
+ <p>Short junk</p>
72
+ <i>
73
+ <hr/>
74
+ <p>A more <i>substantive </i>paragraph.</p>
75
+ </i>
76
+ <p>A similarly <i>substantive </i>paragraph.</p>
77
+ </i>
78
+ HTML
79
+ [ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
80
+ <div>
81
+ <p>a1 a2 a3</p>
82
+ <p>a1 a2 a3 a4 a5</p>
83
+ <p>a1 a2 a3 a4 a5 a6</p>
84
+ <p>a1 a2 a3 a4 a5 a6 a7</p>
85
+ <p>a1 a2 a3 a4 a5 a6 a7 a8</p>
86
+ <p>a1 a2 a3 a4 a5 a6 a7 a8 a9</p>
87
+ </div>
88
+ HTML
89
+ [ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
90
+ <div>
91
+ <p>a1 a2 a3 a4 a5 a6 a7</p>
92
+ <p>a1 a2 a3</p>
93
+ <p>a1 a2 a3 a4 a5</p>
94
+ <p>a1 a2 a3 a4 a5 a6</p>
95
+ <div>
96
+ <p>a1 a2 a3 a4 a5 a6 a7 a8</p>
97
+ a1 a2 a3 a4 a5 a6 a7 a8 a9
98
+ </div>
99
+ </div>
100
+ HTML
101
+ [ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
102
+ <div>
103
+ <p>a1 a2 a3</p>
104
+ <p>a1 a2 a3 a4 a5</p>
105
+ <p>a1 a2 a3 a4 a5 a6</p>
106
+ <p>a1 a2 a3 a4 a5 a6 a7</p>
107
+ <div>
108
+ a1 a2 a3 a4 a5 a6 a7 a8
109
+ <p>a1 a2 a3 a4 a5 a6 a7 a8 a9</p>
110
+ </div>
111
+ </div>
112
+ HTML
113
+ ]
114
+
115
+ htests.each do | html, exp_extract |
116
+ map = ( content( html ) if html ) || UniMap.new
117
+ tfc = TreeFilterChain.new( [ CharactersNormalizer.new,
118
+ WordCounter.new ] )
119
+ fc = [ HTMLTreeFilter.new( :source_tree.to_k, tfc, Order::DEPTH_FIRST ),
120
+ ExtractFilter.new( [ :source_tree.to_k ] ) ]
121
+ chain = filter_chain( fc, :fragment )
122
+ assert( chain.filter( map ) )
123
+ assert_equal( exp_extract, map.extract && map.extract.to_s,
124
+ "from:\n" + html.to_s )
125
+ end
126
+ end
127
+
128
+ def test_multi_tree
129
+ htests = [ [ nil, nil, nil ],
130
+ [ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4' ],
131
+ <div>too short</div>
132
+ HTML1
133
+ <div>a1 a2 a3 a4</div>
134
+ HTML2
135
+ [ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4' ],
136
+ <div>a1 a2 a3 a4</div>
137
+ HTML1
138
+ <div>too short</div>
139
+ HTML2
140
+ [ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4 a5 a6 a7 a8' ],
141
+ <div>a1 a2 a3 a4 a5 a6 a7 a8</div>
142
+ HTML1
143
+ <div>a1 a2 a3 a4 a5 a6 a7 a8 a9</div>
144
+ HTML2
145
+ ]
146
+
147
+ htests.each do | summary, content, exp_extract |
148
+ map = UniMap.new
149
+ map.summary = summary
150
+ map.content = content
151
+
152
+ filters = [ html_clean_filters( :summary ),
153
+ html_clean_filters( :content ),
154
+ ExtractFilter.new( keys( :summary_tree, :content_tree ) ) ]
155
+
156
+ chain = FilterChain.new( "test", filters.flatten )
157
+ assert( chain.filter( map ) )
158
+ assert_equal( exp_extract, map.extract && map.extract.to_s,
159
+ "summary: #{summary}\n" +
160
+ "content: #{content}" )
161
+ end
162
+
163
+ end
164
+
165
+ end
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+
23
+ require 'iudex-html'
24
+
25
+ RJack::Logback.config_console( :stderr => true, :level => RJack::Logback::WARN )
26
+
27
+ require 'iudex-html/factory_helper'
28
+
29
+ require 'iudex-filter/filter_chain_factory'
30
+
31
+ class TestFactoryHelper < MiniTest::Unit::TestCase
32
+ include HTMLTestHelper
33
+
34
+ class TestFilterChainFactory < Iudex::Filter::Core::FilterChainFactory
35
+ include Iudex::HTML::Filters::FactoryHelper
36
+
37
+ def filters
38
+ [ html_clean_filters( :title, :title_tree ), # _tree optional arg
39
+ html_clean_filters( :summary ), # implied :summary_tree
40
+ html_write_filter( :summary ) ].flatten
41
+ end
42
+ end
43
+
44
+ def test
45
+ fcf = TestFilterChainFactory.new( "test" )
46
+ fcf.open
47
+ assert( fcf.open? )
48
+ fcf.close
49
+ end
50
+
51
+ end
@@ -0,0 +1,128 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+
23
+ class TestHTMLParser < MiniTest::Unit::TestCase
24
+ include HTMLTestHelper
25
+
26
+ HTML_META = <<HTML
27
+ <html xmlns="http://www.w3.org/1999/xhtml">
28
+ <head>
29
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
30
+ <title>Iūdex</title>
31
+ </head>
32
+ <body>
33
+ <p>Iūdex test.</p>
34
+ </body>
35
+ </html>
36
+ HTML
37
+
38
+ def test_charset_same
39
+ assert_doc( HTML_META, parse( HTML_META, "UTF-8" ) )
40
+ end
41
+
42
+ def test_charset_rerun
43
+ assert_doc( HTML_META, parse( HTML_META, "ISO-8859-1" ) )
44
+ end
45
+
46
+ def test_charset_bogus
47
+ alt = HTML_META.sub( /utf-8/, 'bogus' )
48
+ assert_doc( alt, parse( alt, "UTF-8" ) )
49
+ end
50
+
51
+ HTML_SKIP_TAGS = <<HTML
52
+ <html xmlns="http://www.w3.org/1999/xhtml">
53
+ <head>
54
+ <style>style me</style>
55
+ </head>
56
+ <body>
57
+ <unknown_empty/>
58
+ <p>normal text.</p>
59
+ <not_empty><p>foo</p><br/></not_empty>
60
+ <nostyle><p>foo</p><br/></nostyle>
61
+ </body>
62
+ </html>
63
+ HTML
64
+
65
+ HTML_SKIP_TAGS_SKIPPED = <<HTML
66
+ <html xmlns="http://www.w3.org/1999/xhtml">
67
+ <head/>
68
+ <body>
69
+ <p>normal text.</p>
70
+ </body>
71
+ </html>
72
+ HTML
73
+
74
+ def test_skip_tags
75
+ assert_doc( HTML_SKIP_TAGS_SKIPPED, parse( HTML_SKIP_TAGS, "ISO-8859-1" ) )
76
+ end
77
+
78
+ HTML_OUTSIDE = <<HTML
79
+ before
80
+ <html xmlns="http://www.w3.org/1999/xhtml">
81
+ <head/>
82
+ <body>
83
+ <p>normal text.</p>
84
+ </body>
85
+ </html>
86
+ after
87
+ HTML
88
+
89
+ HTML_INSIDE = <<HTML
90
+ <html xmlns="http://www.w3.org/1999/xhtml">
91
+ <head/>
92
+ <body>before
93
+ <p>normal text.</p>after</body>
94
+ </html>
95
+ HTML
96
+
97
+ def test_outer_text
98
+ assert_doc( HTML_INSIDE, parse( HTML_OUTSIDE, "ISO-8859-1" ) )
99
+ end
100
+
101
+ HTML_FRAG = {
102
+ :in => "one<p>two</p><br/> three",
103
+ :out => "<div>one<p>two</p><br/> three</div>" }
104
+
105
+ def test_parse_fragment
106
+ tree = parseFragment( HTML_FRAG[ :in ] )
107
+ assert_fragment( HTML_FRAG[ :out ], tree )
108
+ end
109
+
110
+ HTML_CDATA = {
111
+ :in => "<p><![CDATA[two]]></p>",
112
+ :out => "<p/>" }
113
+ # By default (incl HTML browsers) CDATA sections are dropped.
114
+
115
+ def test_cdata
116
+ tree = parseFragment( HTML_CDATA[ :in ] )
117
+ assert_fragment( HTML_CDATA[ :out ], tree )
118
+ end
119
+
120
+ # Neko doesn't ban/reorder blocks in inline elements.
121
+ def test_inline_nest
122
+ html = { :in => "<div><i>begin <p>block</p> end.</i></div>",
123
+ :out => "<div><i>begin <p>block</p> end.</i></div>" }
124
+ tree = parseFragment( html[ :in ] )
125
+ assert_fragment( html[ :out ], tree )
126
+ end
127
+
128
+ end
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+ require 'iudex-html'
23
+
24
+ class TestOtherFilters < MiniTest::Unit::TestCase
25
+ include HTMLTestHelper
26
+
27
+ include Iudex::Core
28
+ include Iudex::HTML
29
+ include Iudex::HTML::Filters
30
+
31
+ def test_title_extractor
32
+ html = <<HTML
33
+ <html>
34
+ <head>
35
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
36
+ <title>I&#363;dex</title>
37
+ <style>style</style>
38
+ </head>
39
+ <body>
40
+ <p>Iūdex test.</p>
41
+ </body>
42
+ </html>
43
+ HTML
44
+
45
+ map = content( html )
46
+ chain = filter_chain( TitleExtractor.new )
47
+ assert( chain.filter( map ) )
48
+ assert_equal( 'Iūdex', map.title.to_s )
49
+ end
50
+
51
+ end
@@ -0,0 +1,124 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ class TestOtherTreeFilters < MiniTest::Unit::TestCase
23
+ include HTMLTestHelper
24
+ include Iudex::HTML::Tree
25
+ include Iudex::HTML::Tree::Filters
26
+
27
+ def test_non_html_atts_dropped
28
+ # Bogus is dropped already by parser
29
+ html = {}
30
+ html[ :in ] = <<HTML
31
+ <div bogus="not html">
32
+ <p>test.</p>
33
+ </div>
34
+ HTML
35
+ html[ :out ] = cut_atts( html[ :in ], 'bogus' )
36
+
37
+ assert_transform( html ) #identity
38
+ end
39
+
40
+ def test_attribute_cleaner
41
+ html = {}
42
+ html[ :in ] = <<HTML
43
+ <div style="font:big">
44
+ <a href=".." style="drop" rel="foo">link text</a>
45
+ <img src=".." alt="foo" height="33" width="44" align="left"/>
46
+ </div>
47
+ HTML
48
+
49
+ html[ :out ] = cut_atts( html[ :in ], 'style', 'align' )
50
+
51
+ assert_transform( html, AttributeCleaner.new )
52
+ end
53
+
54
+ def test_empty_inline_remover
55
+
56
+ hs = [ { :in => "<div><b> keep </b></div>",
57
+ :out => "<div><b> keep </b></div>" },
58
+
59
+ { :in => '<div><b><img src="keep"/></b></div>',
60
+ :out => '<div><b><img src="keep"/></b></div>' },
61
+
62
+ { :in => "<div>first<span/></div>",
63
+ :out => "<div>first~~~~~~~</div>" },
64
+
65
+ { :in => "<div>first<b> </b></div>",
66
+ :out => "<div>first~~~ ~~~~</div>" },
67
+
68
+ { :in => "<div><b><span/></b>last</div>",
69
+ :out => "<div>~~~~~~~~~~~~~~last</div>" },
70
+
71
+ { :in => "<div><b><span/> </b>last</div>",
72
+ :out => "<div>~~~~~~~~~~ ~~~~last</div>" },
73
+
74
+ { :in => "<div><b> <br/> </b>last</div>",
75
+ :out => "<div>~~~ <br/> ~~~~last</div>" } ]
76
+
77
+ hs.each do |html|
78
+ assert_transform( html, EmptyInlineRemover.new )
79
+ end
80
+
81
+ end
82
+
83
+ def test_css_display_filter_pattern
84
+ f = CSSDisplayFilter.new
85
+ assert( f.has_display_none( 'display: none' ) )
86
+ assert( f.has_display_none( '{display: none}' ) ) #lenient
87
+ assert( f.has_display_none( 'other:foo; DISPLAY:NONE;' ) )
88
+
89
+ assert( ! f.has_display_none( 'display: block' ) )
90
+ assert( ! f.has_display_none( 'other-display: none' ) )
91
+ assert( ! f.has_display_none( 'display: nonetheless' ) )
92
+ end
93
+
94
+ def test_css_display_filter
95
+ html = {}
96
+ html[ :in ] = <<HTML
97
+ <div>
98
+ <b>keep</b>
99
+ <div style="display:none"><b>drop</b> me</div>
100
+ </div>
101
+ HTML
102
+ html[ :out ] = <<HTML
103
+ <div>
104
+ <b>keep</b>
105
+ </div>
106
+ HTML
107
+ assert_transform( html, CSSDisplayFilter.new )
108
+ end
109
+
110
+ def test_xmp_to_pre_converter
111
+ html = { :in => "<div><xmp> <i>keep</i> </xmp></div>",
112
+ :out => "<div><pre> &lt;i>keep&lt;/i> </pre></div>" }
113
+
114
+ assert_transform( html, XmpToPreConverter.new )
115
+ end
116
+
117
+ def cut_atts( html, *atts )
118
+ atts.each do |att|
119
+ html = html.gsub( / #{att}="[^"]+"/, '' )
120
+ end
121
+ html
122
+ end
123
+
124
+ end