iudex-html 1.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +2 -0
- data/Manifest.txt +24 -0
- data/README.rdoc +25 -0
- data/Rakefile +53 -0
- data/build/HTML.java.erb +91 -0
- data/build/attributes +82 -0
- data/build/java_generate.rb +139 -0
- data/build/tags +130 -0
- data/lib/iudex-html.rb +56 -0
- data/lib/iudex-html/base.rb +21 -0
- data/lib/iudex-html/factory_helper.rb +95 -0
- data/lib/iudex-html/iudex-html-1.0.0.jar +0 -0
- data/pom.xml +51 -0
- data/test/html_test_helper.rb +100 -0
- data/test/setup.rb +38 -0
- data/test/test_characters_normalizer.rb +81 -0
- data/test/test_extract_filter.rb +165 -0
- data/test/test_factory_helper.rb +51 -0
- data/test/test_html_parser.rb +128 -0
- data/test/test_other_filters.rb +51 -0
- data/test/test_other_tree_filters.rb +124 -0
- data/test/test_parse_filter.rb +72 -0
- data/test/test_tree_walker.rb +94 -0
- data/test/test_word_counters.rb +96 -0
- metadata +162 -0
@@ -0,0 +1,165 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2010-2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
|
23
|
+
class TestExtractFilter < MiniTest::Unit::TestCase
|
24
|
+
include HTMLTestHelper
|
25
|
+
include Iudex::HTML::Filters
|
26
|
+
include Iudex::Filter::KeyHelper
|
27
|
+
|
28
|
+
include Iudex::HTML::Tree::Filters
|
29
|
+
Order = HTMLTreeFilter::Order
|
30
|
+
|
31
|
+
def test_single_tree
|
32
|
+
htests = [ [ nil, nil ],
|
33
|
+
[ '<div></div>', nil ],
|
34
|
+
[ '<div>too short</div>', nil ],
|
35
|
+
[ <<-'HTML', nil ],
|
36
|
+
<div>
|
37
|
+
<p>too short</p>
|
38
|
+
</div>
|
39
|
+
HTML
|
40
|
+
[ <<-'HTML', nil ],
|
41
|
+
<div>a1 a2<br/>
|
42
|
+
a3 a4</div>
|
43
|
+
HTML
|
44
|
+
[ <<-'HTML', "a1 a2 a3 a4" ],
|
45
|
+
<div>a1 a2 a3 a4</div>
|
46
|
+
HTML
|
47
|
+
[ <<-'HTML', "a1 a2 a3 a4" ],
|
48
|
+
<div>
|
49
|
+
<p>a1 a2 a3 a4</p>
|
50
|
+
</div>
|
51
|
+
HTML
|
52
|
+
[ <<-'HTML', "a1 a2 a3 a4" ],
|
53
|
+
<div>a1 a2 a3 a4<br/>
|
54
|
+
not part of extract</div>
|
55
|
+
HTML
|
56
|
+
[ <<-'HTML', "a1 a2 a3 a4" ],
|
57
|
+
<div>not<br/>
|
58
|
+
a1 a2 a3 a4
|
59
|
+
</div>
|
60
|
+
HTML
|
61
|
+
[ <<-'HTML', "A more substantive paragraph." ],
|
62
|
+
<div>
|
63
|
+
<p>Short junk</p>
|
64
|
+
<hr/>
|
65
|
+
<p>A more <i>substantive </i>paragraph.</p>
|
66
|
+
<p>A similarly <i>substantive </i>paragraph.</p>
|
67
|
+
</div>
|
68
|
+
HTML
|
69
|
+
[ <<-'HTML', "A more substantive paragraph." ],
|
70
|
+
<i>
|
71
|
+
<p>Short junk</p>
|
72
|
+
<i>
|
73
|
+
<hr/>
|
74
|
+
<p>A more <i>substantive </i>paragraph.</p>
|
75
|
+
</i>
|
76
|
+
<p>A similarly <i>substantive </i>paragraph.</p>
|
77
|
+
</i>
|
78
|
+
HTML
|
79
|
+
[ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
|
80
|
+
<div>
|
81
|
+
<p>a1 a2 a3</p>
|
82
|
+
<p>a1 a2 a3 a4 a5</p>
|
83
|
+
<p>a1 a2 a3 a4 a5 a6</p>
|
84
|
+
<p>a1 a2 a3 a4 a5 a6 a7</p>
|
85
|
+
<p>a1 a2 a3 a4 a5 a6 a7 a8</p>
|
86
|
+
<p>a1 a2 a3 a4 a5 a6 a7 a8 a9</p>
|
87
|
+
</div>
|
88
|
+
HTML
|
89
|
+
[ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
|
90
|
+
<div>
|
91
|
+
<p>a1 a2 a3 a4 a5 a6 a7</p>
|
92
|
+
<p>a1 a2 a3</p>
|
93
|
+
<p>a1 a2 a3 a4 a5</p>
|
94
|
+
<p>a1 a2 a3 a4 a5 a6</p>
|
95
|
+
<div>
|
96
|
+
<p>a1 a2 a3 a4 a5 a6 a7 a8</p>
|
97
|
+
a1 a2 a3 a4 a5 a6 a7 a8 a9
|
98
|
+
</div>
|
99
|
+
</div>
|
100
|
+
HTML
|
101
|
+
[ <<-'HTML', "a1 a2 a3 a4 a5 a6 a7 a8" ],
|
102
|
+
<div>
|
103
|
+
<p>a1 a2 a3</p>
|
104
|
+
<p>a1 a2 a3 a4 a5</p>
|
105
|
+
<p>a1 a2 a3 a4 a5 a6</p>
|
106
|
+
<p>a1 a2 a3 a4 a5 a6 a7</p>
|
107
|
+
<div>
|
108
|
+
a1 a2 a3 a4 a5 a6 a7 a8
|
109
|
+
<p>a1 a2 a3 a4 a5 a6 a7 a8 a9</p>
|
110
|
+
</div>
|
111
|
+
</div>
|
112
|
+
HTML
|
113
|
+
]
|
114
|
+
|
115
|
+
htests.each do | html, exp_extract |
|
116
|
+
map = ( content( html ) if html ) || UniMap.new
|
117
|
+
tfc = TreeFilterChain.new( [ CharactersNormalizer.new,
|
118
|
+
WordCounter.new ] )
|
119
|
+
fc = [ HTMLTreeFilter.new( :source_tree.to_k, tfc, Order::DEPTH_FIRST ),
|
120
|
+
ExtractFilter.new( [ :source_tree.to_k ] ) ]
|
121
|
+
chain = filter_chain( fc, :fragment )
|
122
|
+
assert( chain.filter( map ) )
|
123
|
+
assert_equal( exp_extract, map.extract && map.extract.to_s,
|
124
|
+
"from:\n" + html.to_s )
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def test_multi_tree
|
129
|
+
htests = [ [ nil, nil, nil ],
|
130
|
+
[ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4' ],
|
131
|
+
<div>too short</div>
|
132
|
+
HTML1
|
133
|
+
<div>a1 a2 a3 a4</div>
|
134
|
+
HTML2
|
135
|
+
[ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4' ],
|
136
|
+
<div>a1 a2 a3 a4</div>
|
137
|
+
HTML1
|
138
|
+
<div>too short</div>
|
139
|
+
HTML2
|
140
|
+
[ <<-'HTML1', <<-'HTML2', 'a1 a2 a3 a4 a5 a6 a7 a8' ],
|
141
|
+
<div>a1 a2 a3 a4 a5 a6 a7 a8</div>
|
142
|
+
HTML1
|
143
|
+
<div>a1 a2 a3 a4 a5 a6 a7 a8 a9</div>
|
144
|
+
HTML2
|
145
|
+
]
|
146
|
+
|
147
|
+
htests.each do | summary, content, exp_extract |
|
148
|
+
map = UniMap.new
|
149
|
+
map.summary = summary
|
150
|
+
map.content = content
|
151
|
+
|
152
|
+
filters = [ html_clean_filters( :summary ),
|
153
|
+
html_clean_filters( :content ),
|
154
|
+
ExtractFilter.new( keys( :summary_tree, :content_tree ) ) ]
|
155
|
+
|
156
|
+
chain = FilterChain.new( "test", filters.flatten )
|
157
|
+
assert( chain.filter( map ) )
|
158
|
+
assert_equal( exp_extract, map.extract && map.extract.to_s,
|
159
|
+
"summary: #{summary}\n" +
|
160
|
+
"content: #{content}" )
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
|
165
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2010-2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
|
23
|
+
require 'iudex-html'
|
24
|
+
|
25
|
+
RJack::Logback.config_console( :stderr => true, :level => RJack::Logback::WARN )
|
26
|
+
|
27
|
+
require 'iudex-html/factory_helper'
|
28
|
+
|
29
|
+
require 'iudex-filter/filter_chain_factory'
|
30
|
+
|
31
|
+
class TestFactoryHelper < MiniTest::Unit::TestCase
|
32
|
+
include HTMLTestHelper
|
33
|
+
|
34
|
+
class TestFilterChainFactory < Iudex::Filter::Core::FilterChainFactory
|
35
|
+
include Iudex::HTML::Filters::FactoryHelper
|
36
|
+
|
37
|
+
def filters
|
38
|
+
[ html_clean_filters( :title, :title_tree ), # _tree optional arg
|
39
|
+
html_clean_filters( :summary ), # implied :summary_tree
|
40
|
+
html_write_filter( :summary ) ].flatten
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def test
|
45
|
+
fcf = TestFilterChainFactory.new( "test" )
|
46
|
+
fcf.open
|
47
|
+
assert( fcf.open? )
|
48
|
+
fcf.close
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2010-2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
|
23
|
+
class TestHTMLParser < MiniTest::Unit::TestCase
|
24
|
+
include HTMLTestHelper
|
25
|
+
|
26
|
+
HTML_META = <<HTML
|
27
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
28
|
+
<head>
|
29
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
30
|
+
<title>Iūdex</title>
|
31
|
+
</head>
|
32
|
+
<body>
|
33
|
+
<p>Iūdex test.</p>
|
34
|
+
</body>
|
35
|
+
</html>
|
36
|
+
HTML
|
37
|
+
|
38
|
+
def test_charset_same
|
39
|
+
assert_doc( HTML_META, parse( HTML_META, "UTF-8" ) )
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_charset_rerun
|
43
|
+
assert_doc( HTML_META, parse( HTML_META, "ISO-8859-1" ) )
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_charset_bogus
|
47
|
+
alt = HTML_META.sub( /utf-8/, 'bogus' )
|
48
|
+
assert_doc( alt, parse( alt, "UTF-8" ) )
|
49
|
+
end
|
50
|
+
|
51
|
+
HTML_SKIP_TAGS = <<HTML
|
52
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
53
|
+
<head>
|
54
|
+
<style>style me</style>
|
55
|
+
</head>
|
56
|
+
<body>
|
57
|
+
<unknown_empty/>
|
58
|
+
<p>normal text.</p>
|
59
|
+
<not_empty><p>foo</p><br/></not_empty>
|
60
|
+
<nostyle><p>foo</p><br/></nostyle>
|
61
|
+
</body>
|
62
|
+
</html>
|
63
|
+
HTML
|
64
|
+
|
65
|
+
HTML_SKIP_TAGS_SKIPPED = <<HTML
|
66
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
67
|
+
<head/>
|
68
|
+
<body>
|
69
|
+
<p>normal text.</p>
|
70
|
+
</body>
|
71
|
+
</html>
|
72
|
+
HTML
|
73
|
+
|
74
|
+
def test_skip_tags
|
75
|
+
assert_doc( HTML_SKIP_TAGS_SKIPPED, parse( HTML_SKIP_TAGS, "ISO-8859-1" ) )
|
76
|
+
end
|
77
|
+
|
78
|
+
HTML_OUTSIDE = <<HTML
|
79
|
+
before
|
80
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
81
|
+
<head/>
|
82
|
+
<body>
|
83
|
+
<p>normal text.</p>
|
84
|
+
</body>
|
85
|
+
</html>
|
86
|
+
after
|
87
|
+
HTML
|
88
|
+
|
89
|
+
HTML_INSIDE = <<HTML
|
90
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
91
|
+
<head/>
|
92
|
+
<body>before
|
93
|
+
<p>normal text.</p>after</body>
|
94
|
+
</html>
|
95
|
+
HTML
|
96
|
+
|
97
|
+
def test_outer_text
|
98
|
+
assert_doc( HTML_INSIDE, parse( HTML_OUTSIDE, "ISO-8859-1" ) )
|
99
|
+
end
|
100
|
+
|
101
|
+
HTML_FRAG = {
|
102
|
+
:in => "one<p>two</p><br/> three",
|
103
|
+
:out => "<div>one<p>two</p><br/> three</div>" }
|
104
|
+
|
105
|
+
def test_parse_fragment
|
106
|
+
tree = parseFragment( HTML_FRAG[ :in ] )
|
107
|
+
assert_fragment( HTML_FRAG[ :out ], tree )
|
108
|
+
end
|
109
|
+
|
110
|
+
HTML_CDATA = {
|
111
|
+
:in => "<p><![CDATA[two]]></p>",
|
112
|
+
:out => "<p/>" }
|
113
|
+
# By default (incl HTML browsers) CDATA sections are dropped.
|
114
|
+
|
115
|
+
def test_cdata
|
116
|
+
tree = parseFragment( HTML_CDATA[ :in ] )
|
117
|
+
assert_fragment( HTML_CDATA[ :out ], tree )
|
118
|
+
end
|
119
|
+
|
120
|
+
# Neko doesn't ban/reorder blocks in inline elements.
|
121
|
+
def test_inline_nest
|
122
|
+
html = { :in => "<div><i>begin <p>block</p> end.</i></div>",
|
123
|
+
:out => "<div><i>begin <p>block</p> end.</i></div>" }
|
124
|
+
tree = parseFragment( html[ :in ] )
|
125
|
+
assert_fragment( html[ :out ], tree )
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2010-2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
require 'iudex-html'
|
23
|
+
|
24
|
+
class TestOtherFilters < MiniTest::Unit::TestCase
|
25
|
+
include HTMLTestHelper
|
26
|
+
|
27
|
+
include Iudex::Core
|
28
|
+
include Iudex::HTML
|
29
|
+
include Iudex::HTML::Filters
|
30
|
+
|
31
|
+
def test_title_extractor
|
32
|
+
html = <<HTML
|
33
|
+
<html>
|
34
|
+
<head>
|
35
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
36
|
+
<title>Iūdex</title>
|
37
|
+
<style>style</style>
|
38
|
+
</head>
|
39
|
+
<body>
|
40
|
+
<p>Iūdex test.</p>
|
41
|
+
</body>
|
42
|
+
</html>
|
43
|
+
HTML
|
44
|
+
|
45
|
+
map = content( html )
|
46
|
+
chain = filter_chain( TitleExtractor.new )
|
47
|
+
assert( chain.filter( map ) )
|
48
|
+
assert_equal( 'Iūdex', map.title.to_s )
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2010-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
class TestOtherTreeFilters < MiniTest::Unit::TestCase
|
23
|
+
include HTMLTestHelper
|
24
|
+
include Iudex::HTML::Tree
|
25
|
+
include Iudex::HTML::Tree::Filters
|
26
|
+
|
27
|
+
def test_non_html_atts_dropped
|
28
|
+
# Bogus is dropped already by parser
|
29
|
+
html = {}
|
30
|
+
html[ :in ] = <<HTML
|
31
|
+
<div bogus="not html">
|
32
|
+
<p>test.</p>
|
33
|
+
</div>
|
34
|
+
HTML
|
35
|
+
html[ :out ] = cut_atts( html[ :in ], 'bogus' )
|
36
|
+
|
37
|
+
assert_transform( html ) #identity
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_attribute_cleaner
|
41
|
+
html = {}
|
42
|
+
html[ :in ] = <<HTML
|
43
|
+
<div style="font:big">
|
44
|
+
<a href=".." style="drop" rel="foo">link text</a>
|
45
|
+
<img src=".." alt="foo" height="33" width="44" align="left"/>
|
46
|
+
</div>
|
47
|
+
HTML
|
48
|
+
|
49
|
+
html[ :out ] = cut_atts( html[ :in ], 'style', 'align' )
|
50
|
+
|
51
|
+
assert_transform( html, AttributeCleaner.new )
|
52
|
+
end
|
53
|
+
|
54
|
+
def test_empty_inline_remover
|
55
|
+
|
56
|
+
hs = [ { :in => "<div><b> keep </b></div>",
|
57
|
+
:out => "<div><b> keep </b></div>" },
|
58
|
+
|
59
|
+
{ :in => '<div><b><img src="keep"/></b></div>',
|
60
|
+
:out => '<div><b><img src="keep"/></b></div>' },
|
61
|
+
|
62
|
+
{ :in => "<div>first<span/></div>",
|
63
|
+
:out => "<div>first~~~~~~~</div>" },
|
64
|
+
|
65
|
+
{ :in => "<div>first<b> </b></div>",
|
66
|
+
:out => "<div>first~~~ ~~~~</div>" },
|
67
|
+
|
68
|
+
{ :in => "<div><b><span/></b>last</div>",
|
69
|
+
:out => "<div>~~~~~~~~~~~~~~last</div>" },
|
70
|
+
|
71
|
+
{ :in => "<div><b><span/> </b>last</div>",
|
72
|
+
:out => "<div>~~~~~~~~~~ ~~~~last</div>" },
|
73
|
+
|
74
|
+
{ :in => "<div><b> <br/> </b>last</div>",
|
75
|
+
:out => "<div>~~~ <br/> ~~~~last</div>" } ]
|
76
|
+
|
77
|
+
hs.each do |html|
|
78
|
+
assert_transform( html, EmptyInlineRemover.new )
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
|
83
|
+
def test_css_display_filter_pattern
|
84
|
+
f = CSSDisplayFilter.new
|
85
|
+
assert( f.has_display_none( 'display: none' ) )
|
86
|
+
assert( f.has_display_none( '{display: none}' ) ) #lenient
|
87
|
+
assert( f.has_display_none( 'other:foo; DISPLAY:NONE;' ) )
|
88
|
+
|
89
|
+
assert( ! f.has_display_none( 'display: block' ) )
|
90
|
+
assert( ! f.has_display_none( 'other-display: none' ) )
|
91
|
+
assert( ! f.has_display_none( 'display: nonetheless' ) )
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_css_display_filter
|
95
|
+
html = {}
|
96
|
+
html[ :in ] = <<HTML
|
97
|
+
<div>
|
98
|
+
<b>keep</b>
|
99
|
+
<div style="display:none"><b>drop</b> me</div>
|
100
|
+
</div>
|
101
|
+
HTML
|
102
|
+
html[ :out ] = <<HTML
|
103
|
+
<div>
|
104
|
+
<b>keep</b>
|
105
|
+
</div>
|
106
|
+
HTML
|
107
|
+
assert_transform( html, CSSDisplayFilter.new )
|
108
|
+
end
|
109
|
+
|
110
|
+
def test_xmp_to_pre_converter
|
111
|
+
html = { :in => "<div><xmp> <i>keep</i> </xmp></div>",
|
112
|
+
:out => "<div><pre> <i>keep</i> </pre></div>" }
|
113
|
+
|
114
|
+
assert_transform( html, XmpToPreConverter.new )
|
115
|
+
end
|
116
|
+
|
117
|
+
def cut_atts( html, *atts )
|
118
|
+
atts.each do |att|
|
119
|
+
html = html.gsub( / #{att}="[^"]+"/, '' )
|
120
|
+
end
|
121
|
+
html
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|