iudex-html 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ class TestParseFilter < MiniTest::Unit::TestCase
23
+ include HTMLTestHelper
24
+ include Gravitext::HTMap
25
+ include Iudex::Core
26
+ include Iudex::HTML
27
+ include Iudex::HTML::Filters
28
+
29
+ def setup
30
+ @filter = html_parse_filter( :title )
31
+ @filter.min_parse = 0
32
+ end
33
+
34
+ def test_marked
35
+ assert( ! marked?( "" ) )
36
+ assert( ! marked?( "simple" ) )
37
+ assert( ! marked?( "<simple" ) )
38
+ assert( ! marked?( "x < y" ) )
39
+ assert( ! marked?( "AT&T" ) )
40
+ assert( ! marked?( "AT & T;" ) )
41
+
42
+ assert( marked?( "<a>" ) )
43
+ assert( marked?( "Words &copy; 2010" ) )
44
+ assert( marked?( "&#xf43e;" ) )
45
+ assert( marked?( "&#2028;" ) )
46
+ assert( marked?( "<![CDATA[simple]]>" ) )
47
+ assert( marked?( "<![CDATA[simple" ) )
48
+ assert( marked?( "<!-- comment -->" ) )
49
+ end
50
+
51
+ def marked?( text )
52
+ @filter.text_marked( text )
53
+ end
54
+
55
+ def test_markup
56
+ tests = [ [ "simple", 0, "simple" ],
57
+ [ "<i>inner</i>", 1, nil ],
58
+ [ "&lt;i>inner&lt;/i>", 2, nil ],
59
+ [ "<!--ignore-->text", 1, "text" ],
60
+ [ "&lt;!--ignore-->text", 2, "text" ],
61
+ [ "&lt;", 1, "<" ],
62
+ [ "&amp;lt;", 2, "<" ] ]
63
+
64
+ tests.each do | input, count, out |
65
+ map = UniMap.new
66
+ map.title = input
67
+ assert_equal( count, @filter.parse_loop( map ), input )
68
+ assert_equal( out, map.title && map.title.to_s )
69
+ end
70
+ end
71
+
72
+ end
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ class TestTreeWalker < MiniTest::Unit::TestCase
23
+ include HTMLTestHelper
24
+
25
+ import 'iudex.html.tree.TreeFilter'
26
+ Action = TreeFilter::Action
27
+
28
+ DROP_HTML = {
29
+ :in => "<div>one<p>foo</p><br/> two</div>",
30
+ :out => "<div>one~~~~~~~~~~<br/> two</div>" }
31
+ # Note: ~~~ is padding removed in compare
32
+
33
+ def test_drop
34
+ filter = TagFilter.new( HTML::P, Action::DROP )
35
+ [ :walk_depth_first, :walk_breadth_first ].each do |order|
36
+ assert_transform( DROP_HTML, filter, order )
37
+ end
38
+ end
39
+
40
+ SKIP_HTML = {
41
+ :in => "<div>first<b>drop</b><span><b>not dropped</b></span></div>",
42
+ :out => "<div>first~~~~~~~~~~~<span><b>not dropped</b></span></div>" }
43
+
44
+ def test_skip
45
+ chain = TreeFilterChain.new( [ TagFilter.new( HTML::SPAN, Action::SKIP ),
46
+ TagFilter.new( HTML::B, Action::DROP ) ] )
47
+ assert_transform( SKIP_HTML, chain, :walk_breadth_first )
48
+ end
49
+
50
+ TERM_HTML = {
51
+ :in => "<div><span>first</span><b>term</b><span><b>not</b></span></div>",
52
+ :out => "<div>~~~~~~~~~~~~~~~~~~<b>term</b><span><b>not</b></span></div>" }
53
+
54
+ def test_terminate
55
+ chain = TreeFilterChain.new( [ TagFilter.new( HTML::B, Action::TERMINATE ),
56
+ TagFilter.new( HTML::SPAN, Action::DROP ) ] )
57
+ [ :walk_depth_first, :walk_breadth_first ].each do |order|
58
+ assert_equal( Action::TERMINATE,
59
+ assert_transform( TERM_HTML, chain, order ) )
60
+ end
61
+ end
62
+
63
+ FOLD_HTML = {
64
+ :in => "<div>first <b>drop</b> <span> remain <b>drop</b> </span> </div>",
65
+ :out => "<div>first ~~~~~~~~~~~ ~~~~~~ remain ~~~~~~~~~~~ ~~~~~~~ </div>" }
66
+
67
+ def test_fold
68
+ chain = TreeFilterChain.new( [ TagFilter.new( HTML::SPAN, Action::FOLD ),
69
+ TagFilter.new( HTML::B, Action::DROP ) ] )
70
+
71
+ [ :walk_breadth_first, :walk_depth_first ].each do |order|
72
+ assert_transform( FOLD_HTML, chain, order )
73
+ end
74
+ end
75
+
76
+ class TagFilter
77
+ include TreeFilter
78
+
79
+ def initialize( tag, action )
80
+ @tag = tag
81
+ @action = action
82
+ end
83
+
84
+ def filter( node )
85
+ elm = node.as_element
86
+ if( elm && elm.tag == @tag )
87
+ @action
88
+ else
89
+ Action::CONTINUE
90
+ end
91
+ end
92
+ end
93
+
94
+ end
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+
23
+ class TestWordCounters < MiniTest::Unit::TestCase
24
+ include HTMLTestHelper
25
+ include Iudex::HTML::Tree
26
+ include Iudex::HTML::Tree::Filters
27
+
28
+ def test_counts
29
+ tset = [ [ "", 0, 0 ],
30
+ [ "<div><span> </span></div>", 0, 0 ],
31
+
32
+ [ "a b", 2, 2 ],
33
+ [ "<div>a b</div>", 2, 2 ],
34
+ [ "<div><span>a b</span></div>", 2, 2 ],
35
+ [ "<div><span>a b</span><span>c d</span></div>", 4, 4 ],
36
+
37
+ [ "<div>a b <span><a name='foo'>c</a> d</span> e f</div>", 6, 6 ],
38
+
39
+ [ "<div><a href='foo'>a b</a></div>",
40
+ 2, 2 * 0.25 ],
41
+
42
+ [ "<div><div>a b</div><div>c d</div></div>",
43
+ 4, ( 2*2 + 2*2 ) / 4.0 ],
44
+
45
+ [ "<div><div>a b</div><div>c d</div><div> </div></div>",
46
+ 4, ( 2*2 + 2*2 + 0*0 ) / 4.0 ],
47
+
48
+ [ "<div>a <div>a b</div><div>c d</div></div>",
49
+ 5, 1.0 + ( 2*2 + 2*2 ) / 5.0 ],
50
+
51
+ [ "<div>a <div>a b c</div><div>c d</div></div>",
52
+ 6, 1.0 + ( 3*3 + 2*2 ) / 6.0 ],
53
+
54
+ [ "<div><p>a b</p><p>c d</p></div><div>e f g</div>",
55
+ 7, ( 2*4 + 3*3 ) / 7.0 ] ]
56
+
57
+ chain = TreeFilterChain.new( [ WordCounter.new, WordyCounter.new ] )
58
+
59
+ tset.each do |html, word_count, wordiness|
60
+ tree = parse( html )
61
+ TreeWalker::walk_depth_first( chain, tree )
62
+
63
+ assert_equal( word_count,
64
+ tree.get( HTMLTreeKeys::WORD_COUNT ),
65
+ "word_count for: " + html )
66
+
67
+ assert_in_delta( wordiness,
68
+ tree.get( HTMLTreeKeys::WORDINESS ),
69
+ 1e-4,
70
+ " wordiness for: " + html )
71
+ end
72
+ end
73
+
74
+ def test_doc
75
+ html = <<HTML
76
+ <html xmlns="http://www.w3.org/1999/xhtml">
77
+ <head>
78
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
79
+ <title>Iūdex</title>
80
+ <style>style</style>
81
+ </head>
82
+ <body>
83
+ <p>Iūdex test.</p>
84
+ </body>
85
+ </html>
86
+ HTML
87
+ tree = parse( html, "UTF-8" )
88
+ chain = TreeFilterChain.new( [ MetaSkipFilter.new,
89
+ WordCounter.new,
90
+ WordyCounter.new ] )
91
+ TreeWalker::walk_depth_first( chain, tree )
92
+ assert_equal( 2, tree.get( HTMLTreeKeys::WORD_COUNT ) );
93
+ assert_equal( 2, tree.get( HTMLTreeKeys::WORDINESS ) );
94
+ end
95
+
96
+ end
metadata ADDED
@@ -0,0 +1,162 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iudex-html
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.0.0
6
+ platform: java
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-04-04 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: iudex-core
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 1.0.0
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: rjack-nekohtml
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ~>
34
+ - !ruby/object:Gem::Version
35
+ version: 1.9.14
36
+ type: :runtime
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: gravitext-xmlprod
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ~>
45
+ - !ruby/object:Gem::Version
46
+ version: 1.4.0
47
+ type: :runtime
48
+ version_requirements: *id003
49
+ - !ruby/object:Gem::Dependency
50
+ name: minitest
51
+ prerelease: false
52
+ requirement: &id004 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: 1.7.1
58
+ - - <
59
+ - !ruby/object:Gem::Version
60
+ version: "2.1"
61
+ type: :development
62
+ version_requirements: *id004
63
+ - !ruby/object:Gem::Dependency
64
+ name: rjack-logback
65
+ prerelease: false
66
+ requirement: &id005 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ~>
70
+ - !ruby/object:Gem::Version
71
+ version: "1.0"
72
+ type: :development
73
+ version_requirements: *id005
74
+ - !ruby/object:Gem::Dependency
75
+ name: rjack-tarpit
76
+ prerelease: false
77
+ requirement: &id006 !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: 1.3.0
83
+ type: :development
84
+ version_requirements: *id006
85
+ description: |-
86
+ Iudex is a general purpose web crawler and feed processor in
87
+ ruby/java. The iudex-html gem contains filters for HTML parsing,
88
+ filtering, exracting text and links.
89
+ email:
90
+ - dek-oss@gravitext.com
91
+ executables: []
92
+
93
+ extensions: []
94
+
95
+ extra_rdoc_files:
96
+ - Manifest.txt
97
+ - History.rdoc
98
+ - README.rdoc
99
+ files:
100
+ - History.rdoc
101
+ - Manifest.txt
102
+ - README.rdoc
103
+ - Rakefile
104
+ - pom.xml
105
+ - build/HTML.java.erb
106
+ - build/attributes
107
+ - build/java_generate.rb
108
+ - build/tags
109
+ - lib/iudex-html/base.rb
110
+ - lib/iudex-html.rb
111
+ - lib/iudex-html/factory_helper.rb
112
+ - test/html_test_helper.rb
113
+ - test/setup.rb
114
+ - test/test_characters_normalizer.rb
115
+ - test/test_extract_filter.rb
116
+ - test/test_factory_helper.rb
117
+ - test/test_html_parser.rb
118
+ - test/test_other_filters.rb
119
+ - test/test_other_tree_filters.rb
120
+ - test/test_parse_filter.rb
121
+ - test/test_tree_walker.rb
122
+ - test/test_word_counters.rb
123
+ - lib/iudex-html/iudex-html-1.0.0.jar
124
+ has_rdoc: true
125
+ homepage: http://github.com/dekellum/iudex
126
+ licenses: []
127
+
128
+ post_install_message:
129
+ rdoc_options:
130
+ - --main
131
+ - README.rdoc
132
+ require_paths:
133
+ - lib
134
+ required_ruby_version: !ruby/object:Gem::Requirement
135
+ none: false
136
+ requirements:
137
+ - - ">="
138
+ - !ruby/object:Gem::Version
139
+ version: "0"
140
+ required_rubygems_version: !ruby/object:Gem::Requirement
141
+ none: false
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: "0"
146
+ requirements: []
147
+
148
+ rubyforge_project: iudex-html
149
+ rubygems_version: 1.5.1
150
+ signing_key:
151
+ specification_version: 3
152
+ summary: Iudex is a general purpose web crawler and feed processor in ruby/java
153
+ test_files:
154
+ - test/test_factory_helper.rb
155
+ - test/test_other_filters.rb
156
+ - test/test_characters_normalizer.rb
157
+ - test/test_word_counters.rb
158
+ - test/test_extract_filter.rb
159
+ - test/test_tree_walker.rb
160
+ - test/test_other_tree_filters.rb
161
+ - test/test_html_parser.rb
162
+ - test/test_parse_filter.rb