iudex-html 1.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ class TestParseFilter < MiniTest::Unit::TestCase
23
+ include HTMLTestHelper
24
+ include Gravitext::HTMap
25
+ include Iudex::Core
26
+ include Iudex::HTML
27
+ include Iudex::HTML::Filters
28
+
29
+ def setup
30
+ @filter = html_parse_filter( :title )
31
+ @filter.min_parse = 0
32
+ end
33
+
34
+ def test_marked
35
+ assert( ! marked?( "" ) )
36
+ assert( ! marked?( "simple" ) )
37
+ assert( ! marked?( "<simple" ) )
38
+ assert( ! marked?( "x < y" ) )
39
+ assert( ! marked?( "AT&T" ) )
40
+ assert( ! marked?( "AT & T;" ) )
41
+
42
+ assert( marked?( "<a>" ) )
43
+ assert( marked?( "Words &copy; 2010" ) )
44
+ assert( marked?( "&#xf43e;" ) )
45
+ assert( marked?( "&#2028;" ) )
46
+ assert( marked?( "<![CDATA[simple]]>" ) )
47
+ assert( marked?( "<![CDATA[simple" ) )
48
+ assert( marked?( "<!-- comment -->" ) )
49
+ end
50
+
51
+ def marked?( text )
52
+ @filter.text_marked( text )
53
+ end
54
+
55
+ def test_markup
56
+ tests = [ [ "simple", 0, "simple" ],
57
+ [ "<i>inner</i>", 1, nil ],
58
+ [ "&lt;i>inner&lt;/i>", 2, nil ],
59
+ [ "<!--ignore-->text", 1, "text" ],
60
+ [ "&lt;!--ignore-->text", 2, "text" ],
61
+ [ "&lt;", 1, "<" ],
62
+ [ "&amp;lt;", 2, "<" ] ]
63
+
64
+ tests.each do | input, count, out |
65
+ map = UniMap.new
66
+ map.title = input
67
+ assert_equal( count, @filter.parse_loop( map ), input )
68
+ assert_equal( out, map.title && map.title.to_s )
69
+ end
70
+ end
71
+
72
+ end
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ class TestTreeWalker < MiniTest::Unit::TestCase
23
+ include HTMLTestHelper
24
+
25
+ import 'iudex.html.tree.TreeFilter'
26
+ Action = TreeFilter::Action
27
+
28
+ DROP_HTML = {
29
+ :in => "<div>one<p>foo</p><br/> two</div>",
30
+ :out => "<div>one~~~~~~~~~~<br/> two</div>" }
31
+ # Note: ~~~ is padding removed in compare
32
+
33
+ def test_drop
34
+ filter = TagFilter.new( HTML::P, Action::DROP )
35
+ [ :walk_depth_first, :walk_breadth_first ].each do |order|
36
+ assert_transform( DROP_HTML, filter, order )
37
+ end
38
+ end
39
+
40
+ SKIP_HTML = {
41
+ :in => "<div>first<b>drop</b><span><b>not dropped</b></span></div>",
42
+ :out => "<div>first~~~~~~~~~~~<span><b>not dropped</b></span></div>" }
43
+
44
+ def test_skip
45
+ chain = TreeFilterChain.new( [ TagFilter.new( HTML::SPAN, Action::SKIP ),
46
+ TagFilter.new( HTML::B, Action::DROP ) ] )
47
+ assert_transform( SKIP_HTML, chain, :walk_breadth_first )
48
+ end
49
+
50
+ TERM_HTML = {
51
+ :in => "<div><span>first</span><b>term</b><span><b>not</b></span></div>",
52
+ :out => "<div>~~~~~~~~~~~~~~~~~~<b>term</b><span><b>not</b></span></div>" }
53
+
54
+ def test_terminate
55
+ chain = TreeFilterChain.new( [ TagFilter.new( HTML::B, Action::TERMINATE ),
56
+ TagFilter.new( HTML::SPAN, Action::DROP ) ] )
57
+ [ :walk_depth_first, :walk_breadth_first ].each do |order|
58
+ assert_equal( Action::TERMINATE,
59
+ assert_transform( TERM_HTML, chain, order ) )
60
+ end
61
+ end
62
+
63
+ FOLD_HTML = {
64
+ :in => "<div>first <b>drop</b> <span> remain <b>drop</b> </span> </div>",
65
+ :out => "<div>first ~~~~~~~~~~~ ~~~~~~ remain ~~~~~~~~~~~ ~~~~~~~ </div>" }
66
+
67
+ def test_fold
68
+ chain = TreeFilterChain.new( [ TagFilter.new( HTML::SPAN, Action::FOLD ),
69
+ TagFilter.new( HTML::B, Action::DROP ) ] )
70
+
71
+ [ :walk_breadth_first, :walk_depth_first ].each do |order|
72
+ assert_transform( FOLD_HTML, chain, order )
73
+ end
74
+ end
75
+
76
+ class TagFilter
77
+ include TreeFilter
78
+
79
+ def initialize( tag, action )
80
+ @tag = tag
81
+ @action = action
82
+ end
83
+
84
+ def filter( node )
85
+ elm = node.as_element
86
+ if( elm && elm.tag == @tag )
87
+ @action
88
+ else
89
+ Action::CONTINUE
90
+ end
91
+ end
92
+ end
93
+
94
+ end
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2010-2011 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+
23
+ class TestWordCounters < MiniTest::Unit::TestCase
24
+ include HTMLTestHelper
25
+ include Iudex::HTML::Tree
26
+ include Iudex::HTML::Tree::Filters
27
+
28
+ def test_counts
29
+ tset = [ [ "", 0, 0 ],
30
+ [ "<div><span> </span></div>", 0, 0 ],
31
+
32
+ [ "a b", 2, 2 ],
33
+ [ "<div>a b</div>", 2, 2 ],
34
+ [ "<div><span>a b</span></div>", 2, 2 ],
35
+ [ "<div><span>a b</span><span>c d</span></div>", 4, 4 ],
36
+
37
+ [ "<div>a b <span><a name='foo'>c</a> d</span> e f</div>", 6, 6 ],
38
+
39
+ [ "<div><a href='foo'>a b</a></div>",
40
+ 2, 2 * 0.25 ],
41
+
42
+ [ "<div><div>a b</div><div>c d</div></div>",
43
+ 4, ( 2*2 + 2*2 ) / 4.0 ],
44
+
45
+ [ "<div><div>a b</div><div>c d</div><div> </div></div>",
46
+ 4, ( 2*2 + 2*2 + 0*0 ) / 4.0 ],
47
+
48
+ [ "<div>a <div>a b</div><div>c d</div></div>",
49
+ 5, 1.0 + ( 2*2 + 2*2 ) / 5.0 ],
50
+
51
+ [ "<div>a <div>a b c</div><div>c d</div></div>",
52
+ 6, 1.0 + ( 3*3 + 2*2 ) / 6.0 ],
53
+
54
+ [ "<div><p>a b</p><p>c d</p></div><div>e f g</div>",
55
+ 7, ( 2*4 + 3*3 ) / 7.0 ] ]
56
+
57
+ chain = TreeFilterChain.new( [ WordCounter.new, WordyCounter.new ] )
58
+
59
+ tset.each do |html, word_count, wordiness|
60
+ tree = parse( html )
61
+ TreeWalker::walk_depth_first( chain, tree )
62
+
63
+ assert_equal( word_count,
64
+ tree.get( HTMLTreeKeys::WORD_COUNT ),
65
+ "word_count for: " + html )
66
+
67
+ assert_in_delta( wordiness,
68
+ tree.get( HTMLTreeKeys::WORDINESS ),
69
+ 1e-4,
70
+ " wordiness for: " + html )
71
+ end
72
+ end
73
+
74
+ def test_doc
75
+ html = <<HTML
76
+ <html xmlns="http://www.w3.org/1999/xhtml">
77
+ <head>
78
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
79
+ <title>Iūdex</title>
80
+ <style>style</style>
81
+ </head>
82
+ <body>
83
+ <p>Iūdex test.</p>
84
+ </body>
85
+ </html>
86
+ HTML
87
+ tree = parse( html, "UTF-8" )
88
+ chain = TreeFilterChain.new( [ MetaSkipFilter.new,
89
+ WordCounter.new,
90
+ WordyCounter.new ] )
91
+ TreeWalker::walk_depth_first( chain, tree )
92
+ assert_equal( 2, tree.get( HTMLTreeKeys::WORD_COUNT ) );
93
+ assert_equal( 2, tree.get( HTMLTreeKeys::WORDINESS ) );
94
+ end
95
+
96
+ end
metadata ADDED
@@ -0,0 +1,162 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: iudex-html
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 1.0.0
6
+ platform: java
7
+ authors:
8
+ - David Kellum
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-04-04 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: iudex-core
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 1.0.0
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: rjack-nekohtml
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ~>
34
+ - !ruby/object:Gem::Version
35
+ version: 1.9.14
36
+ type: :runtime
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: gravitext-xmlprod
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - ~>
45
+ - !ruby/object:Gem::Version
46
+ version: 1.4.0
47
+ type: :runtime
48
+ version_requirements: *id003
49
+ - !ruby/object:Gem::Dependency
50
+ name: minitest
51
+ prerelease: false
52
+ requirement: &id004 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: 1.7.1
58
+ - - <
59
+ - !ruby/object:Gem::Version
60
+ version: "2.1"
61
+ type: :development
62
+ version_requirements: *id004
63
+ - !ruby/object:Gem::Dependency
64
+ name: rjack-logback
65
+ prerelease: false
66
+ requirement: &id005 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ~>
70
+ - !ruby/object:Gem::Version
71
+ version: "1.0"
72
+ type: :development
73
+ version_requirements: *id005
74
+ - !ruby/object:Gem::Dependency
75
+ name: rjack-tarpit
76
+ prerelease: false
77
+ requirement: &id006 !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: 1.3.0
83
+ type: :development
84
+ version_requirements: *id006
85
+ description: |-
86
+ Iudex is a general purpose web crawler and feed processor in
87
+ ruby/java. The iudex-html gem contains filters for HTML parsing,
88
+ filtering, exracting text and links.
89
+ email:
90
+ - dek-oss@gravitext.com
91
+ executables: []
92
+
93
+ extensions: []
94
+
95
+ extra_rdoc_files:
96
+ - Manifest.txt
97
+ - History.rdoc
98
+ - README.rdoc
99
+ files:
100
+ - History.rdoc
101
+ - Manifest.txt
102
+ - README.rdoc
103
+ - Rakefile
104
+ - pom.xml
105
+ - build/HTML.java.erb
106
+ - build/attributes
107
+ - build/java_generate.rb
108
+ - build/tags
109
+ - lib/iudex-html/base.rb
110
+ - lib/iudex-html.rb
111
+ - lib/iudex-html/factory_helper.rb
112
+ - test/html_test_helper.rb
113
+ - test/setup.rb
114
+ - test/test_characters_normalizer.rb
115
+ - test/test_extract_filter.rb
116
+ - test/test_factory_helper.rb
117
+ - test/test_html_parser.rb
118
+ - test/test_other_filters.rb
119
+ - test/test_other_tree_filters.rb
120
+ - test/test_parse_filter.rb
121
+ - test/test_tree_walker.rb
122
+ - test/test_word_counters.rb
123
+ - lib/iudex-html/iudex-html-1.0.0.jar
124
+ has_rdoc: true
125
+ homepage: http://github.com/dekellum/iudex
126
+ licenses: []
127
+
128
+ post_install_message:
129
+ rdoc_options:
130
+ - --main
131
+ - README.rdoc
132
+ require_paths:
133
+ - lib
134
+ required_ruby_version: !ruby/object:Gem::Requirement
135
+ none: false
136
+ requirements:
137
+ - - ">="
138
+ - !ruby/object:Gem::Version
139
+ version: "0"
140
+ required_rubygems_version: !ruby/object:Gem::Requirement
141
+ none: false
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: "0"
146
+ requirements: []
147
+
148
+ rubyforge_project: iudex-html
149
+ rubygems_version: 1.5.1
150
+ signing_key:
151
+ specification_version: 3
152
+ summary: Iudex is a general purpose web crawler and feed processor in ruby/java
153
+ test_files:
154
+ - test/test_factory_helper.rb
155
+ - test/test_other_filters.rb
156
+ - test/test_characters_normalizer.rb
157
+ - test/test_word_counters.rb
158
+ - test/test_extract_filter.rb
159
+ - test/test_tree_walker.rb
160
+ - test/test_other_tree_filters.rb
161
+ - test/test_html_parser.rb
162
+ - test/test_parse_filter.rb