iudex-html 1.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +2 -0
- data/Manifest.txt +24 -0
- data/README.rdoc +25 -0
- data/Rakefile +53 -0
- data/build/HTML.java.erb +91 -0
- data/build/attributes +82 -0
- data/build/java_generate.rb +139 -0
- data/build/tags +130 -0
- data/lib/iudex-html.rb +56 -0
- data/lib/iudex-html/base.rb +21 -0
- data/lib/iudex-html/factory_helper.rb +95 -0
- data/lib/iudex-html/iudex-html-1.0.0.jar +0 -0
- data/pom.xml +51 -0
- data/test/html_test_helper.rb +100 -0
- data/test/setup.rb +38 -0
- data/test/test_characters_normalizer.rb +81 -0
- data/test/test_extract_filter.rb +165 -0
- data/test/test_factory_helper.rb +51 -0
- data/test/test_html_parser.rb +128 -0
- data/test/test_other_filters.rb +51 -0
- data/test/test_other_tree_filters.rb +124 -0
- data/test/test_parse_filter.rb +72 -0
- data/test/test_tree_walker.rb +94 -0
- data/test/test_word_counters.rb +96 -0
- metadata +162 -0
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2010-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
class TestParseFilter < MiniTest::Unit::TestCase
|
23
|
+
include HTMLTestHelper
|
24
|
+
include Gravitext::HTMap
|
25
|
+
include Iudex::Core
|
26
|
+
include Iudex::HTML
|
27
|
+
include Iudex::HTML::Filters
|
28
|
+
|
29
|
+
def setup
|
30
|
+
@filter = html_parse_filter( :title )
|
31
|
+
@filter.min_parse = 0
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_marked
|
35
|
+
assert( ! marked?( "" ) )
|
36
|
+
assert( ! marked?( "simple" ) )
|
37
|
+
assert( ! marked?( "<simple" ) )
|
38
|
+
assert( ! marked?( "x < y" ) )
|
39
|
+
assert( ! marked?( "AT&T" ) )
|
40
|
+
assert( ! marked?( "AT & T;" ) )
|
41
|
+
|
42
|
+
assert( marked?( "<a>" ) )
|
43
|
+
assert( marked?( "Words © 2010" ) )
|
44
|
+
assert( marked?( "" ) )
|
45
|
+
assert( marked?( "߬" ) )
|
46
|
+
assert( marked?( "<![CDATA[simple]]>" ) )
|
47
|
+
assert( marked?( "<![CDATA[simple" ) )
|
48
|
+
assert( marked?( "<!-- comment -->" ) )
|
49
|
+
end
|
50
|
+
|
51
|
+
def marked?( text )
|
52
|
+
@filter.text_marked( text )
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_markup
|
56
|
+
tests = [ [ "simple", 0, "simple" ],
|
57
|
+
[ "<i>inner</i>", 1, nil ],
|
58
|
+
[ "<i>inner</i>", 2, nil ],
|
59
|
+
[ "<!--ignore-->text", 1, "text" ],
|
60
|
+
[ "<!--ignore-->text", 2, "text" ],
|
61
|
+
[ "<", 1, "<" ],
|
62
|
+
[ "&lt;", 2, "<" ] ]
|
63
|
+
|
64
|
+
tests.each do | input, count, out |
|
65
|
+
map = UniMap.new
|
66
|
+
map.title = input
|
67
|
+
assert_equal( count, @filter.parse_loop( map ), input )
|
68
|
+
assert_equal( out, map.title && map.title.to_s )
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2010-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
class TestTreeWalker < MiniTest::Unit::TestCase
|
23
|
+
include HTMLTestHelper
|
24
|
+
|
25
|
+
import 'iudex.html.tree.TreeFilter'
|
26
|
+
Action = TreeFilter::Action
|
27
|
+
|
28
|
+
DROP_HTML = {
|
29
|
+
:in => "<div>one<p>foo</p><br/> two</div>",
|
30
|
+
:out => "<div>one~~~~~~~~~~<br/> two</div>" }
|
31
|
+
# Note: ~~~ is padding removed in compare
|
32
|
+
|
33
|
+
def test_drop
|
34
|
+
filter = TagFilter.new( HTML::P, Action::DROP )
|
35
|
+
[ :walk_depth_first, :walk_breadth_first ].each do |order|
|
36
|
+
assert_transform( DROP_HTML, filter, order )
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
SKIP_HTML = {
|
41
|
+
:in => "<div>first<b>drop</b><span><b>not dropped</b></span></div>",
|
42
|
+
:out => "<div>first~~~~~~~~~~~<span><b>not dropped</b></span></div>" }
|
43
|
+
|
44
|
+
def test_skip
|
45
|
+
chain = TreeFilterChain.new( [ TagFilter.new( HTML::SPAN, Action::SKIP ),
|
46
|
+
TagFilter.new( HTML::B, Action::DROP ) ] )
|
47
|
+
assert_transform( SKIP_HTML, chain, :walk_breadth_first )
|
48
|
+
end
|
49
|
+
|
50
|
+
TERM_HTML = {
|
51
|
+
:in => "<div><span>first</span><b>term</b><span><b>not</b></span></div>",
|
52
|
+
:out => "<div>~~~~~~~~~~~~~~~~~~<b>term</b><span><b>not</b></span></div>" }
|
53
|
+
|
54
|
+
def test_terminate
|
55
|
+
chain = TreeFilterChain.new( [ TagFilter.new( HTML::B, Action::TERMINATE ),
|
56
|
+
TagFilter.new( HTML::SPAN, Action::DROP ) ] )
|
57
|
+
[ :walk_depth_first, :walk_breadth_first ].each do |order|
|
58
|
+
assert_equal( Action::TERMINATE,
|
59
|
+
assert_transform( TERM_HTML, chain, order ) )
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
FOLD_HTML = {
|
64
|
+
:in => "<div>first <b>drop</b> <span> remain <b>drop</b> </span> </div>",
|
65
|
+
:out => "<div>first ~~~~~~~~~~~ ~~~~~~ remain ~~~~~~~~~~~ ~~~~~~~ </div>" }
|
66
|
+
|
67
|
+
def test_fold
|
68
|
+
chain = TreeFilterChain.new( [ TagFilter.new( HTML::SPAN, Action::FOLD ),
|
69
|
+
TagFilter.new( HTML::B, Action::DROP ) ] )
|
70
|
+
|
71
|
+
[ :walk_breadth_first, :walk_depth_first ].each do |order|
|
72
|
+
assert_transform( FOLD_HTML, chain, order )
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
class TagFilter
|
77
|
+
include TreeFilter
|
78
|
+
|
79
|
+
def initialize( tag, action )
|
80
|
+
@tag = tag
|
81
|
+
@action = action
|
82
|
+
end
|
83
|
+
|
84
|
+
def filter( node )
|
85
|
+
elm = node.as_element
|
86
|
+
if( elm && elm.tag == @tag )
|
87
|
+
@action
|
88
|
+
else
|
89
|
+
Action::CONTINUE
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2010-2011 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
|
23
|
+
class TestWordCounters < MiniTest::Unit::TestCase
|
24
|
+
include HTMLTestHelper
|
25
|
+
include Iudex::HTML::Tree
|
26
|
+
include Iudex::HTML::Tree::Filters
|
27
|
+
|
28
|
+
def test_counts
|
29
|
+
tset = [ [ "", 0, 0 ],
|
30
|
+
[ "<div><span> </span></div>", 0, 0 ],
|
31
|
+
|
32
|
+
[ "a b", 2, 2 ],
|
33
|
+
[ "<div>a b</div>", 2, 2 ],
|
34
|
+
[ "<div><span>a b</span></div>", 2, 2 ],
|
35
|
+
[ "<div><span>a b</span><span>c d</span></div>", 4, 4 ],
|
36
|
+
|
37
|
+
[ "<div>a b <span><a name='foo'>c</a> d</span> e f</div>", 6, 6 ],
|
38
|
+
|
39
|
+
[ "<div><a href='foo'>a b</a></div>",
|
40
|
+
2, 2 * 0.25 ],
|
41
|
+
|
42
|
+
[ "<div><div>a b</div><div>c d</div></div>",
|
43
|
+
4, ( 2*2 + 2*2 ) / 4.0 ],
|
44
|
+
|
45
|
+
[ "<div><div>a b</div><div>c d</div><div> </div></div>",
|
46
|
+
4, ( 2*2 + 2*2 + 0*0 ) / 4.0 ],
|
47
|
+
|
48
|
+
[ "<div>a <div>a b</div><div>c d</div></div>",
|
49
|
+
5, 1.0 + ( 2*2 + 2*2 ) / 5.0 ],
|
50
|
+
|
51
|
+
[ "<div>a <div>a b c</div><div>c d</div></div>",
|
52
|
+
6, 1.0 + ( 3*3 + 2*2 ) / 6.0 ],
|
53
|
+
|
54
|
+
[ "<div><p>a b</p><p>c d</p></div><div>e f g</div>",
|
55
|
+
7, ( 2*4 + 3*3 ) / 7.0 ] ]
|
56
|
+
|
57
|
+
chain = TreeFilterChain.new( [ WordCounter.new, WordyCounter.new ] )
|
58
|
+
|
59
|
+
tset.each do |html, word_count, wordiness|
|
60
|
+
tree = parse( html )
|
61
|
+
TreeWalker::walk_depth_first( chain, tree )
|
62
|
+
|
63
|
+
assert_equal( word_count,
|
64
|
+
tree.get( HTMLTreeKeys::WORD_COUNT ),
|
65
|
+
"word_count for: " + html )
|
66
|
+
|
67
|
+
assert_in_delta( wordiness,
|
68
|
+
tree.get( HTMLTreeKeys::WORDINESS ),
|
69
|
+
1e-4,
|
70
|
+
" wordiness for: " + html )
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_doc
|
75
|
+
html = <<HTML
|
76
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
77
|
+
<head>
|
78
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
79
|
+
<title>Iūdex</title>
|
80
|
+
<style>style</style>
|
81
|
+
</head>
|
82
|
+
<body>
|
83
|
+
<p>Iūdex test.</p>
|
84
|
+
</body>
|
85
|
+
</html>
|
86
|
+
HTML
|
87
|
+
tree = parse( html, "UTF-8" )
|
88
|
+
chain = TreeFilterChain.new( [ MetaSkipFilter.new,
|
89
|
+
WordCounter.new,
|
90
|
+
WordyCounter.new ] )
|
91
|
+
TreeWalker::walk_depth_first( chain, tree )
|
92
|
+
assert_equal( 2, tree.get( HTMLTreeKeys::WORD_COUNT ) );
|
93
|
+
assert_equal( 2, tree.get( HTMLTreeKeys::WORDINESS ) );
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
metadata
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: iudex-html
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.0
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- David Kellum
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-04-04 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: iudex-core
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ~>
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 1.0.0
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rjack-nekohtml
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ~>
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 1.9.14
|
36
|
+
type: :runtime
|
37
|
+
version_requirements: *id002
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: gravitext-xmlprod
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - ~>
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 1.4.0
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id003
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: minitest
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: 1.7.1
|
58
|
+
- - <
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: "2.1"
|
61
|
+
type: :development
|
62
|
+
version_requirements: *id004
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: rjack-logback
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ~>
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: "1.0"
|
72
|
+
type: :development
|
73
|
+
version_requirements: *id005
|
74
|
+
- !ruby/object:Gem::Dependency
|
75
|
+
name: rjack-tarpit
|
76
|
+
prerelease: false
|
77
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.3.0
|
83
|
+
type: :development
|
84
|
+
version_requirements: *id006
|
85
|
+
description: |-
|
86
|
+
Iudex is a general purpose web crawler and feed processor in
|
87
|
+
ruby/java. The iudex-html gem contains filters for HTML parsing,
|
88
|
+
filtering, exracting text and links.
|
89
|
+
email:
|
90
|
+
- dek-oss@gravitext.com
|
91
|
+
executables: []
|
92
|
+
|
93
|
+
extensions: []
|
94
|
+
|
95
|
+
extra_rdoc_files:
|
96
|
+
- Manifest.txt
|
97
|
+
- History.rdoc
|
98
|
+
- README.rdoc
|
99
|
+
files:
|
100
|
+
- History.rdoc
|
101
|
+
- Manifest.txt
|
102
|
+
- README.rdoc
|
103
|
+
- Rakefile
|
104
|
+
- pom.xml
|
105
|
+
- build/HTML.java.erb
|
106
|
+
- build/attributes
|
107
|
+
- build/java_generate.rb
|
108
|
+
- build/tags
|
109
|
+
- lib/iudex-html/base.rb
|
110
|
+
- lib/iudex-html.rb
|
111
|
+
- lib/iudex-html/factory_helper.rb
|
112
|
+
- test/html_test_helper.rb
|
113
|
+
- test/setup.rb
|
114
|
+
- test/test_characters_normalizer.rb
|
115
|
+
- test/test_extract_filter.rb
|
116
|
+
- test/test_factory_helper.rb
|
117
|
+
- test/test_html_parser.rb
|
118
|
+
- test/test_other_filters.rb
|
119
|
+
- test/test_other_tree_filters.rb
|
120
|
+
- test/test_parse_filter.rb
|
121
|
+
- test/test_tree_walker.rb
|
122
|
+
- test/test_word_counters.rb
|
123
|
+
- lib/iudex-html/iudex-html-1.0.0.jar
|
124
|
+
has_rdoc: true
|
125
|
+
homepage: http://github.com/dekellum/iudex
|
126
|
+
licenses: []
|
127
|
+
|
128
|
+
post_install_message:
|
129
|
+
rdoc_options:
|
130
|
+
- --main
|
131
|
+
- README.rdoc
|
132
|
+
require_paths:
|
133
|
+
- lib
|
134
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
135
|
+
none: false
|
136
|
+
requirements:
|
137
|
+
- - ">="
|
138
|
+
- !ruby/object:Gem::Version
|
139
|
+
version: "0"
|
140
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
141
|
+
none: false
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: "0"
|
146
|
+
requirements: []
|
147
|
+
|
148
|
+
rubyforge_project: iudex-html
|
149
|
+
rubygems_version: 1.5.1
|
150
|
+
signing_key:
|
151
|
+
specification_version: 3
|
152
|
+
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|
153
|
+
test_files:
|
154
|
+
- test/test_factory_helper.rb
|
155
|
+
- test/test_other_filters.rb
|
156
|
+
- test/test_characters_normalizer.rb
|
157
|
+
- test/test_word_counters.rb
|
158
|
+
- test/test_extract_filter.rb
|
159
|
+
- test/test_tree_walker.rb
|
160
|
+
- test/test_other_tree_filters.rb
|
161
|
+
- test/test_html_parser.rb
|
162
|
+
- test/test_parse_filter.rb
|