iudex-html 1.1.0-java → 1.2.b.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +11 -0
- data/Manifest.txt +5 -1
- data/README.rdoc +1 -1
- data/Rakefile +2 -36
- data/bin/iudex-html-clean +58 -0
- data/bin/iudex-html-perftest +59 -0
- data/build/HTML.java.erb +1 -1
- data/build/attributes +1 -1
- data/build/java_generate.rb +1 -1
- data/build/tags +1 -1
- data/lib/iudex-html/base.rb +2 -2
- data/lib/iudex-html/factory_helper.rb +1 -1
- data/lib/iudex-html/{iudex-html-1.1.0.jar → iudex-html-1.2.b.0.jar} +0 -0
- data/lib/iudex-html.rb +12 -2
- data/pom.xml +4 -4
- data/test/html_test_helper.rb +7 -6
- data/test/reddit.xhtml +557 -0
- data/test/setup.rb +18 -16
- data/test/test_characters_normalizer.rb +1 -1
- data/test/test_extract_filter.rb +1 -1
- data/test/test_factory_helper.rb +1 -1
- data/test/test_html_parser.rb +21 -1
- data/test/test_other_filters.rb +1 -1
- data/test/test_other_tree_filters.rb +1 -1
- data/test/test_parse_filter.rb +1 -1
- data/test/test_stax_parser.rb +78 -0
- data/test/test_tree_walker.rb +1 -1
- data/test/test_word_counters.rb +1 -1
- metadata +43 -48
- data/.gemtest +0 -0
data/test/setup.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c)
|
2
|
+
# Copyright (c) 2008-2012 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You
|
@@ -14,25 +14,27 @@
|
|
14
14
|
# permissions and limitations under the License.
|
15
15
|
#++
|
16
16
|
|
17
|
-
#### General test setup
|
18
|
-
|
19
|
-
test_dir = File.dirname( __FILE__ )
|
20
|
-
|
21
|
-
ldir = File.join( test_dir, "..", "lib" )
|
22
|
-
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
17
|
+
#### General test setup, logging, console output ####
|
23
18
|
|
24
19
|
require 'rubygems'
|
25
|
-
require '
|
26
|
-
RJack::Logback.config_console( :stderr => true )
|
20
|
+
require 'bundler/setup'
|
27
21
|
|
28
22
|
require 'minitest/unit'
|
29
23
|
require 'minitest/autorun'
|
30
24
|
|
31
|
-
require
|
25
|
+
require 'rjack-logback'
|
26
|
+
|
27
|
+
module TestSetup
|
28
|
+
include RJack
|
29
|
+
Logback.config_console( :stderr => true, :thread => true )
|
30
|
+
|
31
|
+
if ( ARGV & %w[ -v --verbose --debug ] ).empty?
|
32
|
+
Logback.root.level = Logback::INFO
|
33
|
+
else
|
34
|
+
Logback.root.level = Logback::DEBUG
|
35
|
+
end
|
36
|
+
|
37
|
+
ARGV.delete( '--debug' )
|
38
|
+
end
|
32
39
|
|
33
|
-
|
34
|
-
# class TestOut
|
35
|
-
# def print( *a ); $stdout.puts( *a ); end
|
36
|
-
# def puts( *a ); $stdout.puts( *a ); end
|
37
|
-
# end
|
38
|
-
# MiniTest::Unit.output = TestOut.new
|
40
|
+
require File.join( File.dirname( __FILE__ ), 'html_test_helper.rb' )
|
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c)
|
5
|
+
# Copyright (c) 2008-2012 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You
|
data/test/test_extract_filter.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
4
4
|
|
5
5
|
#--
|
6
|
-
# Copyright (c)
|
6
|
+
# Copyright (c) 2008-2012 David Kellum
|
7
7
|
#
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
9
|
# may not use this file except in compliance with the License. You
|
data/test/test_factory_helper.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
4
4
|
|
5
5
|
#--
|
6
|
-
# Copyright (c)
|
6
|
+
# Copyright (c) 2008-2012 David Kellum
|
7
7
|
#
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
9
|
# may not use this file except in compliance with the License. You
|
data/test/test_html_parser.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
4
4
|
|
5
5
|
#--
|
6
|
-
# Copyright (c)
|
6
|
+
# Copyright (c) 2008-2012 David Kellum
|
7
7
|
#
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
9
|
# may not use this file except in compliance with the License. You
|
@@ -75,6 +75,26 @@ HTML
|
|
75
75
|
assert_doc( HTML_SKIP_TAGS_SKIPPED, parse( HTML_SKIP_TAGS, "ISO-8859-1" ) )
|
76
76
|
end
|
77
77
|
|
78
|
+
def test_attr_duplicates
|
79
|
+
input = <<-HTML
|
80
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
81
|
+
<head/>
|
82
|
+
<body>
|
83
|
+
<p class="foo" class="bar">hello</p>
|
84
|
+
</body>
|
85
|
+
</html>
|
86
|
+
HTML
|
87
|
+
output = <<-HTML
|
88
|
+
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
|
89
|
+
<head/>
|
90
|
+
<body>
|
91
|
+
<p class="bar">hello</p>
|
92
|
+
</body>
|
93
|
+
</html>
|
94
|
+
HTML
|
95
|
+
assert_doc( output, parse( input ) )
|
96
|
+
end
|
97
|
+
|
78
98
|
HTML_OUTSIDE = <<HTML
|
79
99
|
before
|
80
100
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
data/test/test_other_filters.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
4
4
|
|
5
5
|
#--
|
6
|
-
# Copyright (c)
|
6
|
+
# Copyright (c) 2008-2012 David Kellum
|
7
7
|
#
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
9
|
# may not use this file except in compliance with the License. You
|
@@ -3,7 +3,7 @@
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
4
4
|
|
5
5
|
#--
|
6
|
-
# Copyright (c)
|
6
|
+
# Copyright (c) 2008-2012 David Kellum
|
7
7
|
#
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
9
|
# may not use this file except in compliance with the License. You
|
data/test/test_parse_filter.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c)
|
5
|
+
# Copyright (c) 2008-2012 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You
|
@@ -0,0 +1,78 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
#.hashdot.profile += jruby-shortlived
|
4
|
+
|
5
|
+
#--
|
6
|
+
# Copyright (c) 2012 David Kellum
|
7
|
+
#
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
|
+
# may not use this file except in compliance with the License. You
|
10
|
+
# may obtain a copy of the License at
|
11
|
+
#
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
#
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
17
|
+
# implied. See the License for the specific language governing
|
18
|
+
# permissions and limitations under the License.
|
19
|
+
#++
|
20
|
+
|
21
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
22
|
+
|
23
|
+
class TestStAXParser < MiniTest::Unit::TestCase
|
24
|
+
include HTMLTestHelper
|
25
|
+
|
26
|
+
import 'javax.xml.stream.XMLStreamException'
|
27
|
+
|
28
|
+
HTML_FULL = <<HTML
|
29
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
30
|
+
<head>
|
31
|
+
<title>Iūdex</title>
|
32
|
+
</head>
|
33
|
+
<body>
|
34
|
+
<p>Iūdex test.</p>
|
35
|
+
<n:bogus xmlns:n="bogo">truely</n:bogus>
|
36
|
+
</body>
|
37
|
+
</html>
|
38
|
+
HTML
|
39
|
+
|
40
|
+
def test_with_non_html
|
41
|
+
root = parse( HTML_FULL, "UTF-8" )
|
42
|
+
assert( root.find( 'body' ).find( 'bogus' ).tag.banned? )
|
43
|
+
assert_doc( HTML_FULL, root )
|
44
|
+
end
|
45
|
+
|
46
|
+
HTML_CDATA = {
|
47
|
+
:in => "<p><![CDATA[two]]></p>",
|
48
|
+
:out => "<p>two</p>" }
|
49
|
+
# Note: HTML parsers drop this instead.
|
50
|
+
|
51
|
+
def test_cdata
|
52
|
+
tree = parse( HTML_CDATA[ :in ] )
|
53
|
+
assert_fragment( HTML_CDATA[ :out ], tree )
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_inline_nest
|
57
|
+
html = { :in => "<div><i>begin <p>block</p> end.</i></div>",
|
58
|
+
:out => "<div><i>begin <p>block</p> end.</i></div>" }
|
59
|
+
tree = parse( html[ :in ] )
|
60
|
+
assert_fragment( html[ :out ], tree )
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_invalid_error
|
64
|
+
assert_raises( XMLStreamException ) do
|
65
|
+
Iudex::HTML::Tree.parse( "" )
|
66
|
+
end
|
67
|
+
|
68
|
+
assert_raises( XMLStreamException ) do
|
69
|
+
Iudex::HTML::Tree.parse( "<doc><open></doc>" )
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Helper overrrides
|
74
|
+
def parse( html, charset = "UTF-8" )
|
75
|
+
Iudex::HTML::Tree.parse( compress( html ) )
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
data/test/test_tree_walker.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
3
3
|
|
4
4
|
#--
|
5
|
-
# Copyright (c)
|
5
|
+
# Copyright (c) 2008-2012 David Kellum
|
6
6
|
#
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
8
|
# may not use this file except in compliance with the License. You
|
data/test/test_word_counters.rb
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
4
4
|
|
5
5
|
#--
|
6
|
-
# Copyright (c)
|
6
|
+
# Copyright (c) 2008-2012 David Kellum
|
7
7
|
#
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
9
9
|
# may not use this file except in compliance with the License. You
|
metadata
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: iudex-html
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
prerelease:
|
5
|
-
version: 1.
|
4
|
+
prerelease: 4
|
5
|
+
version: 1.2.b.0
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
@@ -10,86 +10,83 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date:
|
13
|
+
date: 2012-03-05 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: iudex-core
|
17
|
-
|
18
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
17
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
19
18
|
none: false
|
20
19
|
requirements:
|
21
20
|
- - ~>
|
22
21
|
- !ruby/object:Gem::Version
|
23
|
-
version: 1.
|
22
|
+
version: 1.2.b
|
23
|
+
requirement: *id001
|
24
|
+
prerelease: false
|
24
25
|
type: :runtime
|
25
|
-
version_requirements: *id001
|
26
26
|
- !ruby/object:Gem::Dependency
|
27
27
|
name: rjack-nekohtml
|
28
|
-
|
29
|
-
requirement: &id002 !ruby/object:Gem::Requirement
|
28
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
30
29
|
none: false
|
31
30
|
requirements:
|
32
31
|
- - ~>
|
33
32
|
- !ruby/object:Gem::Version
|
34
33
|
version: 1.9.14
|
34
|
+
requirement: *id002
|
35
|
+
prerelease: false
|
35
36
|
type: :runtime
|
36
|
-
version_requirements: *id002
|
37
37
|
- !ruby/object:Gem::Dependency
|
38
38
|
name: gravitext-xmlprod
|
39
|
-
|
40
|
-
requirement: &id003 !ruby/object:Gem::Requirement
|
39
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
41
40
|
none: false
|
42
41
|
requirements:
|
43
42
|
- - ~>
|
44
43
|
- !ruby/object:Gem::Version
|
45
|
-
version: 1.
|
44
|
+
version: 1.5.b
|
45
|
+
requirement: *id003
|
46
|
+
prerelease: false
|
46
47
|
type: :runtime
|
47
|
-
version_requirements: *id003
|
48
48
|
- !ruby/object:Gem::Dependency
|
49
49
|
name: minitest
|
50
|
-
|
51
|
-
requirement: &id004 !ruby/object:Gem::Requirement
|
50
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
52
51
|
none: false
|
53
52
|
requirements:
|
54
53
|
- - ~>
|
55
54
|
- !ruby/object:Gem::Version
|
56
55
|
version: "2.3"
|
56
|
+
requirement: *id004
|
57
|
+
prerelease: false
|
57
58
|
type: :development
|
58
|
-
version_requirements: *id004
|
59
59
|
- !ruby/object:Gem::Dependency
|
60
60
|
name: rjack-logback
|
61
|
-
|
62
|
-
requirement: &id005 !ruby/object:Gem::Requirement
|
61
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
63
62
|
none: false
|
64
63
|
requirements:
|
65
64
|
- - ~>
|
66
65
|
- !ruby/object:Gem::Version
|
67
66
|
version: "1.0"
|
67
|
+
requirement: *id005
|
68
|
+
prerelease: false
|
68
69
|
type: :development
|
69
|
-
version_requirements: *id005
|
70
70
|
- !ruby/object:Gem::Dependency
|
71
71
|
name: rjack-tarpit
|
72
|
-
|
73
|
-
requirement: &id006 !ruby/object:Gem::Requirement
|
72
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
74
73
|
none: false
|
75
74
|
requirements:
|
76
75
|
- - ~>
|
77
76
|
- !ruby/object:Gem::Version
|
78
|
-
version:
|
77
|
+
version: "2.0"
|
78
|
+
requirement: *id006
|
79
|
+
prerelease: false
|
79
80
|
type: :development
|
80
|
-
|
81
|
-
description: |-
|
82
|
-
Iudex is a general purpose web crawler and feed processor in
|
83
|
-
ruby/java. The iudex-html gem contains filters for HTML parsing,
|
84
|
-
filtering, exracting text and links.
|
81
|
+
description: Iudex is a general purpose web crawler and feed processor in ruby/java. The iudex-html gem contains filters for HTML parsing, filtering, exracting text and links.
|
85
82
|
email:
|
86
83
|
- dek-oss@gravitext.com
|
87
|
-
executables:
|
88
|
-
|
84
|
+
executables:
|
85
|
+
- iudex-html-clean
|
86
|
+
- iudex-html-perftest
|
89
87
|
extensions: []
|
90
88
|
|
91
89
|
extra_rdoc_files:
|
92
|
-
- Manifest.txt
|
93
90
|
- History.rdoc
|
94
91
|
- README.rdoc
|
95
92
|
files:
|
@@ -98,6 +95,8 @@ files:
|
|
98
95
|
- README.rdoc
|
99
96
|
- Rakefile
|
100
97
|
- pom.xml
|
98
|
+
- bin/iudex-html-clean
|
99
|
+
- bin/iudex-html-perftest
|
101
100
|
- build/HTML.java.erb
|
102
101
|
- build/attributes
|
103
102
|
- build/java_generate.rb
|
@@ -106,6 +105,7 @@ files:
|
|
106
105
|
- lib/iudex-html.rb
|
107
106
|
- lib/iudex-html/factory_helper.rb
|
108
107
|
- test/html_test_helper.rb
|
108
|
+
- test/reddit.xhtml
|
109
109
|
- test/setup.rb
|
110
110
|
- test/test_characters_normalizer.rb
|
111
111
|
- test/test_extract_filter.rb
|
@@ -114,10 +114,10 @@ files:
|
|
114
114
|
- test/test_other_filters.rb
|
115
115
|
- test/test_other_tree_filters.rb
|
116
116
|
- test/test_parse_filter.rb
|
117
|
+
- test/test_stax_parser.rb
|
117
118
|
- test/test_tree_walker.rb
|
118
119
|
- test/test_word_counters.rb
|
119
|
-
- lib/iudex-html/iudex-html-1.
|
120
|
-
- .gemtest
|
120
|
+
- lib/iudex-html/iudex-html-1.2.b.0.jar
|
121
121
|
homepage: http://github.com/dekellum/iudex
|
122
122
|
licenses: []
|
123
123
|
|
@@ -132,27 +132,22 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
132
132
|
requirements:
|
133
133
|
- - ">="
|
134
134
|
- !ruby/object:Gem::Version
|
135
|
+
hash: 2
|
136
|
+
segments:
|
137
|
+
- 0
|
135
138
|
version: "0"
|
136
139
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
137
140
|
none: false
|
138
141
|
requirements:
|
139
|
-
- - "
|
142
|
+
- - ">"
|
140
143
|
- !ruby/object:Gem::Version
|
141
|
-
version:
|
144
|
+
version: 1.3.1
|
142
145
|
requirements: []
|
143
146
|
|
144
|
-
rubyforge_project:
|
145
|
-
rubygems_version: 1.8.
|
147
|
+
rubyforge_project:
|
148
|
+
rubygems_version: 1.8.15
|
146
149
|
signing_key:
|
147
150
|
specification_version: 3
|
148
|
-
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|
149
|
-
test_files:
|
150
|
-
|
151
|
-
- test/test_other_filters.rb
|
152
|
-
- test/test_characters_normalizer.rb
|
153
|
-
- test/test_word_counters.rb
|
154
|
-
- test/test_extract_filter.rb
|
155
|
-
- test/test_tree_walker.rb
|
156
|
-
- test/test_other_tree_filters.rb
|
157
|
-
- test/test_html_parser.rb
|
158
|
-
- test/test_parse_filter.rb
|
151
|
+
summary: Iudex is a general purpose web crawler and feed processor in ruby/java.
|
152
|
+
test_files: []
|
153
|
+
|
data/.gemtest
DELETED
File without changes
|