iudex-html 1.1.0-java → 1.2.b.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/test/setup.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2010-2011 David Kellum
2
+ # Copyright (c) 2008-2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -14,25 +14,27 @@
14
14
  # permissions and limitations under the License.
15
15
  #++
16
16
 
17
- #### General test setup: LOAD_PATH, logging, console output ####
18
-
19
- test_dir = File.dirname( __FILE__ )
20
-
21
- ldir = File.join( test_dir, "..", "lib" )
22
- $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
17
+ #### General test setup, logging, console output ####
23
18
 
24
19
  require 'rubygems'
25
- require 'rjack-logback'
26
- RJack::Logback.config_console( :stderr => true )
20
+ require 'bundler/setup'
27
21
 
28
22
  require 'minitest/unit'
29
23
  require 'minitest/autorun'
30
24
 
31
- require File.join( test_dir, 'html_test_helper.rb' )
25
+ require 'rjack-logback'
26
+
27
+ module TestSetup
28
+ include RJack
29
+ Logback.config_console( :stderr => true, :thread => true )
30
+
31
+ if ( ARGV & %w[ -v --verbose --debug ] ).empty?
32
+ Logback.root.level = Logback::INFO
33
+ else
34
+ Logback.root.level = Logback::DEBUG
35
+ end
36
+
37
+ ARGV.delete( '--debug' )
38
+ end
32
39
 
33
- # Make test output logging compatible: no partial lines.
34
- # class TestOut
35
- # def print( *a ); $stdout.puts( *a ); end
36
- # def puts( *a ); $stdout.puts( *a ); end
37
- # end
38
- # MiniTest::Unit.output = TestOut.new
40
+ require File.join( File.dirname( __FILE__ ), 'html_test_helper.rb' )
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2010-2011 David Kellum
5
+ # Copyright (c) 2008-2012 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -75,6 +75,26 @@ HTML
75
75
  assert_doc( HTML_SKIP_TAGS_SKIPPED, parse( HTML_SKIP_TAGS, "ISO-8859-1" ) )
76
76
  end
77
77
 
78
+ def test_attr_duplicates
79
+ input = <<-HTML
80
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
81
+ <head/>
82
+ <body>
83
+ <p class="foo" class="bar">hello</p>
84
+ </body>
85
+ </html>
86
+ HTML
87
+ output = <<-HTML
88
+ <html xmlns="http://www.w3.org/1999/xhtml" lang="en">
89
+ <head/>
90
+ <body>
91
+ <p class="bar">hello</p>
92
+ </body>
93
+ </html>
94
+ HTML
95
+ assert_doc( output, parse( input ) )
96
+ end
97
+
78
98
  HTML_OUTSIDE = <<HTML
79
99
  before
80
100
  <html xmlns="http://www.w3.org/1999/xhtml">
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2010-2011 David Kellum
5
+ # Copyright (c) 2008-2012 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2012 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+
23
+ class TestStAXParser < MiniTest::Unit::TestCase
24
+ include HTMLTestHelper
25
+
26
+ import 'javax.xml.stream.XMLStreamException'
27
+
28
+ HTML_FULL = <<HTML
29
+ <html xmlns="http://www.w3.org/1999/xhtml">
30
+ <head>
31
+ <title>Iūdex</title>
32
+ </head>
33
+ <body>
34
+ <p>Iūdex test.</p>
35
+ <n:bogus xmlns:n="bogo">truely</n:bogus>
36
+ </body>
37
+ </html>
38
+ HTML
39
+
40
+ def test_with_non_html
41
+ root = parse( HTML_FULL, "UTF-8" )
42
+ assert( root.find( 'body' ).find( 'bogus' ).tag.banned? )
43
+ assert_doc( HTML_FULL, root )
44
+ end
45
+
46
+ HTML_CDATA = {
47
+ :in => "<p><![CDATA[two]]></p>",
48
+ :out => "<p>two</p>" }
49
+ # Note: HTML parsers drop this instead.
50
+
51
+ def test_cdata
52
+ tree = parse( HTML_CDATA[ :in ] )
53
+ assert_fragment( HTML_CDATA[ :out ], tree )
54
+ end
55
+
56
+ def test_inline_nest
57
+ html = { :in => "<div><i>begin <p>block</p> end.</i></div>",
58
+ :out => "<div><i>begin <p>block</p> end.</i></div>" }
59
+ tree = parse( html[ :in ] )
60
+ assert_fragment( html[ :out ], tree )
61
+ end
62
+
63
+ def test_invalid_error
64
+ assert_raises( XMLStreamException ) do
65
+ Iudex::HTML::Tree.parse( "" )
66
+ end
67
+
68
+ assert_raises( XMLStreamException ) do
69
+ Iudex::HTML::Tree.parse( "<doc><open></doc>" )
70
+ end
71
+ end
72
+
73
+ # Helper overrrides
74
+ def parse( html, charset = "UTF-8" )
75
+ Iudex::HTML::Tree.parse( compress( html ) )
76
+ end
77
+
78
+ end
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2010-2011 David Kellum
5
+ # Copyright (c) 2008-2012 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
metadata CHANGED
@@ -1,8 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iudex-html
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 1.1.0
4
+ prerelease: 4
5
+ version: 1.2.b.0
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
@@ -10,86 +10,83 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-11-13 00:00:00 Z
13
+ date: 2012-03-05 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: iudex-core
17
- prerelease: false
18
- requirement: &id001 !ruby/object:Gem::Requirement
17
+ version_requirements: &id001 !ruby/object:Gem::Requirement
19
18
  none: false
20
19
  requirements:
21
20
  - - ~>
22
21
  - !ruby/object:Gem::Version
23
- version: 1.1.0
22
+ version: 1.2.b
23
+ requirement: *id001
24
+ prerelease: false
24
25
  type: :runtime
25
- version_requirements: *id001
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: rjack-nekohtml
28
- prerelease: false
29
- requirement: &id002 !ruby/object:Gem::Requirement
28
+ version_requirements: &id002 !ruby/object:Gem::Requirement
30
29
  none: false
31
30
  requirements:
32
31
  - - ~>
33
32
  - !ruby/object:Gem::Version
34
33
  version: 1.9.14
34
+ requirement: *id002
35
+ prerelease: false
35
36
  type: :runtime
36
- version_requirements: *id002
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: gravitext-xmlprod
39
- prerelease: false
40
- requirement: &id003 !ruby/object:Gem::Requirement
39
+ version_requirements: &id003 !ruby/object:Gem::Requirement
41
40
  none: false
42
41
  requirements:
43
42
  - - ~>
44
43
  - !ruby/object:Gem::Version
45
- version: 1.4.0
44
+ version: 1.5.b
45
+ requirement: *id003
46
+ prerelease: false
46
47
  type: :runtime
47
- version_requirements: *id003
48
48
  - !ruby/object:Gem::Dependency
49
49
  name: minitest
50
- prerelease: false
51
- requirement: &id004 !ruby/object:Gem::Requirement
50
+ version_requirements: &id004 !ruby/object:Gem::Requirement
52
51
  none: false
53
52
  requirements:
54
53
  - - ~>
55
54
  - !ruby/object:Gem::Version
56
55
  version: "2.3"
56
+ requirement: *id004
57
+ prerelease: false
57
58
  type: :development
58
- version_requirements: *id004
59
59
  - !ruby/object:Gem::Dependency
60
60
  name: rjack-logback
61
- prerelease: false
62
- requirement: &id005 !ruby/object:Gem::Requirement
61
+ version_requirements: &id005 !ruby/object:Gem::Requirement
63
62
  none: false
64
63
  requirements:
65
64
  - - ~>
66
65
  - !ruby/object:Gem::Version
67
66
  version: "1.0"
67
+ requirement: *id005
68
+ prerelease: false
68
69
  type: :development
69
- version_requirements: *id005
70
70
  - !ruby/object:Gem::Dependency
71
71
  name: rjack-tarpit
72
- prerelease: false
73
- requirement: &id006 !ruby/object:Gem::Requirement
72
+ version_requirements: &id006 !ruby/object:Gem::Requirement
74
73
  none: false
75
74
  requirements:
76
75
  - - ~>
77
76
  - !ruby/object:Gem::Version
78
- version: 1.4.0
77
+ version: "2.0"
78
+ requirement: *id006
79
+ prerelease: false
79
80
  type: :development
80
- version_requirements: *id006
81
- description: |-
82
- Iudex is a general purpose web crawler and feed processor in
83
- ruby/java. The iudex-html gem contains filters for HTML parsing,
84
- filtering, exracting text and links.
81
+ description: Iudex is a general purpose web crawler and feed processor in ruby/java. The iudex-html gem contains filters for HTML parsing, filtering, exracting text and links.
85
82
  email:
86
83
  - dek-oss@gravitext.com
87
- executables: []
88
-
84
+ executables:
85
+ - iudex-html-clean
86
+ - iudex-html-perftest
89
87
  extensions: []
90
88
 
91
89
  extra_rdoc_files:
92
- - Manifest.txt
93
90
  - History.rdoc
94
91
  - README.rdoc
95
92
  files:
@@ -98,6 +95,8 @@ files:
98
95
  - README.rdoc
99
96
  - Rakefile
100
97
  - pom.xml
98
+ - bin/iudex-html-clean
99
+ - bin/iudex-html-perftest
101
100
  - build/HTML.java.erb
102
101
  - build/attributes
103
102
  - build/java_generate.rb
@@ -106,6 +105,7 @@ files:
106
105
  - lib/iudex-html.rb
107
106
  - lib/iudex-html/factory_helper.rb
108
107
  - test/html_test_helper.rb
108
+ - test/reddit.xhtml
109
109
  - test/setup.rb
110
110
  - test/test_characters_normalizer.rb
111
111
  - test/test_extract_filter.rb
@@ -114,10 +114,10 @@ files:
114
114
  - test/test_other_filters.rb
115
115
  - test/test_other_tree_filters.rb
116
116
  - test/test_parse_filter.rb
117
+ - test/test_stax_parser.rb
117
118
  - test/test_tree_walker.rb
118
119
  - test/test_word_counters.rb
119
- - lib/iudex-html/iudex-html-1.1.0.jar
120
- - .gemtest
120
+ - lib/iudex-html/iudex-html-1.2.b.0.jar
121
121
  homepage: http://github.com/dekellum/iudex
122
122
  licenses: []
123
123
 
@@ -132,27 +132,22 @@ required_ruby_version: !ruby/object:Gem::Requirement
132
132
  requirements:
133
133
  - - ">="
134
134
  - !ruby/object:Gem::Version
135
+ hash: 2
136
+ segments:
137
+ - 0
135
138
  version: "0"
136
139
  required_rubygems_version: !ruby/object:Gem::Requirement
137
140
  none: false
138
141
  requirements:
139
- - - ">="
142
+ - - ">"
140
143
  - !ruby/object:Gem::Version
141
- version: "0"
144
+ version: 1.3.1
142
145
  requirements: []
143
146
 
144
- rubyforge_project: iudex-html
145
- rubygems_version: 1.8.9
147
+ rubyforge_project:
148
+ rubygems_version: 1.8.15
146
149
  signing_key:
147
150
  specification_version: 3
148
- summary: Iudex is a general purpose web crawler and feed processor in ruby/java
149
- test_files:
150
- - test/test_factory_helper.rb
151
- - test/test_other_filters.rb
152
- - test/test_characters_normalizer.rb
153
- - test/test_word_counters.rb
154
- - test/test_extract_filter.rb
155
- - test/test_tree_walker.rb
156
- - test/test_other_tree_filters.rb
157
- - test/test_html_parser.rb
158
- - test/test_parse_filter.rb
151
+ summary: Iudex is a general purpose web crawler and feed processor in ruby/java.
152
+ test_files: []
153
+
data/.gemtest DELETED
File without changes