iudex-html 1.1.0-java → 1.2.b.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/test/setup.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2010-2011 David Kellum
2
+ # Copyright (c) 2008-2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -14,25 +14,27 @@
14
14
  # permissions and limitations under the License.
15
15
  #++
16
16
 
17
- #### General test setup: LOAD_PATH, logging, console output ####
18
-
19
- test_dir = File.dirname( __FILE__ )
20
-
21
- ldir = File.join( test_dir, "..", "lib" )
22
- $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
17
+ #### General test setup, logging, console output ####
23
18
 
24
19
  require 'rubygems'
25
- require 'rjack-logback'
26
- RJack::Logback.config_console( :stderr => true )
20
+ require 'bundler/setup'
27
21
 
28
22
  require 'minitest/unit'
29
23
  require 'minitest/autorun'
30
24
 
31
- require File.join( test_dir, 'html_test_helper.rb' )
25
+ require 'rjack-logback'
26
+
27
+ module TestSetup
28
+ include RJack
29
+ Logback.config_console( :stderr => true, :thread => true )
30
+
31
+ if ( ARGV & %w[ -v --verbose --debug ] ).empty?
32
+ Logback.root.level = Logback::INFO
33
+ else
34
+ Logback.root.level = Logback::DEBUG
35
+ end
36
+
37
+ ARGV.delete( '--debug' )
38
+ end
32
39
 
33
- # Make test output logging compatible: no partial lines.
34
- # class TestOut
35
- # def print( *a ); $stdout.puts( *a ); end
36
- # def puts( *a ); $stdout.puts( *a ); end
37
- # end
38
- # MiniTest::Unit.output = TestOut.new
40
+ require File.join( File.dirname( __FILE__ ), 'html_test_helper.rb' )
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2010-2011 David Kellum
5
+ # Copyright (c) 2008-2012 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -75,6 +75,26 @@ HTML
75
75
  assert_doc( HTML_SKIP_TAGS_SKIPPED, parse( HTML_SKIP_TAGS, "ISO-8859-1" ) )
76
76
  end
77
77
 
78
+ def test_attr_duplicates
79
+ input = <<-HTML
80
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
81
+ <head/>
82
+ <body>
83
+ <p class="foo" class="bar">hello</p>
84
+ </body>
85
+ </html>
86
+ HTML
87
+ output = <<-HTML
88
+ <html xmlns="http://www.w3.org/1999/xhtml" lang="en">
89
+ <head/>
90
+ <body>
91
+ <p class="bar">hello</p>
92
+ </body>
93
+ </html>
94
+ HTML
95
+ assert_doc( output, parse( input ) )
96
+ end
97
+
78
98
  HTML_OUTSIDE = <<HTML
79
99
  before
80
100
  <html xmlns="http://www.w3.org/1999/xhtml">
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2010-2011 David Kellum
5
+ # Copyright (c) 2008-2012 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
3
+ #.hashdot.profile += jruby-shortlived
4
+
5
+ #--
6
+ # Copyright (c) 2012 David Kellum
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
9
+ # may not use this file except in compliance with the License. You
10
+ # may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
17
+ # implied. See the License for the specific language governing
18
+ # permissions and limitations under the License.
19
+ #++
20
+
21
+ require File.join( File.dirname( __FILE__ ), "setup" )
22
+
23
+ class TestStAXParser < MiniTest::Unit::TestCase
24
+ include HTMLTestHelper
25
+
26
+ import 'javax.xml.stream.XMLStreamException'
27
+
28
+ HTML_FULL = <<HTML
29
+ <html xmlns="http://www.w3.org/1999/xhtml">
30
+ <head>
31
+ <title>Iūdex</title>
32
+ </head>
33
+ <body>
34
+ <p>Iūdex test.</p>
35
+ <n:bogus xmlns:n="bogo">truely</n:bogus>
36
+ </body>
37
+ </html>
38
+ HTML
39
+
40
+ def test_with_non_html
41
+ root = parse( HTML_FULL, "UTF-8" )
42
+ assert( root.find( 'body' ).find( 'bogus' ).tag.banned? )
43
+ assert_doc( HTML_FULL, root )
44
+ end
45
+
46
+ HTML_CDATA = {
47
+ :in => "<p><![CDATA[two]]></p>",
48
+ :out => "<p>two</p>" }
49
+ # Note: HTML parsers drop this instead.
50
+
51
+ def test_cdata
52
+ tree = parse( HTML_CDATA[ :in ] )
53
+ assert_fragment( HTML_CDATA[ :out ], tree )
54
+ end
55
+
56
+ def test_inline_nest
57
+ html = { :in => "<div><i>begin <p>block</p> end.</i></div>",
58
+ :out => "<div><i>begin <p>block</p> end.</i></div>" }
59
+ tree = parse( html[ :in ] )
60
+ assert_fragment( html[ :out ], tree )
61
+ end
62
+
63
+ def test_invalid_error
64
+ assert_raises( XMLStreamException ) do
65
+ Iudex::HTML::Tree.parse( "" )
66
+ end
67
+
68
+ assert_raises( XMLStreamException ) do
69
+ Iudex::HTML::Tree.parse( "<doc><open></doc>" )
70
+ end
71
+ end
72
+
73
+ # Helper overrrides
74
+ def parse( html, charset = "UTF-8" )
75
+ Iudex::HTML::Tree.parse( compress( html ) )
76
+ end
77
+
78
+ end
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2010-2011 David Kellum
5
+ # Copyright (c) 2008-2012 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2010-2011 David Kellum
6
+ # Copyright (c) 2008-2012 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
metadata CHANGED
@@ -1,8 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: iudex-html
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 1.1.0
4
+ prerelease: 4
5
+ version: 1.2.b.0
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
@@ -10,86 +10,83 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-11-13 00:00:00 Z
13
+ date: 2012-03-05 00:00:00 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: iudex-core
17
- prerelease: false
18
- requirement: &id001 !ruby/object:Gem::Requirement
17
+ version_requirements: &id001 !ruby/object:Gem::Requirement
19
18
  none: false
20
19
  requirements:
21
20
  - - ~>
22
21
  - !ruby/object:Gem::Version
23
- version: 1.1.0
22
+ version: 1.2.b
23
+ requirement: *id001
24
+ prerelease: false
24
25
  type: :runtime
25
- version_requirements: *id001
26
26
  - !ruby/object:Gem::Dependency
27
27
  name: rjack-nekohtml
28
- prerelease: false
29
- requirement: &id002 !ruby/object:Gem::Requirement
28
+ version_requirements: &id002 !ruby/object:Gem::Requirement
30
29
  none: false
31
30
  requirements:
32
31
  - - ~>
33
32
  - !ruby/object:Gem::Version
34
33
  version: 1.9.14
34
+ requirement: *id002
35
+ prerelease: false
35
36
  type: :runtime
36
- version_requirements: *id002
37
37
  - !ruby/object:Gem::Dependency
38
38
  name: gravitext-xmlprod
39
- prerelease: false
40
- requirement: &id003 !ruby/object:Gem::Requirement
39
+ version_requirements: &id003 !ruby/object:Gem::Requirement
41
40
  none: false
42
41
  requirements:
43
42
  - - ~>
44
43
  - !ruby/object:Gem::Version
45
- version: 1.4.0
44
+ version: 1.5.b
45
+ requirement: *id003
46
+ prerelease: false
46
47
  type: :runtime
47
- version_requirements: *id003
48
48
  - !ruby/object:Gem::Dependency
49
49
  name: minitest
50
- prerelease: false
51
- requirement: &id004 !ruby/object:Gem::Requirement
50
+ version_requirements: &id004 !ruby/object:Gem::Requirement
52
51
  none: false
53
52
  requirements:
54
53
  - - ~>
55
54
  - !ruby/object:Gem::Version
56
55
  version: "2.3"
56
+ requirement: *id004
57
+ prerelease: false
57
58
  type: :development
58
- version_requirements: *id004
59
59
  - !ruby/object:Gem::Dependency
60
60
  name: rjack-logback
61
- prerelease: false
62
- requirement: &id005 !ruby/object:Gem::Requirement
61
+ version_requirements: &id005 !ruby/object:Gem::Requirement
63
62
  none: false
64
63
  requirements:
65
64
  - - ~>
66
65
  - !ruby/object:Gem::Version
67
66
  version: "1.0"
67
+ requirement: *id005
68
+ prerelease: false
68
69
  type: :development
69
- version_requirements: *id005
70
70
  - !ruby/object:Gem::Dependency
71
71
  name: rjack-tarpit
72
- prerelease: false
73
- requirement: &id006 !ruby/object:Gem::Requirement
72
+ version_requirements: &id006 !ruby/object:Gem::Requirement
74
73
  none: false
75
74
  requirements:
76
75
  - - ~>
77
76
  - !ruby/object:Gem::Version
78
- version: 1.4.0
77
+ version: "2.0"
78
+ requirement: *id006
79
+ prerelease: false
79
80
  type: :development
80
- version_requirements: *id006
81
- description: |-
82
- Iudex is a general purpose web crawler and feed processor in
83
- ruby/java. The iudex-html gem contains filters for HTML parsing,
84
- filtering, exracting text and links.
81
+ description: Iudex is a general purpose web crawler and feed processor in ruby/java. The iudex-html gem contains filters for HTML parsing, filtering, exracting text and links.
85
82
  email:
86
83
  - dek-oss@gravitext.com
87
- executables: []
88
-
84
+ executables:
85
+ - iudex-html-clean
86
+ - iudex-html-perftest
89
87
  extensions: []
90
88
 
91
89
  extra_rdoc_files:
92
- - Manifest.txt
93
90
  - History.rdoc
94
91
  - README.rdoc
95
92
  files:
@@ -98,6 +95,8 @@ files:
98
95
  - README.rdoc
99
96
  - Rakefile
100
97
  - pom.xml
98
+ - bin/iudex-html-clean
99
+ - bin/iudex-html-perftest
101
100
  - build/HTML.java.erb
102
101
  - build/attributes
103
102
  - build/java_generate.rb
@@ -106,6 +105,7 @@ files:
106
105
  - lib/iudex-html.rb
107
106
  - lib/iudex-html/factory_helper.rb
108
107
  - test/html_test_helper.rb
108
+ - test/reddit.xhtml
109
109
  - test/setup.rb
110
110
  - test/test_characters_normalizer.rb
111
111
  - test/test_extract_filter.rb
@@ -114,10 +114,10 @@ files:
114
114
  - test/test_other_filters.rb
115
115
  - test/test_other_tree_filters.rb
116
116
  - test/test_parse_filter.rb
117
+ - test/test_stax_parser.rb
117
118
  - test/test_tree_walker.rb
118
119
  - test/test_word_counters.rb
119
- - lib/iudex-html/iudex-html-1.1.0.jar
120
- - .gemtest
120
+ - lib/iudex-html/iudex-html-1.2.b.0.jar
121
121
  homepage: http://github.com/dekellum/iudex
122
122
  licenses: []
123
123
 
@@ -132,27 +132,22 @@ required_ruby_version: !ruby/object:Gem::Requirement
132
132
  requirements:
133
133
  - - ">="
134
134
  - !ruby/object:Gem::Version
135
+ hash: 2
136
+ segments:
137
+ - 0
135
138
  version: "0"
136
139
  required_rubygems_version: !ruby/object:Gem::Requirement
137
140
  none: false
138
141
  requirements:
139
- - - ">="
142
+ - - ">"
140
143
  - !ruby/object:Gem::Version
141
- version: "0"
144
+ version: 1.3.1
142
145
  requirements: []
143
146
 
144
- rubyforge_project: iudex-html
145
- rubygems_version: 1.8.9
147
+ rubyforge_project:
148
+ rubygems_version: 1.8.15
146
149
  signing_key:
147
150
  specification_version: 3
148
- summary: Iudex is a general purpose web crawler and feed processor in ruby/java
149
- test_files:
150
- - test/test_factory_helper.rb
151
- - test/test_other_filters.rb
152
- - test/test_characters_normalizer.rb
153
- - test/test_word_counters.rb
154
- - test/test_extract_filter.rb
155
- - test/test_tree_walker.rb
156
- - test/test_other_tree_filters.rb
157
- - test/test_html_parser.rb
158
- - test/test_parse_filter.rb
151
+ summary: Iudex is a general purpose web crawler and feed processor in ruby/java.
152
+ test_files: []
153
+
data/.gemtest DELETED
File without changes