iudex-html 1.1.0-java → 1.2.b.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +11 -0
- data/Manifest.txt +5 -1
- data/README.rdoc +1 -1
- data/Rakefile +2 -36
- data/bin/iudex-html-clean +58 -0
- data/bin/iudex-html-perftest +59 -0
- data/build/HTML.java.erb +1 -1
- data/build/attributes +1 -1
- data/build/java_generate.rb +1 -1
- data/build/tags +1 -1
- data/lib/iudex-html/base.rb +2 -2
- data/lib/iudex-html/factory_helper.rb +1 -1
- data/lib/iudex-html/{iudex-html-1.1.0.jar → iudex-html-1.2.b.0.jar} +0 -0
- data/lib/iudex-html.rb +12 -2
- data/pom.xml +4 -4
- data/test/html_test_helper.rb +7 -6
- data/test/reddit.xhtml +557 -0
- data/test/setup.rb +18 -16
- data/test/test_characters_normalizer.rb +1 -1
- data/test/test_extract_filter.rb +1 -1
- data/test/test_factory_helper.rb +1 -1
- data/test/test_html_parser.rb +21 -1
- data/test/test_other_filters.rb +1 -1
- data/test/test_other_tree_filters.rb +1 -1
- data/test/test_parse_filter.rb +1 -1
- data/test/test_stax_parser.rb +78 -0
- data/test/test_tree_walker.rb +1 -1
- data/test/test_word_counters.rb +1 -1
- metadata +43 -48
- data/.gemtest +0 -0
data/History.rdoc
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
=== 1.2.b.0 (2012-3-4)
|
2
|
+
* Upgrade to gravitext-xmlprod ~> 1.5.b
|
3
|
+
* Fix duplicate attributes from Neko, last value wins.
|
4
|
+
* Use Element.to_xml( :implied_ns ) from xmlprod 1.5.b.2 in tests.
|
5
|
+
* Add StAX-based HTMLStAXConsumer for faster parsing of pre-cleaned,
|
6
|
+
trusted XHTML. Add Tree.parse ruby utility. Unrecognized tags are
|
7
|
+
constructed as BANNED.
|
8
|
+
* Add iudex-html-perftest (neko vs stax)
|
9
|
+
* Add basic iudex-html-clean utility for ad hoc testing
|
10
|
+
* Upgrade to tarpit ~> 2.0, bundler Gemfile, gemspec (dev)
|
11
|
+
|
1
12
|
=== 1.1.0 (2011-11-13)
|
2
13
|
* Update to iudex-core ~> 1.1.0
|
3
14
|
* Add NekoHTMLParser charset expansion and ContentSource encoding
|
data/Manifest.txt
CHANGED
@@ -3,6 +3,8 @@ Manifest.txt
|
|
3
3
|
README.rdoc
|
4
4
|
Rakefile
|
5
5
|
pom.xml
|
6
|
+
bin/iudex-html-clean
|
7
|
+
bin/iudex-html-perftest
|
6
8
|
build/HTML.java.erb
|
7
9
|
build/attributes
|
8
10
|
build/java_generate.rb
|
@@ -11,6 +13,7 @@ lib/iudex-html/base.rb
|
|
11
13
|
lib/iudex-html.rb
|
12
14
|
lib/iudex-html/factory_helper.rb
|
13
15
|
test/html_test_helper.rb
|
16
|
+
test/reddit.xhtml
|
14
17
|
test/setup.rb
|
15
18
|
test/test_characters_normalizer.rb
|
16
19
|
test/test_extract_filter.rb
|
@@ -19,6 +22,7 @@ test/test_html_parser.rb
|
|
19
22
|
test/test_other_filters.rb
|
20
23
|
test/test_other_tree_filters.rb
|
21
24
|
test/test_parse_filter.rb
|
25
|
+
test/test_stax_parser.rb
|
22
26
|
test/test_tree_walker.rb
|
23
27
|
test/test_word_counters.rb
|
24
|
-
lib/iudex-html/iudex-html-1.
|
28
|
+
lib/iudex-html/iudex-html-1.2.b.0.jar
|
data/README.rdoc
CHANGED
@@ -10,7 +10,7 @@ filtering, exracting text and links.
|
|
10
10
|
|
11
11
|
== License
|
12
12
|
|
13
|
-
Copyright (c)
|
13
|
+
Copyright (c) 2008-2012 David Kellum
|
14
14
|
|
15
15
|
Licensed under the Apache License, Version 2.0 (the "License"); you
|
16
16
|
may not use this file except in compliance with the License. You
|
data/Rakefile
CHANGED
@@ -1,42 +1,10 @@
|
|
1
1
|
# -*- ruby -*-
|
2
2
|
|
3
|
-
$LOAD_PATH << './lib'
|
4
|
-
require 'iudex-html/base'
|
5
|
-
|
6
3
|
require 'rubygems'
|
7
|
-
|
4
|
+
require 'bundler/setup'
|
8
5
|
require 'rjack-tarpit'
|
9
6
|
|
10
|
-
|
11
|
-
Iudex::HTML::VERSION,
|
12
|
-
:no_assembly, :java_platform )
|
13
|
-
|
14
|
-
t.specify do |h|
|
15
|
-
h.developer( "David Kellum", "dek-oss@gravitext.com" )
|
16
|
-
h.extra_deps += [ [ 'iudex-core', '~> 1.1.0' ],
|
17
|
-
[ 'rjack-nekohtml', '~> 1.9.14' ],
|
18
|
-
[ 'gravitext-xmlprod', '~> 1.4.0' ] ]
|
19
|
-
|
20
|
-
h.testlib = :minitest
|
21
|
-
h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ],
|
22
|
-
[ 'rjack-logback', '~> 1.0' ] ]
|
23
|
-
end
|
24
|
-
|
25
|
-
file 'Manifest.txt' => [ 'pom.xml' ]
|
26
|
-
|
27
|
-
task :check_pom_version do
|
28
|
-
t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
|
29
|
-
end
|
30
|
-
task :check_history_version do
|
31
|
-
t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
|
32
|
-
end
|
33
|
-
task :check_history_date do
|
34
|
-
t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
|
35
|
-
end
|
36
|
-
|
37
|
-
task :gem => [ :check_pom_version, :check_history_version ]
|
38
|
-
task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
|
39
|
-
task :push => [ :check_history_date ]
|
7
|
+
RJack::TarPit.new( 'iudex-html' ).define_tasks
|
40
8
|
|
41
9
|
file 'target/.tarpit' => [ 'src/main/java/iudex/html/HTML.java' ]
|
42
10
|
|
@@ -49,5 +17,3 @@ end
|
|
49
17
|
task :clean do
|
50
18
|
rm_f 'src/main/java/iudex/html/HTML.java'
|
51
19
|
end
|
52
|
-
|
53
|
-
t.define_tasks
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
#--
|
4
|
+
# Copyright (c) 2012 David Kellum
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
|
+
# may not use this file except in compliance with the License. You may
|
8
|
+
# obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15
|
+
# implied. See the License for the specific language governing
|
16
|
+
# permissions and limitations under the License.
|
17
|
+
#++
|
18
|
+
|
19
|
+
$LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
|
20
|
+
|
21
|
+
require 'rubygems'
|
22
|
+
require 'rjack-logback'
|
23
|
+
|
24
|
+
require 'iudex-html'
|
25
|
+
require 'iudex-filter/key_helper'
|
26
|
+
|
27
|
+
require 'gravitext-xmlprod/extensions'
|
28
|
+
|
29
|
+
require 'java'
|
30
|
+
|
31
|
+
class HTMLCleaner
|
32
|
+
include Iudex::HTML
|
33
|
+
include Iudex::HTML::Tree
|
34
|
+
include Iudex::HTML::Filters
|
35
|
+
include Iudex::HTML::Tree::Filters
|
36
|
+
|
37
|
+
import 'iudex.html.HTMLUtils'
|
38
|
+
import 'iudex.html.tree.TreeWalker'
|
39
|
+
|
40
|
+
def run( input = ARGV.first )
|
41
|
+
source = HTMLUtils::source( IO.read( input ).to_java_bytes, "UTF-8" )
|
42
|
+
tree = HTMLUtils::parse( source )
|
43
|
+
|
44
|
+
tfc = TreeFilterChain.new( [ XmpToPreConverter.new,
|
45
|
+
CSSDisplayFilter.new,
|
46
|
+
AttributeCleaner.new,
|
47
|
+
MojiBakeCleaner.new,
|
48
|
+
CharactersNormalizer.new,
|
49
|
+
EmptyInlineRemover.new ] )
|
50
|
+
|
51
|
+
TreeWalker.walk_depth_first( tfc, tree )
|
52
|
+
|
53
|
+
puts tree.to_xml
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
HTMLCleaner.new.run
|
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
#--
|
4
|
+
# Copyright (c) 2012 David Kellum
|
5
|
+
#
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
7
|
+
# may not use this file except in compliance with the License. You may
|
8
|
+
# obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
15
|
+
# implied. See the License for the specific language governing
|
16
|
+
# permissions and limitations under the License.
|
17
|
+
#++
|
18
|
+
|
19
|
+
$LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
|
20
|
+
|
21
|
+
require 'rubygems'
|
22
|
+
require 'rjack-logback'
|
23
|
+
|
24
|
+
require 'iudex-html'
|
25
|
+
|
26
|
+
require 'gravitext-util'
|
27
|
+
require 'gravitext-util/perftest'
|
28
|
+
|
29
|
+
require 'java'
|
30
|
+
|
31
|
+
class HTMLPerfTest
|
32
|
+
include Gravitext
|
33
|
+
include Gravitext::Concurrent
|
34
|
+
|
35
|
+
import 'iudex.html.tree.HTMLStAXUtils'
|
36
|
+
import 'iudex.html.HTMLUtils'
|
37
|
+
|
38
|
+
def run
|
39
|
+
PerfTest::Harness.new( create_tests ).execute
|
40
|
+
end
|
41
|
+
|
42
|
+
def create_tests
|
43
|
+
input = ARGV.first ||
|
44
|
+
File.join( File.dirname( __FILE__ ), '../test/reddit.xhtml' )
|
45
|
+
html = IO.read( input )
|
46
|
+
|
47
|
+
[ BlockTestFactory.new( 'neko' ) do
|
48
|
+
source = HTMLUtils::source( html.to_java_bytes, 'UTF-8' )
|
49
|
+
HTMLUtils::parse( source ).children.length
|
50
|
+
end,
|
51
|
+
BlockTestFactory.new( 'stax' ) do
|
52
|
+
Iudex::HTML::Tree.parse( html ).children.length
|
53
|
+
end
|
54
|
+
]
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
HTMLPerfTest.new.run
|
data/build/HTML.java.erb
CHANGED
data/build/attributes
CHANGED
data/build/java_generate.rb
CHANGED
data/build/tags
CHANGED
data/lib/iudex-html/base.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c)
|
2
|
+
# Copyright (c) 2008-2012 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You may
|
@@ -16,6 +16,6 @@
|
|
16
16
|
|
17
17
|
module Iudex
|
18
18
|
module HTML
|
19
|
-
VERSION = '1.
|
19
|
+
VERSION = '1.2.b.0'
|
20
20
|
end
|
21
21
|
end
|
Binary file
|
data/lib/iudex-html.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c)
|
2
|
+
# Copyright (c) 2008-2012 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You may
|
@@ -16,9 +16,10 @@
|
|
16
16
|
|
17
17
|
require 'iudex-core'
|
18
18
|
require 'gravitext-xmlprod'
|
19
|
+
require 'gravitext-xmlprod/extensions'
|
19
20
|
require 'rjack-nekohtml'
|
20
21
|
|
21
|
-
require 'iudex-html/base
|
22
|
+
require 'iudex-html/base'
|
22
23
|
|
23
24
|
require 'java'
|
24
25
|
|
@@ -39,6 +40,15 @@ module Iudex
|
|
39
40
|
module Tree
|
40
41
|
import 'iudex.html.tree.HTMLTreeKeys'
|
41
42
|
import 'iudex.html.tree.TreeFilterChain'
|
43
|
+
import 'iudex.html.tree.HTMLStAXConsumer'
|
44
|
+
import 'javax.xml.stream.XMLStreamException'
|
45
|
+
|
46
|
+
# Parse the input String using HTMLStAXConsumer, HTMLTag.
|
47
|
+
# Raises XMLStreamException on parse error
|
48
|
+
def self.parse( input )
|
49
|
+
Gravitext::XMLProd::XMLHelper.
|
50
|
+
stax_parse_string( input, HTMLStAXConsumer.new )
|
51
|
+
end
|
42
52
|
|
43
53
|
module Filters
|
44
54
|
import 'iudex.html.tree.filters.AttributeCleaner'
|
data/pom.xml
CHANGED
@@ -3,13 +3,13 @@
|
|
3
3
|
<groupId>iudex</groupId>
|
4
4
|
<artifactId>iudex-html</artifactId>
|
5
5
|
<packaging>jar</packaging>
|
6
|
-
<version>1.
|
6
|
+
<version>1.2.b.0</version>
|
7
7
|
<name>Iudex HTML parsing/filtering and text extraction</name>
|
8
8
|
|
9
9
|
<parent>
|
10
10
|
<groupId>iudex</groupId>
|
11
11
|
<artifactId>iudex-parent</artifactId>
|
12
|
-
<version>1.
|
12
|
+
<version>1.2.b.0</version>
|
13
13
|
<relativePath>..</relativePath>
|
14
14
|
</parent>
|
15
15
|
|
@@ -18,13 +18,13 @@
|
|
18
18
|
<dependency>
|
19
19
|
<groupId>iudex</groupId>
|
20
20
|
<artifactId>iudex-core</artifactId>
|
21
|
-
<version>[1.
|
21
|
+
<version>[1.2,1.2.9999)</version>
|
22
22
|
</dependency>
|
23
23
|
|
24
24
|
<dependency>
|
25
25
|
<groupId>com.gravitext</groupId>
|
26
26
|
<artifactId>gravitext-xmlprod</artifactId>
|
27
|
-
<version>[1.
|
27
|
+
<version>[1.5,1.5.9999)</version>
|
28
28
|
</dependency>
|
29
29
|
|
30
30
|
<dependency>
|
data/test/html_test_helper.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
#--
|
2
|
-
# Copyright (c)
|
2
|
+
# Copyright (c) 2008-2012 David Kellum
|
3
3
|
#
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
5
|
# may not use this file except in compliance with the License. You
|
@@ -19,16 +19,19 @@ require 'iudex-html/factory_helper'
|
|
19
19
|
|
20
20
|
require 'iudex-filter/key_helper'
|
21
21
|
|
22
|
+
require 'gravitext-xmlprod/extensions'
|
23
|
+
|
22
24
|
module HTMLTestHelper
|
23
25
|
|
24
26
|
include Gravitext::HTMap
|
25
27
|
UniMap.define_accessors
|
26
28
|
|
29
|
+
include Gravitext::XMLProd
|
30
|
+
|
27
31
|
include Iudex::Filter::Core
|
28
32
|
include Iudex::HTML::Filters::FactoryHelper
|
29
33
|
|
30
34
|
import 'com.gravitext.xml.tree.TreeUtils'
|
31
|
-
import 'com.gravitext.xml.producer.Indentor'
|
32
35
|
|
33
36
|
import 'iudex.html.HTML'
|
34
37
|
import 'iudex.html.HTMLUtils'
|
@@ -54,8 +57,7 @@ module HTMLTestHelper
|
|
54
57
|
|
55
58
|
def assert_doc( html, root )
|
56
59
|
html = compress( html )
|
57
|
-
assert_equal( html,
|
58
|
-
TreeUtils::produceString( root, Indentor::COMPRESSED ) )
|
60
|
+
assert_equal( html, root.to_xml )
|
59
61
|
end
|
60
62
|
|
61
63
|
def assert_fragment( html, root, remove_padding = false )
|
@@ -64,8 +66,7 @@ module HTMLTestHelper
|
|
64
66
|
|
65
67
|
def assert_fragment_ws( html, root, remove_padding = false )
|
66
68
|
html = html.gsub( /~+/, '' ) if remove_padding
|
67
|
-
assert_equal( html,
|
68
|
-
HTMLUtils::produceFragmentString( root, Indentor::COMPRESSED ) )
|
69
|
+
assert_equal( html, root.to_xml( :implied_ns => [ HTML::NS_XHTML ] ) )
|
69
70
|
end
|
70
71
|
|
71
72
|
def assert_transform( html, filter = nil, func = :walk_depth_first )
|