iudex-html 1.1.0-java → 1.2.b.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.rdoc CHANGED
@@ -1,3 +1,14 @@
1
+ === 1.2.b.0 (2012-3-4)
2
+ * Upgrade to gravitext-xmlprod ~> 1.5.b
3
+ * Fix duplicate attributes from Neko, last value wins.
4
+ * Use Element.to_xml( :implied_ns ) from xmlprod 1.5.b.2 in tests.
5
+ * Add StAX-based HTMLStAXConsumer for faster parsing of pre-cleaned,
6
+ trusted XHTML. Add Tree.parse ruby utility. Unrecognized tags are
7
+ constructed as BANNED.
8
+ * Add iudex-html-perftest (neko vs stax)
9
+ * Add basic iudex-html-clean utility for ad hoc testing
10
+ * Upgrade to tarpit ~> 2.0, bundler Gemfile, gemspec (dev)
11
+
1
12
  === 1.1.0 (2011-11-13)
2
13
  * Update to iudex-core ~> 1.1.0
3
14
  * Add NekoHTMLParser charset expansion and ContentSource encoding
data/Manifest.txt CHANGED
@@ -3,6 +3,8 @@ Manifest.txt
3
3
  README.rdoc
4
4
  Rakefile
5
5
  pom.xml
6
+ bin/iudex-html-clean
7
+ bin/iudex-html-perftest
6
8
  build/HTML.java.erb
7
9
  build/attributes
8
10
  build/java_generate.rb
@@ -11,6 +13,7 @@ lib/iudex-html/base.rb
11
13
  lib/iudex-html.rb
12
14
  lib/iudex-html/factory_helper.rb
13
15
  test/html_test_helper.rb
16
+ test/reddit.xhtml
14
17
  test/setup.rb
15
18
  test/test_characters_normalizer.rb
16
19
  test/test_extract_filter.rb
@@ -19,6 +22,7 @@ test/test_html_parser.rb
19
22
  test/test_other_filters.rb
20
23
  test/test_other_tree_filters.rb
21
24
  test/test_parse_filter.rb
25
+ test/test_stax_parser.rb
22
26
  test/test_tree_walker.rb
23
27
  test/test_word_counters.rb
24
- lib/iudex-html/iudex-html-1.1.0.jar
28
+ lib/iudex-html/iudex-html-1.2.b.0.jar
data/README.rdoc CHANGED
@@ -10,7 +10,7 @@ filtering, exracting text and links.
10
10
 
11
11
  == License
12
12
 
13
- Copyright (c) 2010-2011 David Kellum
13
+ Copyright (c) 2008-2012 David Kellum
14
14
 
15
15
  Licensed under the Apache License, Version 2.0 (the "License"); you
16
16
  may not use this file except in compliance with the License. You
data/Rakefile CHANGED
@@ -1,42 +1,10 @@
1
1
  # -*- ruby -*-
2
2
 
3
- $LOAD_PATH << './lib'
4
- require 'iudex-html/base'
5
-
6
3
  require 'rubygems'
7
- gem 'rjack-tarpit', '~> 1.4'
4
+ require 'bundler/setup'
8
5
  require 'rjack-tarpit'
9
6
 
10
- t = RJack::TarPit.new( 'iudex-html',
11
- Iudex::HTML::VERSION,
12
- :no_assembly, :java_platform )
13
-
14
- t.specify do |h|
15
- h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
- h.extra_deps += [ [ 'iudex-core', '~> 1.1.0' ],
17
- [ 'rjack-nekohtml', '~> 1.9.14' ],
18
- [ 'gravitext-xmlprod', '~> 1.4.0' ] ]
19
-
20
- h.testlib = :minitest
21
- h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ],
22
- [ 'rjack-logback', '~> 1.0' ] ]
23
- end
24
-
25
- file 'Manifest.txt' => [ 'pom.xml' ]
26
-
27
- task :check_pom_version do
28
- t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
29
- end
30
- task :check_history_version do
31
- t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
32
- end
33
- task :check_history_date do
34
- t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
35
- end
36
-
37
- task :gem => [ :check_pom_version, :check_history_version ]
38
- task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
39
- task :push => [ :check_history_date ]
7
+ RJack::TarPit.new( 'iudex-html' ).define_tasks
40
8
 
41
9
  file 'target/.tarpit' => [ 'src/main/java/iudex/html/HTML.java' ]
42
10
 
@@ -49,5 +17,3 @@ end
49
17
  task :clean do
50
18
  rm_f 'src/main/java/iudex/html/HTML.java'
51
19
  end
52
-
53
- t.define_tasks
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2012 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+ require 'rjack-logback'
23
+
24
+ require 'iudex-html'
25
+ require 'iudex-filter/key_helper'
26
+
27
+ require 'gravitext-xmlprod/extensions'
28
+
29
+ require 'java'
30
+
31
+ class HTMLCleaner
32
+ include Iudex::HTML
33
+ include Iudex::HTML::Tree
34
+ include Iudex::HTML::Filters
35
+ include Iudex::HTML::Tree::Filters
36
+
37
+ import 'iudex.html.HTMLUtils'
38
+ import 'iudex.html.tree.TreeWalker'
39
+
40
+ def run( input = ARGV.first )
41
+ source = HTMLUtils::source( IO.read( input ).to_java_bytes, "UTF-8" )
42
+ tree = HTMLUtils::parse( source )
43
+
44
+ tfc = TreeFilterChain.new( [ XmpToPreConverter.new,
45
+ CSSDisplayFilter.new,
46
+ AttributeCleaner.new,
47
+ MojiBakeCleaner.new,
48
+ CharactersNormalizer.new,
49
+ EmptyInlineRemover.new ] )
50
+
51
+ TreeWalker.walk_depth_first( tfc, tree )
52
+
53
+ puts tree.to_xml
54
+ end
55
+
56
+ end
57
+
58
+ HTMLCleaner.new.run
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2012 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+ require 'rjack-logback'
23
+
24
+ require 'iudex-html'
25
+
26
+ require 'gravitext-util'
27
+ require 'gravitext-util/perftest'
28
+
29
+ require 'java'
30
+
31
+ class HTMLPerfTest
32
+ include Gravitext
33
+ include Gravitext::Concurrent
34
+
35
+ import 'iudex.html.tree.HTMLStAXUtils'
36
+ import 'iudex.html.HTMLUtils'
37
+
38
+ def run
39
+ PerfTest::Harness.new( create_tests ).execute
40
+ end
41
+
42
+ def create_tests
43
+ input = ARGV.first ||
44
+ File.join( File.dirname( __FILE__ ), '../test/reddit.xhtml' )
45
+ html = IO.read( input )
46
+
47
+ [ BlockTestFactory.new( 'neko' ) do
48
+ source = HTMLUtils::source( html.to_java_bytes, 'UTF-8' )
49
+ HTMLUtils::parse( source ).children.length
50
+ end,
51
+ BlockTestFactory.new( 'stax' ) do
52
+ Iudex::HTML::Tree.parse( html ).children.length
53
+ end
54
+ ]
55
+ end
56
+
57
+ end
58
+
59
+ HTMLPerfTest.new.run
data/build/HTML.java.erb CHANGED
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2010-2011 David Kellum
2
+ * Copyright (c) 2008-2012 David Kellum
3
3
  *
4
4
  * Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  * may not use this file except in compliance with the License. You may
data/build/attributes CHANGED
@@ -1,6 +1,6 @@
1
1
  # HTML Attributes
2
2
  #
3
- # Copyright (c) 2010-2011 David Kellum
3
+ # Copyright (c) 2008-2012 David Kellum
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License"); you
6
6
  # may not use this file except in compliance with the License. You may
@@ -2,7 +2,7 @@
2
2
  # -*- ruby -*-
3
3
 
4
4
  #--
5
- # Copyright (c) 2010-2011 David Kellum
5
+ # Copyright (c) 2008-2012 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You may
data/build/tags CHANGED
@@ -1,6 +1,6 @@
1
1
  # HTML Tags
2
2
  #
3
- # Copyright (c) 2010-2011 David Kellum
3
+ # Copyright (c) 2008-2012 David Kellum
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License"); you
6
6
  # may not use this file except in compliance with the License. You may
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2010-2011 David Kellum
2
+ # Copyright (c) 2008-2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
@@ -16,6 +16,6 @@
16
16
 
17
17
  module Iudex
18
18
  module HTML
19
- VERSION = '1.1.0'
19
+ VERSION = '1.2.b.0'
20
20
  end
21
21
  end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2010-2011 David Kellum
2
+ # Copyright (c) 2008-2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
data/lib/iudex-html.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2010-2011 David Kellum
2
+ # Copyright (c) 2008-2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
@@ -16,9 +16,10 @@
16
16
 
17
17
  require 'iudex-core'
18
18
  require 'gravitext-xmlprod'
19
+ require 'gravitext-xmlprod/extensions'
19
20
  require 'rjack-nekohtml'
20
21
 
21
- require 'iudex-html/base.rb'
22
+ require 'iudex-html/base'
22
23
 
23
24
  require 'java'
24
25
 
@@ -39,6 +40,15 @@ module Iudex
39
40
  module Tree
40
41
  import 'iudex.html.tree.HTMLTreeKeys'
41
42
  import 'iudex.html.tree.TreeFilterChain'
43
+ import 'iudex.html.tree.HTMLStAXConsumer'
44
+ import 'javax.xml.stream.XMLStreamException'
45
+
46
+ # Parse the input String using HTMLStAXConsumer, HTMLTag.
47
+ # Raises XMLStreamException on parse error
48
+ def self.parse( input )
49
+ Gravitext::XMLProd::XMLHelper.
50
+ stax_parse_string( input, HTMLStAXConsumer.new )
51
+ end
42
52
 
43
53
  module Filters
44
54
  import 'iudex.html.tree.filters.AttributeCleaner'
data/pom.xml CHANGED
@@ -3,13 +3,13 @@
3
3
  <groupId>iudex</groupId>
4
4
  <artifactId>iudex-html</artifactId>
5
5
  <packaging>jar</packaging>
6
- <version>1.1.0</version>
6
+ <version>1.2.b.0</version>
7
7
  <name>Iudex HTML parsing/filtering and text extraction</name>
8
8
 
9
9
  <parent>
10
10
  <groupId>iudex</groupId>
11
11
  <artifactId>iudex-parent</artifactId>
12
- <version>1.1</version>
12
+ <version>1.2.b.0</version>
13
13
  <relativePath>..</relativePath>
14
14
  </parent>
15
15
 
@@ -18,13 +18,13 @@
18
18
  <dependency>
19
19
  <groupId>iudex</groupId>
20
20
  <artifactId>iudex-core</artifactId>
21
- <version>[1.1,1.2)</version>
21
+ <version>[1.2,1.2.9999)</version>
22
22
  </dependency>
23
23
 
24
24
  <dependency>
25
25
  <groupId>com.gravitext</groupId>
26
26
  <artifactId>gravitext-xmlprod</artifactId>
27
- <version>[1.4,1.5)</version>
27
+ <version>[1.5,1.5.9999)</version>
28
28
  </dependency>
29
29
 
30
30
  <dependency>
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2010-2011 David Kellum
2
+ # Copyright (c) 2008-2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -19,16 +19,19 @@ require 'iudex-html/factory_helper'
19
19
 
20
20
  require 'iudex-filter/key_helper'
21
21
 
22
+ require 'gravitext-xmlprod/extensions'
23
+
22
24
  module HTMLTestHelper
23
25
 
24
26
  include Gravitext::HTMap
25
27
  UniMap.define_accessors
26
28
 
29
+ include Gravitext::XMLProd
30
+
27
31
  include Iudex::Filter::Core
28
32
  include Iudex::HTML::Filters::FactoryHelper
29
33
 
30
34
  import 'com.gravitext.xml.tree.TreeUtils'
31
- import 'com.gravitext.xml.producer.Indentor'
32
35
 
33
36
  import 'iudex.html.HTML'
34
37
  import 'iudex.html.HTMLUtils'
@@ -54,8 +57,7 @@ module HTMLTestHelper
54
57
 
55
58
  def assert_doc( html, root )
56
59
  html = compress( html )
57
- assert_equal( html,
58
- TreeUtils::produceString( root, Indentor::COMPRESSED ) )
60
+ assert_equal( html, root.to_xml )
59
61
  end
60
62
 
61
63
  def assert_fragment( html, root, remove_padding = false )
@@ -64,8 +66,7 @@ module HTMLTestHelper
64
66
 
65
67
  def assert_fragment_ws( html, root, remove_padding = false )
66
68
  html = html.gsub( /~+/, '' ) if remove_padding
67
- assert_equal( html,
68
- HTMLUtils::produceFragmentString( root, Indentor::COMPRESSED ) )
69
+ assert_equal( html, root.to_xml( :implied_ns => [ HTML::NS_XHTML ] ) )
69
70
  end
70
71
 
71
72
  def assert_transform( html, filter = nil, func = :walk_depth_first )