iudex-html 1.1.0-java → 1.2.b.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/History.rdoc CHANGED
@@ -1,3 +1,14 @@
1
+ === 1.2.b.0 (2012-3-4)
2
+ * Upgrade to gravitext-xmlprod ~> 1.5.b
3
+ * Fix duplicate attributes from Neko, last value wins.
4
+ * Use Element.to_xml( :implied_ns ) from xmlprod 1.5.b.2 in tests.
5
+ * Add StAX-based HTMLStAXConsumer for faster parsing of pre-cleaned,
6
+ trusted XHTML. Add Tree.parse ruby utility. Unrecognized tags are
7
+ constructed as BANNED.
8
+ * Add iudex-html-perftest (neko vs stax)
9
+ * Add basic iudex-html-clean utility for ad hoc testing
10
+ * Upgrade to tarpit ~> 2.0, bundler Gemfile, gemspec (dev)
11
+
1
12
  === 1.1.0 (2011-11-13)
2
13
  * Update to iudex-core ~> 1.1.0
3
14
  * Add NekoHTMLParser charset expansion and ContentSource encoding
data/Manifest.txt CHANGED
@@ -3,6 +3,8 @@ Manifest.txt
3
3
  README.rdoc
4
4
  Rakefile
5
5
  pom.xml
6
+ bin/iudex-html-clean
7
+ bin/iudex-html-perftest
6
8
  build/HTML.java.erb
7
9
  build/attributes
8
10
  build/java_generate.rb
@@ -11,6 +13,7 @@ lib/iudex-html/base.rb
11
13
  lib/iudex-html.rb
12
14
  lib/iudex-html/factory_helper.rb
13
15
  test/html_test_helper.rb
16
+ test/reddit.xhtml
14
17
  test/setup.rb
15
18
  test/test_characters_normalizer.rb
16
19
  test/test_extract_filter.rb
@@ -19,6 +22,7 @@ test/test_html_parser.rb
19
22
  test/test_other_filters.rb
20
23
  test/test_other_tree_filters.rb
21
24
  test/test_parse_filter.rb
25
+ test/test_stax_parser.rb
22
26
  test/test_tree_walker.rb
23
27
  test/test_word_counters.rb
24
- lib/iudex-html/iudex-html-1.1.0.jar
28
+ lib/iudex-html/iudex-html-1.2.b.0.jar
data/README.rdoc CHANGED
@@ -10,7 +10,7 @@ filtering, exracting text and links.
10
10
 
11
11
  == License
12
12
 
13
- Copyright (c) 2010-2011 David Kellum
13
+ Copyright (c) 2008-2012 David Kellum
14
14
 
15
15
  Licensed under the Apache License, Version 2.0 (the "License"); you
16
16
  may not use this file except in compliance with the License. You
data/Rakefile CHANGED
@@ -1,42 +1,10 @@
1
1
  # -*- ruby -*-
2
2
 
3
- $LOAD_PATH << './lib'
4
- require 'iudex-html/base'
5
-
6
3
  require 'rubygems'
7
- gem 'rjack-tarpit', '~> 1.4'
4
+ require 'bundler/setup'
8
5
  require 'rjack-tarpit'
9
6
 
10
- t = RJack::TarPit.new( 'iudex-html',
11
- Iudex::HTML::VERSION,
12
- :no_assembly, :java_platform )
13
-
14
- t.specify do |h|
15
- h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
- h.extra_deps += [ [ 'iudex-core', '~> 1.1.0' ],
17
- [ 'rjack-nekohtml', '~> 1.9.14' ],
18
- [ 'gravitext-xmlprod', '~> 1.4.0' ] ]
19
-
20
- h.testlib = :minitest
21
- h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ],
22
- [ 'rjack-logback', '~> 1.0' ] ]
23
- end
24
-
25
- file 'Manifest.txt' => [ 'pom.xml' ]
26
-
27
- task :check_pom_version do
28
- t.test_line_match( 'pom.xml', /<version>/, /#{t.version}/ )
29
- end
30
- task :check_history_version do
31
- t.test_line_match( 'History.rdoc', /^==/, / #{t.version} / )
32
- end
33
- task :check_history_date do
34
- t.test_line_match( 'History.rdoc', /^==/, /\([0-9\-]+\)$/ )
35
- end
36
-
37
- task :gem => [ :check_pom_version, :check_history_version ]
38
- task :tag => [ :check_pom_version, :check_history_version, :check_history_date ]
39
- task :push => [ :check_history_date ]
7
+ RJack::TarPit.new( 'iudex-html' ).define_tasks
40
8
 
41
9
  file 'target/.tarpit' => [ 'src/main/java/iudex/html/HTML.java' ]
42
10
 
@@ -49,5 +17,3 @@ end
49
17
  task :clean do
50
18
  rm_f 'src/main/java/iudex/html/HTML.java'
51
19
  end
52
-
53
- t.define_tasks
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2012 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+ require 'rjack-logback'
23
+
24
+ require 'iudex-html'
25
+ require 'iudex-filter/key_helper'
26
+
27
+ require 'gravitext-xmlprod/extensions'
28
+
29
+ require 'java'
30
+
31
+ class HTMLCleaner
32
+ include Iudex::HTML
33
+ include Iudex::HTML::Tree
34
+ include Iudex::HTML::Filters
35
+ include Iudex::HTML::Tree::Filters
36
+
37
+ import 'iudex.html.HTMLUtils'
38
+ import 'iudex.html.tree.TreeWalker'
39
+
40
+ def run( input = ARGV.first )
41
+ source = HTMLUtils::source( IO.read( input ).to_java_bytes, "UTF-8" )
42
+ tree = HTMLUtils::parse( source )
43
+
44
+ tfc = TreeFilterChain.new( [ XmpToPreConverter.new,
45
+ CSSDisplayFilter.new,
46
+ AttributeCleaner.new,
47
+ MojiBakeCleaner.new,
48
+ CharactersNormalizer.new,
49
+ EmptyInlineRemover.new ] )
50
+
51
+ TreeWalker.walk_depth_first( tfc, tree )
52
+
53
+ puts tree.to_xml
54
+ end
55
+
56
+ end
57
+
58
+ HTMLCleaner.new.run
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env jruby
2
+ # -*- ruby -*-
3
+ #--
4
+ # Copyright (c) 2012 David Kellum
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+ #++
18
+
19
+ $LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
20
+
21
+ require 'rubygems'
22
+ require 'rjack-logback'
23
+
24
+ require 'iudex-html'
25
+
26
+ require 'gravitext-util'
27
+ require 'gravitext-util/perftest'
28
+
29
+ require 'java'
30
+
31
+ class HTMLPerfTest
32
+ include Gravitext
33
+ include Gravitext::Concurrent
34
+
35
+ import 'iudex.html.tree.HTMLStAXUtils'
36
+ import 'iudex.html.HTMLUtils'
37
+
38
+ def run
39
+ PerfTest::Harness.new( create_tests ).execute
40
+ end
41
+
42
+ def create_tests
43
+ input = ARGV.first ||
44
+ File.join( File.dirname( __FILE__ ), '../test/reddit.xhtml' )
45
+ html = IO.read( input )
46
+
47
+ [ BlockTestFactory.new( 'neko' ) do
48
+ source = HTMLUtils::source( html.to_java_bytes, 'UTF-8' )
49
+ HTMLUtils::parse( source ).children.length
50
+ end,
51
+ BlockTestFactory.new( 'stax' ) do
52
+ Iudex::HTML::Tree.parse( html ).children.length
53
+ end
54
+ ]
55
+ end
56
+
57
+ end
58
+
59
+ HTMLPerfTest.new.run
data/build/HTML.java.erb CHANGED
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2010-2011 David Kellum
2
+ * Copyright (c) 2008-2012 David Kellum
3
3
  *
4
4
  * Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  * may not use this file except in compliance with the License. You may
data/build/attributes CHANGED
@@ -1,6 +1,6 @@
1
1
  # HTML Attributes
2
2
  #
3
- # Copyright (c) 2010-2011 David Kellum
3
+ # Copyright (c) 2008-2012 David Kellum
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License"); you
6
6
  # may not use this file except in compliance with the License. You may
@@ -2,7 +2,7 @@
2
2
  # -*- ruby -*-
3
3
 
4
4
  #--
5
- # Copyright (c) 2010-2011 David Kellum
5
+ # Copyright (c) 2008-2012 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You may
data/build/tags CHANGED
@@ -1,6 +1,6 @@
1
1
  # HTML Tags
2
2
  #
3
- # Copyright (c) 2010-2011 David Kellum
3
+ # Copyright (c) 2008-2012 David Kellum
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License"); you
6
6
  # may not use this file except in compliance with the License. You may
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2010-2011 David Kellum
2
+ # Copyright (c) 2008-2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
@@ -16,6 +16,6 @@
16
16
 
17
17
  module Iudex
18
18
  module HTML
19
- VERSION = '1.1.0'
19
+ VERSION = '1.2.b.0'
20
20
  end
21
21
  end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2010-2011 David Kellum
2
+ # Copyright (c) 2008-2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
data/lib/iudex-html.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2010-2011 David Kellum
2
+ # Copyright (c) 2008-2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
@@ -16,9 +16,10 @@
16
16
 
17
17
  require 'iudex-core'
18
18
  require 'gravitext-xmlprod'
19
+ require 'gravitext-xmlprod/extensions'
19
20
  require 'rjack-nekohtml'
20
21
 
21
- require 'iudex-html/base.rb'
22
+ require 'iudex-html/base'
22
23
 
23
24
  require 'java'
24
25
 
@@ -39,6 +40,15 @@ module Iudex
39
40
  module Tree
40
41
  import 'iudex.html.tree.HTMLTreeKeys'
41
42
  import 'iudex.html.tree.TreeFilterChain'
43
+ import 'iudex.html.tree.HTMLStAXConsumer'
44
+ import 'javax.xml.stream.XMLStreamException'
45
+
46
+ # Parse the input String using HTMLStAXConsumer, HTMLTag.
47
+ # Raises XMLStreamException on parse error
48
+ def self.parse( input )
49
+ Gravitext::XMLProd::XMLHelper.
50
+ stax_parse_string( input, HTMLStAXConsumer.new )
51
+ end
42
52
 
43
53
  module Filters
44
54
  import 'iudex.html.tree.filters.AttributeCleaner'
data/pom.xml CHANGED
@@ -3,13 +3,13 @@
3
3
  <groupId>iudex</groupId>
4
4
  <artifactId>iudex-html</artifactId>
5
5
  <packaging>jar</packaging>
6
- <version>1.1.0</version>
6
+ <version>1.2.b.0</version>
7
7
  <name>Iudex HTML parsing/filtering and text extraction</name>
8
8
 
9
9
  <parent>
10
10
  <groupId>iudex</groupId>
11
11
  <artifactId>iudex-parent</artifactId>
12
- <version>1.1</version>
12
+ <version>1.2.b.0</version>
13
13
  <relativePath>..</relativePath>
14
14
  </parent>
15
15
 
@@ -18,13 +18,13 @@
18
18
  <dependency>
19
19
  <groupId>iudex</groupId>
20
20
  <artifactId>iudex-core</artifactId>
21
- <version>[1.1,1.2)</version>
21
+ <version>[1.2,1.2.9999)</version>
22
22
  </dependency>
23
23
 
24
24
  <dependency>
25
25
  <groupId>com.gravitext</groupId>
26
26
  <artifactId>gravitext-xmlprod</artifactId>
27
- <version>[1.4,1.5)</version>
27
+ <version>[1.5,1.5.9999)</version>
28
28
  </dependency>
29
29
 
30
30
  <dependency>
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2010-2011 David Kellum
2
+ # Copyright (c) 2008-2012 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -19,16 +19,19 @@ require 'iudex-html/factory_helper'
19
19
 
20
20
  require 'iudex-filter/key_helper'
21
21
 
22
+ require 'gravitext-xmlprod/extensions'
23
+
22
24
  module HTMLTestHelper
23
25
 
24
26
  include Gravitext::HTMap
25
27
  UniMap.define_accessors
26
28
 
29
+ include Gravitext::XMLProd
30
+
27
31
  include Iudex::Filter::Core
28
32
  include Iudex::HTML::Filters::FactoryHelper
29
33
 
30
34
  import 'com.gravitext.xml.tree.TreeUtils'
31
- import 'com.gravitext.xml.producer.Indentor'
32
35
 
33
36
  import 'iudex.html.HTML'
34
37
  import 'iudex.html.HTMLUtils'
@@ -54,8 +57,7 @@ module HTMLTestHelper
54
57
 
55
58
  def assert_doc( html, root )
56
59
  html = compress( html )
57
- assert_equal( html,
58
- TreeUtils::produceString( root, Indentor::COMPRESSED ) )
60
+ assert_equal( html, root.to_xml )
59
61
  end
60
62
 
61
63
  def assert_fragment( html, root, remove_padding = false )
@@ -64,8 +66,7 @@ module HTMLTestHelper
64
66
 
65
67
  def assert_fragment_ws( html, root, remove_padding = false )
66
68
  html = html.gsub( /~+/, '' ) if remove_padding
67
- assert_equal( html,
68
- HTMLUtils::produceFragmentString( root, Indentor::COMPRESSED ) )
69
+ assert_equal( html, root.to_xml( :implied_ns => [ HTML::NS_XHTML ] ) )
69
70
  end
70
71
 
71
72
  def assert_transform( html, filter = nil, func = :walk_depth_first )