iudex-html 1.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +2 -0
- data/Manifest.txt +24 -0
- data/README.rdoc +25 -0
- data/Rakefile +53 -0
- data/build/HTML.java.erb +91 -0
- data/build/attributes +82 -0
- data/build/java_generate.rb +139 -0
- data/build/tags +130 -0
- data/lib/iudex-html.rb +56 -0
- data/lib/iudex-html/base.rb +21 -0
- data/lib/iudex-html/factory_helper.rb +95 -0
- data/lib/iudex-html/iudex-html-1.0.0.jar +0 -0
- data/pom.xml +51 -0
- data/test/html_test_helper.rb +100 -0
- data/test/setup.rb +38 -0
- data/test/test_characters_normalizer.rb +81 -0
- data/test/test_extract_filter.rb +165 -0
- data/test/test_factory_helper.rb +51 -0
- data/test/test_html_parser.rb +128 -0
- data/test/test_other_filters.rb +51 -0
- data/test/test_other_tree_filters.rb +124 -0
- data/test/test_parse_filter.rb +72 -0
- data/test/test_tree_walker.rb +94 -0
- data/test/test_word_counters.rb +96 -0
- metadata +162 -0
data/lib/iudex-html.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-core'
|
18
|
+
require 'gravitext-xmlprod'
|
19
|
+
require 'rjack-nekohtml'
|
20
|
+
|
21
|
+
require 'iudex-html/base.rb'
|
22
|
+
|
23
|
+
require 'java'
|
24
|
+
|
25
|
+
module Iudex
|
26
|
+
module HTML
|
27
|
+
require "iudex-html/iudex-html-#{VERSION}.jar"
|
28
|
+
|
29
|
+
import 'iudex.html.HTMLKeys'
|
30
|
+
|
31
|
+
module Filters
|
32
|
+
import 'iudex.html.filters.ExtractFilter'
|
33
|
+
import 'iudex.html.filters.HTMLParseFilter'
|
34
|
+
import 'iudex.html.filters.HTMLTreeFilter'
|
35
|
+
import 'iudex.html.filters.HTMLWriteFilter'
|
36
|
+
import 'iudex.html.filters.TitleExtractor'
|
37
|
+
end
|
38
|
+
|
39
|
+
module Tree
|
40
|
+
import 'iudex.html.tree.HTMLTreeKeys'
|
41
|
+
import 'iudex.html.tree.TreeFilterChain'
|
42
|
+
|
43
|
+
module Filters
|
44
|
+
import 'iudex.html.tree.filters.AttributeCleaner'
|
45
|
+
import 'iudex.html.tree.filters.CSSDisplayFilter'
|
46
|
+
import 'iudex.html.tree.filters.CharactersNormalizer'
|
47
|
+
import 'iudex.html.tree.filters.EmptyInlineRemover'
|
48
|
+
import 'iudex.html.tree.filters.MetaSkipFilter'
|
49
|
+
import 'iudex.html.tree.filters.WordCounter'
|
50
|
+
import 'iudex.html.tree.filters.WordyCounter'
|
51
|
+
import 'iudex.html.tree.filters.XmpToPreConverter'
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module Iudex
|
18
|
+
module HTML
|
19
|
+
VERSION = '1.0.0'
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-html'
|
18
|
+
require 'iudex-filter/key_helper'
|
19
|
+
|
20
|
+
module Iudex
|
21
|
+
module HTML
|
22
|
+
module Filters
|
23
|
+
module FactoryHelper
|
24
|
+
include Iudex::Core # ContentSource
|
25
|
+
include Iudex::HTML::Tree
|
26
|
+
include Iudex::HTML::Tree::Filters
|
27
|
+
|
28
|
+
# Create html parse and clean filters
|
29
|
+
# Expected usage:
|
30
|
+
# PAGE: html_clean_filters( :source )
|
31
|
+
# FEED: html_clean_filters( :title )
|
32
|
+
# FEED: html_clean_filters( :summary )
|
33
|
+
# FEED: html_clean_filters( :content )
|
34
|
+
#
|
35
|
+
def html_clean_filters( src_key, tree_key = nil )
|
36
|
+
|
37
|
+
tree_key = "#{src_key}_tree".to_sym unless tree_key
|
38
|
+
src_key, tree_key = src_key.to_k, tree_key.to_k
|
39
|
+
|
40
|
+
filters = []
|
41
|
+
filters << html_parse_filter( src_key, tree_key )
|
42
|
+
|
43
|
+
#FIXME: PAGE: filters << TitleExtractor.new, or after?
|
44
|
+
|
45
|
+
# FIXME: if src is text, last filter
|
46
|
+
# filters << TextCtrlWSFilter.new( ContentKeys::TITLE )
|
47
|
+
|
48
|
+
tfc = TreeFilterChain.new( html_tree_filters )
|
49
|
+
|
50
|
+
filters << HTMLTreeFilter.new( tree_key, tfc,
|
51
|
+
HTMLTreeFilter::Order::DEPTH_FIRST )
|
52
|
+
|
53
|
+
#FIXME: First block extractor back to text key?
|
54
|
+
|
55
|
+
filters
|
56
|
+
end
|
57
|
+
|
58
|
+
def html_tree_filters
|
59
|
+
[ XmpToPreConverter.new, # Before CharactersNormalizer
|
60
|
+
CSSDisplayFilter.new, # Before AttributeCleaner
|
61
|
+
AttributeCleaner.new,
|
62
|
+
CharactersNormalizer.new,
|
63
|
+
EmptyInlineRemover.new, # Depth
|
64
|
+
WordCounter.new, # Depth; only for count deps?
|
65
|
+
WordyCounter.new ] # Depth; only with cleaners/simhash?
|
66
|
+
end
|
67
|
+
|
68
|
+
def html_parse_filter( src_key, tree_key = nil )
|
69
|
+
|
70
|
+
tree_key = "#{src_key}_tree".to_sym unless tree_key
|
71
|
+
src_key, tree_key = src_key.to_k, tree_key.to_k
|
72
|
+
|
73
|
+
if( src_key.value_type == ContentSource.java_class )
|
74
|
+
HTMLParseFilter.new( src_key, nil, tree_key )
|
75
|
+
else
|
76
|
+
HTMLParseFilter.new( src_key, tree_key )
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Expected usage:
|
81
|
+
# FEED: html_write_filter( :summary )
|
82
|
+
def html_write_filter( key1, key2 = nil )
|
83
|
+
|
84
|
+
tree_key, out_key = if key2
|
85
|
+
[ key1, key2 ]
|
86
|
+
else
|
87
|
+
[ "#{key1}_tree".to_sym, key1 ]
|
88
|
+
end
|
89
|
+
|
90
|
+
HTMLWriteFilter.new( tree_key.to_k, out_key.to_k )
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
Binary file
|
data/pom.xml
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
2
|
+
<modelVersion>4.0.0</modelVersion>
|
3
|
+
<groupId>iudex</groupId>
|
4
|
+
<artifactId>iudex-html</artifactId>
|
5
|
+
<packaging>jar</packaging>
|
6
|
+
<version>1.0.0</version>
|
7
|
+
<name>Iudex HTML parsing/filtering and text extraction</name>
|
8
|
+
|
9
|
+
<parent>
|
10
|
+
<groupId>iudex</groupId>
|
11
|
+
<artifactId>iudex-parent</artifactId>
|
12
|
+
<version>1.0</version>
|
13
|
+
<relativePath>..</relativePath>
|
14
|
+
</parent>
|
15
|
+
|
16
|
+
<dependencies>
|
17
|
+
|
18
|
+
<dependency>
|
19
|
+
<groupId>iudex</groupId>
|
20
|
+
<artifactId>iudex-core</artifactId>
|
21
|
+
<version>[1.0,1.1)</version>
|
22
|
+
</dependency>
|
23
|
+
|
24
|
+
<dependency>
|
25
|
+
<groupId>com.gravitext</groupId>
|
26
|
+
<artifactId>gravitext-xmlprod</artifactId>
|
27
|
+
<version>[1.4,1.5)</version>
|
28
|
+
</dependency>
|
29
|
+
|
30
|
+
<dependency>
|
31
|
+
<groupId>net.sourceforge.nekohtml</groupId>
|
32
|
+
<artifactId>nekohtml</artifactId>
|
33
|
+
<version>1.9.14</version>
|
34
|
+
</dependency>
|
35
|
+
|
36
|
+
</dependencies>
|
37
|
+
|
38
|
+
<build>
|
39
|
+
<plugins>
|
40
|
+
<plugin>
|
41
|
+
<!-- Parent settings -->
|
42
|
+
<artifactId>maven-compiler-plugin</artifactId>
|
43
|
+
</plugin>
|
44
|
+
<plugin>
|
45
|
+
<!-- Parent settings -->
|
46
|
+
<artifactId>maven-source-plugin</artifactId>
|
47
|
+
</plugin>
|
48
|
+
</plugins>
|
49
|
+
</build>
|
50
|
+
|
51
|
+
</project>
|
@@ -0,0 +1,100 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-html'
|
18
|
+
require 'iudex-html/factory_helper'
|
19
|
+
|
20
|
+
require 'iudex-filter/key_helper'
|
21
|
+
|
22
|
+
module HTMLTestHelper
|
23
|
+
|
24
|
+
include Gravitext::HTMap
|
25
|
+
UniMap.define_accessors
|
26
|
+
|
27
|
+
include Iudex::Filter::Core
|
28
|
+
include Iudex::HTML::Filters::FactoryHelper
|
29
|
+
|
30
|
+
import 'com.gravitext.xml.tree.TreeUtils'
|
31
|
+
import 'com.gravitext.xml.producer.Indentor'
|
32
|
+
|
33
|
+
import 'iudex.html.HTML'
|
34
|
+
import 'iudex.html.HTMLUtils'
|
35
|
+
import 'iudex.html.tree.TreeFilterChain'
|
36
|
+
import 'iudex.html.tree.TreeWalker'
|
37
|
+
|
38
|
+
def parse( html, charset = "UTF-8" )
|
39
|
+
HTMLUtils::parse( source( html, charset ) )
|
40
|
+
end
|
41
|
+
|
42
|
+
def parseFragment( html, charset = "UTF-8" )
|
43
|
+
inner( HTMLUtils::parseFragment( source( html, charset ) ) )
|
44
|
+
end
|
45
|
+
|
46
|
+
def inner( tree )
|
47
|
+
c = tree.children
|
48
|
+
if ( c.size == 1 && c[0].element? )
|
49
|
+
c[0]
|
50
|
+
else
|
51
|
+
tree
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def assert_doc( html, root )
|
56
|
+
html = compress( html )
|
57
|
+
assert_equal( html,
|
58
|
+
TreeUtils::produceString( root, Indentor::COMPRESSED ) )
|
59
|
+
end
|
60
|
+
|
61
|
+
def assert_fragment( html, root, remove_padding = false )
|
62
|
+
assert_fragment_ws( compress( html ), root, remove_padding )
|
63
|
+
end
|
64
|
+
|
65
|
+
def assert_fragment_ws( html, root, remove_padding = false )
|
66
|
+
html = html.gsub( /~+/, '' ) if remove_padding
|
67
|
+
assert_equal( html,
|
68
|
+
HTMLUtils::produceFragmentString( root, Indentor::COMPRESSED ) )
|
69
|
+
end
|
70
|
+
|
71
|
+
def assert_transform( html, filter = nil, func = :walk_depth_first )
|
72
|
+
tree = parseFragment( html[ :in ] )
|
73
|
+
action = TreeWalker.send( func, filter, tree ) if func && filter
|
74
|
+
assert_fragment( html[ :out ], tree, true )
|
75
|
+
action
|
76
|
+
end
|
77
|
+
|
78
|
+
def source( html, charset = "UTF-8" )
|
79
|
+
HTMLUtils::source( compress( html ).to_java_bytes, charset )
|
80
|
+
end
|
81
|
+
|
82
|
+
def compress( html )
|
83
|
+
html.gsub( /\n\s*/, '' )
|
84
|
+
end
|
85
|
+
|
86
|
+
def filter_chain( filters, mode = :whole )
|
87
|
+
pf = html_parse_filter( :source )
|
88
|
+
pf.parse_as_fragment = true if mode == :fragment
|
89
|
+
filters = Array( filters )
|
90
|
+
filters.unshift( pf )
|
91
|
+
FilterChain.new( "test", filters )
|
92
|
+
end
|
93
|
+
|
94
|
+
def content( html, charset = "UTF-8" )
|
95
|
+
map = UniMap.new
|
96
|
+
map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
|
97
|
+
map
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
data/test/setup.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
#### General test setup: LOAD_PATH, logging, console output ####
|
18
|
+
|
19
|
+
test_dir = File.dirname( __FILE__ )
|
20
|
+
|
21
|
+
ldir = File.join( test_dir, "..", "lib" )
|
22
|
+
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
23
|
+
|
24
|
+
require 'rubygems'
|
25
|
+
require 'rjack-logback'
|
26
|
+
RJack::Logback.config_console( :stderr => true )
|
27
|
+
|
28
|
+
require 'minitest/unit'
|
29
|
+
require 'minitest/autorun'
|
30
|
+
|
31
|
+
require File.join( test_dir, 'html_test_helper.rb' )
|
32
|
+
|
33
|
+
# Make test output logging compatible: no partial lines.
|
34
|
+
# class TestOut
|
35
|
+
# def print( *a ); $stdout.puts( *a ); end
|
36
|
+
# def puts( *a ); $stdout.puts( *a ); end
|
37
|
+
# end
|
38
|
+
# MiniTest::Unit.output = TestOut.new
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2010-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
class TestCharactersNormalizer < MiniTest::Unit::TestCase
|
23
|
+
include HTMLTestHelper
|
24
|
+
|
25
|
+
include Iudex::HTML
|
26
|
+
include Iudex::HTML::Filters
|
27
|
+
include Iudex::HTML::Tree
|
28
|
+
include Iudex::HTML::Tree::Filters
|
29
|
+
include Iudex::Core
|
30
|
+
|
31
|
+
Order = HTMLTreeFilter::Order
|
32
|
+
|
33
|
+
def test_simple_block
|
34
|
+
# Note: '~' is padding removed in compare
|
35
|
+
|
36
|
+
html = { :in => "<p> x y </p>",
|
37
|
+
:out => "<p>~x ~y~</p>" }
|
38
|
+
|
39
|
+
assert_normalize( html )
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_simple_inline
|
43
|
+
html = { :in => "<i> x y </i>",
|
44
|
+
:out => "<i> x ~y </i>" }
|
45
|
+
|
46
|
+
assert_normalize( html )
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_mixed_inline
|
50
|
+
html = { :in => "<p> x y <i>z </i> </p>",
|
51
|
+
:out => "<p>~x ~y <i>z </i>~</p>" }
|
52
|
+
|
53
|
+
assert_normalize( html )
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_empty
|
57
|
+
html = { :in => "<div><p> </p> <p>foo</p> </div>",
|
58
|
+
:out => "<div><p/>~~~~~~<p>foo</p>~</div>" }
|
59
|
+
|
60
|
+
assert_normalize( html )
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_pre
|
64
|
+
html = { :in => "<div> x <pre> \0x\n <a> y </a></pre> </div>",
|
65
|
+
:out => "<div>~x~<pre> ~ x\n <a> y </a></pre>~</div>" }
|
66
|
+
|
67
|
+
assert_normalize( html )
|
68
|
+
end
|
69
|
+
|
70
|
+
def assert_normalize( html )
|
71
|
+
[ Order::BREADTH_FIRST, Order::DEPTH_FIRST ].each do |order|
|
72
|
+
map = content( html[ :in ] )
|
73
|
+
tfc = TreeFilterChain.new( [ CharactersNormalizer.new ] )
|
74
|
+
tf = HTMLTreeFilter.new( :source_tree.to_k, tfc, order )
|
75
|
+
chain = filter_chain( tf, :fragment )
|
76
|
+
assert( chain.filter( map ) )
|
77
|
+
assert_fragment_ws( html[ :out ], inner( map.source_tree ), true )
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|