iudex-html 1.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +2 -0
- data/Manifest.txt +24 -0
- data/README.rdoc +25 -0
- data/Rakefile +53 -0
- data/build/HTML.java.erb +91 -0
- data/build/attributes +82 -0
- data/build/java_generate.rb +139 -0
- data/build/tags +130 -0
- data/lib/iudex-html.rb +56 -0
- data/lib/iudex-html/base.rb +21 -0
- data/lib/iudex-html/factory_helper.rb +95 -0
- data/lib/iudex-html/iudex-html-1.0.0.jar +0 -0
- data/pom.xml +51 -0
- data/test/html_test_helper.rb +100 -0
- data/test/setup.rb +38 -0
- data/test/test_characters_normalizer.rb +81 -0
- data/test/test_extract_filter.rb +165 -0
- data/test/test_factory_helper.rb +51 -0
- data/test/test_html_parser.rb +128 -0
- data/test/test_other_filters.rb +51 -0
- data/test/test_other_tree_filters.rb +124 -0
- data/test/test_parse_filter.rb +72 -0
- data/test/test_tree_walker.rb +94 -0
- data/test/test_word_counters.rb +96 -0
- metadata +162 -0
data/lib/iudex-html.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-core'
|
18
|
+
require 'gravitext-xmlprod'
|
19
|
+
require 'rjack-nekohtml'
|
20
|
+
|
21
|
+
require 'iudex-html/base.rb'
|
22
|
+
|
23
|
+
require 'java'
|
24
|
+
|
25
|
+
module Iudex
|
26
|
+
module HTML
|
27
|
+
require "iudex-html/iudex-html-#{VERSION}.jar"
|
28
|
+
|
29
|
+
import 'iudex.html.HTMLKeys'
|
30
|
+
|
31
|
+
module Filters
|
32
|
+
import 'iudex.html.filters.ExtractFilter'
|
33
|
+
import 'iudex.html.filters.HTMLParseFilter'
|
34
|
+
import 'iudex.html.filters.HTMLTreeFilter'
|
35
|
+
import 'iudex.html.filters.HTMLWriteFilter'
|
36
|
+
import 'iudex.html.filters.TitleExtractor'
|
37
|
+
end
|
38
|
+
|
39
|
+
module Tree
|
40
|
+
import 'iudex.html.tree.HTMLTreeKeys'
|
41
|
+
import 'iudex.html.tree.TreeFilterChain'
|
42
|
+
|
43
|
+
module Filters
|
44
|
+
import 'iudex.html.tree.filters.AttributeCleaner'
|
45
|
+
import 'iudex.html.tree.filters.CSSDisplayFilter'
|
46
|
+
import 'iudex.html.tree.filters.CharactersNormalizer'
|
47
|
+
import 'iudex.html.tree.filters.EmptyInlineRemover'
|
48
|
+
import 'iudex.html.tree.filters.MetaSkipFilter'
|
49
|
+
import 'iudex.html.tree.filters.WordCounter'
|
50
|
+
import 'iudex.html.tree.filters.WordyCounter'
|
51
|
+
import 'iudex.html.tree.filters.XmpToPreConverter'
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
module Iudex
|
18
|
+
module HTML
|
19
|
+
VERSION = '1.0.0'
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You may
|
6
|
+
# obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-html'
|
18
|
+
require 'iudex-filter/key_helper'
|
19
|
+
|
20
|
+
module Iudex
|
21
|
+
module HTML
|
22
|
+
module Filters
|
23
|
+
module FactoryHelper
|
24
|
+
include Iudex::Core # ContentSource
|
25
|
+
include Iudex::HTML::Tree
|
26
|
+
include Iudex::HTML::Tree::Filters
|
27
|
+
|
28
|
+
# Create html parse and clean filters
|
29
|
+
# Expected usage:
|
30
|
+
# PAGE: html_clean_filters( :source )
|
31
|
+
# FEED: html_clean_filters( :title )
|
32
|
+
# FEED: html_clean_filters( :summary )
|
33
|
+
# FEED: html_clean_filters( :content )
|
34
|
+
#
|
35
|
+
def html_clean_filters( src_key, tree_key = nil )
|
36
|
+
|
37
|
+
tree_key = "#{src_key}_tree".to_sym unless tree_key
|
38
|
+
src_key, tree_key = src_key.to_k, tree_key.to_k
|
39
|
+
|
40
|
+
filters = []
|
41
|
+
filters << html_parse_filter( src_key, tree_key )
|
42
|
+
|
43
|
+
#FIXME: PAGE: filters << TitleExtractor.new, or after?
|
44
|
+
|
45
|
+
# FIXME: if src is text, last filter
|
46
|
+
# filters << TextCtrlWSFilter.new( ContentKeys::TITLE )
|
47
|
+
|
48
|
+
tfc = TreeFilterChain.new( html_tree_filters )
|
49
|
+
|
50
|
+
filters << HTMLTreeFilter.new( tree_key, tfc,
|
51
|
+
HTMLTreeFilter::Order::DEPTH_FIRST )
|
52
|
+
|
53
|
+
#FIXME: First block extractor back to text key?
|
54
|
+
|
55
|
+
filters
|
56
|
+
end
|
57
|
+
|
58
|
+
def html_tree_filters
|
59
|
+
[ XmpToPreConverter.new, # Before CharactersNormalizer
|
60
|
+
CSSDisplayFilter.new, # Before AttributeCleaner
|
61
|
+
AttributeCleaner.new,
|
62
|
+
CharactersNormalizer.new,
|
63
|
+
EmptyInlineRemover.new, # Depth
|
64
|
+
WordCounter.new, # Depth; only for count deps?
|
65
|
+
WordyCounter.new ] # Depth; only with cleaners/simhash?
|
66
|
+
end
|
67
|
+
|
68
|
+
def html_parse_filter( src_key, tree_key = nil )
|
69
|
+
|
70
|
+
tree_key = "#{src_key}_tree".to_sym unless tree_key
|
71
|
+
src_key, tree_key = src_key.to_k, tree_key.to_k
|
72
|
+
|
73
|
+
if( src_key.value_type == ContentSource.java_class )
|
74
|
+
HTMLParseFilter.new( src_key, nil, tree_key )
|
75
|
+
else
|
76
|
+
HTMLParseFilter.new( src_key, tree_key )
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Expected usage:
|
81
|
+
# FEED: html_write_filter( :summary )
|
82
|
+
def html_write_filter( key1, key2 = nil )
|
83
|
+
|
84
|
+
tree_key, out_key = if key2
|
85
|
+
[ key1, key2 ]
|
86
|
+
else
|
87
|
+
[ "#{key1}_tree".to_sym, key1 ]
|
88
|
+
end
|
89
|
+
|
90
|
+
HTMLWriteFilter.new( tree_key.to_k, out_key.to_k )
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
Binary file
|
data/pom.xml
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
2
|
+
<modelVersion>4.0.0</modelVersion>
|
3
|
+
<groupId>iudex</groupId>
|
4
|
+
<artifactId>iudex-html</artifactId>
|
5
|
+
<packaging>jar</packaging>
|
6
|
+
<version>1.0.0</version>
|
7
|
+
<name>Iudex HTML parsing/filtering and text extraction</name>
|
8
|
+
|
9
|
+
<parent>
|
10
|
+
<groupId>iudex</groupId>
|
11
|
+
<artifactId>iudex-parent</artifactId>
|
12
|
+
<version>1.0</version>
|
13
|
+
<relativePath>..</relativePath>
|
14
|
+
</parent>
|
15
|
+
|
16
|
+
<dependencies>
|
17
|
+
|
18
|
+
<dependency>
|
19
|
+
<groupId>iudex</groupId>
|
20
|
+
<artifactId>iudex-core</artifactId>
|
21
|
+
<version>[1.0,1.1)</version>
|
22
|
+
</dependency>
|
23
|
+
|
24
|
+
<dependency>
|
25
|
+
<groupId>com.gravitext</groupId>
|
26
|
+
<artifactId>gravitext-xmlprod</artifactId>
|
27
|
+
<version>[1.4,1.5)</version>
|
28
|
+
</dependency>
|
29
|
+
|
30
|
+
<dependency>
|
31
|
+
<groupId>net.sourceforge.nekohtml</groupId>
|
32
|
+
<artifactId>nekohtml</artifactId>
|
33
|
+
<version>1.9.14</version>
|
34
|
+
</dependency>
|
35
|
+
|
36
|
+
</dependencies>
|
37
|
+
|
38
|
+
<build>
|
39
|
+
<plugins>
|
40
|
+
<plugin>
|
41
|
+
<!-- Parent settings -->
|
42
|
+
<artifactId>maven-compiler-plugin</artifactId>
|
43
|
+
</plugin>
|
44
|
+
<plugin>
|
45
|
+
<!-- Parent settings -->
|
46
|
+
<artifactId>maven-source-plugin</artifactId>
|
47
|
+
</plugin>
|
48
|
+
</plugins>
|
49
|
+
</build>
|
50
|
+
|
51
|
+
</project>
|
@@ -0,0 +1,100 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
require 'iudex-html'
|
18
|
+
require 'iudex-html/factory_helper'
|
19
|
+
|
20
|
+
require 'iudex-filter/key_helper'
|
21
|
+
|
22
|
+
module HTMLTestHelper
|
23
|
+
|
24
|
+
include Gravitext::HTMap
|
25
|
+
UniMap.define_accessors
|
26
|
+
|
27
|
+
include Iudex::Filter::Core
|
28
|
+
include Iudex::HTML::Filters::FactoryHelper
|
29
|
+
|
30
|
+
import 'com.gravitext.xml.tree.TreeUtils'
|
31
|
+
import 'com.gravitext.xml.producer.Indentor'
|
32
|
+
|
33
|
+
import 'iudex.html.HTML'
|
34
|
+
import 'iudex.html.HTMLUtils'
|
35
|
+
import 'iudex.html.tree.TreeFilterChain'
|
36
|
+
import 'iudex.html.tree.TreeWalker'
|
37
|
+
|
38
|
+
def parse( html, charset = "UTF-8" )
|
39
|
+
HTMLUtils::parse( source( html, charset ) )
|
40
|
+
end
|
41
|
+
|
42
|
+
def parseFragment( html, charset = "UTF-8" )
|
43
|
+
inner( HTMLUtils::parseFragment( source( html, charset ) ) )
|
44
|
+
end
|
45
|
+
|
46
|
+
def inner( tree )
|
47
|
+
c = tree.children
|
48
|
+
if ( c.size == 1 && c[0].element? )
|
49
|
+
c[0]
|
50
|
+
else
|
51
|
+
tree
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def assert_doc( html, root )
|
56
|
+
html = compress( html )
|
57
|
+
assert_equal( html,
|
58
|
+
TreeUtils::produceString( root, Indentor::COMPRESSED ) )
|
59
|
+
end
|
60
|
+
|
61
|
+
def assert_fragment( html, root, remove_padding = false )
|
62
|
+
assert_fragment_ws( compress( html ), root, remove_padding )
|
63
|
+
end
|
64
|
+
|
65
|
+
def assert_fragment_ws( html, root, remove_padding = false )
|
66
|
+
html = html.gsub( /~+/, '' ) if remove_padding
|
67
|
+
assert_equal( html,
|
68
|
+
HTMLUtils::produceFragmentString( root, Indentor::COMPRESSED ) )
|
69
|
+
end
|
70
|
+
|
71
|
+
def assert_transform( html, filter = nil, func = :walk_depth_first )
|
72
|
+
tree = parseFragment( html[ :in ] )
|
73
|
+
action = TreeWalker.send( func, filter, tree ) if func && filter
|
74
|
+
assert_fragment( html[ :out ], tree, true )
|
75
|
+
action
|
76
|
+
end
|
77
|
+
|
78
|
+
def source( html, charset = "UTF-8" )
|
79
|
+
HTMLUtils::source( compress( html ).to_java_bytes, charset )
|
80
|
+
end
|
81
|
+
|
82
|
+
def compress( html )
|
83
|
+
html.gsub( /\n\s*/, '' )
|
84
|
+
end
|
85
|
+
|
86
|
+
def filter_chain( filters, mode = :whole )
|
87
|
+
pf = html_parse_filter( :source )
|
88
|
+
pf.parse_as_fragment = true if mode == :fragment
|
89
|
+
filters = Array( filters )
|
90
|
+
filters.unshift( pf )
|
91
|
+
FilterChain.new( "test", filters )
|
92
|
+
end
|
93
|
+
|
94
|
+
def content( html, charset = "UTF-8" )
|
95
|
+
map = UniMap.new
|
96
|
+
map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
|
97
|
+
map
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
data/test/setup.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2010-2011 David Kellum
|
3
|
+
#
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
5
|
+
# may not use this file except in compliance with the License. You
|
6
|
+
# may obtain a copy of the License at
|
7
|
+
#
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
#
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
13
|
+
# implied. See the License for the specific language governing
|
14
|
+
# permissions and limitations under the License.
|
15
|
+
#++
|
16
|
+
|
17
|
+
#### General test setup: LOAD_PATH, logging, console output ####
|
18
|
+
|
19
|
+
test_dir = File.dirname( __FILE__ )
|
20
|
+
|
21
|
+
ldir = File.join( test_dir, "..", "lib" )
|
22
|
+
$LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
|
23
|
+
|
24
|
+
require 'rubygems'
|
25
|
+
require 'rjack-logback'
|
26
|
+
RJack::Logback.config_console( :stderr => true )
|
27
|
+
|
28
|
+
require 'minitest/unit'
|
29
|
+
require 'minitest/autorun'
|
30
|
+
|
31
|
+
require File.join( test_dir, 'html_test_helper.rb' )
|
32
|
+
|
33
|
+
# Make test output logging compatible: no partial lines.
|
34
|
+
# class TestOut
|
35
|
+
# def print( *a ); $stdout.puts( *a ); end
|
36
|
+
# def puts( *a ); $stdout.puts( *a ); end
|
37
|
+
# end
|
38
|
+
# MiniTest::Unit.output = TestOut.new
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
#.hashdot.profile += jruby-shortlived
|
3
|
+
|
4
|
+
#--
|
5
|
+
# Copyright (c) 2010-2011 David Kellum
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
8
|
+
# may not use this file except in compliance with the License. You
|
9
|
+
# may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
16
|
+
# implied. See the License for the specific language governing
|
17
|
+
# permissions and limitations under the License.
|
18
|
+
#++
|
19
|
+
|
20
|
+
require File.join( File.dirname( __FILE__ ), "setup" )
|
21
|
+
|
22
|
+
class TestCharactersNormalizer < MiniTest::Unit::TestCase
|
23
|
+
include HTMLTestHelper
|
24
|
+
|
25
|
+
include Iudex::HTML
|
26
|
+
include Iudex::HTML::Filters
|
27
|
+
include Iudex::HTML::Tree
|
28
|
+
include Iudex::HTML::Tree::Filters
|
29
|
+
include Iudex::Core
|
30
|
+
|
31
|
+
Order = HTMLTreeFilter::Order
|
32
|
+
|
33
|
+
def test_simple_block
|
34
|
+
# Note: '~' is padding removed in compare
|
35
|
+
|
36
|
+
html = { :in => "<p> x y </p>",
|
37
|
+
:out => "<p>~x ~y~</p>" }
|
38
|
+
|
39
|
+
assert_normalize( html )
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_simple_inline
|
43
|
+
html = { :in => "<i> x y </i>",
|
44
|
+
:out => "<i> x ~y </i>" }
|
45
|
+
|
46
|
+
assert_normalize( html )
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_mixed_inline
|
50
|
+
html = { :in => "<p> x y <i>z </i> </p>",
|
51
|
+
:out => "<p>~x ~y <i>z </i>~</p>" }
|
52
|
+
|
53
|
+
assert_normalize( html )
|
54
|
+
end
|
55
|
+
|
56
|
+
def test_empty
|
57
|
+
html = { :in => "<div><p> </p> <p>foo</p> </div>",
|
58
|
+
:out => "<div><p/>~~~~~~<p>foo</p>~</div>" }
|
59
|
+
|
60
|
+
assert_normalize( html )
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_pre
|
64
|
+
html = { :in => "<div> x <pre> \0x\n <a> y </a></pre> </div>",
|
65
|
+
:out => "<div>~x~<pre> ~ x\n <a> y </a></pre>~</div>" }
|
66
|
+
|
67
|
+
assert_normalize( html )
|
68
|
+
end
|
69
|
+
|
70
|
+
def assert_normalize( html )
|
71
|
+
[ Order::BREADTH_FIRST, Order::DEPTH_FIRST ].each do |order|
|
72
|
+
map = content( html[ :in ] )
|
73
|
+
tfc = TreeFilterChain.new( [ CharactersNormalizer.new ] )
|
74
|
+
tf = HTMLTreeFilter.new( :source_tree.to_k, tfc, order )
|
75
|
+
chain = filter_chain( tf, :fragment )
|
76
|
+
assert( chain.filter( map ) )
|
77
|
+
assert_fragment_ws( html[ :out ], inner( map.source_tree ), true )
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|