iudex-html 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/iudex-html.rb ADDED
@@ -0,0 +1,56 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-core'
18
+ require 'gravitext-xmlprod'
19
+ require 'rjack-nekohtml'
20
+
21
+ require 'iudex-html/base.rb'
22
+
23
+ require 'java'
24
+
25
+ module Iudex
26
+ module HTML
27
+ require "iudex-html/iudex-html-#{VERSION}.jar"
28
+
29
+ import 'iudex.html.HTMLKeys'
30
+
31
+ module Filters
32
+ import 'iudex.html.filters.ExtractFilter'
33
+ import 'iudex.html.filters.HTMLParseFilter'
34
+ import 'iudex.html.filters.HTMLTreeFilter'
35
+ import 'iudex.html.filters.HTMLWriteFilter'
36
+ import 'iudex.html.filters.TitleExtractor'
37
+ end
38
+
39
+ module Tree
40
+ import 'iudex.html.tree.HTMLTreeKeys'
41
+ import 'iudex.html.tree.TreeFilterChain'
42
+
43
+ module Filters
44
+ import 'iudex.html.tree.filters.AttributeCleaner'
45
+ import 'iudex.html.tree.filters.CSSDisplayFilter'
46
+ import 'iudex.html.tree.filters.CharactersNormalizer'
47
+ import 'iudex.html.tree.filters.EmptyInlineRemover'
48
+ import 'iudex.html.tree.filters.MetaSkipFilter'
49
+ import 'iudex.html.tree.filters.WordCounter'
50
+ import 'iudex.html.tree.filters.WordyCounter'
51
+ import 'iudex.html.tree.filters.XmpToPreConverter'
52
+ end
53
+ end
54
+
55
+ end
56
+ end
@@ -0,0 +1,21 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module Iudex
18
+ module HTML
19
+ VERSION = '1.0.0'
20
+ end
21
+ end
@@ -0,0 +1,95 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-html'
18
+ require 'iudex-filter/key_helper'
19
+
20
+ module Iudex
21
+ module HTML
22
+ module Filters
23
+ module FactoryHelper
24
+ include Iudex::Core # ContentSource
25
+ include Iudex::HTML::Tree
26
+ include Iudex::HTML::Tree::Filters
27
+
28
+ # Create html parse and clean filters
29
+ # Expected usage:
30
+ # PAGE: html_clean_filters( :source )
31
+ # FEED: html_clean_filters( :title )
32
+ # FEED: html_clean_filters( :summary )
33
+ # FEED: html_clean_filters( :content )
34
+ #
35
+ def html_clean_filters( src_key, tree_key = nil )
36
+
37
+ tree_key = "#{src_key}_tree".to_sym unless tree_key
38
+ src_key, tree_key = src_key.to_k, tree_key.to_k
39
+
40
+ filters = []
41
+ filters << html_parse_filter( src_key, tree_key )
42
+
43
+ #FIXME: PAGE: filters << TitleExtractor.new, or after?
44
+
45
+ # FIXME: if src is text, last filter
46
+ # filters << TextCtrlWSFilter.new( ContentKeys::TITLE )
47
+
48
+ tfc = TreeFilterChain.new( html_tree_filters )
49
+
50
+ filters << HTMLTreeFilter.new( tree_key, tfc,
51
+ HTMLTreeFilter::Order::DEPTH_FIRST )
52
+
53
+ #FIXME: First block extractor back to text key?
54
+
55
+ filters
56
+ end
57
+
58
+ def html_tree_filters
59
+ [ XmpToPreConverter.new, # Before CharactersNormalizer
60
+ CSSDisplayFilter.new, # Before AttributeCleaner
61
+ AttributeCleaner.new,
62
+ CharactersNormalizer.new,
63
+ EmptyInlineRemover.new, # Depth
64
+ WordCounter.new, # Depth; only for count deps?
65
+ WordyCounter.new ] # Depth; only with cleaners/simhash?
66
+ end
67
+
68
+ def html_parse_filter( src_key, tree_key = nil )
69
+
70
+ tree_key = "#{src_key}_tree".to_sym unless tree_key
71
+ src_key, tree_key = src_key.to_k, tree_key.to_k
72
+
73
+ if( src_key.value_type == ContentSource.java_class )
74
+ HTMLParseFilter.new( src_key, nil, tree_key )
75
+ else
76
+ HTMLParseFilter.new( src_key, tree_key )
77
+ end
78
+ end
79
+
80
+ # Expected usage:
81
+ # FEED: html_write_filter( :summary )
82
+ def html_write_filter( key1, key2 = nil )
83
+
84
+ tree_key, out_key = if key2
85
+ [ key1, key2 ]
86
+ else
87
+ [ "#{key1}_tree".to_sym, key1 ]
88
+ end
89
+
90
+ HTMLWriteFilter.new( tree_key.to_k, out_key.to_k )
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
Binary file
data/pom.xml ADDED
@@ -0,0 +1,51 @@
1
+ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
2
+ <modelVersion>4.0.0</modelVersion>
3
+ <groupId>iudex</groupId>
4
+ <artifactId>iudex-html</artifactId>
5
+ <packaging>jar</packaging>
6
+ <version>1.0.0</version>
7
+ <name>Iudex HTML parsing/filtering and text extraction</name>
8
+
9
+ <parent>
10
+ <groupId>iudex</groupId>
11
+ <artifactId>iudex-parent</artifactId>
12
+ <version>1.0</version>
13
+ <relativePath>..</relativePath>
14
+ </parent>
15
+
16
+ <dependencies>
17
+
18
+ <dependency>
19
+ <groupId>iudex</groupId>
20
+ <artifactId>iudex-core</artifactId>
21
+ <version>[1.0,1.1)</version>
22
+ </dependency>
23
+
24
+ <dependency>
25
+ <groupId>com.gravitext</groupId>
26
+ <artifactId>gravitext-xmlprod</artifactId>
27
+ <version>[1.4,1.5)</version>
28
+ </dependency>
29
+
30
+ <dependency>
31
+ <groupId>net.sourceforge.nekohtml</groupId>
32
+ <artifactId>nekohtml</artifactId>
33
+ <version>1.9.14</version>
34
+ </dependency>
35
+
36
+ </dependencies>
37
+
38
+ <build>
39
+ <plugins>
40
+ <plugin>
41
+ <!-- Parent settings -->
42
+ <artifactId>maven-compiler-plugin</artifactId>
43
+ </plugin>
44
+ <plugin>
45
+ <!-- Parent settings -->
46
+ <artifactId>maven-source-plugin</artifactId>
47
+ </plugin>
48
+ </plugins>
49
+ </build>
50
+
51
+ </project>
@@ -0,0 +1,100 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-html'
18
+ require 'iudex-html/factory_helper'
19
+
20
+ require 'iudex-filter/key_helper'
21
+
22
+ module HTMLTestHelper
23
+
24
+ include Gravitext::HTMap
25
+ UniMap.define_accessors
26
+
27
+ include Iudex::Filter::Core
28
+ include Iudex::HTML::Filters::FactoryHelper
29
+
30
+ import 'com.gravitext.xml.tree.TreeUtils'
31
+ import 'com.gravitext.xml.producer.Indentor'
32
+
33
+ import 'iudex.html.HTML'
34
+ import 'iudex.html.HTMLUtils'
35
+ import 'iudex.html.tree.TreeFilterChain'
36
+ import 'iudex.html.tree.TreeWalker'
37
+
38
+ def parse( html, charset = "UTF-8" )
39
+ HTMLUtils::parse( source( html, charset ) )
40
+ end
41
+
42
+ def parseFragment( html, charset = "UTF-8" )
43
+ inner( HTMLUtils::parseFragment( source( html, charset ) ) )
44
+ end
45
+
46
+ def inner( tree )
47
+ c = tree.children
48
+ if ( c.size == 1 && c[0].element? )
49
+ c[0]
50
+ else
51
+ tree
52
+ end
53
+ end
54
+
55
+ def assert_doc( html, root )
56
+ html = compress( html )
57
+ assert_equal( html,
58
+ TreeUtils::produceString( root, Indentor::COMPRESSED ) )
59
+ end
60
+
61
+ def assert_fragment( html, root, remove_padding = false )
62
+ assert_fragment_ws( compress( html ), root, remove_padding )
63
+ end
64
+
65
+ def assert_fragment_ws( html, root, remove_padding = false )
66
+ html = html.gsub( /~+/, '' ) if remove_padding
67
+ assert_equal( html,
68
+ HTMLUtils::produceFragmentString( root, Indentor::COMPRESSED ) )
69
+ end
70
+
71
+ def assert_transform( html, filter = nil, func = :walk_depth_first )
72
+ tree = parseFragment( html[ :in ] )
73
+ action = TreeWalker.send( func, filter, tree ) if func && filter
74
+ assert_fragment( html[ :out ], tree, true )
75
+ action
76
+ end
77
+
78
+ def source( html, charset = "UTF-8" )
79
+ HTMLUtils::source( compress( html ).to_java_bytes, charset )
80
+ end
81
+
82
+ def compress( html )
83
+ html.gsub( /\n\s*/, '' )
84
+ end
85
+
86
+ def filter_chain( filters, mode = :whole )
87
+ pf = html_parse_filter( :source )
88
+ pf.parse_as_fragment = true if mode == :fragment
89
+ filters = Array( filters )
90
+ filters.unshift( pf )
91
+ FilterChain.new( "test", filters )
92
+ end
93
+
94
+ def content( html, charset = "UTF-8" )
95
+ map = UniMap.new
96
+ map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
97
+ map
98
+ end
99
+
100
+ end
data/test/setup.rb ADDED
@@ -0,0 +1,38 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ #### General test setup: LOAD_PATH, logging, console output ####
18
+
19
+ test_dir = File.dirname( __FILE__ )
20
+
21
+ ldir = File.join( test_dir, "..", "lib" )
22
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
23
+
24
+ require 'rubygems'
25
+ require 'rjack-logback'
26
+ RJack::Logback.config_console( :stderr => true )
27
+
28
+ require 'minitest/unit'
29
+ require 'minitest/autorun'
30
+
31
+ require File.join( test_dir, 'html_test_helper.rb' )
32
+
33
+ # Make test output logging compatible: no partial lines.
34
+ # class TestOut
35
+ # def print( *a ); $stdout.puts( *a ); end
36
+ # def puts( *a ); $stdout.puts( *a ); end
37
+ # end
38
+ # MiniTest::Unit.output = TestOut.new
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ class TestCharactersNormalizer < MiniTest::Unit::TestCase
23
+ include HTMLTestHelper
24
+
25
+ include Iudex::HTML
26
+ include Iudex::HTML::Filters
27
+ include Iudex::HTML::Tree
28
+ include Iudex::HTML::Tree::Filters
29
+ include Iudex::Core
30
+
31
+ Order = HTMLTreeFilter::Order
32
+
33
+ def test_simple_block
34
+ # Note: '~' is padding removed in compare
35
+
36
+ html = { :in => "<p> x y </p>",
37
+ :out => "<p>~x ~y~</p>" }
38
+
39
+ assert_normalize( html )
40
+ end
41
+
42
+ def test_simple_inline
43
+ html = { :in => "<i> x y </i>",
44
+ :out => "<i> x ~y </i>" }
45
+
46
+ assert_normalize( html )
47
+ end
48
+
49
+ def test_mixed_inline
50
+ html = { :in => "<p> x y <i>z </i> </p>",
51
+ :out => "<p>~x ~y <i>z </i>~</p>" }
52
+
53
+ assert_normalize( html )
54
+ end
55
+
56
+ def test_empty
57
+ html = { :in => "<div><p> </p> <p>foo</p> </div>",
58
+ :out => "<div><p/>~~~~~~<p>foo</p>~</div>" }
59
+
60
+ assert_normalize( html )
61
+ end
62
+
63
+ def test_pre
64
+ html = { :in => "<div> x <pre> \0x\n <a> y </a></pre> </div>",
65
+ :out => "<div>~x~<pre> ~ x\n <a> y </a></pre>~</div>" }
66
+
67
+ assert_normalize( html )
68
+ end
69
+
70
+ def assert_normalize( html )
71
+ [ Order::BREADTH_FIRST, Order::DEPTH_FIRST ].each do |order|
72
+ map = content( html[ :in ] )
73
+ tfc = TreeFilterChain.new( [ CharactersNormalizer.new ] )
74
+ tf = HTMLTreeFilter.new( :source_tree.to_k, tfc, order )
75
+ chain = filter_chain( tf, :fragment )
76
+ assert( chain.filter( map ) )
77
+ assert_fragment_ws( html[ :out ], inner( map.source_tree ), true )
78
+ end
79
+ end
80
+
81
+ end