iudex-html 1.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/lib/iudex-html.rb ADDED
@@ -0,0 +1,56 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-core'
18
+ require 'gravitext-xmlprod'
19
+ require 'rjack-nekohtml'
20
+
21
+ require 'iudex-html/base.rb'
22
+
23
+ require 'java'
24
+
25
+ module Iudex
26
+ module HTML
27
+ require "iudex-html/iudex-html-#{VERSION}.jar"
28
+
29
+ import 'iudex.html.HTMLKeys'
30
+
31
+ module Filters
32
+ import 'iudex.html.filters.ExtractFilter'
33
+ import 'iudex.html.filters.HTMLParseFilter'
34
+ import 'iudex.html.filters.HTMLTreeFilter'
35
+ import 'iudex.html.filters.HTMLWriteFilter'
36
+ import 'iudex.html.filters.TitleExtractor'
37
+ end
38
+
39
+ module Tree
40
+ import 'iudex.html.tree.HTMLTreeKeys'
41
+ import 'iudex.html.tree.TreeFilterChain'
42
+
43
+ module Filters
44
+ import 'iudex.html.tree.filters.AttributeCleaner'
45
+ import 'iudex.html.tree.filters.CSSDisplayFilter'
46
+ import 'iudex.html.tree.filters.CharactersNormalizer'
47
+ import 'iudex.html.tree.filters.EmptyInlineRemover'
48
+ import 'iudex.html.tree.filters.MetaSkipFilter'
49
+ import 'iudex.html.tree.filters.WordCounter'
50
+ import 'iudex.html.tree.filters.WordyCounter'
51
+ import 'iudex.html.tree.filters.XmpToPreConverter'
52
+ end
53
+ end
54
+
55
+ end
56
+ end
@@ -0,0 +1,21 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ module Iudex
18
+ module HTML
19
+ VERSION = '1.0.0'
20
+ end
21
+ end
@@ -0,0 +1,95 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You may
6
+ # obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-html'
18
+ require 'iudex-filter/key_helper'
19
+
20
+ module Iudex
21
+ module HTML
22
+ module Filters
23
+ module FactoryHelper
24
+ include Iudex::Core # ContentSource
25
+ include Iudex::HTML::Tree
26
+ include Iudex::HTML::Tree::Filters
27
+
28
+ # Create html parse and clean filters
29
+ # Expected usage:
30
+ # PAGE: html_clean_filters( :source )
31
+ # FEED: html_clean_filters( :title )
32
+ # FEED: html_clean_filters( :summary )
33
+ # FEED: html_clean_filters( :content )
34
+ #
35
+ def html_clean_filters( src_key, tree_key = nil )
36
+
37
+ tree_key = "#{src_key}_tree".to_sym unless tree_key
38
+ src_key, tree_key = src_key.to_k, tree_key.to_k
39
+
40
+ filters = []
41
+ filters << html_parse_filter( src_key, tree_key )
42
+
43
+ #FIXME: PAGE: filters << TitleExtractor.new, or after?
44
+
45
+ # FIXME: if src is text, last filter
46
+ # filters << TextCtrlWSFilter.new( ContentKeys::TITLE )
47
+
48
+ tfc = TreeFilterChain.new( html_tree_filters )
49
+
50
+ filters << HTMLTreeFilter.new( tree_key, tfc,
51
+ HTMLTreeFilter::Order::DEPTH_FIRST )
52
+
53
+ #FIXME: First block extractor back to text key?
54
+
55
+ filters
56
+ end
57
+
58
+ def html_tree_filters
59
+ [ XmpToPreConverter.new, # Before CharactersNormalizer
60
+ CSSDisplayFilter.new, # Before AttributeCleaner
61
+ AttributeCleaner.new,
62
+ CharactersNormalizer.new,
63
+ EmptyInlineRemover.new, # Depth
64
+ WordCounter.new, # Depth; only for count deps?
65
+ WordyCounter.new ] # Depth; only with cleaners/simhash?
66
+ end
67
+
68
+ def html_parse_filter( src_key, tree_key = nil )
69
+
70
+ tree_key = "#{src_key}_tree".to_sym unless tree_key
71
+ src_key, tree_key = src_key.to_k, tree_key.to_k
72
+
73
+ if( src_key.value_type == ContentSource.java_class )
74
+ HTMLParseFilter.new( src_key, nil, tree_key )
75
+ else
76
+ HTMLParseFilter.new( src_key, tree_key )
77
+ end
78
+ end
79
+
80
+ # Expected usage:
81
+ # FEED: html_write_filter( :summary )
82
+ def html_write_filter( key1, key2 = nil )
83
+
84
+ tree_key, out_key = if key2
85
+ [ key1, key2 ]
86
+ else
87
+ [ "#{key1}_tree".to_sym, key1 ]
88
+ end
89
+
90
+ HTMLWriteFilter.new( tree_key.to_k, out_key.to_k )
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
Binary file
data/pom.xml ADDED
@@ -0,0 +1,51 @@
1
+ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
2
+ <modelVersion>4.0.0</modelVersion>
3
+ <groupId>iudex</groupId>
4
+ <artifactId>iudex-html</artifactId>
5
+ <packaging>jar</packaging>
6
+ <version>1.0.0</version>
7
+ <name>Iudex HTML parsing/filtering and text extraction</name>
8
+
9
+ <parent>
10
+ <groupId>iudex</groupId>
11
+ <artifactId>iudex-parent</artifactId>
12
+ <version>1.0</version>
13
+ <relativePath>..</relativePath>
14
+ </parent>
15
+
16
+ <dependencies>
17
+
18
+ <dependency>
19
+ <groupId>iudex</groupId>
20
+ <artifactId>iudex-core</artifactId>
21
+ <version>[1.0,1.1)</version>
22
+ </dependency>
23
+
24
+ <dependency>
25
+ <groupId>com.gravitext</groupId>
26
+ <artifactId>gravitext-xmlprod</artifactId>
27
+ <version>[1.4,1.5)</version>
28
+ </dependency>
29
+
30
+ <dependency>
31
+ <groupId>net.sourceforge.nekohtml</groupId>
32
+ <artifactId>nekohtml</artifactId>
33
+ <version>1.9.14</version>
34
+ </dependency>
35
+
36
+ </dependencies>
37
+
38
+ <build>
39
+ <plugins>
40
+ <plugin>
41
+ <!-- Parent settings -->
42
+ <artifactId>maven-compiler-plugin</artifactId>
43
+ </plugin>
44
+ <plugin>
45
+ <!-- Parent settings -->
46
+ <artifactId>maven-source-plugin</artifactId>
47
+ </plugin>
48
+ </plugins>
49
+ </build>
50
+
51
+ </project>
@@ -0,0 +1,100 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ require 'iudex-html'
18
+ require 'iudex-html/factory_helper'
19
+
20
+ require 'iudex-filter/key_helper'
21
+
22
+ module HTMLTestHelper
23
+
24
+ include Gravitext::HTMap
25
+ UniMap.define_accessors
26
+
27
+ include Iudex::Filter::Core
28
+ include Iudex::HTML::Filters::FactoryHelper
29
+
30
+ import 'com.gravitext.xml.tree.TreeUtils'
31
+ import 'com.gravitext.xml.producer.Indentor'
32
+
33
+ import 'iudex.html.HTML'
34
+ import 'iudex.html.HTMLUtils'
35
+ import 'iudex.html.tree.TreeFilterChain'
36
+ import 'iudex.html.tree.TreeWalker'
37
+
38
+ def parse( html, charset = "UTF-8" )
39
+ HTMLUtils::parse( source( html, charset ) )
40
+ end
41
+
42
+ def parseFragment( html, charset = "UTF-8" )
43
+ inner( HTMLUtils::parseFragment( source( html, charset ) ) )
44
+ end
45
+
46
+ def inner( tree )
47
+ c = tree.children
48
+ if ( c.size == 1 && c[0].element? )
49
+ c[0]
50
+ else
51
+ tree
52
+ end
53
+ end
54
+
55
+ def assert_doc( html, root )
56
+ html = compress( html )
57
+ assert_equal( html,
58
+ TreeUtils::produceString( root, Indentor::COMPRESSED ) )
59
+ end
60
+
61
+ def assert_fragment( html, root, remove_padding = false )
62
+ assert_fragment_ws( compress( html ), root, remove_padding )
63
+ end
64
+
65
+ def assert_fragment_ws( html, root, remove_padding = false )
66
+ html = html.gsub( /~+/, '' ) if remove_padding
67
+ assert_equal( html,
68
+ HTMLUtils::produceFragmentString( root, Indentor::COMPRESSED ) )
69
+ end
70
+
71
+ def assert_transform( html, filter = nil, func = :walk_depth_first )
72
+ tree = parseFragment( html[ :in ] )
73
+ action = TreeWalker.send( func, filter, tree ) if func && filter
74
+ assert_fragment( html[ :out ], tree, true )
75
+ action
76
+ end
77
+
78
+ def source( html, charset = "UTF-8" )
79
+ HTMLUtils::source( compress( html ).to_java_bytes, charset )
80
+ end
81
+
82
+ def compress( html )
83
+ html.gsub( /\n\s*/, '' )
84
+ end
85
+
86
+ def filter_chain( filters, mode = :whole )
87
+ pf = html_parse_filter( :source )
88
+ pf.parse_as_fragment = true if mode == :fragment
89
+ filters = Array( filters )
90
+ filters.unshift( pf )
91
+ FilterChain.new( "test", filters )
92
+ end
93
+
94
+ def content( html, charset = "UTF-8" )
95
+ map = UniMap.new
96
+ map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
97
+ map
98
+ end
99
+
100
+ end
data/test/setup.rb ADDED
@@ -0,0 +1,38 @@
1
+ #--
2
+ # Copyright (c) 2010-2011 David Kellum
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
5
+ # may not use this file except in compliance with the License. You
6
+ # may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13
+ # implied. See the License for the specific language governing
14
+ # permissions and limitations under the License.
15
+ #++
16
+
17
+ #### General test setup: LOAD_PATH, logging, console output ####
18
+
19
+ test_dir = File.dirname( __FILE__ )
20
+
21
+ ldir = File.join( test_dir, "..", "lib" )
22
+ $LOAD_PATH.unshift( ldir ) unless $LOAD_PATH.include?( ldir )
23
+
24
+ require 'rubygems'
25
+ require 'rjack-logback'
26
+ RJack::Logback.config_console( :stderr => true )
27
+
28
+ require 'minitest/unit'
29
+ require 'minitest/autorun'
30
+
31
+ require File.join( test_dir, 'html_test_helper.rb' )
32
+
33
+ # Make test output logging compatible: no partial lines.
34
+ # class TestOut
35
+ # def print( *a ); $stdout.puts( *a ); end
36
+ # def puts( *a ); $stdout.puts( *a ); end
37
+ # end
38
+ # MiniTest::Unit.output = TestOut.new
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env jruby
2
+ #.hashdot.profile += jruby-shortlived
3
+
4
+ #--
5
+ # Copyright (c) 2010-2011 David Kellum
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
8
+ # may not use this file except in compliance with the License. You
9
+ # may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
16
+ # implied. See the License for the specific language governing
17
+ # permissions and limitations under the License.
18
+ #++
19
+
20
+ require File.join( File.dirname( __FILE__ ), "setup" )
21
+
22
+ class TestCharactersNormalizer < MiniTest::Unit::TestCase
23
+ include HTMLTestHelper
24
+
25
+ include Iudex::HTML
26
+ include Iudex::HTML::Filters
27
+ include Iudex::HTML::Tree
28
+ include Iudex::HTML::Tree::Filters
29
+ include Iudex::Core
30
+
31
+ Order = HTMLTreeFilter::Order
32
+
33
+ def test_simple_block
34
+ # Note: '~' is padding removed in compare
35
+
36
+ html = { :in => "<p> x y </p>",
37
+ :out => "<p>~x ~y~</p>" }
38
+
39
+ assert_normalize( html )
40
+ end
41
+
42
+ def test_simple_inline
43
+ html = { :in => "<i> x y </i>",
44
+ :out => "<i> x ~y </i>" }
45
+
46
+ assert_normalize( html )
47
+ end
48
+
49
+ def test_mixed_inline
50
+ html = { :in => "<p> x y <i>z </i> </p>",
51
+ :out => "<p>~x ~y <i>z </i>~</p>" }
52
+
53
+ assert_normalize( html )
54
+ end
55
+
56
+ def test_empty
57
+ html = { :in => "<div><p> </p> <p>foo</p> </div>",
58
+ :out => "<div><p/>~~~~~~<p>foo</p>~</div>" }
59
+
60
+ assert_normalize( html )
61
+ end
62
+
63
+ def test_pre
64
+ html = { :in => "<div> x <pre> \0x\n <a> y </a></pre> </div>",
65
+ :out => "<div>~x~<pre> ~ x\n <a> y </a></pre>~</div>" }
66
+
67
+ assert_normalize( html )
68
+ end
69
+
70
+ def assert_normalize( html )
71
+ [ Order::BREADTH_FIRST, Order::DEPTH_FIRST ].each do |order|
72
+ map = content( html[ :in ] )
73
+ tfc = TreeFilterChain.new( [ CharactersNormalizer.new ] )
74
+ tf = HTMLTreeFilter.new( :source_tree.to_k, tfc, order )
75
+ chain = filter_chain( tf, :fragment )
76
+ assert( chain.filter( map ) )
77
+ assert_fragment_ws( html[ :out ], inner( map.source_tree ), true )
78
+ end
79
+ end
80
+
81
+ end