iudex-html 1.0.0-java → 1.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.rdoc +7 -0
- data/Manifest.txt +1 -1
- data/Rakefile +3 -3
- data/lib/iudex-html.rb +15 -0
- data/lib/iudex-html/base.rb +1 -1
- data/lib/iudex-html/factory_helper.rb +1 -0
- data/lib/iudex-html/{iudex-html-1.0.0.jar → iudex-html-1.1.0.jar} +0 -0
- data/pom.xml +3 -3
- data/test/html_test_helper.rb +1 -1
- data/test/test_other_tree_filters.rb +11 -3
- data/test/test_parse_filter.rb +29 -13
- metadata +9 -13
data/.gemtest
ADDED
File without changes
|
data/History.rdoc
CHANGED
@@ -1,2 +1,9 @@
|
|
1
|
+
=== 1.1.0 (2011-11-13)
|
2
|
+
* Update to iudex-core ~> 1.1.0
|
3
|
+
* Add NekoHTMLParser charset expansion and ContentSource encoding
|
4
|
+
confidence support
|
5
|
+
* Add MojiBakeCleaner tree filter, helper support
|
6
|
+
* Update to minitest ~> 2.3
|
7
|
+
|
1
8
|
=== 1.0.0 (2011-04-04)
|
2
9
|
* Initial release.
|
data/Manifest.txt
CHANGED
data/Rakefile
CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH << './lib'
|
|
4
4
|
require 'iudex-html/base'
|
5
5
|
|
6
6
|
require 'rubygems'
|
7
|
-
gem 'rjack-tarpit', '~> 1.
|
7
|
+
gem 'rjack-tarpit', '~> 1.4'
|
8
8
|
require 'rjack-tarpit'
|
9
9
|
|
10
10
|
t = RJack::TarPit.new( 'iudex-html',
|
@@ -13,12 +13,12 @@ t = RJack::TarPit.new( 'iudex-html',
|
|
13
13
|
|
14
14
|
t.specify do |h|
|
15
15
|
h.developer( "David Kellum", "dek-oss@gravitext.com" )
|
16
|
-
h.extra_deps += [ [ 'iudex-core', '~> 1.
|
16
|
+
h.extra_deps += [ [ 'iudex-core', '~> 1.1.0' ],
|
17
17
|
[ 'rjack-nekohtml', '~> 1.9.14' ],
|
18
18
|
[ 'gravitext-xmlprod', '~> 1.4.0' ] ]
|
19
19
|
|
20
20
|
h.testlib = :minitest
|
21
|
-
h.extra_dev_deps += [ [ 'minitest', '
|
21
|
+
h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ],
|
22
22
|
[ 'rjack-logback', '~> 1.0' ] ]
|
23
23
|
end
|
24
24
|
|
data/lib/iudex-html.rb
CHANGED
@@ -46,9 +46,24 @@ module Iudex
|
|
46
46
|
import 'iudex.html.tree.filters.CharactersNormalizer'
|
47
47
|
import 'iudex.html.tree.filters.EmptyInlineRemover'
|
48
48
|
import 'iudex.html.tree.filters.MetaSkipFilter'
|
49
|
+
import 'iudex.html.tree.filters.MojiBakeCleaner'
|
49
50
|
import 'iudex.html.tree.filters.WordCounter'
|
50
51
|
import 'iudex.html.tree.filters.WordyCounter'
|
51
52
|
import 'iudex.html.tree.filters.XmpToPreConverter'
|
53
|
+
|
54
|
+
# Re-open iudex.html.tree.filter.MojiBakeCleaner to add config file
|
55
|
+
# based initialization.
|
56
|
+
class MojiBakeCleaner
|
57
|
+
include Iudex::Core
|
58
|
+
|
59
|
+
# Alt constructor taking a configuration file in `mojibake
|
60
|
+
# -t` format.
|
61
|
+
def initialize( config_file = :default )
|
62
|
+
args = Array( config_file ) - [ :default ]
|
63
|
+
super( *MojiBake.load_config( *args ) )
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
52
67
|
end
|
53
68
|
end
|
54
69
|
|
data/lib/iudex-html/base.rb
CHANGED
@@ -59,6 +59,7 @@ module Iudex
|
|
59
59
|
[ XmpToPreConverter.new, # Before CharactersNormalizer
|
60
60
|
CSSDisplayFilter.new, # Before AttributeCleaner
|
61
61
|
AttributeCleaner.new,
|
62
|
+
MojiBakeCleaner.new,
|
62
63
|
CharactersNormalizer.new,
|
63
64
|
EmptyInlineRemover.new, # Depth
|
64
65
|
WordCounter.new, # Depth; only for count deps?
|
Binary file
|
data/pom.xml
CHANGED
@@ -3,13 +3,13 @@
|
|
3
3
|
<groupId>iudex</groupId>
|
4
4
|
<artifactId>iudex-html</artifactId>
|
5
5
|
<packaging>jar</packaging>
|
6
|
-
<version>1.
|
6
|
+
<version>1.1.0</version>
|
7
7
|
<name>Iudex HTML parsing/filtering and text extraction</name>
|
8
8
|
|
9
9
|
<parent>
|
10
10
|
<groupId>iudex</groupId>
|
11
11
|
<artifactId>iudex-parent</artifactId>
|
12
|
-
<version>1.
|
12
|
+
<version>1.1</version>
|
13
13
|
<relativePath>..</relativePath>
|
14
14
|
</parent>
|
15
15
|
|
@@ -18,7 +18,7 @@
|
|
18
18
|
<dependency>
|
19
19
|
<groupId>iudex</groupId>
|
20
20
|
<artifactId>iudex-core</artifactId>
|
21
|
-
<version>[1.
|
21
|
+
<version>[1.1,1.2)</version>
|
22
22
|
</dependency>
|
23
23
|
|
24
24
|
<dependency>
|
data/test/html_test_helper.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/env jruby
|
2
|
+
# -*- coding: utf-8 -*-
|
2
3
|
#.hashdot.profile += jruby-shortlived
|
3
4
|
|
4
5
|
#--
|
@@ -86,9 +87,9 @@ HTML
|
|
86
87
|
assert( f.has_display_none( '{display: none}' ) ) #lenient
|
87
88
|
assert( f.has_display_none( 'other:foo; DISPLAY:NONE;' ) )
|
88
89
|
|
89
|
-
|
90
|
-
|
91
|
-
|
90
|
+
refute( f.has_display_none( 'display: block' ) )
|
91
|
+
refute( f.has_display_none( 'other-display: none' ) )
|
92
|
+
refute( f.has_display_none( 'display: nonetheless' ) )
|
92
93
|
end
|
93
94
|
|
94
95
|
def test_css_display_filter
|
@@ -114,6 +115,13 @@ HTML
|
|
114
115
|
assert_transform( html, XmpToPreConverter.new )
|
115
116
|
end
|
116
117
|
|
118
|
+
def test_mojibake_cleaner
|
119
|
+
html = { :in => "<div><p>ascii</p> ° </div>",
|
120
|
+
:out => "<div><p>ascii</p> ~° </div>" }
|
121
|
+
|
122
|
+
assert_transform( html, MojiBakeCleaner.new )
|
123
|
+
end
|
124
|
+
|
117
125
|
def cut_atts( html, *atts )
|
118
126
|
atts.each do |att|
|
119
127
|
html = html.gsub( / #{att}="[^"]+"/, '' )
|
data/test/test_parse_filter.rb
CHANGED
@@ -32,20 +32,20 @@ class TestParseFilter < MiniTest::Unit::TestCase
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def test_marked
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
refute( marked?( "" ) )
|
36
|
+
refute( marked?( "simple" ) )
|
37
|
+
refute( marked?( "<simple" ) )
|
38
|
+
refute( marked?( "x < y" ) )
|
39
|
+
refute( marked?( "AT&T" ) )
|
40
|
+
refute( marked?( "AT & T;" ) )
|
41
41
|
|
42
|
-
assert(
|
43
|
-
assert(
|
44
|
-
assert(
|
45
|
-
assert(
|
46
|
-
assert(
|
47
|
-
assert(
|
48
|
-
assert(
|
42
|
+
assert( marked?( "<a>" ) )
|
43
|
+
assert( marked?( "Words © 2010" ) )
|
44
|
+
assert( marked?( "" ) )
|
45
|
+
assert( marked?( "߬" ) )
|
46
|
+
assert( marked?( "<![CDATA[simple]]>" ) )
|
47
|
+
assert( marked?( "<![CDATA[simple" ) )
|
48
|
+
assert( marked?( "<!-- comment -->" ) )
|
49
49
|
end
|
50
50
|
|
51
51
|
def marked?( text )
|
@@ -54,6 +54,7 @@ class TestParseFilter < MiniTest::Unit::TestCase
|
|
54
54
|
|
55
55
|
def test_markup
|
56
56
|
tests = [ [ "simple", 0, "simple" ],
|
57
|
+
[ "AT&T", 0, "AT&T" ],
|
57
58
|
[ "<i>inner</i>", 1, nil ],
|
58
59
|
[ "<i>inner</i>", 2, nil ],
|
59
60
|
[ "<!--ignore-->text", 1, "text" ],
|
@@ -61,6 +62,21 @@ class TestParseFilter < MiniTest::Unit::TestCase
|
|
61
62
|
[ "<", 1, "<" ],
|
62
63
|
[ "&lt;", 2, "<" ] ]
|
63
64
|
|
65
|
+
@filter.min_parse = 0
|
66
|
+
tests.each do | input, count, out |
|
67
|
+
map = UniMap.new
|
68
|
+
map.title = input
|
69
|
+
assert_equal( count, @filter.parse_loop( map ), input )
|
70
|
+
assert_equal( out, map.title && map.title.to_s )
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_markup_nochange
|
75
|
+
tests = [ [ "simple", 1, "simple" ],
|
76
|
+
[ "AT&T", 1, "AT&T" ],
|
77
|
+
[ "<i>AT&T</i>", 1, nil ] ]
|
78
|
+
|
79
|
+
@filter.min_parse = 1
|
64
80
|
tests.each do | input, count, out |
|
65
81
|
map = UniMap.new
|
66
82
|
map.title = input
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: iudex-html
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 1.
|
5
|
+
version: 1.1.0
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
@@ -10,8 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
14
|
-
default_executable:
|
13
|
+
date: 2011-11-13 00:00:00 Z
|
15
14
|
dependencies:
|
16
15
|
- !ruby/object:Gem::Dependency
|
17
16
|
name: iudex-core
|
@@ -21,7 +20,7 @@ dependencies:
|
|
21
20
|
requirements:
|
22
21
|
- - ~>
|
23
22
|
- !ruby/object:Gem::Version
|
24
|
-
version: 1.
|
23
|
+
version: 1.1.0
|
25
24
|
type: :runtime
|
26
25
|
version_requirements: *id001
|
27
26
|
- !ruby/object:Gem::Dependency
|
@@ -52,12 +51,9 @@ dependencies:
|
|
52
51
|
requirement: &id004 !ruby/object:Gem::Requirement
|
53
52
|
none: false
|
54
53
|
requirements:
|
55
|
-
- -
|
56
|
-
- !ruby/object:Gem::Version
|
57
|
-
version: 1.7.1
|
58
|
-
- - <
|
54
|
+
- - ~>
|
59
55
|
- !ruby/object:Gem::Version
|
60
|
-
version: "2.
|
56
|
+
version: "2.3"
|
61
57
|
type: :development
|
62
58
|
version_requirements: *id004
|
63
59
|
- !ruby/object:Gem::Dependency
|
@@ -79,7 +75,7 @@ dependencies:
|
|
79
75
|
requirements:
|
80
76
|
- - ~>
|
81
77
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.
|
78
|
+
version: 1.4.0
|
83
79
|
type: :development
|
84
80
|
version_requirements: *id006
|
85
81
|
description: |-
|
@@ -120,8 +116,8 @@ files:
|
|
120
116
|
- test/test_parse_filter.rb
|
121
117
|
- test/test_tree_walker.rb
|
122
118
|
- test/test_word_counters.rb
|
123
|
-
- lib/iudex-html/iudex-html-1.
|
124
|
-
|
119
|
+
- lib/iudex-html/iudex-html-1.1.0.jar
|
120
|
+
- .gemtest
|
125
121
|
homepage: http://github.com/dekellum/iudex
|
126
122
|
licenses: []
|
127
123
|
|
@@ -146,7 +142,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
146
142
|
requirements: []
|
147
143
|
|
148
144
|
rubyforge_project: iudex-html
|
149
|
-
rubygems_version: 1.
|
145
|
+
rubygems_version: 1.8.9
|
150
146
|
signing_key:
|
151
147
|
specification_version: 3
|
152
148
|
summary: Iudex is a general purpose web crawler and feed processor in ruby/java
|