iudex-html 1.0.0-java → 1.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gemtest ADDED
File without changes
data/History.rdoc CHANGED
@@ -1,2 +1,9 @@
1
+ === 1.1.0 (2011-11-13)
2
+ * Update to iudex-core ~> 1.1.0
3
+ * Add NekoHTMLParser charset expansion and ContentSource encoding
4
+ confidence support
5
+ * Add MojiBakeCleaner tree filter, helper support
6
+ * Update to minitest ~> 2.3
7
+
1
8
  === 1.0.0 (2011-04-04)
2
9
  * Initial release.
data/Manifest.txt CHANGED
@@ -21,4 +21,4 @@ test/test_other_tree_filters.rb
21
21
  test/test_parse_filter.rb
22
22
  test/test_tree_walker.rb
23
23
  test/test_word_counters.rb
24
- lib/iudex-html/iudex-html-1.0.0.jar
24
+ lib/iudex-html/iudex-html-1.1.0.jar
data/Rakefile CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH << './lib'
4
4
  require 'iudex-html/base'
5
5
 
6
6
  require 'rubygems'
7
- gem 'rjack-tarpit', '~> 1.2'
7
+ gem 'rjack-tarpit', '~> 1.4'
8
8
  require 'rjack-tarpit'
9
9
 
10
10
  t = RJack::TarPit.new( 'iudex-html',
@@ -13,12 +13,12 @@ t = RJack::TarPit.new( 'iudex-html',
13
13
 
14
14
  t.specify do |h|
15
15
  h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
- h.extra_deps += [ [ 'iudex-core', '~> 1.0.0' ],
16
+ h.extra_deps += [ [ 'iudex-core', '~> 1.1.0' ],
17
17
  [ 'rjack-nekohtml', '~> 1.9.14' ],
18
18
  [ 'gravitext-xmlprod', '~> 1.4.0' ] ]
19
19
 
20
20
  h.testlib = :minitest
21
- h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ],
21
+ h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ],
22
22
  [ 'rjack-logback', '~> 1.0' ] ]
23
23
  end
24
24
 
data/lib/iudex-html.rb CHANGED
@@ -46,9 +46,24 @@ module Iudex
46
46
  import 'iudex.html.tree.filters.CharactersNormalizer'
47
47
  import 'iudex.html.tree.filters.EmptyInlineRemover'
48
48
  import 'iudex.html.tree.filters.MetaSkipFilter'
49
+ import 'iudex.html.tree.filters.MojiBakeCleaner'
49
50
  import 'iudex.html.tree.filters.WordCounter'
50
51
  import 'iudex.html.tree.filters.WordyCounter'
51
52
  import 'iudex.html.tree.filters.XmpToPreConverter'
53
+
54
+ # Re-open iudex.html.tree.filter.MojiBakeCleaner to add config file
55
+ # based initialization.
56
+ class MojiBakeCleaner
57
+ include Iudex::Core
58
+
59
+ # Alt constructor taking a configuration file in `mojibake
60
+ # -t` format.
61
+ def initialize( config_file = :default )
62
+ args = Array( config_file ) - [ :default ]
63
+ super( *MojiBake.load_config( *args ) )
64
+ end
65
+ end
66
+
52
67
  end
53
68
  end
54
69
 
@@ -16,6 +16,6 @@
16
16
 
17
17
  module Iudex
18
18
  module HTML
19
- VERSION = '1.0.0'
19
+ VERSION = '1.1.0'
20
20
  end
21
21
  end
@@ -59,6 +59,7 @@ module Iudex
59
59
  [ XmpToPreConverter.new, # Before CharactersNormalizer
60
60
  CSSDisplayFilter.new, # Before AttributeCleaner
61
61
  AttributeCleaner.new,
62
+ MojiBakeCleaner.new,
62
63
  CharactersNormalizer.new,
63
64
  EmptyInlineRemover.new, # Depth
64
65
  WordCounter.new, # Depth; only for count deps?
data/pom.xml CHANGED
@@ -3,13 +3,13 @@
3
3
  <groupId>iudex</groupId>
4
4
  <artifactId>iudex-html</artifactId>
5
5
  <packaging>jar</packaging>
6
- <version>1.0.0</version>
6
+ <version>1.1.0</version>
7
7
  <name>Iudex HTML parsing/filtering and text extraction</name>
8
8
 
9
9
  <parent>
10
10
  <groupId>iudex</groupId>
11
11
  <artifactId>iudex-parent</artifactId>
12
- <version>1.0</version>
12
+ <version>1.1</version>
13
13
  <relativePath>..</relativePath>
14
14
  </parent>
15
15
 
@@ -18,7 +18,7 @@
18
18
  <dependency>
19
19
  <groupId>iudex</groupId>
20
20
  <artifactId>iudex-core</artifactId>
21
- <version>[1.0,1.1)</version>
21
+ <version>[1.1,1.2)</version>
22
22
  </dependency>
23
23
 
24
24
  <dependency>
@@ -93,7 +93,7 @@ module HTMLTestHelper
93
93
 
94
94
  def content( html, charset = "UTF-8" )
95
95
  map = UniMap.new
96
- map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
96
+ map.source = HTMLUtils::source( html.to_java_bytes, charset )
97
97
  map
98
98
  end
99
99
 
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
2
3
  #.hashdot.profile += jruby-shortlived
3
4
 
4
5
  #--
@@ -86,9 +87,9 @@ HTML
86
87
  assert( f.has_display_none( '{display: none}' ) ) #lenient
87
88
  assert( f.has_display_none( 'other:foo; DISPLAY:NONE;' ) )
88
89
 
89
- assert( ! f.has_display_none( 'display: block' ) )
90
- assert( ! f.has_display_none( 'other-display: none' ) )
91
- assert( ! f.has_display_none( 'display: nonetheless' ) )
90
+ refute( f.has_display_none( 'display: block' ) )
91
+ refute( f.has_display_none( 'other-display: none' ) )
92
+ refute( f.has_display_none( 'display: nonetheless' ) )
92
93
  end
93
94
 
94
95
  def test_css_display_filter
@@ -114,6 +115,13 @@ HTML
114
115
  assert_transform( html, XmpToPreConverter.new )
115
116
  end
116
117
 
118
+ def test_mojibake_cleaner
119
+ html = { :in => "<div><p>ascii</p> ° </div>",
120
+ :out => "<div><p>ascii</p> ~° </div>" }
121
+
122
+ assert_transform( html, MojiBakeCleaner.new )
123
+ end
124
+
117
125
  def cut_atts( html, *atts )
118
126
  atts.each do |att|
119
127
  html = html.gsub( / #{att}="[^"]+"/, '' )
@@ -32,20 +32,20 @@ class TestParseFilter < MiniTest::Unit::TestCase
32
32
  end
33
33
 
34
34
  def test_marked
35
- assert( ! marked?( "" ) )
36
- assert( ! marked?( "simple" ) )
37
- assert( ! marked?( "<simple" ) )
38
- assert( ! marked?( "x < y" ) )
39
- assert( ! marked?( "AT&T" ) )
40
- assert( ! marked?( "AT & T;" ) )
35
+ refute( marked?( "" ) )
36
+ refute( marked?( "simple" ) )
37
+ refute( marked?( "<simple" ) )
38
+ refute( marked?( "x < y" ) )
39
+ refute( marked?( "AT&T" ) )
40
+ refute( marked?( "AT & T;" ) )
41
41
 
42
- assert( marked?( "<a>" ) )
43
- assert( marked?( "Words &copy; 2010" ) )
44
- assert( marked?( "&#xf43e;" ) )
45
- assert( marked?( "&#2028;" ) )
46
- assert( marked?( "<![CDATA[simple]]>" ) )
47
- assert( marked?( "<![CDATA[simple" ) )
48
- assert( marked?( "<!-- comment -->" ) )
42
+ assert( marked?( "<a>" ) )
43
+ assert( marked?( "Words &copy; 2010" ) )
44
+ assert( marked?( "&#xf43e;" ) )
45
+ assert( marked?( "&#2028;" ) )
46
+ assert( marked?( "<![CDATA[simple]]>" ) )
47
+ assert( marked?( "<![CDATA[simple" ) )
48
+ assert( marked?( "<!-- comment -->" ) )
49
49
  end
50
50
 
51
51
  def marked?( text )
@@ -54,6 +54,7 @@ class TestParseFilter < MiniTest::Unit::TestCase
54
54
 
55
55
  def test_markup
56
56
  tests = [ [ "simple", 0, "simple" ],
57
+ [ "AT&T", 0, "AT&T" ],
57
58
  [ "<i>inner</i>", 1, nil ],
58
59
  [ "&lt;i>inner&lt;/i>", 2, nil ],
59
60
  [ "<!--ignore-->text", 1, "text" ],
@@ -61,6 +62,21 @@ class TestParseFilter < MiniTest::Unit::TestCase
61
62
  [ "&lt;", 1, "<" ],
62
63
  [ "&amp;lt;", 2, "<" ] ]
63
64
 
65
+ @filter.min_parse = 0
66
+ tests.each do | input, count, out |
67
+ map = UniMap.new
68
+ map.title = input
69
+ assert_equal( count, @filter.parse_loop( map ), input )
70
+ assert_equal( out, map.title && map.title.to_s )
71
+ end
72
+ end
73
+
74
+ def test_markup_nochange
75
+ tests = [ [ "simple", 1, "simple" ],
76
+ [ "AT&T", 1, "AT&T" ],
77
+ [ "<i>AT&T</i>", 1, nil ] ]
78
+
79
+ @filter.min_parse = 1
64
80
  tests.each do | input, count, out |
65
81
  map = UniMap.new
66
82
  map.title = input
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: iudex-html
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.0.0
5
+ version: 1.1.0
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
@@ -10,8 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-04-04 00:00:00 -07:00
14
- default_executable:
13
+ date: 2011-11-13 00:00:00 Z
15
14
  dependencies:
16
15
  - !ruby/object:Gem::Dependency
17
16
  name: iudex-core
@@ -21,7 +20,7 @@ dependencies:
21
20
  requirements:
22
21
  - - ~>
23
22
  - !ruby/object:Gem::Version
24
- version: 1.0.0
23
+ version: 1.1.0
25
24
  type: :runtime
26
25
  version_requirements: *id001
27
26
  - !ruby/object:Gem::Dependency
@@ -52,12 +51,9 @@ dependencies:
52
51
  requirement: &id004 !ruby/object:Gem::Requirement
53
52
  none: false
54
53
  requirements:
55
- - - ">="
56
- - !ruby/object:Gem::Version
57
- version: 1.7.1
58
- - - <
54
+ - - ~>
59
55
  - !ruby/object:Gem::Version
60
- version: "2.1"
56
+ version: "2.3"
61
57
  type: :development
62
58
  version_requirements: *id004
63
59
  - !ruby/object:Gem::Dependency
@@ -79,7 +75,7 @@ dependencies:
79
75
  requirements:
80
76
  - - ~>
81
77
  - !ruby/object:Gem::Version
82
- version: 1.3.0
78
+ version: 1.4.0
83
79
  type: :development
84
80
  version_requirements: *id006
85
81
  description: |-
@@ -120,8 +116,8 @@ files:
120
116
  - test/test_parse_filter.rb
121
117
  - test/test_tree_walker.rb
122
118
  - test/test_word_counters.rb
123
- - lib/iudex-html/iudex-html-1.0.0.jar
124
- has_rdoc: true
119
+ - lib/iudex-html/iudex-html-1.1.0.jar
120
+ - .gemtest
125
121
  homepage: http://github.com/dekellum/iudex
126
122
  licenses: []
127
123
 
@@ -146,7 +142,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
146
142
  requirements: []
147
143
 
148
144
  rubyforge_project: iudex-html
149
- rubygems_version: 1.5.1
145
+ rubygems_version: 1.8.9
150
146
  signing_key:
151
147
  specification_version: 3
152
148
  summary: Iudex is a general purpose web crawler and feed processor in ruby/java