iudex-html 1.0.0-java → 1.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/.gemtest ADDED
File without changes
data/History.rdoc CHANGED
@@ -1,2 +1,9 @@
1
+ === 1.1.0 (2011-11-13)
2
+ * Update to iudex-core ~> 1.1.0
3
+ * Add NekoHTMLParser charset expansion and ContentSource encoding
4
+ confidence support
5
+ * Add MojiBakeCleaner tree filter, helper support
6
+ * Update to minitest ~> 2.3
7
+
1
8
  === 1.0.0 (2011-04-04)
2
9
  * Initial release.
data/Manifest.txt CHANGED
@@ -21,4 +21,4 @@ test/test_other_tree_filters.rb
21
21
  test/test_parse_filter.rb
22
22
  test/test_tree_walker.rb
23
23
  test/test_word_counters.rb
24
- lib/iudex-html/iudex-html-1.0.0.jar
24
+ lib/iudex-html/iudex-html-1.1.0.jar
data/Rakefile CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH << './lib'
4
4
  require 'iudex-html/base'
5
5
 
6
6
  require 'rubygems'
7
- gem 'rjack-tarpit', '~> 1.2'
7
+ gem 'rjack-tarpit', '~> 1.4'
8
8
  require 'rjack-tarpit'
9
9
 
10
10
  t = RJack::TarPit.new( 'iudex-html',
@@ -13,12 +13,12 @@ t = RJack::TarPit.new( 'iudex-html',
13
13
 
14
14
  t.specify do |h|
15
15
  h.developer( "David Kellum", "dek-oss@gravitext.com" )
16
- h.extra_deps += [ [ 'iudex-core', '~> 1.0.0' ],
16
+ h.extra_deps += [ [ 'iudex-core', '~> 1.1.0' ],
17
17
  [ 'rjack-nekohtml', '~> 1.9.14' ],
18
18
  [ 'gravitext-xmlprod', '~> 1.4.0' ] ]
19
19
 
20
20
  h.testlib = :minitest
21
- h.extra_dev_deps += [ [ 'minitest', '>= 1.7.1', '< 2.1' ],
21
+ h.extra_dev_deps += [ [ 'minitest', '~> 2.3' ],
22
22
  [ 'rjack-logback', '~> 1.0' ] ]
23
23
  end
24
24
 
data/lib/iudex-html.rb CHANGED
@@ -46,9 +46,24 @@ module Iudex
46
46
  import 'iudex.html.tree.filters.CharactersNormalizer'
47
47
  import 'iudex.html.tree.filters.EmptyInlineRemover'
48
48
  import 'iudex.html.tree.filters.MetaSkipFilter'
49
+ import 'iudex.html.tree.filters.MojiBakeCleaner'
49
50
  import 'iudex.html.tree.filters.WordCounter'
50
51
  import 'iudex.html.tree.filters.WordyCounter'
51
52
  import 'iudex.html.tree.filters.XmpToPreConverter'
53
+
54
+ # Re-open iudex.html.tree.filter.MojiBakeCleaner to add config file
55
+ # based initialization.
56
+ class MojiBakeCleaner
57
+ include Iudex::Core
58
+
59
+ # Alt constructor taking a configuration file in `mojibake
60
+ # -t` format.
61
+ def initialize( config_file = :default )
62
+ args = Array( config_file ) - [ :default ]
63
+ super( *MojiBake.load_config( *args ) )
64
+ end
65
+ end
66
+
52
67
  end
53
68
  end
54
69
 
@@ -16,6 +16,6 @@
16
16
 
17
17
  module Iudex
18
18
  module HTML
19
- VERSION = '1.0.0'
19
+ VERSION = '1.1.0'
20
20
  end
21
21
  end
@@ -59,6 +59,7 @@ module Iudex
59
59
  [ XmpToPreConverter.new, # Before CharactersNormalizer
60
60
  CSSDisplayFilter.new, # Before AttributeCleaner
61
61
  AttributeCleaner.new,
62
+ MojiBakeCleaner.new,
62
63
  CharactersNormalizer.new,
63
64
  EmptyInlineRemover.new, # Depth
64
65
  WordCounter.new, # Depth; only for count deps?
data/pom.xml CHANGED
@@ -3,13 +3,13 @@
3
3
  <groupId>iudex</groupId>
4
4
  <artifactId>iudex-html</artifactId>
5
5
  <packaging>jar</packaging>
6
- <version>1.0.0</version>
6
+ <version>1.1.0</version>
7
7
  <name>Iudex HTML parsing/filtering and text extraction</name>
8
8
 
9
9
  <parent>
10
10
  <groupId>iudex</groupId>
11
11
  <artifactId>iudex-parent</artifactId>
12
- <version>1.0</version>
12
+ <version>1.1</version>
13
13
  <relativePath>..</relativePath>
14
14
  </parent>
15
15
 
@@ -18,7 +18,7 @@
18
18
  <dependency>
19
19
  <groupId>iudex</groupId>
20
20
  <artifactId>iudex-core</artifactId>
21
- <version>[1.0,1.1)</version>
21
+ <version>[1.1,1.2)</version>
22
22
  </dependency>
23
23
 
24
24
  <dependency>
@@ -93,7 +93,7 @@ module HTMLTestHelper
93
93
 
94
94
  def content( html, charset = "UTF-8" )
95
95
  map = UniMap.new
96
- map.source = HTMLUtils::source( html.to_java_bytes, "UTF-8" )
96
+ map.source = HTMLUtils::source( html.to_java_bytes, charset )
97
97
  map
98
98
  end
99
99
 
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env jruby
2
+ # -*- coding: utf-8 -*-
2
3
  #.hashdot.profile += jruby-shortlived
3
4
 
4
5
  #--
@@ -86,9 +87,9 @@ HTML
86
87
  assert( f.has_display_none( '{display: none}' ) ) #lenient
87
88
  assert( f.has_display_none( 'other:foo; DISPLAY:NONE;' ) )
88
89
 
89
- assert( ! f.has_display_none( 'display: block' ) )
90
- assert( ! f.has_display_none( 'other-display: none' ) )
91
- assert( ! f.has_display_none( 'display: nonetheless' ) )
90
+ refute( f.has_display_none( 'display: block' ) )
91
+ refute( f.has_display_none( 'other-display: none' ) )
92
+ refute( f.has_display_none( 'display: nonetheless' ) )
92
93
  end
93
94
 
94
95
  def test_css_display_filter
@@ -114,6 +115,13 @@ HTML
114
115
  assert_transform( html, XmpToPreConverter.new )
115
116
  end
116
117
 
118
+ def test_mojibake_cleaner
119
+ html = { :in => "<div><p>ascii</p> ° </div>",
120
+ :out => "<div><p>ascii</p> ~° </div>" }
121
+
122
+ assert_transform( html, MojiBakeCleaner.new )
123
+ end
124
+
117
125
  def cut_atts( html, *atts )
118
126
  atts.each do |att|
119
127
  html = html.gsub( / #{att}="[^"]+"/, '' )
@@ -32,20 +32,20 @@ class TestParseFilter < MiniTest::Unit::TestCase
32
32
  end
33
33
 
34
34
  def test_marked
35
- assert( ! marked?( "" ) )
36
- assert( ! marked?( "simple" ) )
37
- assert( ! marked?( "<simple" ) )
38
- assert( ! marked?( "x < y" ) )
39
- assert( ! marked?( "AT&T" ) )
40
- assert( ! marked?( "AT & T;" ) )
35
+ refute( marked?( "" ) )
36
+ refute( marked?( "simple" ) )
37
+ refute( marked?( "<simple" ) )
38
+ refute( marked?( "x < y" ) )
39
+ refute( marked?( "AT&T" ) )
40
+ refute( marked?( "AT & T;" ) )
41
41
 
42
- assert( marked?( "<a>" ) )
43
- assert( marked?( "Words &copy; 2010" ) )
44
- assert( marked?( "&#xf43e;" ) )
45
- assert( marked?( "&#2028;" ) )
46
- assert( marked?( "<![CDATA[simple]]>" ) )
47
- assert( marked?( "<![CDATA[simple" ) )
48
- assert( marked?( "<!-- comment -->" ) )
42
+ assert( marked?( "<a>" ) )
43
+ assert( marked?( "Words &copy; 2010" ) )
44
+ assert( marked?( "&#xf43e;" ) )
45
+ assert( marked?( "&#2028;" ) )
46
+ assert( marked?( "<![CDATA[simple]]>" ) )
47
+ assert( marked?( "<![CDATA[simple" ) )
48
+ assert( marked?( "<!-- comment -->" ) )
49
49
  end
50
50
 
51
51
  def marked?( text )
@@ -54,6 +54,7 @@ class TestParseFilter < MiniTest::Unit::TestCase
54
54
 
55
55
  def test_markup
56
56
  tests = [ [ "simple", 0, "simple" ],
57
+ [ "AT&T", 0, "AT&T" ],
57
58
  [ "<i>inner</i>", 1, nil ],
58
59
  [ "&lt;i>inner&lt;/i>", 2, nil ],
59
60
  [ "<!--ignore-->text", 1, "text" ],
@@ -61,6 +62,21 @@ class TestParseFilter < MiniTest::Unit::TestCase
61
62
  [ "&lt;", 1, "<" ],
62
63
  [ "&amp;lt;", 2, "<" ] ]
63
64
 
65
+ @filter.min_parse = 0
66
+ tests.each do | input, count, out |
67
+ map = UniMap.new
68
+ map.title = input
69
+ assert_equal( count, @filter.parse_loop( map ), input )
70
+ assert_equal( out, map.title && map.title.to_s )
71
+ end
72
+ end
73
+
74
+ def test_markup_nochange
75
+ tests = [ [ "simple", 1, "simple" ],
76
+ [ "AT&T", 1, "AT&T" ],
77
+ [ "<i>AT&T</i>", 1, nil ] ]
78
+
79
+ @filter.min_parse = 1
64
80
  tests.each do | input, count, out |
65
81
  map = UniMap.new
66
82
  map.title = input
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: iudex-html
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.0.0
5
+ version: 1.1.0
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
@@ -10,8 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-04-04 00:00:00 -07:00
14
- default_executable:
13
+ date: 2011-11-13 00:00:00 Z
15
14
  dependencies:
16
15
  - !ruby/object:Gem::Dependency
17
16
  name: iudex-core
@@ -21,7 +20,7 @@ dependencies:
21
20
  requirements:
22
21
  - - ~>
23
22
  - !ruby/object:Gem::Version
24
- version: 1.0.0
23
+ version: 1.1.0
25
24
  type: :runtime
26
25
  version_requirements: *id001
27
26
  - !ruby/object:Gem::Dependency
@@ -52,12 +51,9 @@ dependencies:
52
51
  requirement: &id004 !ruby/object:Gem::Requirement
53
52
  none: false
54
53
  requirements:
55
- - - ">="
56
- - !ruby/object:Gem::Version
57
- version: 1.7.1
58
- - - <
54
+ - - ~>
59
55
  - !ruby/object:Gem::Version
60
- version: "2.1"
56
+ version: "2.3"
61
57
  type: :development
62
58
  version_requirements: *id004
63
59
  - !ruby/object:Gem::Dependency
@@ -79,7 +75,7 @@ dependencies:
79
75
  requirements:
80
76
  - - ~>
81
77
  - !ruby/object:Gem::Version
82
- version: 1.3.0
78
+ version: 1.4.0
83
79
  type: :development
84
80
  version_requirements: *id006
85
81
  description: |-
@@ -120,8 +116,8 @@ files:
120
116
  - test/test_parse_filter.rb
121
117
  - test/test_tree_walker.rb
122
118
  - test/test_word_counters.rb
123
- - lib/iudex-html/iudex-html-1.0.0.jar
124
- has_rdoc: true
119
+ - lib/iudex-html/iudex-html-1.1.0.jar
120
+ - .gemtest
125
121
  homepage: http://github.com/dekellum/iudex
126
122
  licenses: []
127
123
 
@@ -146,7 +142,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
146
142
  requirements: []
147
143
 
148
144
  rubyforge_project: iudex-html
149
- rubygems_version: 1.5.1
145
+ rubygems_version: 1.8.9
150
146
  signing_key:
151
147
  specification_version: 3
152
148
  summary: Iudex is a general purpose web crawler and feed processor in ruby/java