iudex-html 1.2.b.1-java → 1.2.b.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +4 -0
- data/Manifest.txt +1 -1
- data/lib/iudex-html/base.rb +1 -1
- data/lib/iudex-html/{iudex-html-1.2.b.1.jar → iudex-html-1.2.b.2.jar} +0 -0
- data/pom.xml +2 -2
- data/test/test_html_parser.rb +16 -0
- metadata +7 -4
data/History.rdoc
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
=== 1.2.b.2 (2012-6-23)
|
2
|
+
* Fix #8: Add Neko parser check for empty (malformed) attribute names.
|
3
|
+
* Extend/upgrade to gravitext-xmlprod [1.5.1,1.7)
|
4
|
+
|
1
5
|
=== 1.2.b.1 (2012-5-31)
|
2
6
|
* Add support for HTML 5 (draft) tags, attributes
|
3
7
|
* Neko parser support for HTML 5 <meta charset>
|
data/Manifest.txt
CHANGED
data/lib/iudex-html/base.rb
CHANGED
Binary file
|
data/pom.xml
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
<groupId>iudex</groupId>
|
4
4
|
<artifactId>iudex-html</artifactId>
|
5
5
|
<packaging>jar</packaging>
|
6
|
-
<version>1.2.b.
|
6
|
+
<version>1.2.b.2</version>
|
7
7
|
<name>Iudex HTML parsing/filtering and text extraction</name>
|
8
8
|
|
9
9
|
<parent>
|
@@ -24,7 +24,7 @@
|
|
24
24
|
<dependency>
|
25
25
|
<groupId>com.gravitext</groupId>
|
26
26
|
<artifactId>gravitext-xmlprod</artifactId>
|
27
|
-
<version>[1.5.1,1.
|
27
|
+
<version>[1.5.1,1.6.9999)</version>
|
28
28
|
</dependency>
|
29
29
|
|
30
30
|
<dependency>
|
data/test/test_html_parser.rb
CHANGED
@@ -150,4 +150,20 @@ HTML
|
|
150
150
|
assert_fragment( html[ :out ], tree )
|
151
151
|
end
|
152
152
|
|
153
|
+
import 'iudex.html.neko.NekoHTMLParser'
|
154
|
+
|
155
|
+
# Neko yields attributes with empty localName, given this invalid
|
156
|
+
# input (#8)
|
157
|
+
def test_invalid_attribute
|
158
|
+
html = { :in => '<div><img alt=""wns : next class="artwork" /></div>',
|
159
|
+
:out => '<div><img alt="" wns="" next="" class="artwork"/></div>' }
|
160
|
+
|
161
|
+
parser = NekoHTMLParser.new
|
162
|
+
parser.parse_as_fragment = true
|
163
|
+
parser.skip_banned = false # required to reproduce empty localName
|
164
|
+
|
165
|
+
tree = inner( parser.parse( source( html[ :in ], "UTF-8" ) ) )
|
166
|
+
assert_fragment( html[ :out ], tree )
|
167
|
+
end
|
168
|
+
|
153
169
|
end
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: iudex-html
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease: 4
|
5
|
-
version: 1.2.b.
|
5
|
+
version: 1.2.b.2
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2012-06-
|
13
|
+
date: 2012-06-23 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: iudex-core
|
@@ -39,9 +39,12 @@ dependencies:
|
|
39
39
|
version_requirements: &id003 !ruby/object:Gem::Requirement
|
40
40
|
none: false
|
41
41
|
requirements:
|
42
|
-
- -
|
42
|
+
- - ">="
|
43
43
|
- !ruby/object:Gem::Version
|
44
44
|
version: 1.5.1
|
45
|
+
- - <
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: "1.7"
|
45
48
|
requirement: *id003
|
46
49
|
prerelease: false
|
47
50
|
type: :runtime
|
@@ -117,7 +120,7 @@ files:
|
|
117
120
|
- test/test_stax_parser.rb
|
118
121
|
- test/test_tree_walker.rb
|
119
122
|
- test/test_word_counters.rb
|
120
|
-
- lib/iudex-html/iudex-html-1.2.b.
|
123
|
+
- lib/iudex-html/iudex-html-1.2.b.2.jar
|
121
124
|
homepage: http://github.com/dekellum/iudex
|
122
125
|
licenses: []
|
123
126
|
|