iudex-html 1.3.0-java → 1.4.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +8 -0
- data/Manifest.txt +1 -1
- data/README.rdoc +1 -1
- data/bin/iudex-html-clean +1 -1
- data/bin/iudex-html-perftest +1 -1
- data/build/HTML.java.erb +1 -1
- data/build/attributes +1 -1
- data/build/java_generate.rb +1 -1
- data/build/tags +1 -1
- data/lib/iudex-html.rb +1 -1
- data/lib/iudex-html/base.rb +2 -2
- data/lib/iudex-html/factory_helper.rb +1 -1
- data/lib/iudex-html/{iudex-html-1.3.0.jar → iudex-html-1.4.0.jar} +0 -0
- data/pom.xml +4 -4
- data/test/html_test_helper.rb +1 -1
- data/test/setup.rb +1 -1
- data/test/test_characters_normalizer.rb +1 -1
- data/test/test_extract_filter.rb +1 -1
- data/test/test_factory_helper.rb +1 -1
- data/test/test_html_parser.rb +39 -1
- data/test/test_other_filters.rb +1 -1
- data/test/test_other_tree_filters.rb +1 -1
- data/test/test_parse_filter.rb +1 -1
- data/test/test_stax_parser.rb +1 -1
- data/test/test_tree_walker.rb +1 -1
- data/test/test_word_counters.rb +1 -1
- metadata +9 -9
data/History.rdoc
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
=== 1.4.0 (2013-10-29)
|
|
2
|
+
* Fix meta-tag charset parse restart handling for case of empty
|
|
3
|
+
charset and conflicting/multiple combinations of Content-Type or
|
|
4
|
+
charset meta.
|
|
5
|
+
* Upgrade to nekohtml ~> 1.9.18
|
|
6
|
+
* Upgrade to iudex-* ~> 1.4.0 dependencies
|
|
7
|
+
* Upgrade to minitest ~> 4.7.4 (dev)
|
|
8
|
+
|
|
1
9
|
=== 1.3.0 (2012-11-8)
|
|
2
10
|
* Upgrade to gravitext-xmlprod ~> 1.7.0 (and gravitext-util ~> 1.7.0)
|
|
3
11
|
* Upgrade to logback ~> 1.5 (dev)
|
data/Manifest.txt
CHANGED
data/README.rdoc
CHANGED
|
@@ -11,7 +11,7 @@ filtering, exracting text and links.
|
|
|
11
11
|
|
|
12
12
|
== License
|
|
13
13
|
|
|
14
|
-
Copyright (c) 2008-
|
|
14
|
+
Copyright (c) 2008-2013 David Kellum
|
|
15
15
|
|
|
16
16
|
Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
17
17
|
may not use this file except in compliance with the License. You
|
data/bin/iudex-html-clean
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env jruby
|
|
2
2
|
# -*- ruby -*-
|
|
3
3
|
#--
|
|
4
|
-
# Copyright (c)
|
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
7
7
|
# may not use this file except in compliance with the License. You may
|
data/bin/iudex-html-perftest
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env jruby
|
|
2
2
|
# -*- ruby -*-
|
|
3
3
|
#--
|
|
4
|
-
# Copyright (c)
|
|
4
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
5
5
|
#
|
|
6
6
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
7
7
|
# may not use this file except in compliance with the License. You may
|
data/build/HTML.java.erb
CHANGED
data/build/attributes
CHANGED
data/build/java_generate.rb
CHANGED
data/build/tags
CHANGED
data/lib/iudex-html.rb
CHANGED
data/lib/iudex-html/base.rb
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#--
|
|
2
|
-
# Copyright (c) 2008-
|
|
2
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
3
3
|
#
|
|
4
4
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
5
5
|
# may not use this file except in compliance with the License. You may
|
|
@@ -16,6 +16,6 @@
|
|
|
16
16
|
|
|
17
17
|
module Iudex
|
|
18
18
|
module HTML
|
|
19
|
-
VERSION = '1.
|
|
19
|
+
VERSION = '1.4.0'
|
|
20
20
|
end
|
|
21
21
|
end
|
|
Binary file
|
data/pom.xml
CHANGED
|
@@ -3,13 +3,13 @@
|
|
|
3
3
|
<groupId>iudex</groupId>
|
|
4
4
|
<artifactId>iudex-html</artifactId>
|
|
5
5
|
<packaging>jar</packaging>
|
|
6
|
-
<version>1.
|
|
6
|
+
<version>1.4.0</version>
|
|
7
7
|
<name>Iudex HTML parsing/filtering and text extraction</name>
|
|
8
8
|
|
|
9
9
|
<parent>
|
|
10
10
|
<groupId>iudex</groupId>
|
|
11
11
|
<artifactId>iudex-parent</artifactId>
|
|
12
|
-
<version>1.
|
|
12
|
+
<version>1.4.0</version>
|
|
13
13
|
<relativePath>..</relativePath>
|
|
14
14
|
</parent>
|
|
15
15
|
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
<dependency>
|
|
19
19
|
<groupId>iudex</groupId>
|
|
20
20
|
<artifactId>iudex-core</artifactId>
|
|
21
|
-
<version>[1.
|
|
21
|
+
<version>[1.4.0,1.4.999)</version>
|
|
22
22
|
</dependency>
|
|
23
23
|
|
|
24
24
|
<dependency>
|
|
@@ -30,7 +30,7 @@
|
|
|
30
30
|
<dependency>
|
|
31
31
|
<groupId>net.sourceforge.nekohtml</groupId>
|
|
32
32
|
<artifactId>nekohtml</artifactId>
|
|
33
|
-
<version>1.9.
|
|
33
|
+
<version>[1.9.18,1.9.999)</version>
|
|
34
34
|
</dependency>
|
|
35
35
|
|
|
36
36
|
</dependencies>
|
data/test/html_test_helper.rb
CHANGED
data/test/setup.rb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
|
3
3
|
|
|
4
4
|
#--
|
|
5
|
-
# Copyright (c) 2008-
|
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
6
6
|
#
|
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
8
8
|
# may not use this file except in compliance with the License. You
|
data/test/test_extract_filter.rb
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
|
4
4
|
|
|
5
5
|
#--
|
|
6
|
-
# Copyright (c) 2008-
|
|
6
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
7
7
|
#
|
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
9
9
|
# may not use this file except in compliance with the License. You
|
data/test/test_factory_helper.rb
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
|
4
4
|
|
|
5
5
|
#--
|
|
6
|
-
# Copyright (c) 2008-
|
|
6
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
7
7
|
#
|
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
9
9
|
# may not use this file except in compliance with the License. You
|
data/test/test_html_parser.rb
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
|
4
4
|
|
|
5
5
|
#--
|
|
6
|
-
# Copyright (c) 2008-
|
|
6
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
7
7
|
#
|
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
9
9
|
# may not use this file except in compliance with the License. You
|
|
@@ -22,6 +22,7 @@ require File.join( File.dirname( __FILE__ ), "setup" )
|
|
|
22
22
|
|
|
23
23
|
class TestHTMLParser < MiniTest::Unit::TestCase
|
|
24
24
|
include HTMLTestHelper
|
|
25
|
+
import 'iudex.util.Charsets'
|
|
25
26
|
|
|
26
27
|
HTML_META = <<HTML
|
|
27
28
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
@@ -48,11 +49,48 @@ HTML
|
|
|
48
49
|
assert_doc( alt, parse( alt, "UTF-8" ) )
|
|
49
50
|
end
|
|
50
51
|
|
|
52
|
+
def test_charset_missing
|
|
53
|
+
alt = HTML_META.sub( /; charset=utf-8/, '' )
|
|
54
|
+
assert_doc( alt, parse( alt, "UTF-8" ) )
|
|
55
|
+
end
|
|
56
|
+
|
|
51
57
|
def test_meta_charset_rerun
|
|
52
58
|
alt = HTML_META.sub( /<meta .*\/>/, '<meta charset="utf-8"/>' )
|
|
53
59
|
assert_doc( alt, parse( alt, "ISO-8859-1" ) )
|
|
54
60
|
end
|
|
55
61
|
|
|
62
|
+
def test_meta_charset_conflict
|
|
63
|
+
alt = HTML_META.sub( /(<head>)/, '<head><meta charset="ISO-8859-1"/>' )
|
|
64
|
+
assert_doc( alt, parse( alt, "ISO-8859-1" ) )
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def test_meta_charset_conflict_2
|
|
68
|
+
alt = HTML_META.sub( /utf-8/, 'latin1' )
|
|
69
|
+
alt = HTML_META.sub( /(<\/head>)/, '<meta charset="UTF-8"/></head>' )
|
|
70
|
+
assert_doc( alt, parse( alt, "ISO-8859-1" ) )
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
HTML_META_2 = <<HTML
|
|
74
|
+
<!DOCTYPE html>
|
|
75
|
+
<head>
|
|
76
|
+
<title>Page with skipped head tags</title>
|
|
77
|
+
<meta name="description" content="">
|
|
78
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
|
79
|
+
<script type="text/javascript" src="../js/swfobject.js"></script>
|
|
80
|
+
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
|
81
|
+
</head>
|
|
82
|
+
<body>
|
|
83
|
+
<p>Iudex test.</p>
|
|
84
|
+
</body>
|
|
85
|
+
</html>
|
|
86
|
+
HTML
|
|
87
|
+
|
|
88
|
+
def test_charset_conflict_3
|
|
89
|
+
src = source( HTML_META_2, "Windows-1252" )
|
|
90
|
+
src.set_default_encoding( Charsets::UTF_8, 0.10 )
|
|
91
|
+
assert( HTMLUtils::parse( src ) )
|
|
92
|
+
end
|
|
93
|
+
|
|
56
94
|
HTML_SKIP_TAGS = <<HTML
|
|
57
95
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
58
96
|
<head>
|
data/test/test_other_filters.rb
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
|
4
4
|
|
|
5
5
|
#--
|
|
6
|
-
# Copyright (c) 2008-
|
|
6
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
7
7
|
#
|
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
9
9
|
# may not use this file except in compliance with the License. You
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
|
4
4
|
|
|
5
5
|
#--
|
|
6
|
-
# Copyright (c) 2008-
|
|
6
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
7
7
|
#
|
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
9
9
|
# may not use this file except in compliance with the License. You
|
data/test/test_parse_filter.rb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
|
3
3
|
|
|
4
4
|
#--
|
|
5
|
-
# Copyright (c) 2008-
|
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
6
6
|
#
|
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
8
8
|
# may not use this file except in compliance with the License. You
|
data/test/test_stax_parser.rb
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
|
4
4
|
|
|
5
5
|
#--
|
|
6
|
-
# Copyright (c)
|
|
6
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
7
7
|
#
|
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
9
9
|
# may not use this file except in compliance with the License. You
|
data/test/test_tree_walker.rb
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
#.hashdot.profile += jruby-shortlived
|
|
3
3
|
|
|
4
4
|
#--
|
|
5
|
-
# Copyright (c) 2008-
|
|
5
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
6
6
|
#
|
|
7
7
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
8
8
|
# may not use this file except in compliance with the License. You
|
data/test/test_word_counters.rb
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
#.hashdot.profile += jruby-shortlived
|
|
4
4
|
|
|
5
5
|
#--
|
|
6
|
-
# Copyright (c) 2008-
|
|
6
|
+
# Copyright (c) 2008-2013 David Kellum
|
|
7
7
|
#
|
|
8
8
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
9
9
|
# may not use this file except in compliance with the License. You
|
metadata
CHANGED
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
name: iudex-html
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease:
|
|
5
|
-
version: 1.
|
|
5
|
+
version: 1.4.0
|
|
6
6
|
platform: java
|
|
7
7
|
authors:
|
|
8
8
|
- David Kellum
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date:
|
|
12
|
+
date: 2013-10-30 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: iudex-core
|
|
@@ -17,13 +17,13 @@ dependencies:
|
|
|
17
17
|
requirements:
|
|
18
18
|
- - ~>
|
|
19
19
|
- !ruby/object:Gem::Version
|
|
20
|
-
version: 1.
|
|
20
|
+
version: 1.4.0
|
|
21
21
|
none: false
|
|
22
22
|
requirement: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
24
|
- - ~>
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: 1.
|
|
26
|
+
version: 1.4.0
|
|
27
27
|
none: false
|
|
28
28
|
prerelease: false
|
|
29
29
|
type: :runtime
|
|
@@ -33,13 +33,13 @@ dependencies:
|
|
|
33
33
|
requirements:
|
|
34
34
|
- - ~>
|
|
35
35
|
- !ruby/object:Gem::Version
|
|
36
|
-
version: 1.9.
|
|
36
|
+
version: 1.9.18
|
|
37
37
|
none: false
|
|
38
38
|
requirement: !ruby/object:Gem::Requirement
|
|
39
39
|
requirements:
|
|
40
40
|
- - ~>
|
|
41
41
|
- !ruby/object:Gem::Version
|
|
42
|
-
version: 1.9.
|
|
42
|
+
version: 1.9.18
|
|
43
43
|
none: false
|
|
44
44
|
prerelease: false
|
|
45
45
|
type: :runtime
|
|
@@ -65,13 +65,13 @@ dependencies:
|
|
|
65
65
|
requirements:
|
|
66
66
|
- - ~>
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
|
-
version:
|
|
68
|
+
version: 4.7.4
|
|
69
69
|
none: false
|
|
70
70
|
requirement: !ruby/object:Gem::Requirement
|
|
71
71
|
requirements:
|
|
72
72
|
- - ~>
|
|
73
73
|
- !ruby/object:Gem::Version
|
|
74
|
-
version:
|
|
74
|
+
version: 4.7.4
|
|
75
75
|
none: false
|
|
76
76
|
prerelease: false
|
|
77
77
|
type: :development
|
|
@@ -145,7 +145,7 @@ files:
|
|
|
145
145
|
- test/test_stax_parser.rb
|
|
146
146
|
- test/test_tree_walker.rb
|
|
147
147
|
- test/test_word_counters.rb
|
|
148
|
-
- lib/iudex-html/iudex-html-1.
|
|
148
|
+
- lib/iudex-html/iudex-html-1.4.0.jar
|
|
149
149
|
homepage: http://iudex.gravitext.com
|
|
150
150
|
licenses: []
|
|
151
151
|
post_install_message:
|