iudex-html 1.3.0-java → 1.4.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,11 @@
1
+ === 1.4.0 (2013-10-29)
2
+ * Fix meta-tag charset parse restart handling for case of empty
3
+ charset and conflicting/multiple combinations of Content-Type or
4
+ charset meta.
5
+ * Upgrade to nekohtml ~> 1.9.18
6
+ * Upgrade to iudex-* ~> 1.4.0 dependencies
7
+ * Upgrade to minitest ~> 4.7.4 (dev)
8
+
1
9
  === 1.3.0 (2012-11-8)
2
10
  * Upgrade to gravitext-xmlprod ~> 1.7.0 (and gravitext-util ~> 1.7.0)
3
11
  * Upgrade to logback ~> 1.5 (dev)
@@ -25,4 +25,4 @@ test/test_parse_filter.rb
25
25
  test/test_stax_parser.rb
26
26
  test/test_tree_walker.rb
27
27
  test/test_word_counters.rb
28
- lib/iudex-html/iudex-html-1.3.0.jar
28
+ lib/iudex-html/iudex-html-1.4.0.jar
@@ -11,7 +11,7 @@ filtering, exracting text and links.
11
11
 
12
12
  == License
13
13
 
14
- Copyright (c) 2008-2012 David Kellum
14
+ Copyright (c) 2008-2013 David Kellum
15
15
 
16
16
  Licensed under the Apache License, Version 2.0 (the "License"); you
17
17
  may not use this file except in compliance with the License. You
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env jruby
2
2
  # -*- ruby -*-
3
3
  #--
4
- # Copyright (c) 2012 David Kellum
4
+ # Copyright (c) 2008-2013 David Kellum
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License"); you
7
7
  # may not use this file except in compliance with the License. You may
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env jruby
2
2
  # -*- ruby -*-
3
3
  #--
4
- # Copyright (c) 2012 David Kellum
4
+ # Copyright (c) 2008-2013 David Kellum
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License"); you
7
7
  # may not use this file except in compliance with the License. You may
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2008-2012 David Kellum
2
+ * Copyright (c) 2008-2013 David Kellum
3
3
  *
4
4
  * Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  * may not use this file except in compliance with the License. You may
@@ -1,6 +1,6 @@
1
1
  # HTML Attributes
2
2
  #
3
- # Copyright (c) 2008-2012 David Kellum
3
+ # Copyright (c) 2008-2013 David Kellum
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License"); you
6
6
  # may not use this file except in compliance with the License. You may
@@ -2,7 +2,7 @@
2
2
  # -*- ruby -*-
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You may
data/build/tags CHANGED
@@ -1,6 +1,6 @@
1
1
  # HTML Tags
2
2
  #
3
- # Copyright (c) 2008-2012 David Kellum
3
+ # Copyright (c) 2008-2013 David Kellum
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License"); you
6
6
  # may not use this file except in compliance with the License. You may
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
@@ -16,6 +16,6 @@
16
16
 
17
17
  module Iudex
18
18
  module HTML
19
- VERSION = '1.3.0'
19
+ VERSION = '1.4.0'
20
20
  end
21
21
  end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
data/pom.xml CHANGED
@@ -3,13 +3,13 @@
3
3
  <groupId>iudex</groupId>
4
4
  <artifactId>iudex-html</artifactId>
5
5
  <packaging>jar</packaging>
6
- <version>1.3.0</version>
6
+ <version>1.4.0</version>
7
7
  <name>Iudex HTML parsing/filtering and text extraction</name>
8
8
 
9
9
  <parent>
10
10
  <groupId>iudex</groupId>
11
11
  <artifactId>iudex-parent</artifactId>
12
- <version>1.3.0</version>
12
+ <version>1.4.0</version>
13
13
  <relativePath>..</relativePath>
14
14
  </parent>
15
15
 
@@ -18,7 +18,7 @@
18
18
  <dependency>
19
19
  <groupId>iudex</groupId>
20
20
  <artifactId>iudex-core</artifactId>
21
- <version>[1.3.0,1.3.999)</version>
21
+ <version>[1.4.0,1.4.999)</version>
22
22
  </dependency>
23
23
 
24
24
  <dependency>
@@ -30,7 +30,7 @@
30
30
  <dependency>
31
31
  <groupId>net.sourceforge.nekohtml</groupId>
32
32
  <artifactId>nekohtml</artifactId>
33
- <version>1.9.14</version>
33
+ <version>[1.9.18,1.9.999)</version>
34
34
  </dependency>
35
35
 
36
36
  </dependencies>
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -22,6 +22,7 @@ require File.join( File.dirname( __FILE__ ), "setup" )
22
22
 
23
23
  class TestHTMLParser < MiniTest::Unit::TestCase
24
24
  include HTMLTestHelper
25
+ import 'iudex.util.Charsets'
25
26
 
26
27
  HTML_META = <<HTML
27
28
  <html xmlns="http://www.w3.org/1999/xhtml">
@@ -48,11 +49,48 @@ HTML
48
49
  assert_doc( alt, parse( alt, "UTF-8" ) )
49
50
  end
50
51
 
52
+ def test_charset_missing
53
+ alt = HTML_META.sub( /; charset=utf-8/, '' )
54
+ assert_doc( alt, parse( alt, "UTF-8" ) )
55
+ end
56
+
51
57
  def test_meta_charset_rerun
52
58
  alt = HTML_META.sub( /<meta .*\/>/, '<meta charset="utf-8"/>' )
53
59
  assert_doc( alt, parse( alt, "ISO-8859-1" ) )
54
60
  end
55
61
 
62
+ def test_meta_charset_conflict
63
+ alt = HTML_META.sub( /(<head>)/, '<head><meta charset="ISO-8859-1"/>' )
64
+ assert_doc( alt, parse( alt, "ISO-8859-1" ) )
65
+ end
66
+
67
+ def test_meta_charset_conflict_2
68
+ alt = HTML_META.sub( /utf-8/, 'latin1' )
69
+ alt = HTML_META.sub( /(<\/head>)/, '<meta charset="UTF-8"/></head>' )
70
+ assert_doc( alt, parse( alt, "ISO-8859-1" ) )
71
+ end
72
+
73
+ HTML_META_2 = <<HTML
74
+ <!DOCTYPE html>
75
+ <head>
76
+ <title>Page with skipped head tags</title>
77
+ <meta name="description" content="">
78
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
79
+ <script type="text/javascript" src="../js/swfobject.js"></script>
80
+ <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
81
+ </head>
82
+ <body>
83
+ <p>Iudex test.</p>
84
+ </body>
85
+ </html>
86
+ HTML
87
+
88
+ def test_charset_conflict_3
89
+ src = source( HTML_META_2, "Windows-1252" )
90
+ src.set_default_encoding( Charsets::UTF_8, 0.10 )
91
+ assert( HTMLUtils::parse( src ) )
92
+ end
93
+
56
94
  HTML_SKIP_TAGS = <<HTML
57
95
  <html xmlns="http://www.w3.org/1999/xhtml">
58
96
  <head>
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: iudex-html
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.3.0
5
+ version: 1.4.0
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-08 00:00:00.000000000 Z
12
+ date: 2013-10-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: iudex-core
@@ -17,13 +17,13 @@ dependencies:
17
17
  requirements:
18
18
  - - ~>
19
19
  - !ruby/object:Gem::Version
20
- version: 1.3.0
20
+ version: 1.4.0
21
21
  none: false
22
22
  requirement: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ~>
25
25
  - !ruby/object:Gem::Version
26
- version: 1.3.0
26
+ version: 1.4.0
27
27
  none: false
28
28
  prerelease: false
29
29
  type: :runtime
@@ -33,13 +33,13 @@ dependencies:
33
33
  requirements:
34
34
  - - ~>
35
35
  - !ruby/object:Gem::Version
36
- version: 1.9.14
36
+ version: 1.9.18
37
37
  none: false
38
38
  requirement: !ruby/object:Gem::Requirement
39
39
  requirements:
40
40
  - - ~>
41
41
  - !ruby/object:Gem::Version
42
- version: 1.9.14
42
+ version: 1.9.18
43
43
  none: false
44
44
  prerelease: false
45
45
  type: :runtime
@@ -65,13 +65,13 @@ dependencies:
65
65
  requirements:
66
66
  - - ~>
67
67
  - !ruby/object:Gem::Version
68
- version: '2.3'
68
+ version: 4.7.4
69
69
  none: false
70
70
  requirement: !ruby/object:Gem::Requirement
71
71
  requirements:
72
72
  - - ~>
73
73
  - !ruby/object:Gem::Version
74
- version: '2.3'
74
+ version: 4.7.4
75
75
  none: false
76
76
  prerelease: false
77
77
  type: :development
@@ -145,7 +145,7 @@ files:
145
145
  - test/test_stax_parser.rb
146
146
  - test/test_tree_walker.rb
147
147
  - test/test_word_counters.rb
148
- - lib/iudex-html/iudex-html-1.3.0.jar
148
+ - lib/iudex-html/iudex-html-1.4.0.jar
149
149
  homepage: http://iudex.gravitext.com
150
150
  licenses: []
151
151
  post_install_message: