iudex-html 1.3.0-java → 1.4.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,11 @@
1
+ === 1.4.0 (2013-10-29)
2
+ * Fix meta-tag charset parse restart handling for case of empty
3
+ charset and conflicting/multiple combinations of Content-Type or
4
+ charset meta.
5
+ * Upgrade to nekohtml ~> 1.9.18
6
+ * Upgrade to iudex-* ~> 1.4.0 dependencies
7
+ * Upgrade to minitest ~> 4.7.4 (dev)
8
+
1
9
  === 1.3.0 (2012-11-8)
2
10
  * Upgrade to gravitext-xmlprod ~> 1.7.0 (and gravitext-util ~> 1.7.0)
3
11
  * Upgrade to logback ~> 1.5 (dev)
@@ -25,4 +25,4 @@ test/test_parse_filter.rb
25
25
  test/test_stax_parser.rb
26
26
  test/test_tree_walker.rb
27
27
  test/test_word_counters.rb
28
- lib/iudex-html/iudex-html-1.3.0.jar
28
+ lib/iudex-html/iudex-html-1.4.0.jar
@@ -11,7 +11,7 @@ filtering, exracting text and links.
11
11
 
12
12
  == License
13
13
 
14
- Copyright (c) 2008-2012 David Kellum
14
+ Copyright (c) 2008-2013 David Kellum
15
15
 
16
16
  Licensed under the Apache License, Version 2.0 (the "License"); you
17
17
  may not use this file except in compliance with the License. You
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env jruby
2
2
  # -*- ruby -*-
3
3
  #--
4
- # Copyright (c) 2012 David Kellum
4
+ # Copyright (c) 2008-2013 David Kellum
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License"); you
7
7
  # may not use this file except in compliance with the License. You may
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env jruby
2
2
  # -*- ruby -*-
3
3
  #--
4
- # Copyright (c) 2012 David Kellum
4
+ # Copyright (c) 2008-2013 David Kellum
5
5
  #
6
6
  # Licensed under the Apache License, Version 2.0 (the "License"); you
7
7
  # may not use this file except in compliance with the License. You may
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2008-2012 David Kellum
2
+ * Copyright (c) 2008-2013 David Kellum
3
3
  *
4
4
  * Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  * may not use this file except in compliance with the License. You may
@@ -1,6 +1,6 @@
1
1
  # HTML Attributes
2
2
  #
3
- # Copyright (c) 2008-2012 David Kellum
3
+ # Copyright (c) 2008-2013 David Kellum
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License"); you
6
6
  # may not use this file except in compliance with the License. You may
@@ -2,7 +2,7 @@
2
2
  # -*- ruby -*-
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You may
data/build/tags CHANGED
@@ -1,6 +1,6 @@
1
1
  # HTML Tags
2
2
  #
3
- # Copyright (c) 2008-2012 David Kellum
3
+ # Copyright (c) 2008-2013 David Kellum
4
4
  #
5
5
  # Licensed under the Apache License, Version 2.0 (the "License"); you
6
6
  # may not use this file except in compliance with the License. You may
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
@@ -16,6 +16,6 @@
16
16
 
17
17
  module Iudex
18
18
  module HTML
19
- VERSION = '1.3.0'
19
+ VERSION = '1.4.0'
20
20
  end
21
21
  end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You may
data/pom.xml CHANGED
@@ -3,13 +3,13 @@
3
3
  <groupId>iudex</groupId>
4
4
  <artifactId>iudex-html</artifactId>
5
5
  <packaging>jar</packaging>
6
- <version>1.3.0</version>
6
+ <version>1.4.0</version>
7
7
  <name>Iudex HTML parsing/filtering and text extraction</name>
8
8
 
9
9
  <parent>
10
10
  <groupId>iudex</groupId>
11
11
  <artifactId>iudex-parent</artifactId>
12
- <version>1.3.0</version>
12
+ <version>1.4.0</version>
13
13
  <relativePath>..</relativePath>
14
14
  </parent>
15
15
 
@@ -18,7 +18,7 @@
18
18
  <dependency>
19
19
  <groupId>iudex</groupId>
20
20
  <artifactId>iudex-core</artifactId>
21
- <version>[1.3.0,1.3.999)</version>
21
+ <version>[1.4.0,1.4.999)</version>
22
22
  </dependency>
23
23
 
24
24
  <dependency>
@@ -30,7 +30,7 @@
30
30
  <dependency>
31
31
  <groupId>net.sourceforge.nekohtml</groupId>
32
32
  <artifactId>nekohtml</artifactId>
33
- <version>1.9.14</version>
33
+ <version>[1.9.18,1.9.999)</version>
34
34
  </dependency>
35
35
 
36
36
  </dependencies>
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright (c) 2008-2012 David Kellum
2
+ # Copyright (c) 2008-2013 David Kellum
3
3
  #
4
4
  # Licensed under the Apache License, Version 2.0 (the "License"); you
5
5
  # may not use this file except in compliance with the License. You
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -22,6 +22,7 @@ require File.join( File.dirname( __FILE__ ), "setup" )
22
22
 
23
23
  class TestHTMLParser < MiniTest::Unit::TestCase
24
24
  include HTMLTestHelper
25
+ import 'iudex.util.Charsets'
25
26
 
26
27
  HTML_META = <<HTML
27
28
  <html xmlns="http://www.w3.org/1999/xhtml">
@@ -48,11 +49,48 @@ HTML
48
49
  assert_doc( alt, parse( alt, "UTF-8" ) )
49
50
  end
50
51
 
52
+ def test_charset_missing
53
+ alt = HTML_META.sub( /; charset=utf-8/, '' )
54
+ assert_doc( alt, parse( alt, "UTF-8" ) )
55
+ end
56
+
51
57
  def test_meta_charset_rerun
52
58
  alt = HTML_META.sub( /<meta .*\/>/, '<meta charset="utf-8"/>' )
53
59
  assert_doc( alt, parse( alt, "ISO-8859-1" ) )
54
60
  end
55
61
 
62
+ def test_meta_charset_conflict
63
+ alt = HTML_META.sub( /(<head>)/, '<head><meta charset="ISO-8859-1"/>' )
64
+ assert_doc( alt, parse( alt, "ISO-8859-1" ) )
65
+ end
66
+
67
+ def test_meta_charset_conflict_2
68
+ alt = HTML_META.sub( /utf-8/, 'latin1' )
69
+ alt = HTML_META.sub( /(<\/head>)/, '<meta charset="UTF-8"/></head>' )
70
+ assert_doc( alt, parse( alt, "ISO-8859-1" ) )
71
+ end
72
+
73
+ HTML_META_2 = <<HTML
74
+ <!DOCTYPE html>
75
+ <head>
76
+ <title>Page with skipped head tags</title>
77
+ <meta name="description" content="">
78
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
79
+ <script type="text/javascript" src="../js/swfobject.js"></script>
80
+ <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
81
+ </head>
82
+ <body>
83
+ <p>Iudex test.</p>
84
+ </body>
85
+ </html>
86
+ HTML
87
+
88
+ def test_charset_conflict_3
89
+ src = source( HTML_META_2, "Windows-1252" )
90
+ src.set_default_encoding( Charsets::UTF_8, 0.10 )
91
+ assert( HTMLUtils::parse( src ) )
92
+ end
93
+
56
94
  HTML_SKIP_TAGS = <<HTML
57
95
  <html xmlns="http://www.w3.org/1999/xhtml">
58
96
  <head>
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
@@ -2,7 +2,7 @@
2
2
  #.hashdot.profile += jruby-shortlived
3
3
 
4
4
  #--
5
- # Copyright (c) 2008-2012 David Kellum
5
+ # Copyright (c) 2008-2013 David Kellum
6
6
  #
7
7
  # Licensed under the Apache License, Version 2.0 (the "License"); you
8
8
  # may not use this file except in compliance with the License. You
@@ -3,7 +3,7 @@
3
3
  #.hashdot.profile += jruby-shortlived
4
4
 
5
5
  #--
6
- # Copyright (c) 2008-2012 David Kellum
6
+ # Copyright (c) 2008-2013 David Kellum
7
7
  #
8
8
  # Licensed under the Apache License, Version 2.0 (the "License"); you
9
9
  # may not use this file except in compliance with the License. You
metadata CHANGED
@@ -2,14 +2,14 @@
2
2
  name: iudex-html
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 1.3.0
5
+ version: 1.4.0
6
6
  platform: java
7
7
  authors:
8
8
  - David Kellum
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-11-08 00:00:00.000000000 Z
12
+ date: 2013-10-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: iudex-core
@@ -17,13 +17,13 @@ dependencies:
17
17
  requirements:
18
18
  - - ~>
19
19
  - !ruby/object:Gem::Version
20
- version: 1.3.0
20
+ version: 1.4.0
21
21
  none: false
22
22
  requirement: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ~>
25
25
  - !ruby/object:Gem::Version
26
- version: 1.3.0
26
+ version: 1.4.0
27
27
  none: false
28
28
  prerelease: false
29
29
  type: :runtime
@@ -33,13 +33,13 @@ dependencies:
33
33
  requirements:
34
34
  - - ~>
35
35
  - !ruby/object:Gem::Version
36
- version: 1.9.14
36
+ version: 1.9.18
37
37
  none: false
38
38
  requirement: !ruby/object:Gem::Requirement
39
39
  requirements:
40
40
  - - ~>
41
41
  - !ruby/object:Gem::Version
42
- version: 1.9.14
42
+ version: 1.9.18
43
43
  none: false
44
44
  prerelease: false
45
45
  type: :runtime
@@ -65,13 +65,13 @@ dependencies:
65
65
  requirements:
66
66
  - - ~>
67
67
  - !ruby/object:Gem::Version
68
- version: '2.3'
68
+ version: 4.7.4
69
69
  none: false
70
70
  requirement: !ruby/object:Gem::Requirement
71
71
  requirements:
72
72
  - - ~>
73
73
  - !ruby/object:Gem::Version
74
- version: '2.3'
74
+ version: 4.7.4
75
75
  none: false
76
76
  prerelease: false
77
77
  type: :development
@@ -145,7 +145,7 @@ files:
145
145
  - test/test_stax_parser.rb
146
146
  - test/test_tree_walker.rb
147
147
  - test/test_word_counters.rb
148
- - lib/iudex-html/iudex-html-1.3.0.jar
148
+ - lib/iudex-html/iudex-html-1.4.0.jar
149
149
  homepage: http://iudex.gravitext.com
150
150
  licenses: []
151
151
  post_install_message: