nokogiri 1.10.8-java → 1.11.0.rc3-java

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +24 -22
  3. data/ext/java/nokogiri/HtmlDocument.java +34 -46
  4. data/ext/java/nokogiri/HtmlSaxParserContext.java +87 -57
  5. data/ext/java/nokogiri/NokogiriService.java +1 -1
  6. data/ext/java/nokogiri/XmlAttr.java +13 -20
  7. data/ext/java/nokogiri/XmlAttributeDecl.java +11 -12
  8. data/ext/java/nokogiri/XmlCdata.java +3 -4
  9. data/ext/java/nokogiri/XmlComment.java +1 -1
  10. data/ext/java/nokogiri/XmlDocument.java +148 -175
  11. data/ext/java/nokogiri/XmlDocumentFragment.java +13 -31
  12. data/ext/java/nokogiri/XmlDtd.java +5 -8
  13. data/ext/java/nokogiri/XmlElement.java +1 -20
  14. data/ext/java/nokogiri/XmlElementDecl.java +23 -28
  15. data/ext/java/nokogiri/XmlEntityDecl.java +23 -27
  16. data/ext/java/nokogiri/XmlEntityReference.java +2 -2
  17. data/ext/java/nokogiri/XmlNamespace.java +72 -89
  18. data/ext/java/nokogiri/XmlNode.java +300 -401
  19. data/ext/java/nokogiri/XmlNodeSet.java +72 -77
  20. data/ext/java/nokogiri/XmlReader.java +10 -11
  21. data/ext/java/nokogiri/XmlSaxParserContext.java +7 -7
  22. data/ext/java/nokogiri/XmlSchema.java +3 -3
  23. data/ext/java/nokogiri/XmlText.java +12 -9
  24. data/ext/java/nokogiri/XmlXpathContext.java +7 -7
  25. data/ext/java/nokogiri/XsltStylesheet.java +7 -15
  26. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +4 -10
  27. data/ext/java/nokogiri/internals/NokogiriHelpers.java +71 -135
  28. data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +90 -58
  29. data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +5 -4
  30. data/ext/java/nokogiri/internals/ParserContext.java +27 -73
  31. data/ext/java/nokogiri/internals/ReaderNode.java +2 -4
  32. data/ext/java/nokogiri/internals/XmlDomParserContext.java +17 -32
  33. data/ext/nokogiri/extconf.rb +50 -37
  34. data/ext/nokogiri/nokogiri.c +12 -6
  35. data/ext/nokogiri/nokogiri.h +13 -0
  36. data/ext/nokogiri/xml_document.c +16 -2
  37. data/ext/nokogiri/xml_io.c +8 -6
  38. data/ext/nokogiri/xml_node.c +20 -0
  39. data/ext/nokogiri/xml_reader.c +6 -17
  40. data/ext/nokogiri/xml_schema.c +29 -0
  41. data/ext/nokogiri/xslt_stylesheet.c +0 -4
  42. data/lib/nokogiri.rb +3 -20
  43. data/lib/nokogiri/css.rb +1 -0
  44. data/lib/nokogiri/css/node.rb +1 -0
  45. data/lib/nokogiri/css/parser.rb +61 -60
  46. data/lib/nokogiri/css/parser_extras.rb +39 -36
  47. data/lib/nokogiri/css/syntax_error.rb +1 -0
  48. data/lib/nokogiri/css/tokenizer.rb +1 -0
  49. data/lib/nokogiri/css/xpath_visitor.rb +3 -1
  50. data/lib/nokogiri/decorators/slop.rb +1 -0
  51. data/lib/nokogiri/html.rb +1 -0
  52. data/lib/nokogiri/html/builder.rb +1 -0
  53. data/lib/nokogiri/html/document.rb +1 -0
  54. data/lib/nokogiri/html/document_fragment.rb +1 -0
  55. data/lib/nokogiri/html/element_description.rb +1 -0
  56. data/lib/nokogiri/html/element_description_defaults.rb +1 -0
  57. data/lib/nokogiri/html/entity_lookup.rb +1 -0
  58. data/lib/nokogiri/html/sax/parser.rb +1 -0
  59. data/lib/nokogiri/html/sax/parser_context.rb +1 -0
  60. data/lib/nokogiri/html/sax/push_parser.rb +1 -0
  61. data/lib/nokogiri/jruby/dependencies.rb +20 -0
  62. data/lib/nokogiri/nokogiri.jar +0 -0
  63. data/lib/nokogiri/syntax_error.rb +1 -0
  64. data/lib/nokogiri/version.rb +86 -45
  65. data/lib/nokogiri/xml.rb +1 -0
  66. data/lib/nokogiri/xml/attr.rb +1 -0
  67. data/lib/nokogiri/xml/attribute_decl.rb +1 -0
  68. data/lib/nokogiri/xml/builder.rb +3 -2
  69. data/lib/nokogiri/xml/cdata.rb +1 -0
  70. data/lib/nokogiri/xml/character_data.rb +1 -0
  71. data/lib/nokogiri/xml/document.rb +3 -8
  72. data/lib/nokogiri/xml/document_fragment.rb +1 -0
  73. data/lib/nokogiri/xml/dtd.rb +1 -0
  74. data/lib/nokogiri/xml/element_content.rb +1 -0
  75. data/lib/nokogiri/xml/element_decl.rb +1 -0
  76. data/lib/nokogiri/xml/entity_decl.rb +1 -0
  77. data/lib/nokogiri/xml/entity_reference.rb +1 -0
  78. data/lib/nokogiri/xml/namespace.rb +1 -0
  79. data/lib/nokogiri/xml/node.rb +539 -224
  80. data/lib/nokogiri/xml/node/save_options.rb +1 -0
  81. data/lib/nokogiri/xml/node_set.rb +1 -0
  82. data/lib/nokogiri/xml/notation.rb +1 -0
  83. data/lib/nokogiri/xml/parse_options.rb +4 -3
  84. data/lib/nokogiri/xml/pp.rb +1 -0
  85. data/lib/nokogiri/xml/pp/character_data.rb +1 -0
  86. data/lib/nokogiri/xml/pp/node.rb +1 -0
  87. data/lib/nokogiri/xml/processing_instruction.rb +1 -0
  88. data/lib/nokogiri/xml/reader.rb +7 -3
  89. data/lib/nokogiri/xml/relax_ng.rb +1 -0
  90. data/lib/nokogiri/xml/sax.rb +1 -0
  91. data/lib/nokogiri/xml/sax/document.rb +1 -0
  92. data/lib/nokogiri/xml/sax/parser.rb +1 -0
  93. data/lib/nokogiri/xml/sax/parser_context.rb +1 -0
  94. data/lib/nokogiri/xml/sax/push_parser.rb +1 -0
  95. data/lib/nokogiri/xml/schema.rb +1 -0
  96. data/lib/nokogiri/xml/searchable.rb +22 -15
  97. data/lib/nokogiri/xml/syntax_error.rb +1 -0
  98. data/lib/nokogiri/xml/text.rb +1 -0
  99. data/lib/nokogiri/xml/xpath.rb +1 -0
  100. data/lib/nokogiri/xml/xpath/syntax_error.rb +1 -0
  101. data/lib/nokogiri/xml/xpath_context.rb +1 -0
  102. data/lib/nokogiri/xslt.rb +1 -0
  103. data/lib/nokogiri/xslt/stylesheet.rb +1 -0
  104. data/lib/xsd/xmlparser/nokogiri.rb +1 -0
  105. metadata +53 -34
  106. data/ext/java/nokogiri/internals/NokogiriEncodingReaderWrapper.java +0 -107
  107. data/ext/java/nokogiri/internals/UncloseableInputStream.java +0 -102
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4a128fdc76b3bc96899e22fed66f613f990e3c4736d4b0cc7c243a3a19182bf9
4
- data.tar.gz: 6f8b1afb5e1ac41f49740b77bd908a7b7405d6425b23b756dc3cbb816f4a8970
3
+ metadata.gz: dfc89c8aff7d9b973bf29d827f80c625594c131f6b8e2e389bb06557497ed387
4
+ data.tar.gz: e34b6e0854f4707d7c8ad1bcd277c39b625900467e87254b14f73f69d5e116ac
5
5
  SHA512:
6
- metadata.gz: 62b71a0ee6624bb44cb631f7ab887ad48c5c55bd25d9b040b2da553aa364a737fbc5fe1de61f2ef4fd86a8e80789df880dce056fc0423be56d57a968ae593172
7
- data.tar.gz: 3d0f696d04cf4ae54c9202e7e3575d0bb6e6ba6217a39d8cba714ad0ef395f1a1022b19d509c7efe96c48103badf9b5a355d04e93aeebefab4afe7767af1393e
6
+ metadata.gz: 1b5cb94d1d12baec54e71dac6ff3ab9a58bc50a0d6acd9b8fcc49f29178080655dce47284ad675be9e568bd057b19c243100b6eaadaaac4ccd3250c76c2e3604
7
+ data.tar.gz: e4d6eee384cc6000f6094a0f543c9a01b5c6eba5623e998ffcf264f493d2c387a2998d61ffab35e7714596edd1e2add2f52fc4374e2d64d008cb8eaefac98916
data/README.md CHANGED
@@ -11,7 +11,7 @@ or CSS3 selectors.
11
11
 
12
12
  * https://nokogiri.org
13
13
  * [Installation Help](https://nokogiri.org/tutorials/installing_nokogiri.html)
14
- * [Tutorials](https://nokogiri.org)
14
+ * [Tutorials](https://nokogiri.org/tutorials/toc.html)
15
15
  * [Cheat Sheet](https://github.com/sparklemotion/nokogiri/wiki/Cheat-sheet)
16
16
  * [GitHub](https://github.com/sparklemotion/nokogiri)
17
17
  * [Mailing List](https://groups.google.com/group/nokogiri-talk)
@@ -20,14 +20,14 @@ or CSS3 selectors.
20
20
 
21
21
  ## Status
22
22
 
23
- [![Concourse CI](https://ci.nokogiri.org/api/v1/teams/nokogiri-core/pipelines/nokogiri/jobs/ruby-2.4-system/badge)](https://ci.nokogiri.org/teams/nokogiri-core/pipelines/nokogiri)
23
+ [![Concourse CI](https://ci.nokogiri.org/api/v1/teams/nokogiri-core/pipelines/nokogiri/jobs/cruby-2.7/badge)](https://ci.nokogiri.org/teams/nokogiri-core/pipelines/nokogiri)
24
24
  [![Appveyor CI](https://ci.appveyor.com/api/projects/status/xj2pqwvlxwuwgr06/branch/master?svg=true)](https://ci.appveyor.com/project/flavorjones/nokogiri/branch/master)
25
25
  [![Code Climate](https://codeclimate.com/github/sparklemotion/nokogiri.svg)](https://codeclimate.com/github/sparklemotion/nokogiri)
26
26
  [![Test Coverage](https://api.codeclimate.com/v1/badges/59c67b0e8976027a45ad/test_coverage)](https://codeclimate.com/github/sparklemotion/nokogiri/test_coverage)
27
27
 
28
28
  [![Gem Version](https://badge.fury.io/rb/nokogiri.svg)](https://rubygems.org/gems/nokogiri)
29
- [![SemVer compatibility](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score.html?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)
30
- [![Tidelift dependencies](https://tidelift.com/badges/github/sparklemotion/nokogiri)](https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme)
29
+ [![SemVer compatibility](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score/?dependency-name=nokogiri&package-manager=bundler)
30
+ [![Tidelift dependencies](https://tidelift.com/badges/package/rubygems/nokogiri)](https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme)
31
31
 
32
32
 
33
33
  ## Features
@@ -71,7 +71,7 @@ help you directly and improve the documentation.
71
71
  Binary packages are available for some distributions.
72
72
 
73
73
  * Debian: https://packages.debian.org/sid/ruby-nokogiri
74
- * SuSE: https://download.opensuse.org/repositories/devel:/languages:/ruby:/extensions/
74
+ * openSUSE/SLE: https://download.opensuse.org/repositories/devel:/languages:/ruby:/extensions/
75
75
  * Fedora: http://s390.koji.fedoraproject.org/koji/packageinfo?packageID=6756
76
76
 
77
77
 
@@ -108,7 +108,7 @@ require 'nokogiri'
108
108
  require 'open-uri'
109
109
 
110
110
  # Fetch and parse HTML document
111
- doc = Nokogiri::HTML(open('https://nokogiri.org/tutorials/installing_nokogiri.html'))
111
+ doc = Nokogiri::HTML(URI.open('https://nokogiri.org/tutorials/installing_nokogiri.html'))
112
112
 
113
113
  puts "### Search for nodes by css"
114
114
  doc.css('nav ul.menu li a', 'article h2').each do |link|
@@ -129,27 +129,23 @@ end
129
129
 
130
130
  ## Requirements
131
131
 
132
- * Ruby 2.3.0 or higher, including any development packages necessary
133
- to compile native extensions.
132
+ Ruby 2.4.0 or higher, including any development packages necessary to compile native extensions.
134
133
 
135
- * In Nokogiri 1.6.0 and later libxml2 and libxslt are bundled with the
136
- gem, but if you want to use the system versions:
134
+ In Nokogiri 1.6.0 and later libxml2 and libxslt are bundled with the gem, but if you want to use the system versions:
137
135
 
138
- * First, check out [the long list](http://www.xmlsoft.org/news.html)
139
- of fixes and changes between releases before deciding to use any
140
- version older than is bundled with Nokogiri.
136
+ * First, check out [the long list](http://www.xmlsoft.org/news.html)
137
+ of fixes and changes between releases before deciding to use any
138
+ version older than is bundled with Nokogiri.
141
139
 
142
- * At install time, set the environment variable
143
- `NOKOGIRI_USE_SYSTEM_LIBRARIES` or else use the
144
- `--use-system-libraries` argument. (See
145
- https://nokogiri.org/tutorials/installing_nokogiri.html#install-with-system-libraries
146
- for specifics.)
140
+ * At install time, set the environment variable
141
+ `NOKOGIRI_USE_SYSTEM_LIBRARIES` or else use the
142
+ `--use-system-libraries` argument. (See
143
+ https://nokogiri.org/tutorials/installing_nokogiri.html#install-with-system-libraries
144
+ for specifics.)
147
145
 
148
- * libxml2 >=2.6.21 with iconv support
149
- (libxml2-dev/-devel is also required)
146
+ * libxml2 >=2.6.21 with iconv support (libxml2-dev/-devel is also required)
150
147
 
151
- * libxslt, built with and supported by the given libxml2
152
- (libxslt-dev/-devel is also required)
148
+ * libxslt, built with and supported by the given libxml2 (libxslt-dev/-devel is also required)
153
149
 
154
150
 
155
151
  ## Encoding
@@ -191,6 +187,12 @@ explicitly setting the encoding to EUC-JP on the parser:
191
187
  We've adopted the Contributor Covenant code of conduct, which you can read in full in [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md).
192
188
 
193
189
 
190
+ ## Semantic Versioning
191
+
192
+ [![SemVer compatibility](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score/?dependency-name=nokogiri&package-manager=bundler)
193
+
194
+ Nokogiri follows [Semantic Versioning](https://semver.org/). See [`CHANGELOG.md`](CHANGELOG.md) for more details.
195
+
194
196
  ## License
195
197
 
196
198
  This project is licensed under the terms of the MIT license.
@@ -36,7 +36,6 @@ import org.jruby.Ruby;
36
36
  import org.jruby.RubyClass;
37
37
  import org.jruby.anno.JRubyClass;
38
38
  import org.jruby.anno.JRubyMethod;
39
- import org.jruby.runtime.Arity;
40
39
  import org.jruby.runtime.Helpers;
41
40
  import org.jruby.runtime.ThreadContext;
42
41
  import org.jruby.runtime.builtin.IRubyObject;
@@ -48,6 +47,8 @@ import org.w3c.dom.NodeList;
48
47
 
49
48
  import nokogiri.internals.HtmlDomParserContext;
50
49
 
50
+ import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
51
+
51
52
  /**
52
53
  * Class for Nokogiri::HTML::Document.
53
54
  *
@@ -65,21 +66,25 @@ public class HtmlDocument extends XmlDocument {
65
66
  public HtmlDocument(Ruby ruby, RubyClass klazz) {
66
67
  super(ruby, klazz);
67
68
  }
68
-
69
+
70
+ public HtmlDocument(Ruby runtime, Document document) {
71
+ this(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Document"), document);
72
+ }
73
+
69
74
  public HtmlDocument(Ruby ruby, RubyClass klazz, Document doc) {
70
75
  super(ruby, klazz, doc);
71
76
  }
72
77
 
73
78
  @JRubyMethod(name="new", meta = true, rest = true, required=0)
74
- public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz,
75
- IRubyObject[] args) {
79
+ public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz, IRubyObject[] args) {
80
+ final Ruby runtime = context.runtime;
76
81
  HtmlDocument htmlDocument;
77
82
  try {
78
- Document docNode = createNewDocument();
79
- htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass) klazz);
80
- htmlDocument.setDocumentNode(context, docNode);
83
+ Document docNode = createNewDocument(runtime);
84
+ htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(runtime, (RubyClass) klazz);
85
+ htmlDocument.setDocumentNode(context.runtime, docNode);
81
86
  } catch (Exception ex) {
82
- throw context.getRuntime().newRuntimeError("couldn't create document: " + ex);
87
+ throw asRuntimeError(runtime, "couldn't create document: ", ex);
83
88
  }
84
89
 
85
90
  Helpers.invoke(context, htmlDocument, "initialize", args);
@@ -109,46 +114,29 @@ public class HtmlDocument extends XmlDocument {
109
114
  return internalSubset;
110
115
  }
111
116
 
112
- public static IRubyObject do_parse(ThreadContext context,
113
- IRubyObject klass,
114
- IRubyObject[] args) {
115
- Ruby ruby = context.getRuntime();
116
- Arity.checkArgumentCount(ruby, args, 4, 4);
117
- HtmlDomParserContext ctx =
118
- new HtmlDomParserContext(ruby, args[2], args[3]);
119
- ctx.setInputSource(context, args[0], args[1]);
120
- return ctx.parse(context, klass, args[1]);
121
- }
122
-
123
- public void setDocumentNode(ThreadContext context, Node node) {
124
- super.setNode(context, node);
125
- Ruby runtime = context.getRuntime();
126
- if (node != null) {
127
- Document document = (Document)node;
128
- document.normalize();
129
- stabilzeAttrValue(document.getDocumentElement());
130
- }
117
+ @Override
118
+ void init(Ruby runtime, Document document) {
119
+ stabilizeTextContent(document);
120
+ document.normalize();
131
121
  setInstanceVariable("@decorators", runtime.getNil());
122
+ if (document.getDocumentElement() != null) {
123
+ stabilizeAttrs(document.getDocumentElement());
124
+ }
132
125
  }
133
-
134
- private void stabilzeAttrValue(Node node) {
135
- if (node == null) return;
126
+
127
+ private static void stabilizeAttrs(Node node) {
136
128
  if (node.hasAttributes()) {
137
129
  NamedNodeMap nodeMap = node.getAttributes();
138
130
  for (int i=0; i<nodeMap.getLength(); i++) {
139
131
  Node n = nodeMap.item(i);
140
132
  if (n instanceof Attr) {
141
- Attr attr = (Attr)n;
142
- String attrName = attr.getName();
143
- // not sure, but need to get value always before document is referred.
144
- // or lose attribute value
145
- String attrValue = attr.getValue(); // don't delete this line
133
+ stabilizeAttr((Attr) n);
146
134
  }
147
135
  }
148
136
  }
149
137
  NodeList children = node.getChildNodes();
150
138
  for (int i=0; i<children.getLength(); i++) {
151
- stabilzeAttrValue(children.item(i));
139
+ stabilizeAttrs(children.item(i));
152
140
  }
153
141
  }
154
142
 
@@ -167,11 +155,11 @@ public class HtmlDocument extends XmlDocument {
167
155
  * Read the HTML document from +io+ with given +url+, +encoding+,
168
156
  * and +options+. See Nokogiri::HTML.parse
169
157
  */
170
- @JRubyMethod(meta = true, rest = true)
171
- public static IRubyObject read_io(ThreadContext context,
172
- IRubyObject cls,
173
- IRubyObject[] args) {
174
- return do_parse(context, cls, args);
158
+ @JRubyMethod(meta = true, required = 4)
159
+ public static IRubyObject read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args) {
160
+ HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
161
+ ctx.setIOInputSource(context, args[0], args[1]);
162
+ return ctx.parse(context, (RubyClass) klass, args[1]);
175
163
  }
176
164
 
177
165
  /*
@@ -181,10 +169,10 @@ public class HtmlDocument extends XmlDocument {
181
169
  * Read the HTML document contained in +string+ with given +url+, +encoding+,
182
170
  * and +options+. See Nokogiri::HTML.parse
183
171
  */
184
- @JRubyMethod(meta = true, rest = true)
185
- public static IRubyObject read_memory(ThreadContext context,
186
- IRubyObject cls,
187
- IRubyObject[] args) {
188
- return do_parse(context, cls, args);
172
+ @JRubyMethod(meta = true, required = 4)
173
+ public static IRubyObject read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args) {
174
+ HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
175
+ ctx.setStringInputSource(context, args[0], args[1]);
176
+ return ctx.parse(context, (RubyClass) klass, args[1]);
189
177
  }
190
178
  }
@@ -32,28 +32,29 @@
32
32
 
33
33
  package nokogiri;
34
34
 
35
- import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
36
-
37
35
  import java.io.ByteArrayInputStream;
38
36
  import java.io.InputStream;
39
37
  import java.nio.charset.Charset;
40
38
  import java.nio.charset.IllegalCharsetNameException;
41
39
  import java.nio.charset.UnsupportedCharsetException;
42
- import java.util.EnumSet;
43
40
  import java.util.regex.Matcher;
44
41
  import java.util.regex.Pattern;
45
42
 
46
- import nokogiri.internals.NokogiriHandler;
47
-
48
43
  import org.apache.xerces.parsers.AbstractSAXParser;
49
44
  import org.cyberneko.html.parsers.SAXParser;
50
- import org.jruby.*;
45
+ import org.jruby.Ruby;
46
+ import org.jruby.RubyClass;
47
+ import org.jruby.RubyFixnum;
48
+ import org.jruby.RubyString;
51
49
  import org.jruby.anno.JRubyClass;
52
50
  import org.jruby.anno.JRubyMethod;
53
51
  import org.jruby.runtime.ThreadContext;
54
52
  import org.jruby.runtime.builtin.IRubyObject;
55
53
  import org.xml.sax.SAXException;
56
54
 
55
+ import nokogiri.internals.NokogiriHandler;
56
+ import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
57
+
57
58
  /**
58
59
  * Class for Nokogiri::HTML::SAX::ParserContext.
59
60
  *
@@ -65,10 +66,16 @@ import org.xml.sax.SAXException;
65
66
  @JRubyClass(name="Nokogiri::HTML::SAX::ParserContext", parent="Nokogiri::XML::SAX::ParserContext")
66
67
  public class HtmlSaxParserContext extends XmlSaxParserContext {
67
68
 
69
+ static HtmlSaxParserContext newInstance(final Ruby runtime, final RubyClass klazz) {
70
+ HtmlSaxParserContext instance = new HtmlSaxParserContext(runtime, klazz);
71
+ instance.initialize(runtime);
72
+ return instance;
73
+ }
74
+
68
75
  public HtmlSaxParserContext(Ruby ruby, RubyClass rubyClass) {
69
76
  super(ruby, rubyClass);
70
77
  }
71
-
78
+
72
79
  @Override
73
80
  protected AbstractSAXParser createParser() throws SAXException {
74
81
  SAXParser parser = new SAXParser();
@@ -78,6 +85,11 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
78
85
  "http://cyberneko.org/html/properties/names/elems", "lower");
79
86
  parser.setProperty(
80
87
  "http://cyberneko.org/html/properties/names/attrs", "lower");
88
+
89
+ // NekoHTML should not try to guess the encoding based on the meta
90
+ // tags or other information in the document. This is already
91
+ // handled by the EncodingReader.
92
+ parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
81
93
  return parser;
82
94
  } catch(SAXException ex) {
83
95
  throw new SAXException(
@@ -90,18 +102,17 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
90
102
  IRubyObject klazz,
91
103
  IRubyObject data,
92
104
  IRubyObject encoding) {
93
- HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
94
- ctx.initialize(context.getRuntime());
95
- String javaEncoding = findEncoding(context, encoding);
105
+ HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
106
+ String javaEncoding = findEncodingName(context, encoding);
96
107
  if (javaEncoding != null) {
97
- String input = applyEncoding(rubyStringToString(data), javaEncoding);
98
- ByteArrayInputStream istream = new ByteArrayInputStream(input.getBytes());
108
+ CharSequence input = applyEncoding(rubyStringToString(data.convertToString()), javaEncoding);
109
+ ByteArrayInputStream istream = new ByteArrayInputStream(input.toString().getBytes());
99
110
  ctx.setInputSource(istream);
100
111
  ctx.getInputSource().setEncoding(javaEncoding);
101
112
  }
102
113
  return ctx;
103
114
  }
104
-
115
+
105
116
  public enum EncodingType {
106
117
  NONE(0, "NONE"),
107
118
  UTF_8(1, "UTF-8"),
@@ -142,23 +153,38 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
142
153
  public String toString() {
143
154
  return name;
144
155
  }
145
- }
146
-
147
- private static String findName(final int value) {
148
- for (EncodingType type : EncodingType.values()) {
149
- if (type.getValue() == value) return type.toString();
156
+
157
+ private static transient EncodingType[] values;
158
+
159
+ // NOTE: assuming ordinal == value
160
+ static EncodingType get(final int ordinal) {
161
+ EncodingType[] values = EncodingType.values;
162
+ if (values == null) {
163
+ values = EncodingType.values();
164
+ EncodingType.values = values;
165
+ }
166
+ if (ordinal >= 0 && ordinal < values.length) {
167
+ return values[ordinal];
168
+ }
169
+ return null;
150
170
  }
151
- return null;
171
+
152
172
  }
153
-
154
- private static String findEncoding(ThreadContext context, IRubyObject encoding) {
173
+
174
+ private static String findEncodingName(final int value) {
175
+ EncodingType type = EncodingType.get(value);
176
+ if (type == null) return null;
177
+ assert type.value == value;
178
+ return type.name;
179
+ }
180
+
181
+ private static String findEncodingName(ThreadContext context, IRubyObject encoding) {
155
182
  String rubyEncoding = null;
156
183
  if (encoding instanceof RubyString) {
157
- rubyEncoding = rubyStringToString(encoding);
184
+ rubyEncoding = rubyStringToString((RubyString) encoding);
158
185
  }
159
186
  else if (encoding instanceof RubyFixnum) {
160
- int value = RubyFixnum.fix2int((RubyFixnum) encoding);
161
- rubyEncoding = findName(value);
187
+ rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding));
162
188
  }
163
189
  if (rubyEncoding == null) return null;
164
190
  try {
@@ -172,35 +198,52 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
172
198
  }
173
199
  }
174
200
 
175
- private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+");
201
+ private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+", Pattern.CASE_INSENSITIVE);
176
202
 
177
- private static String applyEncoding(String input, String enc) {
178
- String str = input.toLowerCase();
179
- int start_pos = 0;
180
- int end_pos = 0;
181
- if (input.contains("meta") && input.contains("charset")) {
182
- Matcher m = CHARSET_PATTERN.matcher(str);
203
+ private static CharSequence applyEncoding(final String input, final String enc) {
204
+ int start_pos = 0; int end_pos = 0;
205
+ if (containsIgnoreCase(input, "charset")) {
206
+ Matcher m = CHARSET_PATTERN.matcher(input);
183
207
  while (m.find()) {
184
208
  start_pos = m.start();
185
209
  end_pos = m.end();
186
210
  }
187
211
  }
188
212
  if (start_pos != end_pos) {
189
- String substr = input.substring(start_pos, end_pos);
190
- input = input.replace(substr, "charset=" + enc);
213
+ return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc);
191
214
  }
192
215
  return input;
193
216
  }
194
217
 
218
+ private static boolean containsIgnoreCase(final String str, final String sub) {
219
+ final int len = sub.length();
220
+ final int max = str.length() - len;
221
+
222
+ if (len == 0) return true;
223
+ final char c0Lower = Character.toLowerCase(sub.charAt(0));
224
+ final char c0Upper = Character.toUpperCase(sub.charAt(0));
225
+
226
+ for (int i = 0; i <= max; i++) {
227
+ final char ch = str.charAt(i);
228
+ if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) {
229
+ continue; // first char doesn't match
230
+ }
231
+
232
+ if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) {
233
+ return true;
234
+ }
235
+ }
236
+ return false;
237
+ }
238
+
195
239
  @JRubyMethod(name="file", meta=true)
196
240
  public static IRubyObject parse_file(ThreadContext context,
197
- IRubyObject klazz,
241
+ IRubyObject klass,
198
242
  IRubyObject data,
199
243
  IRubyObject encoding) {
200
- HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
201
- ctx.initialize(context.getRuntime());
244
+ HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klass);
202
245
  ctx.setInputSourceFile(context, data);
203
- String javaEncoding = findEncoding(context, encoding);
246
+ String javaEncoding = findEncodingName(context, encoding);
204
247
  if (javaEncoding != null) {
205
248
  ctx.getInputSource().setEncoding(javaEncoding);
206
249
  }
@@ -209,13 +252,12 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
209
252
 
210
253
  @JRubyMethod(name="io", meta=true)
211
254
  public static IRubyObject parse_io(ThreadContext context,
212
- IRubyObject klazz,
255
+ IRubyObject klass,
213
256
  IRubyObject data,
214
257
  IRubyObject encoding) {
215
- HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
216
- ctx.initialize(context.getRuntime());
217
- ctx.setInputSource(context, data, context.getRuntime().getNil());
218
- String javaEncoding = findEncoding(context, encoding);
258
+ HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klass);
259
+ ctx.setIOInputSource(context, data, context.nil);
260
+ String javaEncoding = findEncodingName(context, encoding);
219
261
  if (javaEncoding != null) {
220
262
  ctx.getInputSource().setEncoding(javaEncoding);
221
263
  }
@@ -226,27 +268,15 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
226
268
  * Create a new parser context that will read from a raw input stream.
227
269
  * Meant to be run in a separate thread by HtmlSaxPushParser.
228
270
  */
229
- static HtmlSaxParserContext parse_stream(final Ruby runtime, RubyClass klazz, InputStream stream) {
230
- HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(runtime, klazz);
231
- ctx.initialize(runtime);
271
+ static HtmlSaxParserContext parse_stream(final Ruby runtime, RubyClass klass, InputStream stream) {
272
+ HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(runtime, klass);
232
273
  ctx.setInputSource(stream);
233
274
  return ctx;
234
275
  }
235
276
 
236
277
  @Override
237
278
  protected void preParse(final Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler) {
238
- // final String path = "Nokogiri::XML::FragmentHandler";
239
- // final String docFrag =
240
- // "http://cyberneko.org/html/features/balance-tags/document-fragment";
241
- // RubyObjectAdapter adapter = JavaEmbedUtils.newObjectAdapter();
242
- // IRubyObject doc = adapter.getInstanceVariable(handlerRuby, "@document");
243
- // RubyModule mod = runtime.getClassFromPath(path);
244
- // try {
245
- // if (doc != null && !doc.isNil() && adapter.isKindOf(doc, mod))
246
- // parser.setFeature(docFrag, true);
247
- // } catch (Exception e) {
248
- // // ignore
249
- // }
279
+ // this function is meant to be empty. It overrides the one in XmlSaxParserContext
250
280
  }
251
281
 
252
282
  }