nokogiri 1.10.9-java → 1.11.0.rc4-java
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/LICENSE-DEPENDENCIES.md +1015 -947
- data/README.md +24 -22
- data/ext/java/nokogiri/HtmlDocument.java +34 -46
- data/ext/java/nokogiri/HtmlSaxParserContext.java +88 -58
- data/ext/java/nokogiri/HtmlSaxPushParser.java +1 -1
- data/ext/java/nokogiri/NokogiriService.java +1 -1
- data/ext/java/nokogiri/XmlAttr.java +13 -20
- data/ext/java/nokogiri/XmlAttributeDecl.java +11 -12
- data/ext/java/nokogiri/XmlCdata.java +3 -4
- data/ext/java/nokogiri/XmlComment.java +1 -1
- data/ext/java/nokogiri/XmlDocument.java +148 -175
- data/ext/java/nokogiri/XmlDocumentFragment.java +13 -31
- data/ext/java/nokogiri/XmlDtd.java +5 -8
- data/ext/java/nokogiri/XmlElement.java +1 -20
- data/ext/java/nokogiri/XmlElementDecl.java +23 -28
- data/ext/java/nokogiri/XmlEntityDecl.java +23 -27
- data/ext/java/nokogiri/XmlEntityReference.java +2 -2
- data/ext/java/nokogiri/XmlNamespace.java +72 -89
- data/ext/java/nokogiri/XmlNode.java +303 -406
- data/ext/java/nokogiri/XmlNodeSet.java +70 -76
- data/ext/java/nokogiri/XmlReader.java +12 -13
- data/ext/java/nokogiri/XmlRelaxng.java +10 -3
- data/ext/java/nokogiri/XmlSaxParserContext.java +15 -10
- data/ext/java/nokogiri/XmlSchema.java +87 -27
- data/ext/java/nokogiri/XmlSyntaxError.java +2 -6
- data/ext/java/nokogiri/XmlText.java +12 -9
- data/ext/java/nokogiri/XmlXpathContext.java +55 -25
- data/ext/java/nokogiri/XsltStylesheet.java +7 -15
- data/ext/java/nokogiri/internals/HtmlDomParserContext.java +52 -46
- data/ext/java/nokogiri/internals/NokogiriHandler.java +1 -1
- data/ext/java/nokogiri/internals/NokogiriHelpers.java +71 -135
- data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +90 -58
- data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +9 -2
- data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +67 -10
- data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +4 -2
- data/ext/java/nokogiri/internals/ParserContext.java +27 -73
- data/ext/java/nokogiri/internals/ReaderNode.java +2 -4
- data/ext/java/nokogiri/internals/XmlDomParserContext.java +18 -33
- data/ext/nokogiri/depend +476 -357
- data/ext/nokogiri/extconf.rb +485 -352
- data/ext/nokogiri/html_document.c +79 -78
- data/ext/nokogiri/html_sax_parser_context.c +2 -2
- data/ext/nokogiri/nokogiri.c +34 -40
- data/ext/nokogiri/xml_document.c +18 -4
- data/ext/nokogiri/xml_io.c +8 -6
- data/ext/nokogiri/xml_node.c +21 -1
- data/ext/nokogiri/xml_node_set.c +1 -1
- data/ext/nokogiri/xml_reader.c +6 -17
- data/ext/nokogiri/xml_relax_ng.c +29 -11
- data/ext/nokogiri/xml_sax_parser.c +2 -7
- data/ext/nokogiri/xml_sax_parser_context.c +2 -2
- data/ext/nokogiri/xml_schema.c +55 -13
- data/ext/nokogiri/xml_xpath_context.c +80 -4
- data/ext/nokogiri/xslt_stylesheet.c +1 -8
- data/lib/nokogiri.rb +4 -21
- data/lib/nokogiri/css.rb +1 -0
- data/lib/nokogiri/css/node.rb +1 -0
- data/lib/nokogiri/css/parser.rb +63 -62
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +39 -36
- data/lib/nokogiri/css/syntax_error.rb +1 -0
- data/lib/nokogiri/css/tokenizer.rb +1 -0
- data/lib/nokogiri/css/xpath_visitor.rb +73 -43
- data/lib/nokogiri/decorators/slop.rb +1 -0
- data/lib/nokogiri/html.rb +1 -0
- data/lib/nokogiri/html/builder.rb +1 -0
- data/lib/nokogiri/html/document.rb +13 -26
- data/lib/nokogiri/html/document_fragment.rb +1 -0
- data/lib/nokogiri/html/element_description.rb +1 -0
- data/lib/nokogiri/html/element_description_defaults.rb +1 -0
- data/lib/nokogiri/html/entity_lookup.rb +1 -0
- data/lib/nokogiri/html/sax/parser.rb +1 -0
- data/lib/nokogiri/html/sax/parser_context.rb +1 -0
- data/lib/nokogiri/html/sax/push_parser.rb +1 -0
- data/lib/nokogiri/jruby/dependencies.rb +20 -0
- data/lib/nokogiri/nokogiri.jar +0 -0
- data/lib/nokogiri/syntax_error.rb +1 -0
- data/lib/nokogiri/version.rb +3 -109
- data/lib/nokogiri/version/constant.rb +5 -0
- data/lib/nokogiri/version/info.rb +182 -0
- data/lib/nokogiri/xml.rb +1 -0
- data/lib/nokogiri/xml/attr.rb +1 -0
- data/lib/nokogiri/xml/attribute_decl.rb +1 -0
- data/lib/nokogiri/xml/builder.rb +3 -2
- data/lib/nokogiri/xml/cdata.rb +1 -0
- data/lib/nokogiri/xml/character_data.rb +1 -0
- data/lib/nokogiri/xml/document.rb +20 -15
- data/lib/nokogiri/xml/document_fragment.rb +5 -6
- data/lib/nokogiri/xml/dtd.rb +1 -0
- data/lib/nokogiri/xml/element_content.rb +1 -0
- data/lib/nokogiri/xml/element_decl.rb +1 -0
- data/lib/nokogiri/xml/entity_decl.rb +1 -0
- data/lib/nokogiri/xml/entity_reference.rb +1 -0
- data/lib/nokogiri/xml/namespace.rb +1 -0
- data/lib/nokogiri/xml/node.rb +587 -249
- data/lib/nokogiri/xml/node/save_options.rb +1 -0
- data/lib/nokogiri/xml/node_set.rb +1 -0
- data/lib/nokogiri/xml/notation.rb +1 -0
- data/lib/nokogiri/xml/parse_options.rb +10 -3
- data/lib/nokogiri/xml/pp.rb +1 -0
- data/lib/nokogiri/xml/pp/character_data.rb +1 -0
- data/lib/nokogiri/xml/pp/node.rb +1 -0
- data/lib/nokogiri/xml/processing_instruction.rb +1 -0
- data/lib/nokogiri/xml/reader.rb +7 -3
- data/lib/nokogiri/xml/relax_ng.rb +7 -2
- data/lib/nokogiri/xml/sax.rb +1 -0
- data/lib/nokogiri/xml/sax/document.rb +1 -0
- data/lib/nokogiri/xml/sax/parser.rb +1 -0
- data/lib/nokogiri/xml/sax/parser_context.rb +1 -0
- data/lib/nokogiri/xml/sax/push_parser.rb +1 -0
- data/lib/nokogiri/xml/schema.rb +13 -4
- data/lib/nokogiri/xml/searchable.rb +25 -16
- data/lib/nokogiri/xml/syntax_error.rb +1 -0
- data/lib/nokogiri/xml/text.rb +1 -0
- data/lib/nokogiri/xml/xpath.rb +1 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +1 -0
- data/lib/nokogiri/xml/xpath_context.rb +1 -0
- data/lib/nokogiri/xslt.rb +1 -0
- data/lib/nokogiri/xslt/stylesheet.rb +1 -0
- data/lib/xsd/xmlparser/nokogiri.rb +1 -0
- metadata +89 -96
- data/ext/java/nokogiri/internals/NokogiriEncodingReaderWrapper.java +0 -107
- data/ext/java/nokogiri/internals/UncloseableInputStream.java +0 -102
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/nokogiri.h +0 -121
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -14
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -12
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
data/README.md
CHANGED
@@ -11,7 +11,7 @@ or CSS3 selectors.
|
|
11
11
|
|
12
12
|
* https://nokogiri.org
|
13
13
|
* [Installation Help](https://nokogiri.org/tutorials/installing_nokogiri.html)
|
14
|
-
* [Tutorials](https://nokogiri.org)
|
14
|
+
* [Tutorials](https://nokogiri.org/tutorials/toc.html)
|
15
15
|
* [Cheat Sheet](https://github.com/sparklemotion/nokogiri/wiki/Cheat-sheet)
|
16
16
|
* [GitHub](https://github.com/sparklemotion/nokogiri)
|
17
17
|
* [Mailing List](https://groups.google.com/group/nokogiri-talk)
|
@@ -20,14 +20,14 @@ or CSS3 selectors.
|
|
20
20
|
|
21
21
|
## Status
|
22
22
|
|
23
|
-
[![Concourse CI](https://ci.nokogiri.org/api/v1/teams/nokogiri-core/pipelines/nokogiri/jobs/
|
23
|
+
[![Concourse CI](https://ci.nokogiri.org/api/v1/teams/nokogiri-core/pipelines/nokogiri/jobs/cruby-2.7/badge)](https://ci.nokogiri.org/teams/nokogiri-core/pipelines/nokogiri)
|
24
24
|
[![Appveyor CI](https://ci.appveyor.com/api/projects/status/xj2pqwvlxwuwgr06/branch/master?svg=true)](https://ci.appveyor.com/project/flavorjones/nokogiri/branch/master)
|
25
25
|
[![Code Climate](https://codeclimate.com/github/sparklemotion/nokogiri.svg)](https://codeclimate.com/github/sparklemotion/nokogiri)
|
26
26
|
[![Test Coverage](https://api.codeclimate.com/v1/badges/59c67b0e8976027a45ad/test_coverage)](https://codeclimate.com/github/sparklemotion/nokogiri/test_coverage)
|
27
27
|
|
28
28
|
[![Gem Version](https://badge.fury.io/rb/nokogiri.svg)](https://rubygems.org/gems/nokogiri)
|
29
|
-
[![SemVer compatibility](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score
|
30
|
-
[![Tidelift dependencies](https://tidelift.com/badges/
|
29
|
+
[![SemVer compatibility](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score/?dependency-name=nokogiri&package-manager=bundler)
|
30
|
+
[![Tidelift dependencies](https://tidelift.com/badges/package/rubygems/nokogiri)](https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme)
|
31
31
|
|
32
32
|
|
33
33
|
## Features
|
@@ -71,7 +71,7 @@ help you directly and improve the documentation.
|
|
71
71
|
Binary packages are available for some distributions.
|
72
72
|
|
73
73
|
* Debian: https://packages.debian.org/sid/ruby-nokogiri
|
74
|
-
*
|
74
|
+
* openSUSE/SLE: https://download.opensuse.org/repositories/devel:/languages:/ruby:/extensions/
|
75
75
|
* Fedora: http://s390.koji.fedoraproject.org/koji/packageinfo?packageID=6756
|
76
76
|
|
77
77
|
|
@@ -108,7 +108,7 @@ require 'nokogiri'
|
|
108
108
|
require 'open-uri'
|
109
109
|
|
110
110
|
# Fetch and parse HTML document
|
111
|
-
doc = Nokogiri::HTML(open('https://nokogiri.org/tutorials/installing_nokogiri.html'))
|
111
|
+
doc = Nokogiri::HTML(URI.open('https://nokogiri.org/tutorials/installing_nokogiri.html'))
|
112
112
|
|
113
113
|
puts "### Search for nodes by css"
|
114
114
|
doc.css('nav ul.menu li a', 'article h2').each do |link|
|
@@ -129,27 +129,23 @@ end
|
|
129
129
|
|
130
130
|
## Requirements
|
131
131
|
|
132
|
-
|
133
|
-
to compile native extensions.
|
132
|
+
Ruby 2.5.0 or higher, including any development packages necessary to compile native extensions.
|
134
133
|
|
135
|
-
|
136
|
-
gem, but if you want to use the system versions:
|
134
|
+
In Nokogiri 1.6.0 and later libxml2 and libxslt are bundled with the gem, but if you want to use the system versions:
|
137
135
|
|
138
|
-
|
139
|
-
|
140
|
-
|
136
|
+
* First, check out [the long list](http://www.xmlsoft.org/news.html)
|
137
|
+
of fixes and changes between releases before deciding to use any
|
138
|
+
version older than is bundled with Nokogiri.
|
141
139
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
140
|
+
* At install time, set the environment variable
|
141
|
+
`NOKOGIRI_USE_SYSTEM_LIBRARIES` or else use the
|
142
|
+
`--use-system-libraries` argument. (See
|
143
|
+
https://nokogiri.org/tutorials/installing_nokogiri.html#install-with-system-libraries
|
144
|
+
for specifics.)
|
147
145
|
|
148
|
-
|
149
|
-
(libxml2-dev/-devel is also required)
|
146
|
+
* libxml2 >=2.6.21 with iconv support (libxml2-dev/-devel is also required)
|
150
147
|
|
151
|
-
|
152
|
-
(libxslt-dev/-devel is also required)
|
148
|
+
* libxslt, built with and supported by the given libxml2 (libxslt-dev/-devel is also required)
|
153
149
|
|
154
150
|
|
155
151
|
## Encoding
|
@@ -191,6 +187,12 @@ explicitly setting the encoding to EUC-JP on the parser:
|
|
191
187
|
We've adopted the Contributor Covenant code of conduct, which you can read in full in [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md).
|
192
188
|
|
193
189
|
|
190
|
+
## Semantic Versioning
|
191
|
+
|
192
|
+
[![SemVer compatibility](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score/?dependency-name=nokogiri&package-manager=bundler)
|
193
|
+
|
194
|
+
Nokogiri follows [Semantic Versioning](https://semver.org/). See [`CHANGELOG.md`](CHANGELOG.md) for more details.
|
195
|
+
|
194
196
|
## License
|
195
197
|
|
196
198
|
This project is licensed under the terms of the MIT license.
|
@@ -36,7 +36,6 @@ import org.jruby.Ruby;
|
|
36
36
|
import org.jruby.RubyClass;
|
37
37
|
import org.jruby.anno.JRubyClass;
|
38
38
|
import org.jruby.anno.JRubyMethod;
|
39
|
-
import org.jruby.runtime.Arity;
|
40
39
|
import org.jruby.runtime.Helpers;
|
41
40
|
import org.jruby.runtime.ThreadContext;
|
42
41
|
import org.jruby.runtime.builtin.IRubyObject;
|
@@ -48,6 +47,8 @@ import org.w3c.dom.NodeList;
|
|
48
47
|
|
49
48
|
import nokogiri.internals.HtmlDomParserContext;
|
50
49
|
|
50
|
+
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
|
51
|
+
|
51
52
|
/**
|
52
53
|
* Class for Nokogiri::HTML::Document.
|
53
54
|
*
|
@@ -65,21 +66,25 @@ public class HtmlDocument extends XmlDocument {
|
|
65
66
|
public HtmlDocument(Ruby ruby, RubyClass klazz) {
|
66
67
|
super(ruby, klazz);
|
67
68
|
}
|
68
|
-
|
69
|
+
|
70
|
+
public HtmlDocument(Ruby runtime, Document document) {
|
71
|
+
this(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Document"), document);
|
72
|
+
}
|
73
|
+
|
69
74
|
public HtmlDocument(Ruby ruby, RubyClass klazz, Document doc) {
|
70
75
|
super(ruby, klazz, doc);
|
71
76
|
}
|
72
77
|
|
73
78
|
@JRubyMethod(name="new", meta = true, rest = true, required=0)
|
74
|
-
public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz,
|
75
|
-
|
79
|
+
public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz, IRubyObject[] args) {
|
80
|
+
final Ruby runtime = context.runtime;
|
76
81
|
HtmlDocument htmlDocument;
|
77
82
|
try {
|
78
|
-
Document docNode = createNewDocument();
|
79
|
-
htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(
|
80
|
-
htmlDocument.setDocumentNode(context, docNode);
|
83
|
+
Document docNode = createNewDocument(runtime);
|
84
|
+
htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(runtime, (RubyClass) klazz);
|
85
|
+
htmlDocument.setDocumentNode(context.runtime, docNode);
|
81
86
|
} catch (Exception ex) {
|
82
|
-
throw
|
87
|
+
throw asRuntimeError(runtime, "couldn't create document: ", ex);
|
83
88
|
}
|
84
89
|
|
85
90
|
Helpers.invoke(context, htmlDocument, "initialize", args);
|
@@ -109,46 +114,29 @@ public class HtmlDocument extends XmlDocument {
|
|
109
114
|
return internalSubset;
|
110
115
|
}
|
111
116
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
Arity.checkArgumentCount(ruby, args, 4, 4);
|
117
|
-
HtmlDomParserContext ctx =
|
118
|
-
new HtmlDomParserContext(ruby, args[2], args[3]);
|
119
|
-
ctx.setInputSource(context, args[0], args[1]);
|
120
|
-
return ctx.parse(context, klass, args[1]);
|
121
|
-
}
|
122
|
-
|
123
|
-
public void setDocumentNode(ThreadContext context, Node node) {
|
124
|
-
super.setNode(context, node);
|
125
|
-
Ruby runtime = context.getRuntime();
|
126
|
-
if (node != null) {
|
127
|
-
Document document = (Document)node;
|
128
|
-
document.normalize();
|
129
|
-
stabilzeAttrValue(document.getDocumentElement());
|
130
|
-
}
|
117
|
+
@Override
|
118
|
+
void init(Ruby runtime, Document document) {
|
119
|
+
stabilizeTextContent(document);
|
120
|
+
document.normalize();
|
131
121
|
setInstanceVariable("@decorators", runtime.getNil());
|
122
|
+
if (document.getDocumentElement() != null) {
|
123
|
+
stabilizeAttrs(document.getDocumentElement());
|
124
|
+
}
|
132
125
|
}
|
133
|
-
|
134
|
-
private void
|
135
|
-
if (node == null) return;
|
126
|
+
|
127
|
+
private static void stabilizeAttrs(Node node) {
|
136
128
|
if (node.hasAttributes()) {
|
137
129
|
NamedNodeMap nodeMap = node.getAttributes();
|
138
130
|
for (int i=0; i<nodeMap.getLength(); i++) {
|
139
131
|
Node n = nodeMap.item(i);
|
140
132
|
if (n instanceof Attr) {
|
141
|
-
|
142
|
-
String attrName = attr.getName();
|
143
|
-
// not sure, but need to get value always before document is referred.
|
144
|
-
// or lose attribute value
|
145
|
-
String attrValue = attr.getValue(); // don't delete this line
|
133
|
+
stabilizeAttr((Attr) n);
|
146
134
|
}
|
147
135
|
}
|
148
136
|
}
|
149
137
|
NodeList children = node.getChildNodes();
|
150
138
|
for (int i=0; i<children.getLength(); i++) {
|
151
|
-
|
139
|
+
stabilizeAttrs(children.item(i));
|
152
140
|
}
|
153
141
|
}
|
154
142
|
|
@@ -167,11 +155,11 @@ public class HtmlDocument extends XmlDocument {
|
|
167
155
|
* Read the HTML document from +io+ with given +url+, +encoding+,
|
168
156
|
* and +options+. See Nokogiri::HTML.parse
|
169
157
|
*/
|
170
|
-
@JRubyMethod(meta = true,
|
171
|
-
public static IRubyObject read_io(ThreadContext context,
|
172
|
-
|
173
|
-
|
174
|
-
return
|
158
|
+
@JRubyMethod(meta = true, required = 4)
|
159
|
+
public static IRubyObject read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args) {
|
160
|
+
HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
|
161
|
+
ctx.setIOInputSource(context, args[0], args[1]);
|
162
|
+
return ctx.parse(context, (RubyClass) klass, args[1]);
|
175
163
|
}
|
176
164
|
|
177
165
|
/*
|
@@ -181,10 +169,10 @@ public class HtmlDocument extends XmlDocument {
|
|
181
169
|
* Read the HTML document contained in +string+ with given +url+, +encoding+,
|
182
170
|
* and +options+. See Nokogiri::HTML.parse
|
183
171
|
*/
|
184
|
-
@JRubyMethod(meta = true,
|
185
|
-
public static IRubyObject read_memory(ThreadContext context,
|
186
|
-
|
187
|
-
|
188
|
-
return
|
172
|
+
@JRubyMethod(meta = true, required = 4)
|
173
|
+
public static IRubyObject read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args) {
|
174
|
+
HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
|
175
|
+
ctx.setStringInputSource(context, args[0], args[1]);
|
176
|
+
return ctx.parse(context, (RubyClass) klass, args[1]);
|
189
177
|
}
|
190
178
|
}
|
@@ -32,28 +32,29 @@
|
|
32
32
|
|
33
33
|
package nokogiri;
|
34
34
|
|
35
|
-
import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
|
36
|
-
|
37
35
|
import java.io.ByteArrayInputStream;
|
38
36
|
import java.io.InputStream;
|
39
37
|
import java.nio.charset.Charset;
|
40
38
|
import java.nio.charset.IllegalCharsetNameException;
|
41
39
|
import java.nio.charset.UnsupportedCharsetException;
|
42
|
-
import java.util.EnumSet;
|
43
40
|
import java.util.regex.Matcher;
|
44
41
|
import java.util.regex.Pattern;
|
45
42
|
|
46
|
-
import nokogiri.internals.NokogiriHandler;
|
47
|
-
|
48
43
|
import org.apache.xerces.parsers.AbstractSAXParser;
|
49
44
|
import org.cyberneko.html.parsers.SAXParser;
|
50
|
-
import org.jruby
|
45
|
+
import org.jruby.Ruby;
|
46
|
+
import org.jruby.RubyClass;
|
47
|
+
import org.jruby.RubyFixnum;
|
48
|
+
import org.jruby.RubyString;
|
51
49
|
import org.jruby.anno.JRubyClass;
|
52
50
|
import org.jruby.anno.JRubyMethod;
|
53
51
|
import org.jruby.runtime.ThreadContext;
|
54
52
|
import org.jruby.runtime.builtin.IRubyObject;
|
55
53
|
import org.xml.sax.SAXException;
|
56
54
|
|
55
|
+
import nokogiri.internals.NokogiriHandler;
|
56
|
+
import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
|
57
|
+
|
57
58
|
/**
|
58
59
|
* Class for Nokogiri::HTML::SAX::ParserContext.
|
59
60
|
*
|
@@ -65,10 +66,16 @@ import org.xml.sax.SAXException;
|
|
65
66
|
@JRubyClass(name="Nokogiri::HTML::SAX::ParserContext", parent="Nokogiri::XML::SAX::ParserContext")
|
66
67
|
public class HtmlSaxParserContext extends XmlSaxParserContext {
|
67
68
|
|
69
|
+
static HtmlSaxParserContext newInstance(final Ruby runtime, final RubyClass klazz) {
|
70
|
+
HtmlSaxParserContext instance = new HtmlSaxParserContext(runtime, klazz);
|
71
|
+
instance.initialize(runtime);
|
72
|
+
return instance;
|
73
|
+
}
|
74
|
+
|
68
75
|
public HtmlSaxParserContext(Ruby ruby, RubyClass rubyClass) {
|
69
76
|
super(ruby, rubyClass);
|
70
77
|
}
|
71
|
-
|
78
|
+
|
72
79
|
@Override
|
73
80
|
protected AbstractSAXParser createParser() throws SAXException {
|
74
81
|
SAXParser parser = new SAXParser();
|
@@ -78,6 +85,11 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
|
|
78
85
|
"http://cyberneko.org/html/properties/names/elems", "lower");
|
79
86
|
parser.setProperty(
|
80
87
|
"http://cyberneko.org/html/properties/names/attrs", "lower");
|
88
|
+
|
89
|
+
// NekoHTML should not try to guess the encoding based on the meta
|
90
|
+
// tags or other information in the document. This is already
|
91
|
+
// handled by the EncodingReader.
|
92
|
+
parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
|
81
93
|
return parser;
|
82
94
|
} catch(SAXException ex) {
|
83
95
|
throw new SAXException(
|
@@ -90,18 +102,17 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
|
|
90
102
|
IRubyObject klazz,
|
91
103
|
IRubyObject data,
|
92
104
|
IRubyObject encoding) {
|
93
|
-
HtmlSaxParserContext ctx =
|
94
|
-
|
95
|
-
String javaEncoding = findEncoding(context, encoding);
|
105
|
+
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
|
106
|
+
String javaEncoding = findEncodingName(context, encoding);
|
96
107
|
if (javaEncoding != null) {
|
97
|
-
|
98
|
-
ByteArrayInputStream istream = new ByteArrayInputStream(input.getBytes());
|
108
|
+
CharSequence input = applyEncoding(rubyStringToString(data.convertToString()), javaEncoding);
|
109
|
+
ByteArrayInputStream istream = new ByteArrayInputStream(input.toString().getBytes());
|
99
110
|
ctx.setInputSource(istream);
|
100
111
|
ctx.getInputSource().setEncoding(javaEncoding);
|
101
112
|
}
|
102
113
|
return ctx;
|
103
114
|
}
|
104
|
-
|
115
|
+
|
105
116
|
public enum EncodingType {
|
106
117
|
NONE(0, "NONE"),
|
107
118
|
UTF_8(1, "UTF-8"),
|
@@ -142,23 +153,38 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
|
|
142
153
|
public String toString() {
|
143
154
|
return name;
|
144
155
|
}
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
156
|
+
|
157
|
+
private static transient EncodingType[] values;
|
158
|
+
|
159
|
+
// NOTE: assuming ordinal == value
|
160
|
+
static EncodingType get(final int ordinal) {
|
161
|
+
EncodingType[] values = EncodingType.values;
|
162
|
+
if (values == null) {
|
163
|
+
values = EncodingType.values();
|
164
|
+
EncodingType.values = values;
|
165
|
+
}
|
166
|
+
if (ordinal >= 0 && ordinal < values.length) {
|
167
|
+
return values[ordinal];
|
168
|
+
}
|
169
|
+
return null;
|
150
170
|
}
|
151
|
-
|
171
|
+
|
152
172
|
}
|
153
|
-
|
154
|
-
private static String
|
173
|
+
|
174
|
+
private static String findEncodingName(final int value) {
|
175
|
+
EncodingType type = EncodingType.get(value);
|
176
|
+
if (type == null) return null;
|
177
|
+
assert type.value == value;
|
178
|
+
return type.name;
|
179
|
+
}
|
180
|
+
|
181
|
+
private static String findEncodingName(ThreadContext context, IRubyObject encoding) {
|
155
182
|
String rubyEncoding = null;
|
156
183
|
if (encoding instanceof RubyString) {
|
157
|
-
rubyEncoding = rubyStringToString(encoding);
|
184
|
+
rubyEncoding = rubyStringToString((RubyString) encoding);
|
158
185
|
}
|
159
186
|
else if (encoding instanceof RubyFixnum) {
|
160
|
-
|
161
|
-
rubyEncoding = findName(value);
|
187
|
+
rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding));
|
162
188
|
}
|
163
189
|
if (rubyEncoding == null) return null;
|
164
190
|
try {
|
@@ -168,39 +194,56 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
|
|
168
194
|
throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported");
|
169
195
|
}
|
170
196
|
catch (IllegalCharsetNameException e) {
|
171
|
-
throw context.getRuntime().
|
197
|
+
throw context.getRuntime().newEncodingError(e.getMessage());
|
172
198
|
}
|
173
199
|
}
|
174
200
|
|
175
|
-
private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+");
|
201
|
+
private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+", Pattern.CASE_INSENSITIVE);
|
176
202
|
|
177
|
-
private static
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
if (input.contains("meta") && input.contains("charset")) {
|
182
|
-
Matcher m = CHARSET_PATTERN.matcher(str);
|
203
|
+
private static CharSequence applyEncoding(final String input, final String enc) {
|
204
|
+
int start_pos = 0; int end_pos = 0;
|
205
|
+
if (containsIgnoreCase(input, "charset")) {
|
206
|
+
Matcher m = CHARSET_PATTERN.matcher(input);
|
183
207
|
while (m.find()) {
|
184
208
|
start_pos = m.start();
|
185
209
|
end_pos = m.end();
|
186
210
|
}
|
187
211
|
}
|
188
212
|
if (start_pos != end_pos) {
|
189
|
-
|
190
|
-
input = input.replace(substr, "charset=" + enc);
|
213
|
+
return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc);
|
191
214
|
}
|
192
215
|
return input;
|
193
216
|
}
|
194
217
|
|
218
|
+
private static boolean containsIgnoreCase(final String str, final String sub) {
|
219
|
+
final int len = sub.length();
|
220
|
+
final int max = str.length() - len;
|
221
|
+
|
222
|
+
if (len == 0) return true;
|
223
|
+
final char c0Lower = Character.toLowerCase(sub.charAt(0));
|
224
|
+
final char c0Upper = Character.toUpperCase(sub.charAt(0));
|
225
|
+
|
226
|
+
for (int i = 0; i <= max; i++) {
|
227
|
+
final char ch = str.charAt(i);
|
228
|
+
if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) {
|
229
|
+
continue; // first char doesn't match
|
230
|
+
}
|
231
|
+
|
232
|
+
if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) {
|
233
|
+
return true;
|
234
|
+
}
|
235
|
+
}
|
236
|
+
return false;
|
237
|
+
}
|
238
|
+
|
195
239
|
@JRubyMethod(name="file", meta=true)
|
196
240
|
public static IRubyObject parse_file(ThreadContext context,
|
197
|
-
IRubyObject
|
241
|
+
IRubyObject klass,
|
198
242
|
IRubyObject data,
|
199
243
|
IRubyObject encoding) {
|
200
|
-
HtmlSaxParserContext ctx =
|
201
|
-
ctx.initialize(context.getRuntime());
|
244
|
+
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klass);
|
202
245
|
ctx.setInputSourceFile(context, data);
|
203
|
-
String javaEncoding =
|
246
|
+
String javaEncoding = findEncodingName(context, encoding);
|
204
247
|
if (javaEncoding != null) {
|
205
248
|
ctx.getInputSource().setEncoding(javaEncoding);
|
206
249
|
}
|
@@ -209,13 +252,12 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
|
|
209
252
|
|
210
253
|
@JRubyMethod(name="io", meta=true)
|
211
254
|
public static IRubyObject parse_io(ThreadContext context,
|
212
|
-
IRubyObject
|
255
|
+
IRubyObject klass,
|
213
256
|
IRubyObject data,
|
214
257
|
IRubyObject encoding) {
|
215
|
-
HtmlSaxParserContext ctx =
|
216
|
-
ctx.
|
217
|
-
|
218
|
-
String javaEncoding = findEncoding(context, encoding);
|
258
|
+
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klass);
|
259
|
+
ctx.setIOInputSource(context, data, context.nil);
|
260
|
+
String javaEncoding = findEncodingName(context, encoding);
|
219
261
|
if (javaEncoding != null) {
|
220
262
|
ctx.getInputSource().setEncoding(javaEncoding);
|
221
263
|
}
|
@@ -226,27 +268,15 @@ public class HtmlSaxParserContext extends XmlSaxParserContext {
|
|
226
268
|
* Create a new parser context that will read from a raw input stream.
|
227
269
|
* Meant to be run in a separate thread by HtmlSaxPushParser.
|
228
270
|
*/
|
229
|
-
static HtmlSaxParserContext parse_stream(final Ruby runtime, RubyClass
|
230
|
-
HtmlSaxParserContext ctx =
|
231
|
-
ctx.initialize(runtime);
|
271
|
+
static HtmlSaxParserContext parse_stream(final Ruby runtime, RubyClass klass, InputStream stream) {
|
272
|
+
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(runtime, klass);
|
232
273
|
ctx.setInputSource(stream);
|
233
274
|
return ctx;
|
234
275
|
}
|
235
276
|
|
236
277
|
@Override
|
237
278
|
protected void preParse(final Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler) {
|
238
|
-
//
|
239
|
-
// final String docFrag =
|
240
|
-
// "http://cyberneko.org/html/features/balance-tags/document-fragment";
|
241
|
-
// RubyObjectAdapter adapter = JavaEmbedUtils.newObjectAdapter();
|
242
|
-
// IRubyObject doc = adapter.getInstanceVariable(handlerRuby, "@document");
|
243
|
-
// RubyModule mod = runtime.getClassFromPath(path);
|
244
|
-
// try {
|
245
|
-
// if (doc != null && !doc.isNil() && adapter.isKindOf(doc, mod))
|
246
|
-
// parser.setFeature(docFrag, true);
|
247
|
-
// } catch (Exception e) {
|
248
|
-
// // ignore
|
249
|
-
// }
|
279
|
+
// this function is meant to be empty. It overrides the one in XmlSaxParserContext
|
250
280
|
}
|
251
281
|
|
252
282
|
}
|