nokogiri 1.7.2-java → 1.8.0-java
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/.cross_rubies +4 -4
- data/.travis.yml +43 -24
- data/CHANGELOG.md +54 -6
- data/Gemfile +8 -7
- data/Gemfile-libxml-ruby +3 -0
- data/LICENSE-DEPENDENCIES.md +1612 -0
- data/{LICENSE.txt → LICENSE.md} +1 -1
- data/Manifest.txt +5 -8
- data/README.md +8 -5
- data/Rakefile +15 -31
- data/appveyor.yml +2 -0
- data/dependencies.yml +12 -7
- data/ext/java/nokogiri/HtmlDocument.java +2 -2
- data/ext/java/nokogiri/HtmlSaxParserContext.java +20 -21
- data/ext/java/nokogiri/HtmlSaxPushParser.java +6 -10
- data/ext/java/nokogiri/NokogiriService.java +10 -31
- data/ext/java/nokogiri/XmlAttr.java +1 -26
- data/ext/java/nokogiri/XmlCdata.java +0 -1
- data/ext/java/nokogiri/XmlComment.java +1 -1
- data/ext/java/nokogiri/XmlDocument.java +4 -5
- data/ext/java/nokogiri/XmlDocumentFragment.java +29 -21
- data/ext/java/nokogiri/XmlDtd.java +1 -1
- data/ext/java/nokogiri/XmlElement.java +9 -10
- data/ext/java/nokogiri/XmlEntityDecl.java +4 -5
- data/ext/java/nokogiri/XmlNode.java +105 -103
- data/ext/java/nokogiri/XmlNodeSet.java +64 -76
- data/ext/java/nokogiri/XmlReader.java +48 -48
- data/ext/java/nokogiri/XmlRelaxng.java +1 -1
- data/ext/java/nokogiri/XmlSaxPushParser.java +37 -17
- data/ext/java/nokogiri/XmlSchema.java +7 -5
- data/ext/java/nokogiri/XmlSyntaxError.java +47 -35
- data/ext/java/nokogiri/XmlXpathContext.java +160 -132
- data/ext/java/nokogiri/XsltStylesheet.java +15 -24
- data/ext/java/nokogiri/internals/HtmlDomParserContext.java +19 -23
- data/ext/java/nokogiri/internals/NokogiriDomParser.java +1 -1
- data/ext/java/nokogiri/internals/NokogiriEncodingReaderWrapper.java +1 -1
- data/ext/java/nokogiri/internals/NokogiriEntityResolver.java +11 -13
- data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +5 -21
- data/ext/java/nokogiri/internals/NokogiriHandler.java +1 -1
- data/ext/java/nokogiri/internals/NokogiriHelpers.java +105 -142
- data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +16 -26
- data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +32 -50
- data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +10 -13
- data/ext/java/nokogiri/internals/NokogiriXPathVariableResolver.java +3 -10
- data/ext/java/nokogiri/internals/ParserContext.java +4 -8
- data/ext/java/nokogiri/internals/ReaderNode.java +53 -93
- data/ext/java/nokogiri/internals/SaveContextVisitor.java +77 -89
- data/ext/java/nokogiri/internals/SchemaErrorHandler.java +6 -9
- data/ext/java/nokogiri/internals/XalanDTMManagerPatch.java +167 -0
- data/ext/java/nokogiri/internals/XmlDomParserContext.java +17 -6
- data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +1 -1
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +28 -28
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315.java +3 -4
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +2 -2
- data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +10 -10
- data/ext/java/nokogiri/internals/c14n/ElementProxy.java +5 -5
- data/ext/java/nokogiri/internals/c14n/InclusiveNamespaces.java +2 -2
- data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +1 -1
- data/ext/java/nokogiri/internals/c14n/XMLUtils.java +2 -2
- data/ext/java/org/apache/xml/dtm/ref/dom2dtm/DOM2DTMExt.java +1749 -0
- data/ext/nokogiri/extconf.rb +12 -17
- data/ext/nokogiri/nokogiri.h +0 -10
- data/ext/nokogiri/xml_attr.c +12 -8
- data/ext/nokogiri/xml_node.c +17 -14
- data/ext/nokogiri/xml_sax_push_parser.c +56 -12
- data/lib/nokogiri/html/sax/parser.rb +10 -0
- data/lib/nokogiri/nokogiri.jar +0 -0
- data/lib/nokogiri/version.rb +5 -4
- data/lib/nokogiri/xml/document.rb +9 -9
- data/lib/nokogiri/xml/node.rb +7 -7
- data/lib/nokogiri/xml/node_set.rb +12 -7
- data/lib/nokogiri/xml/sax/parser.rb +6 -7
- data/lib/nokogiri/xml/searchable.rb +34 -25
- data/lib/nokogiri/xml/syntax_error.rb +24 -1
- data/test/decorators/test_slop.rb +4 -1
- data/test/helper.rb +10 -0
- data/test/html/sax/test_parser.rb +27 -0
- data/test/html/test_document.rb +12 -1
- data/test/html/test_document_encoding.rb +1 -3
- data/test/html/test_document_fragment.rb +3 -0
- data/test/xml/sax/test_push_parser.rb +48 -0
- data/test/xml/test_attr.rb +7 -0
- data/test/xml/test_document.rb +1 -1
- data/test/xml/test_document_fragment.rb +27 -0
- data/test/xml/test_entity_reference.rb +2 -2
- data/test/xml/test_node.rb +12 -15
- data/test/xml/test_node_reparenting.rb +14 -0
- data/test/xml/test_node_set.rb +8 -6
- data/test/xml/test_reader.rb +19 -0
- data/test/xml/test_syntax_error.rb +21 -15
- data/test/xml/test_unparented_node.rb +54 -11
- data/test/xml/test_xpath.rb +23 -6
- metadata +32 -20
- data/ext/java/nokogiri/internals/NokogiriDocumentCache.java +0 -73
- data/ext/java/nokogiri/internals/XsltExtensionFunction.java +0 -72
- data/suppressions/nokogiri_ree-1.8.7.358.supp +0 -61
- data/suppressions/nokogiri_ruby-1.8.7.370.supp +0 -0
- data/suppressions/nokogiri_ruby-1.9.2.320.supp +0 -28
- data/suppressions/nokogiri_ruby-1.9.3.327.supp +0 -28
- data/test_all +0 -105
@@ -82,16 +82,12 @@ import org.w3c.dom.Document;
|
|
82
82
|
*/
|
83
83
|
@JRubyClass(name="Nokogiri::XSLT::Stylesheet")
|
84
84
|
public class XsltStylesheet extends RubyObject {
|
85
|
-
|
85
|
+
|
86
86
|
private TransformerFactory factory = null;
|
87
87
|
private Templates sheet = null;
|
88
88
|
private IRubyObject stylesheet = null;
|
89
89
|
private boolean htmlish = false;
|
90
90
|
|
91
|
-
public static Map<String, Object> getRegistry() {
|
92
|
-
return registry;
|
93
|
-
}
|
94
|
-
|
95
91
|
public XsltStylesheet(Ruby ruby, RubyClass rubyClass) {
|
96
92
|
super(ruby, rubyClass);
|
97
93
|
}
|
@@ -215,8 +211,7 @@ public class XsltStylesheet extends RubyObject {
|
|
215
211
|
|
216
212
|
NokogiriXsltErrorListener elistener = new NokogiriXsltErrorListener();
|
217
213
|
DOMSource domSource = new DOMSource(((XmlDocument) args[0]).getDocument());
|
218
|
-
DOMResult result = null;
|
219
|
-
String stringResult = null;
|
214
|
+
final DOMResult result; String stringResult = null;
|
220
215
|
try{
|
221
216
|
result = tryXsltTransformation(context, args, domSource, elistener); // DOMResult
|
222
217
|
if (result.getNode().getFirstChild() == null) {
|
@@ -275,16 +270,17 @@ public class XsltStylesheet extends RubyObject {
|
|
275
270
|
pwriter.connect(preader);
|
276
271
|
StreamResult result = new StreamResult(pwriter);
|
277
272
|
transf.transform(domSource, result);
|
273
|
+
|
278
274
|
char[] cbuf = new char[1024];
|
279
275
|
int len = preader.read(cbuf, 0, 1024);
|
280
|
-
StringBuilder builder = new StringBuilder();
|
281
|
-
builder.append(
|
282
|
-
htmlish = isHtml(builder
|
276
|
+
StringBuilder builder = new StringBuilder(len);
|
277
|
+
builder.append(cbuf, 0, len);
|
278
|
+
htmlish = isHtml(builder); // judge from the first chunk
|
283
279
|
|
284
280
|
while (len == 1024) {
|
285
281
|
len = preader.read(cbuf, 0, 1024);
|
286
282
|
if (len > 0) {
|
287
|
-
builder.append(
|
283
|
+
builder.append(cbuf, 0, len);
|
288
284
|
}
|
289
285
|
}
|
290
286
|
|
@@ -308,20 +304,18 @@ public class XsltStylesheet extends RubyObject {
|
|
308
304
|
|
309
305
|
private Templates getTemplatesFromStreamSource() throws TransformerConfigurationException {
|
310
306
|
if (stylesheet instanceof RubyString) {
|
311
|
-
StringReader reader = new StringReader(
|
307
|
+
StringReader reader = new StringReader(stylesheet.asJavaString());
|
312
308
|
StreamSource xsltStreamSource = new StreamSource(reader);
|
313
309
|
return factory.newTemplates(xsltStreamSource);
|
314
310
|
}
|
315
311
|
return null;
|
316
312
|
}
|
317
313
|
|
318
|
-
private static Pattern
|
319
|
-
Pattern.compile("<(%s)*html", Pattern.CASE_INSENSITIVE);
|
314
|
+
private static final Pattern HTML_TAG = Pattern.compile("<(%s)*html", Pattern.CASE_INSENSITIVE);
|
320
315
|
|
321
|
-
private boolean isHtml(
|
322
|
-
Matcher
|
323
|
-
|
324
|
-
else return false;
|
316
|
+
private static boolean isHtml(CharSequence chunk) {
|
317
|
+
Matcher match = HTML_TAG.matcher(chunk);
|
318
|
+
return match.find();
|
325
319
|
}
|
326
320
|
|
327
321
|
private IRubyObject createDocumentFromString(ThreadContext context, Ruby runtime, String stringResult) {
|
@@ -346,12 +340,9 @@ public class XsltStylesheet extends RubyObject {
|
|
346
340
|
}
|
347
341
|
}
|
348
342
|
|
349
|
-
private void argumentTypeCheck(Ruby runtime, IRubyObject arg) {
|
350
|
-
if (arg instanceof XmlDocument)
|
351
|
-
|
352
|
-
} else {
|
353
|
-
throw runtime.newArgumentError("argument must be a Nokogiri::XML::Document");
|
354
|
-
}
|
343
|
+
private static void argumentTypeCheck(Ruby runtime, IRubyObject arg) {
|
344
|
+
if (arg instanceof XmlDocument) return;
|
345
|
+
throw runtime.newArgumentError("argument must be a Nokogiri::XML::Document");
|
355
346
|
}
|
356
347
|
|
357
348
|
@JRubyMethod(name = {"registr", "register"}, meta = true)
|
@@ -39,7 +39,6 @@ import nokogiri.HtmlDocument;
|
|
39
39
|
import nokogiri.NokogiriService;
|
40
40
|
import nokogiri.XmlDocument;
|
41
41
|
|
42
|
-
import org.apache.xerces.parsers.DOMParser;
|
43
42
|
import org.apache.xerces.xni.Augmentations;
|
44
43
|
import org.apache.xerces.xni.QName;
|
45
44
|
import org.apache.xerces.xni.XMLAttributes;
|
@@ -54,6 +53,7 @@ import org.jruby.runtime.ThreadContext;
|
|
54
53
|
import org.jruby.runtime.builtin.IRubyObject;
|
55
54
|
import org.w3c.dom.Document;
|
56
55
|
import org.w3c.dom.NamedNodeMap;
|
56
|
+
import org.w3c.dom.Node;
|
57
57
|
import org.w3c.dom.NodeList;
|
58
58
|
|
59
59
|
/**
|
@@ -65,8 +65,6 @@ import org.w3c.dom.NodeList;
|
|
65
65
|
*/
|
66
66
|
public class HtmlDomParserContext extends XmlDomParserContext {
|
67
67
|
|
68
|
-
private String encoding;
|
69
|
-
|
70
68
|
public HtmlDomParserContext(Ruby runtime, IRubyObject options) {
|
71
69
|
super(runtime, options);
|
72
70
|
}
|
@@ -87,7 +85,7 @@ public class HtmlDomParserContext extends XmlDomParserContext {
|
|
87
85
|
@Override
|
88
86
|
protected void initParser(Ruby runtime) {
|
89
87
|
XMLParserConfiguration config = new HTMLConfiguration();
|
90
|
-
XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
|
88
|
+
//XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
|
91
89
|
XMLDocumentFilter elementValidityCheckFilter = new ElementValidityCheckFilter(errorHandler);
|
92
90
|
//XMLDocumentFilter[] filters = { removeNSAttrsFilter, elementValidityCheckFilter};
|
93
91
|
XMLDocumentFilter[] filters = { elementValidityCheckFilter};
|
@@ -121,14 +119,12 @@ public class HtmlDomParserContext extends XmlDomParserContext {
|
|
121
119
|
|
122
120
|
@Override
|
123
121
|
protected XmlDocument getNewEmptyDocument(ThreadContext context) {
|
124
|
-
IRubyObject[] args =
|
122
|
+
IRubyObject[] args = IRubyObject.NULL_ARRAY;
|
125
123
|
return (XmlDocument) XmlDocument.rbNew(context, getNokogiriClass(context.getRuntime(), "Nokogiri::HTML::Document"), args);
|
126
124
|
}
|
127
125
|
|
128
126
|
@Override
|
129
|
-
protected XmlDocument wrapDocument(ThreadContext context,
|
130
|
-
RubyClass klazz,
|
131
|
-
Document document) {
|
127
|
+
protected XmlDocument wrapDocument(ThreadContext context, RubyClass klazz, Document document) {
|
132
128
|
HtmlDocument htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(context.getRuntime(), klazz);
|
133
129
|
htmlDocument.setDocumentNode(context, document);
|
134
130
|
if (ruby_encoding.isNil()) {
|
@@ -149,18 +145,18 @@ public class HtmlDomParserContext extends XmlDomParserContext {
|
|
149
145
|
// NekoHtml doesn't understand HTML5 meta tag format. This fails to detect charset
|
150
146
|
// from an HTML5 style meta tag. Luckily, the meta tag and charset exists in DOM tree
|
151
147
|
// so, this method attempts to find the charset.
|
152
|
-
private String tryGetCharsetFromHtml5MetaTag(Document document) {
|
148
|
+
private static String tryGetCharsetFromHtml5MetaTag(Document document) {
|
153
149
|
if (!"html".equalsIgnoreCase(document.getDocumentElement().getNodeName())) return null;
|
154
|
-
NodeList list = document.getDocumentElement().getChildNodes();
|
150
|
+
NodeList list = document.getDocumentElement().getChildNodes(); Node item;
|
155
151
|
for (int i = 0; i < list.getLength(); i++) {
|
156
|
-
if ("head".equalsIgnoreCase(list.item(i).getNodeName())) {
|
157
|
-
NodeList headers =
|
152
|
+
if ("head".equalsIgnoreCase((item = list.item(i)).getNodeName())) {
|
153
|
+
NodeList headers = item.getChildNodes();
|
158
154
|
for (int j = 0; j < headers.getLength(); j++) {
|
159
|
-
if ("meta".equalsIgnoreCase(headers.item(j).getNodeName())) {
|
160
|
-
NamedNodeMap nodeMap =
|
155
|
+
if ("meta".equalsIgnoreCase((item = headers.item(j)).getNodeName())) {
|
156
|
+
NamedNodeMap nodeMap = item.getAttributes();
|
161
157
|
for (int k = 0; k < nodeMap.getLength(); k++) {
|
162
|
-
if ("charset".equalsIgnoreCase(nodeMap.item(k).getNodeName())) {
|
163
|
-
return
|
158
|
+
if ("charset".equalsIgnoreCase((item = nodeMap.item(k)).getNodeName())) {
|
159
|
+
return item.getNodeValue();
|
164
160
|
}
|
165
161
|
}
|
166
162
|
}
|
@@ -227,12 +223,12 @@ public class HtmlDomParserContext extends XmlDomParserContext {
|
|
227
223
|
{} // z
|
228
224
|
};
|
229
225
|
|
230
|
-
private boolean isValid(String
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
for (int i=0; i<
|
235
|
-
if (
|
226
|
+
private static boolean isValid(final String name) {
|
227
|
+
int index = name.charAt(0) - 97;
|
228
|
+
if (index >= element_names.length) return false;
|
229
|
+
String[] elementNames = element_names[index];
|
230
|
+
for (int i=0; i<elementNames.length; i++) {
|
231
|
+
if (name.equals(elementNames[i])) {
|
236
232
|
return true;
|
237
233
|
}
|
238
234
|
}
|
@@ -242,7 +238,7 @@ public class HtmlDomParserContext extends XmlDomParserContext {
|
|
242
238
|
@Override
|
243
239
|
public void startElement(QName name, XMLAttributes attrs, Augmentations augs) throws XNIException {
|
244
240
|
if (!isValid(name.rawname)) {
|
245
|
-
errorHandler.
|
241
|
+
errorHandler.addError(new Exception("Tag " + name.rawname + " invalid"));
|
246
242
|
}
|
247
243
|
super.startElement(name, attrs, augs);
|
248
244
|
}
|
@@ -99,7 +99,7 @@ public class NokogiriDomParser extends DOMParser {
|
|
99
99
|
doc.setUserData(XmlDocument.DTD_RAW_DOCUMENT, dtd.getDocument(), null);
|
100
100
|
}
|
101
101
|
|
102
|
-
private class NokogiriXInlcudeEntityResolver implements org.xml.sax.EntityResolver {
|
102
|
+
private static class NokogiriXInlcudeEntityResolver implements org.xml.sax.EntityResolver {
|
103
103
|
InputSource source;
|
104
104
|
private NokogiriXInlcudeEntityResolver(InputSource source) {
|
105
105
|
this.source = source;
|
@@ -41,7 +41,7 @@ public class NokogiriEncodingReaderWrapper extends InputStream {
|
|
41
41
|
this.encodingReader = encodingReader;
|
42
42
|
this.ruby = context.getRuntime();
|
43
43
|
|
44
|
-
if (!RuntimeHelpers.invoke(context, encodingReader, "respond_to?", ruby.newSymbol("read")
|
44
|
+
if (!RuntimeHelpers.invoke(context, encodingReader, "respond_to?", ruby.newSymbol("read")).isTrue()
|
45
45
|
|| encodingReader.getInstanceVariable("@io") == null) {
|
46
46
|
throw ruby.newArgumentError("Argument doesn't respond to read or doesn't have instance variable @io");
|
47
47
|
}
|
@@ -19,7 +19,7 @@ import org.xml.sax.ext.EntityResolver2;
|
|
19
19
|
* to be relative to the current directory of the Ruby runtime.
|
20
20
|
*/
|
21
21
|
public class NokogiriEntityResolver implements EntityResolver2 {
|
22
|
-
protected Ruby runtime;
|
22
|
+
protected final Ruby runtime;
|
23
23
|
private final NokogiriErrorHandler handler;
|
24
24
|
private final Options options;
|
25
25
|
|
@@ -33,7 +33,7 @@ public class NokogiriEntityResolver implements EntityResolver2 {
|
|
33
33
|
@Override
|
34
34
|
public InputSource getExternalSubset(String name, String baseURI)
|
35
35
|
throws SAXException, IOException {
|
36
|
-
|
36
|
+
return null;
|
37
37
|
}
|
38
38
|
|
39
39
|
@Override
|
@@ -51,17 +51,16 @@ public class NokogiriEntityResolver implements EntityResolver2 {
|
|
51
51
|
return resolveEntity(runtime, name, publicId, baseURI, systemId);
|
52
52
|
}
|
53
53
|
|
54
|
-
private File join(String parent, String child) {
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
return new File(parent, child);
|
54
|
+
private static File join(String parent, String child) {
|
55
|
+
if (new File(parent).isFile()) {
|
56
|
+
parent = new File(parent).getParent();
|
57
|
+
}
|
58
|
+
return new File(parent, child);
|
60
59
|
}
|
61
60
|
|
62
|
-
private InputSource emptyInputSource(InputSource source) {
|
63
|
-
|
64
|
-
|
61
|
+
private static InputSource emptyInputSource(InputSource source) {
|
62
|
+
source.setByteStream(new ByteArrayInputStream(new byte[0]));
|
63
|
+
return source;
|
65
64
|
}
|
66
65
|
|
67
66
|
private boolean shouldLoadDtd() {
|
@@ -69,8 +68,7 @@ public class NokogiriEntityResolver implements EntityResolver2 {
|
|
69
68
|
}
|
70
69
|
|
71
70
|
private void addError(String errorMessage) {
|
72
|
-
|
73
|
-
handler.errors.add(new Exception(errorMessage));
|
71
|
+
if (handler != null) handler.errors.add(new Exception(errorMessage));
|
74
72
|
}
|
75
73
|
|
76
74
|
/**
|
@@ -32,18 +32,10 @@
|
|
32
32
|
|
33
33
|
package nokogiri.internals;
|
34
34
|
|
35
|
-
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
|
36
|
-
|
37
35
|
import java.util.ArrayList;
|
38
36
|
import java.util.List;
|
39
37
|
|
40
|
-
import nokogiri.NokogiriService;
|
41
|
-
import nokogiri.XmlSyntaxError;
|
42
|
-
|
43
38
|
import org.apache.xerces.xni.parser.XMLErrorHandler;
|
44
|
-
import org.jruby.Ruby;
|
45
|
-
import org.jruby.runtime.ThreadContext;
|
46
|
-
import org.jruby.runtime.builtin.IRubyObject;
|
47
39
|
import org.xml.sax.ErrorHandler;
|
48
40
|
|
49
41
|
/**
|
@@ -56,30 +48,22 @@ import org.xml.sax.ErrorHandler;
|
|
56
48
|
* @author Yoko Harada <yokolet@gmail.com>
|
57
49
|
*/
|
58
50
|
public abstract class NokogiriErrorHandler implements ErrorHandler, XMLErrorHandler {
|
59
|
-
protected List<Exception> errors;
|
51
|
+
protected final List<Exception> errors;
|
60
52
|
protected boolean noerror;
|
61
53
|
protected boolean nowarning;
|
62
54
|
|
63
55
|
public NokogiriErrorHandler(boolean noerror, boolean nowarning) {
|
64
|
-
errors = new ArrayList<Exception>();
|
56
|
+
this.errors = new ArrayList<Exception>(4);
|
65
57
|
this.noerror = noerror;
|
66
58
|
this.nowarning = nowarning;
|
67
59
|
}
|
68
60
|
|
69
|
-
|
61
|
+
List<Exception> getErrors() { return errors; }
|
70
62
|
|
71
|
-
public
|
72
|
-
Ruby runtime = context.getRuntime();
|
73
|
-
List<IRubyObject> res = new ArrayList<IRubyObject>();
|
74
|
-
for (int i = 0; i < errors.size(); i++) {
|
75
|
-
XmlSyntaxError xmlSyntaxError = (XmlSyntaxError) NokogiriService.XML_SYNTAXERROR_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::SyntaxError"));
|
76
|
-
xmlSyntaxError.setException(errors.get(i));
|
77
|
-
res.add(xmlSyntaxError);
|
78
|
-
}
|
79
|
-
return res;
|
80
|
-
}
|
63
|
+
public void addError(Exception ex) { errors.add(ex); }
|
81
64
|
|
82
65
|
protected boolean usesNekoHtml(String domain) {
|
83
66
|
return "http://cyberneko.org/html".equals(domain);
|
84
67
|
}
|
68
|
+
|
85
69
|
}
|
@@ -242,7 +242,7 @@ public class NokogiriHandler extends DefaultHandler2 implements XmlDeclHandler {
|
|
242
242
|
@Override
|
243
243
|
public void characters(char[] ch, int start, int length) throws SAXException {
|
244
244
|
StringBuffer sb = characterStack.peek();
|
245
|
-
sb.append(
|
245
|
+
sb.append(ch, start, length);
|
246
246
|
}
|
247
247
|
|
248
248
|
@Override
|
@@ -32,17 +32,14 @@
|
|
32
32
|
|
33
33
|
package nokogiri.internals;
|
34
34
|
|
35
|
+
import java.io.ByteArrayInputStream;
|
35
36
|
import java.io.File;
|
36
37
|
import java.io.UnsupportedEncodingException;
|
37
38
|
import java.lang.reflect.InvocationTargetException;
|
38
39
|
import java.lang.reflect.Method;
|
39
40
|
import java.nio.ByteBuffer;
|
40
41
|
import java.nio.CharBuffer;
|
41
|
-
import java.nio.charset.CharacterCodingException;
|
42
42
|
import java.nio.charset.Charset;
|
43
|
-
import java.nio.charset.CharsetEncoder;
|
44
|
-
import java.util.ArrayList;
|
45
|
-
import java.util.List;
|
46
43
|
import java.util.Set;
|
47
44
|
import java.util.regex.Matcher;
|
48
45
|
import java.util.regex.Pattern;
|
@@ -62,7 +59,6 @@ import nokogiri.XmlProcessingInstruction;
|
|
62
59
|
import nokogiri.XmlText;
|
63
60
|
import nokogiri.XmlXpathContext;
|
64
61
|
|
65
|
-
import org.jcodings.specific.UTF8Encoding;
|
66
62
|
import org.jruby.Ruby;
|
67
63
|
import org.jruby.RubyArray;
|
68
64
|
import org.jruby.RubyClass;
|
@@ -193,28 +189,28 @@ public class NokogiriHelpers {
|
|
193
189
|
return NokogiriService.getNokogiriClassCache(ruby).get(name);
|
194
190
|
}
|
195
191
|
|
196
|
-
public static IRubyObject stringOrNil(Ruby runtime, String
|
197
|
-
|
198
|
-
return convertJavaStringToRuby(runtime, s);
|
192
|
+
public static IRubyObject stringOrNil(Ruby runtime, String str) {
|
193
|
+
return str == null ? runtime.getNil() : convertString(runtime, str);
|
199
194
|
}
|
200
|
-
|
195
|
+
|
196
|
+
public static IRubyObject stringOrNil(Ruby runtime, CharSequence str) {
|
197
|
+
return str == null ? runtime.getNil() : convertString(runtime, str);
|
198
|
+
}
|
199
|
+
|
201
200
|
public static IRubyObject stringOrNil(Ruby runtime, byte[] bytes) {
|
202
|
-
|
203
|
-
return RubyString.newString(runtime, bytes);
|
201
|
+
return bytes == null ? runtime.getNil() : RubyString.newString(runtime, bytes);
|
204
202
|
}
|
205
|
-
|
206
|
-
public static IRubyObject stringOrBlank(Ruby runtime, String
|
207
|
-
|
208
|
-
return convertJavaStringToRuby(runtime, s);
|
203
|
+
|
204
|
+
public static IRubyObject stringOrBlank(Ruby runtime, String str) {
|
205
|
+
return str == null ? runtime.newString() : convertString(runtime, str);
|
209
206
|
}
|
210
207
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
}
|
208
|
+
public static RubyString convertString(Ruby runtime, String str) {
|
209
|
+
return RubyString.newUTF8String(runtime, str);
|
210
|
+
}
|
211
|
+
|
212
|
+
public static RubyString convertString(Ruby runtime, CharSequence str) {
|
213
|
+
return RubyString.newUTF8String(runtime, str);
|
218
214
|
}
|
219
215
|
|
220
216
|
/**
|
@@ -259,12 +255,7 @@ public class NokogiriHelpers {
|
|
259
255
|
return ("xmlns".equals(localName)) ? null : localName;
|
260
256
|
}
|
261
257
|
|
262
|
-
private static Charset
|
263
|
-
|
264
|
-
private static Charset getCharsetUTF8() {
|
265
|
-
if (utf8 == null) utf8 = Charset.forName("UTF-8");
|
266
|
-
return utf8;
|
267
|
-
}
|
258
|
+
private static final Charset UTF8 = Charset.forName("UTF-8");
|
268
259
|
|
269
260
|
/**
|
270
261
|
* Converts a RubyString in to a Java String. Assumes the
|
@@ -290,15 +281,7 @@ public class NokogiriHelpers {
|
|
290
281
|
}
|
291
282
|
|
292
283
|
private static String toJavaString(RubyString str) {
|
293
|
-
|
294
|
-
try {
|
295
|
-
if (str.getRuntime().is1_9()) {
|
296
|
-
return new String(value.getUnsafeBytes(), value.begin(), value.length(), str.getEncoding().toString());
|
297
|
-
}
|
298
|
-
return RubyEncoding.decodeUTF8(value.getUnsafeBytes(), value.begin(), value.length());
|
299
|
-
} catch (UnsupportedEncodingException uee) {
|
300
|
-
return str.toString();
|
301
|
-
}
|
284
|
+
return str.decodeString(); // toString()
|
302
285
|
}
|
303
286
|
|
304
287
|
public static String rubyStringToString(RubyString str) {
|
@@ -307,16 +290,15 @@ public class NokogiriHelpers {
|
|
307
290
|
int offset = byteList.begin();
|
308
291
|
int len = byteList.length();
|
309
292
|
ByteBuffer buf = ByteBuffer.wrap(data, offset, len);
|
310
|
-
return
|
293
|
+
return UTF8.decode(buf).toString();
|
311
294
|
}
|
312
|
-
|
313
|
-
public static
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
if (obj != null) list.add(obj.toString());
|
295
|
+
|
296
|
+
public static ByteArrayInputStream stringBytesToStream(final IRubyObject str) {
|
297
|
+
if (str instanceof RubyString || str.respondsTo("to_str")) {
|
298
|
+
final ByteList bytes = str.convertToString().getByteList();
|
299
|
+
return new ByteArrayInputStream(bytes.unsafeBytes(), bytes.begin(), bytes.length());
|
318
300
|
}
|
319
|
-
return
|
301
|
+
return null;
|
320
302
|
}
|
321
303
|
|
322
304
|
public static String getNodeCompletePath(Node node) {
|
@@ -325,19 +307,14 @@ public class NokogiriHelpers {
|
|
325
307
|
|
326
308
|
// TODO: Rename buffer to path.
|
327
309
|
String buffer = "";
|
328
|
-
String sep;
|
329
|
-
String name;
|
330
|
-
|
331
|
-
int occur = 0;
|
332
|
-
boolean generic;
|
333
310
|
|
334
311
|
cur = node;
|
335
312
|
|
336
313
|
do {
|
337
|
-
name = "";
|
338
|
-
sep = "?";
|
339
|
-
occur = 0;
|
340
|
-
generic = false;
|
314
|
+
String name = "";
|
315
|
+
String sep = "?";
|
316
|
+
int occur = 0;
|
317
|
+
boolean generic = false;
|
341
318
|
|
342
319
|
if(cur.getNodeType() == Node.DOCUMENT_NODE) {
|
343
320
|
if(buffer.startsWith("/")) break;
|
@@ -552,16 +529,16 @@ public class NokogiriHelpers {
|
|
552
529
|
((a != null) && (b != null) && (b.equals(a))));
|
553
530
|
}
|
554
531
|
|
555
|
-
private static Pattern encoded_pattern = Pattern.compile("&|>|<| ");
|
556
|
-
private static
|
557
|
-
private static
|
558
|
-
private static String[] decoded = {"&", ">", "<", "\r"};
|
532
|
+
private static final Pattern encoded_pattern = Pattern.compile("&|>|<| ");
|
533
|
+
private static final String[] encoded = {"&", ">", "<", " "};
|
534
|
+
private static final Pattern decoded_pattern = Pattern.compile("&|>|<|\r");
|
535
|
+
private static final String[] decoded = {"&", ">", "<", "\r"};
|
559
536
|
|
560
|
-
private static
|
537
|
+
private static StringBuffer convert(Pattern ptn, CharSequence input, String[] oldChars, String[] newChars) {
|
561
538
|
Matcher matcher = ptn.matcher(input);
|
562
539
|
boolean result = matcher.find();
|
563
|
-
StringBuffer sb = new StringBuffer();
|
564
|
-
while(result) {
|
540
|
+
StringBuffer sb = new StringBuffer(input.length() + 8);
|
541
|
+
while (result) {
|
565
542
|
String matched = matcher.group();
|
566
543
|
String replacement = "";
|
567
544
|
for (int i=0; i<oldChars.length; i++) {
|
@@ -574,15 +551,15 @@ public class NokogiriHelpers {
|
|
574
551
|
result = matcher.find();
|
575
552
|
}
|
576
553
|
matcher.appendTail(sb);
|
577
|
-
return sb
|
554
|
+
return sb;
|
578
555
|
}
|
579
556
|
|
580
|
-
public static
|
581
|
-
return convert(decoded_pattern,
|
557
|
+
public static CharSequence encodeJavaString(CharSequence str) {
|
558
|
+
return convert(decoded_pattern, str, decoded, encoded);
|
582
559
|
}
|
583
560
|
|
584
|
-
public static
|
585
|
-
return convert(encoded_pattern,
|
561
|
+
public static CharSequence decodeJavaString(CharSequence str) {
|
562
|
+
return convert(encoded_pattern, str, encoded, decoded);
|
586
563
|
}
|
587
564
|
|
588
565
|
public static String getNodeName(Node node) {
|
@@ -601,8 +578,7 @@ public class NokogiriHelpers {
|
|
601
578
|
|
602
579
|
public static final String XMLNS_URI = "http://www.w3.org/2000/xmlns/";
|
603
580
|
public static boolean isNamespace(Node node) {
|
604
|
-
return (XMLNS_URI.equals(node.getNamespaceURI()) ||
|
605
|
-
isNamespace(node.getNodeName()));
|
581
|
+
return (XMLNS_URI.equals(node.getNamespaceURI()) || isNamespace(node.getNodeName()));
|
606
582
|
}
|
607
583
|
|
608
584
|
public static boolean isNamespace(String nodeName) {
|
@@ -618,44 +594,40 @@ public class NokogiriHelpers {
|
|
618
594
|
}
|
619
595
|
|
620
596
|
public static boolean isWhitespaceText(ThreadContext context, IRubyObject obj) {
|
621
|
-
if (obj == null || obj.isNil()) return false;
|
622
|
-
|
623
|
-
XmlNode node = (XmlNode) obj;
|
624
|
-
if (!(node instanceof XmlText))
|
625
|
-
return false;
|
597
|
+
//if (obj == null || obj.isNil()) return false;
|
598
|
+
if ( !(obj instanceof XmlText) ) return false;
|
626
599
|
|
627
|
-
|
628
|
-
return content
|
600
|
+
CharSequence content = ((XmlNode) obj).getContentImpl();
|
601
|
+
return content == null || isWhitespaceText(content);
|
629
602
|
}
|
630
603
|
|
631
|
-
public static boolean isWhitespaceText(
|
632
|
-
|
604
|
+
public static boolean isWhitespaceText(CharSequence str) {
|
605
|
+
int len = str.length(); int beg = 0;
|
606
|
+
while ((beg < len) && (str.charAt(beg) <= ' ')) beg++;
|
607
|
+
return beg == len;
|
633
608
|
}
|
634
609
|
|
635
|
-
public static
|
636
|
-
|
637
|
-
|
610
|
+
public static CharSequence canonicalizeWhitespace(CharSequence str) {
|
611
|
+
final int len = str.length();
|
612
|
+
StringBuilder sb = new StringBuilder(len);
|
638
613
|
boolean newline_added = false;
|
639
|
-
for (int i=0; i<
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
newline_added = true;
|
614
|
+
for ( int i = 0; i < len; i++ ) {
|
615
|
+
char c = str.charAt(i);
|
616
|
+
if ( c == '\n' ) {
|
617
|
+
if ( ! newline_added ) {
|
618
|
+
sb.append(c); newline_added = true;
|
644
619
|
}
|
645
620
|
} else {
|
646
|
-
sb.append(
|
621
|
+
sb.append(c);
|
647
622
|
}
|
648
623
|
}
|
649
|
-
return sb
|
624
|
+
return sb;
|
650
625
|
}
|
651
626
|
|
652
627
|
public static String newQName(String newPrefix, Node node) {
|
653
628
|
String tagName = getLocalPart(node.getNodeName());
|
654
|
-
if(newPrefix == null)
|
655
|
-
|
656
|
-
} else {
|
657
|
-
return newPrefix + ":" + tagName;
|
658
|
-
}
|
629
|
+
if (newPrefix == null) return tagName;
|
630
|
+
return newPrefix + ':' + tagName;
|
659
631
|
}
|
660
632
|
|
661
633
|
public static RubyArray nodeListToRubyArray(Ruby ruby, NodeList nodes) {
|
@@ -695,8 +667,7 @@ public class NokogiriHelpers {
|
|
695
667
|
}
|
696
668
|
|
697
669
|
private static String guessEncoding() {
|
698
|
-
String name =
|
699
|
-
if (name == null) name = System.getProperty("file.encoding");
|
670
|
+
String name = System.getProperty("file.encoding");
|
700
671
|
if (name == null) name = "UTF-8";
|
701
672
|
return name;
|
702
673
|
}
|
@@ -722,8 +693,8 @@ public class NokogiriHelpers {
|
|
722
693
|
|
723
694
|
private static String resolveSystemId(String baseName, String systemId) {
|
724
695
|
if (baseName == null || baseName.length() < 1) return null;
|
725
|
-
String parentName
|
726
|
-
baseName = baseName.
|
696
|
+
String parentName;
|
697
|
+
baseName = baseName.replace("%20", " ");
|
727
698
|
File base = new File(baseName);
|
728
699
|
if (base.isDirectory()) parentName = baseName;
|
729
700
|
else parentName = base.getParent();
|
@@ -735,53 +706,45 @@ public class NokogiriHelpers {
|
|
735
706
|
}
|
736
707
|
|
737
708
|
public static boolean isUTF8(String encoding) {
|
738
|
-
if (encoding == null) return true;
|
739
|
-
|
740
|
-
return ret == 0;
|
709
|
+
if (encoding == null) return true; // no need to convert encoding
|
710
|
+
return Charset.forName(encoding).compareTo(UTF8) == 0;
|
741
711
|
}
|
742
712
|
|
743
|
-
public static
|
744
|
-
|
745
|
-
CharBuffer charBuffer = CharBuffer.wrap(input_string);
|
746
|
-
ByteBuffer byteBuffer = encoder.encode(charBuffer);
|
747
|
-
byte[] buffer = new byte[byteBuffer.remaining()];
|
748
|
-
byteBuffer.get(buffer);
|
749
|
-
return buffer;
|
713
|
+
public static ByteBuffer convertEncoding(Charset output_charset, CharSequence input_string) {
|
714
|
+
return output_charset.encode(CharBuffer.wrap(input_string)); // does replace implicitly on un-mappable characters
|
750
715
|
}
|
751
716
|
|
752
|
-
public static
|
753
|
-
if (!(doc instanceof HtmlDocument)) return
|
717
|
+
public static CharSequence convertEncodingByNKFIfNecessary(ThreadContext context, XmlDocument doc, CharSequence str) {
|
718
|
+
if (!(doc instanceof HtmlDocument)) return str;
|
754
719
|
String parsed_encoding = ((HtmlDocument)doc).getPraedEncoding();
|
755
|
-
if (parsed_encoding == null) return
|
720
|
+
if (parsed_encoding == null) return str;
|
756
721
|
String ruby_encoding = rubyStringToString(doc.getEncoding());
|
757
|
-
if (ruby_encoding == null) return
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
}
|
722
|
+
if (ruby_encoding == null) return str;
|
723
|
+
Charset encoding = Charset.forName(ruby_encoding);
|
724
|
+
if (Charset.forName(parsed_encoding).compareTo(encoding) == 0) return str;
|
725
|
+
if (str.length() == 0) return str; // no need to convert
|
726
|
+
return NokogiriHelpers.nkf(context, encoding, str);
|
763
727
|
}
|
764
728
|
|
729
|
+
private static final ByteList _Sw = new ByteList(new byte[] { '-','S','w' }, false);
|
730
|
+
private static final ByteList _Jw = new ByteList(new byte[] { '-','J','w' }, false);
|
731
|
+
private static final ByteList _Ew = new ByteList(new byte[] { '-','E','w' }, false);
|
732
|
+
private static final ByteList _Ww = new ByteList(new byte[] { '-','W','w' }, false);
|
733
|
+
|
765
734
|
// This method is used from HTML documents. HTML meta tag with encoding specification
|
766
735
|
// might appear after non-ascii characters are used. For example, a title tag before
|
767
736
|
// a meta tag. In such a case, Xerces encodes characters in UTF-8 without seeing meta tag.
|
768
737
|
// Nokogiri uses NKF library to convert characters correct encoding. This means the method
|
769
738
|
// works only for JIS/Shift_JIS/EUC-JP.
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
if (NokogiriHelpers.shift_jis.compareTo(
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
} else {
|
780
|
-
// should not come here. should be treated before this method.
|
781
|
-
sb.append("W");
|
782
|
-
}
|
783
|
-
sb.append("w");
|
784
|
-
Class nkfClass = null;
|
739
|
+
private static CharSequence nkf(ThreadContext context, Charset encoding, CharSequence str) {
|
740
|
+
final Ruby runtime = context.getRuntime();
|
741
|
+
final ByteList opt;
|
742
|
+
if (NokogiriHelpers.shift_jis.compareTo(encoding) == 0) opt = _Sw;
|
743
|
+
else if (NokogiriHelpers.jis.compareTo(encoding) == 0) opt = _Jw;
|
744
|
+
else if (NokogiriHelpers.euc_jp.compareTo(encoding) == 0) opt = _Ew;
|
745
|
+
else opt = _Ww; // should not come here. should be treated before this method.
|
746
|
+
|
747
|
+
Class nkfClass;
|
785
748
|
try {
|
786
749
|
// JRuby 1.7 and later
|
787
750
|
nkfClass = runtime.getClassLoader().loadClass("org.jruby.ext.nkf.RubyNKF");
|
@@ -790,35 +753,35 @@ public class NokogiriHelpers {
|
|
790
753
|
// Before JRuby 1.7
|
791
754
|
nkfClass = runtime.getClassLoader().loadClass("org.jruby.RubyNKF");
|
792
755
|
} catch (ClassNotFoundException e2) {
|
793
|
-
return
|
756
|
+
return str;
|
794
757
|
}
|
795
758
|
}
|
796
759
|
Method nkf_method;
|
797
760
|
try {
|
798
761
|
nkf_method = nkfClass.getMethod("nkf", ThreadContext.class, IRubyObject.class, IRubyObject.class, IRubyObject.class);
|
799
762
|
RubyString r_str =
|
800
|
-
(RubyString)nkf_method.invoke(null,
|
763
|
+
(RubyString)nkf_method.invoke(null, context, null, runtime.newString(opt), runtime.newString(str.toString()));
|
801
764
|
return NokogiriHelpers.rubyStringToString(r_str);
|
802
765
|
} catch (SecurityException e) {
|
803
|
-
return
|
766
|
+
return str;
|
804
767
|
} catch (NoSuchMethodException e) {
|
805
|
-
return
|
768
|
+
return str;
|
806
769
|
} catch (IllegalArgumentException e) {
|
807
|
-
return
|
770
|
+
return str;
|
808
771
|
} catch (IllegalAccessException e) {
|
809
|
-
return
|
772
|
+
return str;
|
810
773
|
} catch (InvocationTargetException e) {
|
811
|
-
return
|
774
|
+
return str;
|
812
775
|
}
|
813
776
|
}
|
814
777
|
|
815
|
-
private static Charset shift_jis = Charset.forName("Shift_JIS");
|
816
|
-
private static Charset jis = Charset.forName("ISO-2022-JP");
|
817
|
-
private static Charset euc_jp = Charset.forName("EUC-JP");
|
778
|
+
private static final Charset shift_jis = Charset.forName("Shift_JIS");
|
779
|
+
private static final Charset jis = Charset.forName("ISO-2022-JP");
|
780
|
+
private static final Charset euc_jp = Charset.forName("EUC-JP");
|
818
781
|
|
819
782
|
public static boolean shouldEncode(Node text) {
|
820
|
-
|
821
|
-
|
783
|
+
final Boolean encoded = (Boolean) text.getUserData(NokogiriHelpers.ENCODED_STRING);
|
784
|
+
return encoded == null || ! encoded;
|
822
785
|
}
|
823
786
|
|
824
787
|
public static boolean shouldDecode(Node text) {
|