nokogiri 1.5.0-java → 1.5.1-java

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (81) hide show
  1. data/CHANGELOG.ja.rdoc +56 -12
  2. data/CHANGELOG.rdoc +45 -0
  3. data/C_CODING_STYLE.rdoc +27 -0
  4. data/Manifest.txt +4 -0
  5. data/README.rdoc +11 -7
  6. data/Rakefile +44 -26
  7. data/bin/nokogiri +10 -2
  8. data/ext/java/nokogiri/HtmlDocument.java +37 -2
  9. data/ext/java/nokogiri/NokogiriService.java +10 -2
  10. data/ext/java/nokogiri/XmlAttr.java +1 -1
  11. data/ext/java/nokogiri/XmlDocument.java +68 -11
  12. data/ext/java/nokogiri/XmlDocumentFragment.java +16 -5
  13. data/ext/java/nokogiri/XmlElement.java +0 -40
  14. data/ext/java/nokogiri/XmlNamespace.java +8 -1
  15. data/ext/java/nokogiri/XmlNode.java +131 -27
  16. data/ext/java/nokogiri/XmlNodeSet.java +4 -1
  17. data/ext/java/nokogiri/XmlSaxParserContext.java +2 -13
  18. data/ext/java/nokogiri/XmlXpathContext.java +4 -1
  19. data/ext/java/nokogiri/XsltStylesheet.java +198 -37
  20. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +40 -2
  21. data/ext/java/nokogiri/internals/NokogiriHelpers.java +82 -9
  22. data/ext/java/nokogiri/internals/NokogiriXsltErrorListener.java +4 -3
  23. data/ext/java/nokogiri/internals/ParserContext.java +33 -3
  24. data/ext/java/nokogiri/internals/SaveContextVisitor.java +203 -12
  25. data/ext/java/nokogiri/internals/XmlDomParser.java +33 -2
  26. data/ext/java/nokogiri/internals/XmlDomParserContext.java +32 -12
  27. data/ext/nokogiri/extconf.rb +11 -3
  28. data/ext/nokogiri/html_document.c +16 -0
  29. data/ext/nokogiri/html_sax_parser_context.c +59 -37
  30. data/ext/nokogiri/html_sax_push_parser.c +87 -0
  31. data/ext/nokogiri/html_sax_push_parser.h +9 -0
  32. data/ext/nokogiri/nokogiri.c +6 -8
  33. data/ext/nokogiri/nokogiri.h +3 -0
  34. data/ext/nokogiri/xml_document.c +101 -3
  35. data/ext/nokogiri/xml_document.h +3 -3
  36. data/ext/nokogiri/xml_node.c +150 -58
  37. data/ext/nokogiri/xml_node_set.c +169 -120
  38. data/ext/nokogiri/xml_node_set.h +5 -0
  39. data/ext/nokogiri/xml_sax_parser_context.c +64 -41
  40. data/ext/nokogiri/xml_text.c +2 -0
  41. data/ext/nokogiri/xml_xpath_context.c +30 -24
  42. data/ext/nokogiri/xslt_stylesheet.c +62 -16
  43. data/ext/nokogiri/xslt_stylesheet.h +5 -0
  44. data/lib/nokogiri/css/parser.rb +163 -157
  45. data/lib/nokogiri/css/parser.y +6 -3
  46. data/lib/nokogiri/css/tokenizer.rb +1 -1
  47. data/lib/nokogiri/css/tokenizer.rex +1 -1
  48. data/lib/nokogiri/html.rb +1 -0
  49. data/lib/nokogiri/html/document.rb +82 -42
  50. data/lib/nokogiri/html/sax/push_parser.rb +16 -0
  51. data/lib/nokogiri/nokogiri.jar +0 -0
  52. data/lib/nokogiri/version.rb +1 -1
  53. data/lib/nokogiri/xml.rb +6 -0
  54. data/lib/nokogiri/xml/builder.rb +7 -1
  55. data/lib/nokogiri/xml/document.rb +32 -17
  56. data/lib/nokogiri/xml/document_fragment.rb +6 -1
  57. data/lib/nokogiri/xml/node.rb +40 -9
  58. data/lib/nokogiri/xslt.rb +5 -1
  59. data/tasks/cross_compile.rb +1 -0
  60. data/tasks/nokogiri.org.rb +6 -0
  61. data/tasks/test.rb +1 -0
  62. data/test/css/test_xpath_visitor.rb +6 -0
  63. data/test/helper.rb +1 -0
  64. data/test/html/test_document.rb +26 -0
  65. data/test/html/test_document_fragment.rb +1 -2
  66. data/test/test_memory_leak.rb +81 -1
  67. data/test/test_xslt_transforms.rb +152 -123
  68. data/test/xml/test_builder.rb +24 -2
  69. data/test/xml/test_c14n.rb +151 -0
  70. data/test/xml/test_document.rb +48 -0
  71. data/test/xml/test_namespace.rb +5 -0
  72. data/test/xml/test_node.rb +82 -1
  73. data/test/xml/test_node_attributes.rb +19 -0
  74. data/test/xml/test_node_inheritance.rb +32 -0
  75. data/test/xml/test_node_reparenting.rb +32 -0
  76. data/test/xml/test_node_set.rb +16 -8
  77. data/test/xml/test_reader_encoding.rb +16 -0
  78. data/test/xml/test_unparented_node.rb +32 -0
  79. data/test/xml/test_xinclude.rb +83 -0
  80. data/test/xml/test_xpath.rb +22 -0
  81. metadata +147 -123
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * (The MIT License)
3
3
  *
4
- * Copyright (c) 2008 - 2011:
4
+ * Copyright (c) 2008 - 2012:
5
5
  *
6
6
  * * {Aaron Patterson}[http://tenderlovemaking.com]
7
7
  * * {Mike Dalessio}[http://mike.daless.io]
@@ -132,6 +132,9 @@ public class XmlNodeSet extends RubyObject implements NodeList {
132
132
  @JRubyMethod
133
133
  public IRubyObject delete(ThreadContext context, IRubyObject node_or_namespace){
134
134
  if (nodes == null) return context.getRuntime().getNil();
135
+ if (node_or_namespace instanceof XmlNamespace) {
136
+ ((XmlNamespace)node_or_namespace).deleteHref();
137
+ }
135
138
  return nodes.delete(context, asXmlNodeOrNamespace(context, node_or_namespace), Block.NULL_BLOCK);
136
139
  }
137
140
 
@@ -32,7 +32,7 @@
32
32
 
33
33
  package nokogiri;
34
34
 
35
- import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
35
+ import static nokogiri.internals.NokogiriHelpers.isWhitespaceText;
36
36
  import static org.jruby.javasupport.util.RuntimeHelpers.invoke;
37
37
 
38
38
  import java.io.IOException;
@@ -248,7 +248,7 @@ public class XmlSaxParserContext extends ParserContext {
248
248
  // doesn't distinguish between empty and bad whereas
249
249
  // Nokogiri does.
250
250
  String message = spe.getMessage();
251
- if ("Premature end of file.".matches(message)) {
251
+ if ("Premature end of file.".matches(message) && stringDataSize < 1) {
252
252
  throw ruby.newRuntimeError(
253
253
  "couldn't parse document: " + message);
254
254
  } else {
@@ -340,17 +340,6 @@ public class XmlSaxParserContext extends ParserContext {
340
340
  ((XmlNode) doc).normalize();
341
341
  }
342
342
 
343
- protected boolean isWhitespaceText(ThreadContext context, IRubyObject obj) {
344
- if (obj == null || obj.isNil()) return false;
345
-
346
- XmlNode node = (XmlNode) obj;
347
- if (!(node instanceof XmlText))
348
- return false;
349
-
350
- String content = rubyStringToString(node.content(context));
351
- return content.trim().length() == 0;
352
- }
353
-
354
343
  @JRubyMethod(name="column")
355
344
  public IRubyObject column(ThreadContext context) {
356
345
  Integer number = handler.getColumn();
@@ -52,6 +52,7 @@ import org.jruby.Ruby;
52
52
  import org.jruby.RubyBoolean;
53
53
  import org.jruby.RubyClass;
54
54
  import org.jruby.RubyException;
55
+ import org.jruby.RubyFloat;
55
56
  import org.jruby.RubyNumeric;
56
57
  import org.jruby.RubyObject;
57
58
  import org.jruby.RubyString;
@@ -150,7 +151,9 @@ public class XmlXpathContext extends RubyObject {
150
151
  private IRubyObject tryGetOpaqueValue(XPathExpression xpathExpression) throws XPathExpressionException {
151
152
  String string = (String)xpathExpression.evaluate(context.node, XPathConstants.STRING);
152
153
  Double value = null;
153
- if ((value = getDoubleValue(string)) != null) return RubyNumeric.dbl2num(getRuntime(), value);
154
+ if ((value = getDoubleValue(string)) != null) {
155
+ return new RubyFloat(getRuntime(), value);
156
+ }
154
157
  if (doesMatch(boolean_pattern, string.toLowerCase())) return RubyBoolean.newBoolean(getRuntime(), Boolean.parseBoolean(string));
155
158
  return RubyString.newString(getRuntime(), string);
156
159
  }
@@ -33,13 +33,19 @@
33
33
  package nokogiri;
34
34
 
35
35
  import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
36
+ import static nokogiri.internals.NokogiriHelpers.stringOrBlank;
36
37
 
38
+ import java.io.IOException;
39
+ import java.io.PipedReader;
40
+ import java.io.PipedWriter;
41
+ import java.io.StringReader;
42
+ import java.nio.CharBuffer;
37
43
  import java.util.HashMap;
38
44
  import java.util.Map;
45
+ import java.util.Set;
39
46
  import java.util.regex.Matcher;
40
47
  import java.util.regex.Pattern;
41
48
 
42
- import javax.xml.transform.ErrorListener;
43
49
  import javax.xml.transform.Templates;
44
50
  import javax.xml.transform.Transformer;
45
51
  import javax.xml.transform.TransformerConfigurationException;
@@ -47,13 +53,17 @@ import javax.xml.transform.TransformerException;
47
53
  import javax.xml.transform.TransformerFactory;
48
54
  import javax.xml.transform.dom.DOMResult;
49
55
  import javax.xml.transform.dom.DOMSource;
56
+ import javax.xml.transform.stream.StreamResult;
57
+ import javax.xml.transform.stream.StreamSource;
50
58
 
51
59
  import nokogiri.internals.NokogiriXsltErrorListener;
52
60
 
53
61
  import org.jruby.Ruby;
54
62
  import org.jruby.RubyArray;
55
63
  import org.jruby.RubyClass;
64
+ import org.jruby.RubyHash;
56
65
  import org.jruby.RubyObject;
66
+ import org.jruby.RubyString;
57
67
  import org.jruby.anno.JRubyClass;
58
68
  import org.jruby.anno.JRubyMethod;
59
69
  import org.jruby.javasupport.util.RuntimeHelpers;
@@ -70,8 +80,10 @@ import org.w3c.dom.Document;
70
80
  @JRubyClass(name="Nokogiri::XSLT::Stylesheet")
71
81
  public class XsltStylesheet extends RubyObject {
72
82
  private static Map<String, Object> registry = new HashMap<String, Object>();
73
- private static TransformerFactory factory = null;
74
- private Templates sheet;
83
+ private TransformerFactory factory = null;
84
+ private Templates sheet = null;
85
+ private IRubyObject stylesheet = null;
86
+ private boolean htmlish = false;
75
87
 
76
88
  public static Map<String, Object> getRegistry() {
77
89
  return registry;
@@ -80,17 +92,45 @@ public class XsltStylesheet extends RubyObject {
80
92
  public XsltStylesheet(Ruby ruby, RubyClass rubyClass) {
81
93
  super(ruby, rubyClass);
82
94
  }
95
+
96
+ /**
97
+ * Create and return a copy of this object.
98
+ *
99
+ * @return a clone of this object
100
+ */
101
+ @Override
102
+ public Object clone() throws CloneNotSupportedException {
103
+ return super.clone();
104
+ }
83
105
 
84
106
  private void addParametersToTransformer(ThreadContext context, Transformer transf, IRubyObject parameters) {
85
- Ruby ruby = context.getRuntime();
86
- RubyArray params = parameters.convertToArray();
107
+ Ruby runtime = context.getRuntime();
108
+
109
+ if (parameters instanceof RubyHash) {
110
+ setHashParameters(transf, (RubyHash)parameters);
111
+ } else if (parameters instanceof RubyArray) {
112
+ setArrayParameters(transf, runtime, (RubyArray)parameters);
113
+ } else {
114
+ throw runtime.newTypeError("parameters should be given either Array or Hash");
115
+ }
116
+ }
117
+
118
+ private void setHashParameters(Transformer transformer, RubyHash hash) {
119
+ Set<String> keys = hash.keySet();
120
+ for (String key : keys) {
121
+ String value = (String)hash.get(key);
122
+ transformer.setParameter(key, unparseValue(value));
123
+ }
124
+ }
125
+
126
+ private void setArrayParameters(Transformer transformer, Ruby runtime, RubyArray params) {
87
127
  int limit = params.getLength();
88
128
  if(limit % 2 == 1) limit--;
89
129
 
90
130
  for(int i = 0; i < limit; i+=2) {
91
- String name = params.aref(ruby.newFixnum(i)).asJavaString();
92
- String value = params.aref(ruby.newFixnum(i+1)).asJavaString();
93
- transf.setParameter(name, unparseValue(value));
131
+ String name = params.aref(runtime.newFixnum(i)).asJavaString();
132
+ String value = params.aref(runtime.newFixnum(i+1)).asJavaString();
133
+ transformer.setParameter(name, unparseValue(value));
94
134
  }
95
135
  }
96
136
 
@@ -105,36 +145,51 @@ public class XsltStylesheet extends RubyObject {
105
145
  return orig;
106
146
  }
107
147
 
108
- @JRubyMethod(meta = true)
109
- public static IRubyObject parse_stylesheet_doc(ThreadContext context, IRubyObject cls, IRubyObject document) {
148
+ @JRubyMethod(meta = true, rest = true)
149
+ public static IRubyObject parse_stylesheet_doc(ThreadContext context, IRubyObject klazz, IRubyObject[] args) {
110
150
 
111
- Ruby ruby = context.getRuntime();
112
-
113
- if(!(document instanceof XmlDocument)) {
114
- throw ruby.newArgumentError("doc must be a Nokogiri::XML::Document instance");
115
- }
116
-
117
- XmlDocument xmlDoc = (XmlDocument) document;
151
+ Ruby runtime = context.getRuntime();
118
152
 
119
- RubyArray errors = (RubyArray) xmlDoc.getInstanceVariable("@errors");
153
+ ensureFirstArgIsDocument(runtime, args[0]);
120
154
 
121
- if(!errors.isEmpty()) {
122
- throw ruby.newRuntimeError(errors.first().asJavaString());
123
- }
155
+ XmlDocument xmlDoc = (XmlDocument) args[0];
156
+ ensureDocumentHasNoError(context, xmlDoc);
124
157
 
125
158
  Document doc = ((XmlDocument) xmlDoc.dup_implementation(context, true)).getDocument();
126
159
 
127
- XsltStylesheet xslt = new XsltStylesheet(ruby, (RubyClass) cls);
160
+ XsltStylesheet xslt =
161
+ (XsltStylesheet) NokogiriService.XSLT_STYLESHEET_ALLOCATOR.allocate(runtime, (RubyClass)klazz);
128
162
 
129
163
  try {
130
- if (factory == null) factory = TransformerFactory.newInstance();
131
- xslt.sheet = factory.newTemplates(new DOMSource(doc));
164
+ xslt.init(args[1], doc);
132
165
  } catch (TransformerConfigurationException ex) {
133
- ruby.newRuntimeError("could not parse xslt stylesheet");
166
+ runtime.newRuntimeError("could not parse xslt stylesheet");
134
167
  }
135
168
 
136
169
  return xslt;
137
170
  }
171
+
172
+ private void init(IRubyObject stylesheet, Document document) throws TransformerConfigurationException {
173
+ this.stylesheet = stylesheet; // either RubyString or RubyFile
174
+ if (factory == null) factory = TransformerFactory.newInstance();
175
+ sheet = factory.newTemplates(new DOMSource(document));
176
+ }
177
+
178
+ private static void ensureFirstArgIsDocument(Ruby runtime, IRubyObject arg) {
179
+ if (arg instanceof XmlDocument) {
180
+ return;
181
+ } else {
182
+ throw runtime.newArgumentError("doc must be a Nokogiri::XML::Document instance");
183
+ }
184
+ }
185
+
186
+ private static void ensureDocumentHasNoError(ThreadContext context, XmlDocument xmlDoc) {
187
+ Ruby runtime = context.getRuntime();
188
+ RubyArray errors_of_xmlDoc = (RubyArray) xmlDoc.getInstanceVariable("@errors");
189
+ if (!errors_of_xmlDoc.isEmpty()) {
190
+ throw runtime.newRuntimeError(errors_of_xmlDoc.first().asJavaString());
191
+ }
192
+ }
138
193
 
139
194
  @JRubyMethod
140
195
  public IRubyObject serialize(ThreadContext context, IRubyObject doc) {
@@ -147,21 +202,23 @@ public class XsltStylesheet extends RubyObject {
147
202
  public IRubyObject transform(ThreadContext context, IRubyObject[] args) {
148
203
  Ruby runtime = context.getRuntime();
149
204
 
150
- DOMSource docSource = new DOMSource(((XmlDocument) args[0]).getDocument());
151
- DOMResult result = new DOMResult();
205
+ argumentTypeCheck(runtime, args[0]);
152
206
 
153
207
  NokogiriXsltErrorListener elistener = new NokogiriXsltErrorListener();
208
+ DOMSource domSource = new DOMSource(((XmlDocument) args[0]).getDocument());
209
+ DOMResult result = null;
210
+ String stringResult = null;
154
211
  try{
155
- Transformer transf = this.sheet.newTransformer();
156
- transf.setErrorListener(elistener);
157
- if(args.length > 1) {
158
- addParametersToTransformer(context, transf, args[1]);
212
+ result = tryXsltTransformation(context, args, domSource, elistener); // DOMResult
213
+ if (result.getNode().getFirstChild() == null) {
214
+ stringResult = retryXsltTransformation(context, args, domSource, elistener); // StreamResult
159
215
  }
160
- transf.transform(docSource, result);
161
216
  } catch(TransformerConfigurationException ex) {
162
- // processes later
217
+ throw runtime.newRuntimeError(ex.getMessage());
163
218
  } catch(TransformerException ex) {
164
- // processes later
219
+ throw runtime.newRuntimeError(ex.getMessage());
220
+ } catch (IOException ex) {
221
+ throw runtime.newRuntimeError(ex.getMessage());
165
222
  }
166
223
 
167
224
  switch (elistener.getErrorType()) {
@@ -172,18 +229,122 @@ public class XsltStylesheet extends RubyObject {
172
229
  default:
173
230
  // no-op
174
231
  }
232
+
233
+ if (stringResult == null) {
234
+ return createDocumentFromDomResult(context, runtime, result);
235
+ } else {
236
+ return createDocumentFromString(context, runtime, stringResult);
237
+ }
238
+ }
239
+
240
+ private DOMResult tryXsltTransformation(ThreadContext context, IRubyObject[] args, DOMSource domSource, NokogiriXsltErrorListener elistener) throws TransformerException {
241
+ Transformer transf = sheet.newTransformer();
242
+ transf.reset();
243
+ transf.setErrorListener(elistener);
244
+ if (args.length > 1) {
245
+ addParametersToTransformer(context, transf, args[1]);
246
+ }
247
+
248
+ DOMResult result = new DOMResult();
249
+ transf.transform(domSource, result);
250
+ return result;
251
+ }
252
+
253
+ private String retryXsltTransformation(ThreadContext context,
254
+ IRubyObject[] args,
255
+ DOMSource domSource,
256
+ NokogiriXsltErrorListener elistener)
257
+ throws TransformerException, IOException {
258
+ Templates templates = getTemplatesFromStreamSource();
259
+ Transformer transf = templates.newTransformer();
260
+ transf.setErrorListener(elistener);
261
+ if (args.length > 1) {
262
+ addParametersToTransformer(context, transf, args[1]);
263
+ }
264
+ PipedWriter pwriter = new PipedWriter();
265
+ PipedReader preader = new PipedReader();
266
+ pwriter.connect(preader);
267
+ StreamResult result = new StreamResult(pwriter);
268
+ transf.transform(domSource, result);
269
+ char[] cbuf = new char[1024];
270
+ int len = preader.read(cbuf, 0, 1024);
271
+ StringBuilder builder = new StringBuilder();
272
+ builder.append(CharBuffer.wrap(cbuf, 0, len));
273
+ htmlish = isHtml(builder.toString()); // judge from the first chunk
175
274
 
176
- if ("html".equals(result.getNode().getFirstChild().getNodeName())) {
275
+ while (len == 1024) {
276
+ len = preader.read(cbuf, 0, 1024);
277
+ if (len > 0) {
278
+ builder.append(CharBuffer.wrap(cbuf, 0, len));
279
+ }
280
+ }
281
+
282
+ preader.close();
283
+ pwriter.close();
284
+
285
+ return builder.toString();
286
+ }
287
+
288
+ private IRubyObject createDocumentFromDomResult(ThreadContext context, Ruby runtime, DOMResult domResult) {
289
+ if ("html".equals(domResult.getNode().getFirstChild().getNodeName())) {
177
290
  HtmlDocument htmlDocument = (HtmlDocument) getNokogiriClass(runtime, "Nokogiri::HTML::Document").allocate();
178
- htmlDocument.setNode(context, (Document) result.getNode());
291
+ htmlDocument.setDocumentNode(context, (Document) domResult.getNode());
179
292
  return htmlDocument;
180
293
  } else {
181
294
  XmlDocument xmlDocument = (XmlDocument) NokogiriService.XML_DOCUMENT_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Document"));
182
- xmlDocument.setNode(context, (Document) result.getNode());
295
+ xmlDocument.setDocumentNode(context, (Document) domResult.getNode());
296
+ return xmlDocument;
297
+ }
298
+ }
299
+
300
+ private Templates getTemplatesFromStreamSource() throws TransformerConfigurationException {
301
+ if (stylesheet instanceof RubyString) {
302
+ StringReader reader = new StringReader((String)stylesheet.toJava(String.class));
303
+ StreamSource xsltStreamSource = new StreamSource(reader);
304
+ return factory.newTemplates(xsltStreamSource);
305
+ }
306
+ return null;
307
+ }
308
+
309
+ private static Pattern html_tag =
310
+ Pattern.compile("<(%s)*html", Pattern.CASE_INSENSITIVE);
311
+
312
+ private boolean isHtml(String chunk) {
313
+ Matcher m = XsltStylesheet.html_tag.matcher(chunk);
314
+ if (m.find()) return true;
315
+ else return false;
316
+ }
317
+
318
+ private IRubyObject createDocumentFromString(ThreadContext context, Ruby runtime, String stringResult) {
319
+ IRubyObject[] args = new IRubyObject[4];
320
+ args[0] = stringOrBlank(runtime, stringResult);
321
+ args[1] = runtime.getNil(); // url
322
+ args[2] = runtime.getNil(); // encoding
323
+ RubyClass parse_options = (RubyClass)runtime.getClassFromPath("Nokogiri::XML::ParseOptions");
324
+ if (htmlish) {
325
+ args[3] = parse_options.getConstant("DEFAULT_HTML");
326
+ RubyClass htmlDocumentClass = getNokogiriClass(runtime, "Nokogiri::HTML::Document");
327
+ return RuntimeHelpers.invoke(context, htmlDocumentClass, "parse", args);
328
+ } else {
329
+ args[3] = parse_options.getConstant("DEFAULT_XML");
330
+ RubyClass xmlDocumentClass = getNokogiriClass(runtime, "Nokogiri::XML::Document");
331
+ XmlDocument xmlDocument = (XmlDocument) RuntimeHelpers.invoke(context, xmlDocumentClass, "parse", args);
332
+ if (((Document)xmlDocument.getNode()).getDocumentElement() == null) {
333
+ RubyArray errors = (RubyArray) xmlDocument.getInstanceVariable("@errors");
334
+ RuntimeHelpers.invoke(context, errors, "<<", args[0]);
335
+ }
183
336
  return xmlDocument;
184
337
  }
185
338
  }
186
339
 
340
+ private void argumentTypeCheck(Ruby runtime, IRubyObject arg) {
341
+ if (arg instanceof XmlDocument) {
342
+ return;
343
+ } else {
344
+ throw runtime.newArgumentError("argument must be a Nokogiri::XML::Document");
345
+ }
346
+ }
347
+
187
348
  @JRubyMethod(name = {"registr", "register"}, meta = true)
188
349
  public static IRubyObject register(ThreadContext context, IRubyObject cls, IRubyObject uri, IRubyObject receiver) {
189
350
  throw context.getRuntime().newNotImplementedError("Nokogiri::XSLT.register method is not implemented");
@@ -34,6 +34,7 @@ package nokogiri.internals;
34
34
 
35
35
  import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
36
36
  import static nokogiri.internals.NokogiriHelpers.isNamespace;
37
+ import static nokogiri.internals.NokogiriHelpers.stringOrNil;
37
38
  import nokogiri.HtmlDocument;
38
39
  import nokogiri.NokogiriService;
39
40
  import nokogiri.XmlDocument;
@@ -52,6 +53,8 @@ import org.jruby.RubyClass;
52
53
  import org.jruby.runtime.ThreadContext;
53
54
  import org.jruby.runtime.builtin.IRubyObject;
54
55
  import org.w3c.dom.Document;
56
+ import org.w3c.dom.NamedNodeMap;
57
+ import org.w3c.dom.NodeList;
55
58
 
56
59
  /**
57
60
  * Parser for HtmlDocument. This class actually parses HtmlDocument using NekoHtml.
@@ -84,7 +87,8 @@ public class HtmlDomParserContext extends XmlDomParserContext {
84
87
  XMLParserConfiguration config = new HTMLConfiguration();
85
88
  XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
86
89
  XMLDocumentFilter elementValidityCheckFilter = new ElementValidityCheckFilter(errorHandler);
87
- XMLDocumentFilter[] filters = { removeNSAttrsFilter, elementValidityCheckFilter};
90
+ //XMLDocumentFilter[] filters = { removeNSAttrsFilter, elementValidityCheckFilter};
91
+ XMLDocumentFilter[] filters = { elementValidityCheckFilter};
88
92
 
89
93
  config.setErrorHandler(this.errorHandler);
90
94
  parser = new DOMParser(config);
@@ -119,10 +123,44 @@ public class HtmlDomParserContext extends XmlDomParserContext {
119
123
  RubyClass klazz,
120
124
  Document document) {
121
125
  HtmlDocument htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(context.getRuntime(), klazz);
122
- htmlDocument.setNode(context, document);
126
+ htmlDocument.setDocumentNode(context, document);
127
+ if (ruby_encoding.isNil()) {
128
+ // ruby_encoding might have detected by HtmlDocument::EncodingReader
129
+ if (detected_encoding != null && !detected_encoding.isNil()) {
130
+ ruby_encoding = detected_encoding;
131
+ } else {
132
+ // no encoding given & no encoding detected, then try to get it
133
+ String charset = tryGetCharsetFromHtml5MetaTag(document);
134
+ ruby_encoding = stringOrNil(context.getRuntime(), charset);
135
+ }
136
+ }
123
137
  htmlDocument.setEncoding(ruby_encoding);
124
138
  return htmlDocument;
125
139
  }
140
+
141
+ // NekoHtml doesn't understand HTML5 meta tag format. This fails to detect charset
142
+ // from an HTML5 style meta tag. Luckily, the meta tag and charset exists in DOM tree
143
+ // so, this method attempts to find the charset.
144
+ private String tryGetCharsetFromHtml5MetaTag(Document document) {
145
+ if (!"html".equalsIgnoreCase(document.getDocumentElement().getNodeName())) return null;
146
+ NodeList list = document.getDocumentElement().getChildNodes();
147
+ for (int i = 0; i < list.getLength(); i++) {
148
+ if ("head".equalsIgnoreCase(list.item(i).getNodeName())) {
149
+ NodeList headers = list.item(i).getChildNodes();
150
+ for (int j = 0; j < headers.getLength(); j++) {
151
+ if ("meta".equalsIgnoreCase(headers.item(j).getNodeName())) {
152
+ NamedNodeMap nodeMap = headers.item(j).getAttributes();
153
+ for (int k = 0; k < nodeMap.getLength(); k++) {
154
+ if ("charset".equalsIgnoreCase(nodeMap.item(k).getNodeName())) {
155
+ return nodeMap.item(k).getNodeValue();
156
+ }
157
+ }
158
+ }
159
+ }
160
+ }
161
+ }
162
+ return null;
163
+ }
126
164
 
127
165
  /**
128
166
  * Filter to strip out attributes that pertain to XML namespaces.