nokogiri 1.11.0.rc4-java → 1.11.5-java

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (144) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/LICENSE-DEPENDENCIES.md +12 -12
  4. data/LICENSE.md +1 -1
  5. data/README.md +168 -91
  6. data/dependencies.yml +12 -12
  7. data/ext/java/nokogiri/EncodingHandler.java +76 -89
  8. data/ext/java/nokogiri/HtmlDocument.java +135 -144
  9. data/ext/java/nokogiri/HtmlElementDescription.java +102 -117
  10. data/ext/java/nokogiri/HtmlEntityLookup.java +33 -60
  11. data/ext/java/nokogiri/HtmlSaxParserContext.java +218 -222
  12. data/ext/java/nokogiri/HtmlSaxPushParser.java +162 -169
  13. data/ext/java/nokogiri/NokogiriService.java +595 -556
  14. data/ext/java/nokogiri/XmlAttr.java +118 -126
  15. data/ext/java/nokogiri/XmlAttributeDecl.java +95 -106
  16. data/ext/java/nokogiri/XmlCdata.java +35 -58
  17. data/ext/java/nokogiri/XmlComment.java +46 -67
  18. data/ext/java/nokogiri/XmlDocument.java +645 -572
  19. data/ext/java/nokogiri/XmlDocumentFragment.java +125 -137
  20. data/ext/java/nokogiri/XmlDtd.java +448 -414
  21. data/ext/java/nokogiri/XmlElement.java +23 -48
  22. data/ext/java/nokogiri/XmlElementContent.java +343 -316
  23. data/ext/java/nokogiri/XmlElementDecl.java +124 -125
  24. data/ext/java/nokogiri/XmlEntityDecl.java +119 -127
  25. data/ext/java/nokogiri/XmlEntityReference.java +49 -72
  26. data/ext/java/nokogiri/XmlNamespace.java +175 -175
  27. data/ext/java/nokogiri/XmlNode.java +1843 -1620
  28. data/ext/java/nokogiri/XmlNodeSet.java +361 -331
  29. data/ext/java/nokogiri/XmlProcessingInstruction.java +47 -69
  30. data/ext/java/nokogiri/XmlReader.java +513 -450
  31. data/ext/java/nokogiri/XmlRelaxng.java +85 -104
  32. data/ext/java/nokogiri/XmlSaxParserContext.java +328 -315
  33. data/ext/java/nokogiri/XmlSaxPushParser.java +227 -220
  34. data/ext/java/nokogiri/XmlSchema.java +328 -295
  35. data/ext/java/nokogiri/XmlSyntaxError.java +113 -115
  36. data/ext/java/nokogiri/XmlText.java +55 -76
  37. data/ext/java/nokogiri/XmlXpathContext.java +240 -238
  38. data/ext/java/nokogiri/XsltStylesheet.java +280 -269
  39. data/ext/java/nokogiri/internals/ClosedStreamException.java +5 -2
  40. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +201 -202
  41. data/ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java +17 -10
  42. data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +43 -16
  43. data/ext/java/nokogiri/internals/NokogiriDomParser.java +63 -80
  44. data/ext/java/nokogiri/internals/NokogiriEntityResolver.java +107 -88
  45. data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +27 -52
  46. data/ext/java/nokogiri/internals/NokogiriHandler.java +316 -286
  47. data/ext/java/nokogiri/internals/NokogiriHelpers.java +736 -652
  48. data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +184 -173
  49. data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +81 -98
  50. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +64 -79
  51. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +84 -99
  52. data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +48 -65
  53. data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +116 -131
  54. data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +34 -56
  55. data/ext/java/nokogiri/internals/NokogiriXPathVariableResolver.java +23 -46
  56. data/ext/java/nokogiri/internals/NokogiriXsltErrorListener.java +55 -72
  57. data/ext/java/nokogiri/internals/ParserContext.java +206 -211
  58. data/ext/java/nokogiri/internals/ReaderNode.java +478 -403
  59. data/ext/java/nokogiri/internals/SaveContextVisitor.java +822 -739
  60. data/ext/java/nokogiri/internals/SchemaErrorHandler.java +31 -54
  61. data/ext/java/nokogiri/internals/XalanDTMManagerPatch.java +129 -123
  62. data/ext/java/nokogiri/internals/XmlDeclHandler.java +3 -34
  63. data/ext/java/nokogiri/internals/XmlDomParserContext.java +206 -207
  64. data/ext/java/nokogiri/internals/XmlSaxParser.java +22 -47
  65. data/ext/java/nokogiri/internals/c14n/AttrCompare.java +71 -68
  66. data/ext/java/nokogiri/internals/c14n/C14nHelper.java +137 -118
  67. data/ext/java/nokogiri/internals/c14n/CanonicalFilter.java +27 -21
  68. data/ext/java/nokogiri/internals/c14n/CanonicalizationException.java +74 -61
  69. data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +230 -205
  70. data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +572 -547
  71. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_OmitComments.java +17 -10
  72. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_WithComments.java +17 -10
  73. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315.java +323 -302
  74. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +232 -219
  75. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclOmitComments.java +22 -15
  76. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclWithComments.java +23 -16
  77. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315OmitComments.java +23 -16
  78. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315WithComments.java +22 -15
  79. data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +575 -545
  80. data/ext/java/nokogiri/internals/c14n/CanonicalizerPhysical.java +141 -120
  81. data/ext/java/nokogiri/internals/c14n/CanonicalizerSpi.java +39 -38
  82. data/ext/java/nokogiri/internals/c14n/Constants.java +13 -10
  83. data/ext/java/nokogiri/internals/c14n/ElementProxy.java +279 -247
  84. data/ext/java/nokogiri/internals/c14n/HelperNodeList.java +66 -53
  85. data/ext/java/nokogiri/internals/c14n/IgnoreAllErrorHandler.java +44 -37
  86. data/ext/java/nokogiri/internals/c14n/InclusiveNamespaces.java +135 -120
  87. data/ext/java/nokogiri/internals/c14n/InvalidCanonicalizerException.java +59 -48
  88. data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +384 -334
  89. data/ext/java/nokogiri/internals/c14n/NodeFilter.java +25 -24
  90. data/ext/java/nokogiri/internals/c14n/UtfHelpper.java +151 -140
  91. data/ext/java/nokogiri/internals/c14n/XMLUtils.java +456 -423
  92. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +1466 -1500
  93. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +626 -574
  94. data/ext/nokogiri/depend +34 -474
  95. data/ext/nokogiri/extconf.rb +270 -183
  96. data/ext/nokogiri/html_document.c +10 -15
  97. data/ext/nokogiri/html_element_description.c +84 -71
  98. data/ext/nokogiri/html_entity_lookup.c +21 -16
  99. data/ext/nokogiri/html_sax_parser_context.c +67 -64
  100. data/ext/nokogiri/html_sax_push_parser.c +42 -34
  101. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  102. data/ext/nokogiri/nokogiri.c +190 -60
  103. data/ext/nokogiri/test_global_handlers.c +40 -0
  104. data/ext/nokogiri/xml_attr.c +15 -15
  105. data/ext/nokogiri/xml_attribute_decl.c +18 -18
  106. data/ext/nokogiri/xml_cdata.c +13 -18
  107. data/ext/nokogiri/xml_comment.c +19 -26
  108. data/ext/nokogiri/xml_document.c +246 -188
  109. data/ext/nokogiri/xml_document_fragment.c +13 -15
  110. data/ext/nokogiri/xml_dtd.c +54 -48
  111. data/ext/nokogiri/xml_element_content.c +30 -27
  112. data/ext/nokogiri/xml_element_decl.c +22 -22
  113. data/ext/nokogiri/xml_encoding_handler.c +17 -11
  114. data/ext/nokogiri/xml_entity_decl.c +32 -30
  115. data/ext/nokogiri/xml_entity_reference.c +16 -18
  116. data/ext/nokogiri/xml_namespace.c +56 -49
  117. data/ext/nokogiri/xml_node.c +371 -320
  118. data/ext/nokogiri/xml_node_set.c +168 -156
  119. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  120. data/ext/nokogiri/xml_reader.c +191 -157
  121. data/ext/nokogiri/xml_relax_ng.c +29 -23
  122. data/ext/nokogiri/xml_sax_parser.c +117 -112
  123. data/ext/nokogiri/xml_sax_parser_context.c +101 -84
  124. data/ext/nokogiri/xml_sax_push_parser.c +36 -27
  125. data/ext/nokogiri/xml_schema.c +48 -42
  126. data/ext/nokogiri/xml_syntax_error.c +42 -21
  127. data/ext/nokogiri/xml_text.c +13 -17
  128. data/ext/nokogiri/xml_xpath_context.c +134 -127
  129. data/ext/nokogiri/xslt_stylesheet.c +157 -157
  130. data/lib/nokogiri.rb +2 -6
  131. data/lib/nokogiri/css/parser.rb +1 -1
  132. data/lib/nokogiri/extension.rb +26 -0
  133. data/lib/nokogiri/html/document_fragment.rb +15 -15
  134. data/lib/nokogiri/nokogiri.jar +0 -0
  135. data/lib/nokogiri/version/constant.rb +1 -1
  136. data/lib/nokogiri/version/info.rb +32 -8
  137. data/lib/nokogiri/xml/document.rb +74 -28
  138. data/lib/nokogiri/xml/node.rb +39 -42
  139. data/lib/nokogiri/xml/reader.rb +2 -9
  140. data/lib/nokogiri/xml/xpath.rb +1 -3
  141. data/lib/nokogiri/xml/xpath/syntax_error.rb +1 -1
  142. metadata +62 -127
  143. data/ext/nokogiri/xml_io.c +0 -63
  144. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
@@ -1,9 +1,12 @@
1
1
  package nokogiri.internals;
2
2
 
3
3
  @SuppressWarnings("serial")
4
- public class ClosedStreamException extends Exception {
4
+ public class ClosedStreamException extends Exception
5
+ {
5
6
 
6
- public ClosedStreamException(String message) {
7
+ public
8
+ ClosedStreamException(String message)
9
+ {
7
10
  super(message);
8
11
  }
9
12
 
@@ -1,35 +1,3 @@
1
- /**
2
- * (The MIT License)
3
- *
4
- * Copyright (c) 2008 - 2012:
5
- *
6
- * * {Aaron Patterson}[http://tenderlovemaking.com]
7
- * * {Mike Dalessio}[http://mike.daless.io]
8
- * * {Charles Nutter}[http://blog.headius.com]
9
- * * {Sergio Arbeo}[http://www.serabe.com]
10
- * * {Patrick Mahoney}[http://polycrystal.org]
11
- * * {Yoko Harada}[http://yokolet.blogspot.com]
12
- *
13
- * Permission is hereby granted, free of charge, to any person obtaining
14
- * a copy of this software and associated documentation files (the
15
- * 'Software'), to deal in the Software without restriction, including
16
- * without limitation the rights to use, copy, modify, merge, publish,
17
- * distribute, sublicense, and/or sell copies of the Software, and to
18
- * permit persons to whom the Software is furnished to do so, subject to
19
- * the following conditions:
20
- *
21
- * The above copyright notice and this permission notice shall be
22
- * included in all copies or substantial portions of the Software.
23
- *
24
- * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
25
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31
- */
32
-
33
1
  package nokogiri.internals;
34
2
 
35
3
  import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
@@ -52,6 +20,7 @@ import org.cyberneko.html.filters.DefaultFilter;
52
20
  import org.jruby.Ruby;
53
21
  import org.jruby.RubyClass;
54
22
  import org.jruby.runtime.ThreadContext;
23
+ import org.jruby.runtime.Helpers;
55
24
  import org.jruby.runtime.builtin.IRubyObject;
56
25
  import org.w3c.dom.Document;
57
26
  import org.w3c.dom.NamedNodeMap;
@@ -60,193 +29,223 @@ import org.w3c.dom.NodeList;
60
29
 
61
30
  /**
62
31
  * Parser for HtmlDocument. This class actually parses HtmlDocument using NekoHtml.
63
- *
32
+ *
64
33
  * @author sergio
65
34
  * @author Patrick Mahoney <pat@polycrystal.org>
66
35
  * @author Yoko Harada <yokolet@gmail.com>
67
36
  */
68
- public class HtmlDomParserContext extends XmlDomParserContext {
69
-
70
- public HtmlDomParserContext(Ruby runtime, IRubyObject options) {
71
- this(runtime, runtime.getNil(), options);
37
+ public class HtmlDomParserContext extends XmlDomParserContext
38
+ {
39
+
40
+ public
41
+ HtmlDomParserContext(Ruby runtime, IRubyObject options)
42
+ {
43
+ this(runtime, runtime.getNil(), options);
44
+ }
45
+
46
+ public
47
+ HtmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options)
48
+ {
49
+ super(runtime, encoding, options);
50
+ java_encoding = NokogiriHelpers.getValidEncoding(encoding);
51
+ }
52
+
53
+ @Override
54
+ protected void
55
+ initParser(Ruby runtime)
56
+ {
57
+ XMLParserConfiguration config = new HTMLConfiguration();
58
+ //XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
59
+ XMLDocumentFilter elementValidityCheckFilter = new ElementValidityCheckFilter(errorHandler);
60
+ //XMLDocumentFilter[] filters = { removeNSAttrsFilter, elementValidityCheckFilter};
61
+ XMLDocumentFilter[] filters = { elementValidityCheckFilter};
62
+
63
+ config.setErrorHandler(this.errorHandler);
64
+
65
+ parser = new NokogiriDomParser(config);
66
+
67
+ // see http://nekohtml.sourceforge.net/settings.html for details
68
+ setProperty("http://cyberneko.org/html/properties/default-encoding", java_encoding);
69
+ setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
70
+ setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
71
+ setProperty("http://cyberneko.org/html/properties/filters", filters);
72
+ setFeature("http://cyberneko.org/html/features/report-errors", true);
73
+ setFeature("http://xml.org/sax/features/namespaces", false);
74
+ }
75
+
76
+ @Override
77
+ public void
78
+ setEncoding(String encoding)
79
+ {
80
+ super.setEncoding(encoding);
81
+ }
82
+
83
+ /**
84
+ * Enable NekoHTML feature for balancing tags in a document fragment.
85
+ *
86
+ * This method is used in XmlNode#in_context method.
87
+ */
88
+ public void
89
+ enableDocumentFragment()
90
+ {
91
+ setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
92
+ }
93
+
94
+ @Override
95
+ public XmlDocument
96
+ parse(ThreadContext context, RubyClass klass, IRubyObject url)
97
+ {
98
+ XmlDocument xmlDoc = super.parse(context, klass, url);
99
+
100
+ // let's be consistent in how we handle RECOVER and NORECOVER (a.k.a. STRICT)
101
+ // https://github.com/sparklemotion/nokogiri/issues/2130
102
+ if (!options.recover && errorHandler.getErrors().size() > 0) {
103
+ XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime);
104
+ String exceptionMsg = String.format("%s: '%s'",
105
+ "Parser without recover option encountered error or warning",
106
+ errorHandler.getErrors().get(0));
107
+ xmlSyntaxError.setException(new Exception(exceptionMsg));
108
+ throw xmlSyntaxError.toThrowable();
72
109
  }
73
110
 
74
- public HtmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options) {
75
- super(runtime, encoding, options);
76
- java_encoding = NokogiriHelpers.getValidEncoding(encoding);
111
+ return xmlDoc;
112
+ }
113
+
114
+ @Override
115
+ protected XmlDocument
116
+ wrapDocument(ThreadContext context, RubyClass klass, Document document)
117
+ {
118
+ HtmlDocument htmlDocument = new HtmlDocument(context.runtime, klass, document);
119
+ htmlDocument.setDocumentNode(context.runtime, document);
120
+ Helpers.invoke(context, htmlDocument, "initialize");
121
+
122
+ if (ruby_encoding.isNil()) {
123
+ // ruby_encoding might have detected by HtmlDocument::EncodingReader
124
+ if (detected_encoding != null && !detected_encoding.isNil()) {
125
+ ruby_encoding = detected_encoding;
126
+ } else {
127
+ // no encoding given & no encoding detected, then try to get it
128
+ String charset = tryGetCharsetFromHtml5MetaTag(document);
129
+ ruby_encoding = stringOrNil(context.runtime, charset);
130
+ }
77
131
  }
78
-
79
- @Override
80
- protected void initParser(Ruby runtime) {
81
- XMLParserConfiguration config = new HTMLConfiguration();
82
- //XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
83
- XMLDocumentFilter elementValidityCheckFilter = new ElementValidityCheckFilter(errorHandler);
84
- //XMLDocumentFilter[] filters = { removeNSAttrsFilter, elementValidityCheckFilter};
85
- XMLDocumentFilter[] filters = { elementValidityCheckFilter};
86
-
87
- config.setErrorHandler(this.errorHandler);
88
-
89
- parser = new NokogiriDomParser(config);
90
-
91
- // see http://nekohtml.sourceforge.net/settings.html for details
92
- setProperty("http://cyberneko.org/html/properties/default-encoding", java_encoding);
93
- setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
94
- setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
95
- setProperty("http://cyberneko.org/html/properties/filters", filters);
96
- setFeature("http://cyberneko.org/html/features/report-errors", true);
97
- setFeature("http://xml.org/sax/features/namespaces", false);
132
+ htmlDocument.setEncoding(ruby_encoding);
133
+ htmlDocument.setParsedEncoding(java_encoding);
134
+ return htmlDocument;
135
+ }
136
+
137
+ // NekoHtml doesn't understand HTML5 meta tag format. This fails to detect charset
138
+ // from an HTML5 style meta tag. Luckily, the meta tag and charset exists in DOM tree
139
+ // so, this method attempts to find the charset.
140
+ private static String
141
+ tryGetCharsetFromHtml5MetaTag(Document document)
142
+ {
143
+ if (!"html".equalsIgnoreCase(document.getDocumentElement().getNodeName())) { return null; }
144
+ NodeList list = document.getDocumentElement().getChildNodes();
145
+ Node item;
146
+ for (int i = 0; i < list.getLength(); i++) {
147
+ if ("head".equalsIgnoreCase((item = list.item(i)).getNodeName())) {
148
+ NodeList headers = item.getChildNodes();
149
+ for (int j = 0; j < headers.getLength(); j++) {
150
+ if ("meta".equalsIgnoreCase((item = headers.item(j)).getNodeName())) {
151
+ NamedNodeMap nodeMap = item.getAttributes();
152
+ for (int k = 0; k < nodeMap.getLength(); k++) {
153
+ if ("charset".equalsIgnoreCase((item = nodeMap.item(k)).getNodeName())) {
154
+ return item.getNodeValue();
155
+ }
156
+ }
157
+ }
158
+ }
159
+ }
98
160
  }
99
-
161
+ return null;
162
+ }
163
+
164
+ /**
165
+ * Filter to strip out attributes that pertain to XML namespaces.
166
+ */
167
+ public static class RemoveNSAttrsFilter extends DefaultFilter
168
+ {
100
169
  @Override
101
- public void setEncoding(String encoding) {
102
- super.setEncoding(encoding);
103
- }
170
+ public void
171
+ startElement(QName element, XMLAttributes attrs,
172
+ Augmentations augs) throws XNIException
173
+ {
174
+ int i;
175
+ for (i = 0; i < attrs.getLength(); ++i) {
176
+ if (isNamespace(attrs.getQName(i))) {
177
+ attrs.removeAttributeAt(i);
178
+ --i;
179
+ }
180
+ }
104
181
 
105
- /**
106
- * Enable NekoHTML feature for balancing tags in a document fragment.
107
- *
108
- * This method is used in XmlNode#in_context method.
109
- */
110
- public void enableDocumentFragment() {
111
- setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
182
+ element.uri = null;
183
+ super.startElement(element, attrs, augs);
112
184
  }
185
+ }
113
186
 
114
- @Override
115
- public XmlDocument parse(ThreadContext context, RubyClass klass, IRubyObject url) {
116
- XmlDocument xmlDoc = super.parse(context, klass, url);
117
-
118
- // let's be consistent in how we handle RECOVER and NORECOVER (a.k.a. STRICT)
119
- // https://github.com/sparklemotion/nokogiri/issues/2130
120
- if (!options.recover && errorHandler.getErrors().size() > 0) {
121
- XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime);
122
- String exceptionMsg = String.format("%s: '%s'",
123
- "Parser without recover option encountered error or warning",
124
- errorHandler.getErrors().get(0));
125
- xmlSyntaxError.setException(new Exception(exceptionMsg));
126
- throw xmlSyntaxError.toThrowable();
127
- }
187
+ public static class ElementValidityCheckFilter extends DefaultFilter
188
+ {
189
+ private NokogiriErrorHandler errorHandler;
128
190
 
129
- return xmlDoc;
191
+ private
192
+ ElementValidityCheckFilter(NokogiriErrorHandler errorHandler)
193
+ {
194
+ this.errorHandler = errorHandler;
130
195
  }
131
196
 
132
- @Override
133
- protected XmlDocument wrapDocument(ThreadContext context, RubyClass klass, Document document) {
134
- HtmlDocument htmlDocument = new HtmlDocument(context.runtime, klass, document);
135
- htmlDocument.setDocumentNode(context.runtime, document);
136
- if (ruby_encoding.isNil()) {
137
- // ruby_encoding might have detected by HtmlDocument::EncodingReader
138
- if (detected_encoding != null && !detected_encoding.isNil()) {
139
- ruby_encoding = detected_encoding;
140
- } else {
141
- // no encoding given & no encoding detected, then try to get it
142
- String charset = tryGetCharsetFromHtml5MetaTag(document);
143
- ruby_encoding = stringOrNil(context.runtime, charset);
144
- }
145
- }
146
- htmlDocument.setEncoding(ruby_encoding);
147
- htmlDocument.setParsedEncoding(java_encoding);
148
- return htmlDocument;
149
- }
150
-
151
- // NekoHtml doesn't understand HTML5 meta tag format. This fails to detect charset
152
- // from an HTML5 style meta tag. Luckily, the meta tag and charset exists in DOM tree
153
- // so, this method attempts to find the charset.
154
- private static String tryGetCharsetFromHtml5MetaTag(Document document) {
155
- if (!"html".equalsIgnoreCase(document.getDocumentElement().getNodeName())) return null;
156
- NodeList list = document.getDocumentElement().getChildNodes(); Node item;
157
- for (int i = 0; i < list.getLength(); i++) {
158
- if ("head".equalsIgnoreCase((item = list.item(i)).getNodeName())) {
159
- NodeList headers = item.getChildNodes();
160
- for (int j = 0; j < headers.getLength(); j++) {
161
- if ("meta".equalsIgnoreCase((item = headers.item(j)).getNodeName())) {
162
- NamedNodeMap nodeMap = item.getAttributes();
163
- for (int k = 0; k < nodeMap.getLength(); k++) {
164
- if ("charset".equalsIgnoreCase((item = nodeMap.item(k)).getNodeName())) {
165
- return item.getNodeValue();
166
- }
167
- }
168
- }
169
- }
170
- }
197
+ // element names from xhtml1-strict.dtd
198
+ private static String[][] element_names = {
199
+ {"a", "abbr", "acronym", "address", "area"},
200
+ {"b", "base", "basefont", "bdo", "big", "blockquote", "body", "br", "button"},
201
+ {"caption", "cite", "code", "col", "colgroup"},
202
+ {"dd", "del", "dfn", "div", "dl", "dt"},
203
+ {"em"},
204
+ {"fieldset", "font", "form", "frame", "frameset"},
205
+ {}, // g
206
+ {"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html"},
207
+ {"i", "iframe", "img", "input", "ins"},
208
+ {}, // j
209
+ {"kbd"},
210
+ {"label", "legend", "li", "link"},
211
+ {"map", "meta"},
212
+ {"noframes", "noscript"},
213
+ {"object", "ol", "optgroup", "option"},
214
+ {"p", "param", "pre"},
215
+ {"q"},
216
+ {}, // r
217
+ {"s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup"},
218
+ {"table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt"},
219
+ {"u", "ul"},
220
+ {"var"},
221
+ {}, // w
222
+ {}, // x
223
+ {}, // y
224
+ {} // z
225
+ };
226
+
227
+ private static boolean
228
+ isValid(final String name)
229
+ {
230
+ int index = name.charAt(0) - 97;
231
+ if (index >= element_names.length) { return false; }
232
+ String[] elementNames = element_names[index];
233
+ for (int i = 0; i < elementNames.length; i++) {
234
+ if (name.equals(elementNames[i])) {
235
+ return true;
171
236
  }
172
- return null;
237
+ }
238
+ return false;
173
239
  }
174
240
 
175
- /**
176
- * Filter to strip out attributes that pertain to XML namespaces.
177
- */
178
- public static class RemoveNSAttrsFilter extends DefaultFilter {
179
- @Override
180
- public void startElement(QName element, XMLAttributes attrs,
181
- Augmentations augs) throws XNIException {
182
- int i;
183
- for (i = 0; i < attrs.getLength(); ++i) {
184
- if (isNamespace(attrs.getQName(i))) {
185
- attrs.removeAttributeAt(i);
186
- --i;
187
- }
188
- }
189
-
190
- element.uri = null;
191
- super.startElement(element, attrs, augs);
192
- }
193
- }
194
-
195
- public static class ElementValidityCheckFilter extends DefaultFilter {
196
- private NokogiriErrorHandler errorHandler;
197
-
198
- private ElementValidityCheckFilter(NokogiriErrorHandler errorHandler) {
199
- this.errorHandler = errorHandler;
200
- }
201
-
202
- // element names from xhtml1-strict.dtd
203
- private static String[][] element_names = {
204
- {"a", "abbr", "acronym", "address", "area"},
205
- {"b", "base", "basefont", "bdo", "big", "blockquote", "body", "br", "button"},
206
- {"caption", "cite", "code", "col", "colgroup"},
207
- {"dd", "del", "dfn", "div", "dl", "dt"},
208
- {"em"},
209
- {"fieldset", "font", "form", "frame", "frameset"},
210
- {}, // g
211
- {"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html"},
212
- {"i", "iframe", "img", "input", "ins"},
213
- {}, // j
214
- {"kbd"},
215
- {"label", "legend", "li", "link"},
216
- {"map", "meta"},
217
- {"noframes", "noscript"},
218
- {"object", "ol", "optgroup", "option"},
219
- {"p", "param", "pre"},
220
- {"q"},
221
- {}, // r
222
- {"s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup"},
223
- {"table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt"},
224
- {"u", "ul"},
225
- {"var"},
226
- {}, // w
227
- {}, // x
228
- {}, // y
229
- {} // z
230
- };
231
-
232
- private static boolean isValid(final String name) {
233
- int index = name.charAt(0) - 97;
234
- if (index >= element_names.length) return false;
235
- String[] elementNames = element_names[index];
236
- for (int i=0; i<elementNames.length; i++) {
237
- if (name.equals(elementNames[i])) {
238
- return true;
239
- }
240
- }
241
- return false;
242
- }
243
-
244
- @Override
245
- public void startElement(QName name, XMLAttributes attrs, Augmentations augs) throws XNIException {
246
- if (!isValid(name.rawname)) {
247
- errorHandler.addError(new Exception("Tag " + name.rawname + " invalid"));
248
- }
249
- super.startElement(name, attrs, augs);
250
- }
241
+ @Override
242
+ public void
243
+ startElement(QName name, XMLAttributes attrs, Augmentations augs) throws XNIException
244
+ {
245
+ if (!isValid(name.rawname)) {
246
+ errorHandler.addError(new Exception("Tag " + name.rawname + " invalid"));
247
+ }
248
+ super.startElement(name, attrs, augs);
251
249
  }
250
+ }
252
251
  }