nokogiri 1.11.0.rc1-java → 1.11.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/LICENSE-DEPENDENCIES.md +1015 -947
- data/LICENSE.md +1 -1
- data/README.md +171 -94
- data/ext/java/nokogiri/EncodingHandler.java +78 -59
- data/ext/java/nokogiri/HtmlDocument.java +137 -114
- data/ext/java/nokogiri/HtmlElementDescription.java +104 -87
- data/ext/java/nokogiri/HtmlEntityLookup.java +31 -26
- data/ext/java/nokogiri/HtmlSaxParserContext.java +220 -192
- data/ext/java/nokogiri/HtmlSaxPushParser.java +164 -139
- data/ext/java/nokogiri/NokogiriService.java +597 -526
- data/ext/java/nokogiri/XmlAttr.java +120 -96
- data/ext/java/nokogiri/XmlAttributeDecl.java +97 -76
- data/ext/java/nokogiri/XmlCdata.java +35 -26
- data/ext/java/nokogiri/XmlComment.java +48 -37
- data/ext/java/nokogiri/XmlDocument.java +642 -540
- data/ext/java/nokogiri/XmlDocumentFragment.java +127 -107
- data/ext/java/nokogiri/XmlDtd.java +450 -384
- data/ext/java/nokogiri/XmlElement.java +25 -18
- data/ext/java/nokogiri/XmlElementContent.java +345 -286
- data/ext/java/nokogiri/XmlElementDecl.java +126 -95
- data/ext/java/nokogiri/XmlEntityDecl.java +121 -97
- data/ext/java/nokogiri/XmlEntityReference.java +51 -42
- data/ext/java/nokogiri/XmlNamespace.java +177 -145
- data/ext/java/nokogiri/XmlNode.java +1843 -1590
- data/ext/java/nokogiri/XmlNodeSet.java +361 -299
- data/ext/java/nokogiri/XmlProcessingInstruction.java +49 -39
- data/ext/java/nokogiri/XmlReader.java +513 -418
- data/ext/java/nokogiri/XmlRelaxng.java +92 -72
- data/ext/java/nokogiri/XmlSaxParserContext.java +330 -280
- data/ext/java/nokogiri/XmlSaxPushParser.java +229 -190
- data/ext/java/nokogiri/XmlSchema.java +335 -210
- data/ext/java/nokogiri/XmlSyntaxError.java +113 -87
- data/ext/java/nokogiri/XmlText.java +57 -46
- data/ext/java/nokogiri/XmlXpathContext.java +242 -178
- data/ext/java/nokogiri/XsltStylesheet.java +282 -239
- data/ext/java/nokogiri/internals/ClosedStreamException.java +5 -2
- data/ext/java/nokogiri/internals/HtmlDomParserContext.java +203 -160
- data/ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java +17 -10
- data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +43 -16
- data/ext/java/nokogiri/internals/NokogiriDomParser.java +65 -50
- data/ext/java/nokogiri/internals/NokogiriEntityResolver.java +107 -88
- data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +25 -18
- data/ext/java/nokogiri/internals/NokogiriHandler.java +316 -254
- data/ext/java/nokogiri/internals/NokogiriHelpers.java +738 -622
- data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +186 -143
- data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +81 -59
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +66 -49
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +86 -69
- data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +44 -29
- data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +121 -48
- data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +34 -22
- data/ext/java/nokogiri/internals/NokogiriXPathVariableResolver.java +25 -17
- data/ext/java/nokogiri/internals/NokogiriXsltErrorListener.java +57 -42
- data/ext/java/nokogiri/internals/ParserContext.java +206 -179
- data/ext/java/nokogiri/internals/ReaderNode.java +478 -371
- data/ext/java/nokogiri/internals/SaveContextVisitor.java +822 -707
- data/ext/java/nokogiri/internals/SchemaErrorHandler.java +28 -19
- data/ext/java/nokogiri/internals/XalanDTMManagerPatch.java +129 -123
- data/ext/java/nokogiri/internals/XmlDeclHandler.java +5 -4
- data/ext/java/nokogiri/internals/XmlDomParserContext.java +208 -177
- data/ext/java/nokogiri/internals/XmlSaxParser.java +24 -17
- data/ext/java/nokogiri/internals/c14n/AttrCompare.java +71 -68
- data/ext/java/nokogiri/internals/c14n/C14nHelper.java +137 -118
- data/ext/java/nokogiri/internals/c14n/CanonicalFilter.java +27 -21
- data/ext/java/nokogiri/internals/c14n/CanonicalizationException.java +74 -61
- data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +230 -205
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +572 -547
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11_OmitComments.java +17 -10
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11_WithComments.java +17 -10
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315.java +323 -302
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +232 -219
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclOmitComments.java +22 -15
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclWithComments.java +23 -16
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315OmitComments.java +23 -16
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315WithComments.java +22 -15
- data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +575 -545
- data/ext/java/nokogiri/internals/c14n/CanonicalizerPhysical.java +141 -120
- data/ext/java/nokogiri/internals/c14n/CanonicalizerSpi.java +39 -38
- data/ext/java/nokogiri/internals/c14n/Constants.java +13 -10
- data/ext/java/nokogiri/internals/c14n/ElementProxy.java +279 -247
- data/ext/java/nokogiri/internals/c14n/HelperNodeList.java +66 -53
- data/ext/java/nokogiri/internals/c14n/IgnoreAllErrorHandler.java +44 -37
- data/ext/java/nokogiri/internals/c14n/InclusiveNamespaces.java +135 -120
- data/ext/java/nokogiri/internals/c14n/InvalidCanonicalizerException.java +59 -48
- data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +384 -334
- data/ext/java/nokogiri/internals/c14n/NodeFilter.java +25 -24
- data/ext/java/nokogiri/internals/c14n/UtfHelpper.java +151 -140
- data/ext/java/nokogiri/internals/c14n/XMLUtils.java +456 -423
- data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +1466 -1500
- data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +626 -570
- data/ext/nokogiri/depend +37 -358
- data/ext/nokogiri/extconf.rb +585 -374
- data/ext/nokogiri/html_document.c +78 -82
- data/ext/nokogiri/html_element_description.c +84 -71
- data/ext/nokogiri/html_entity_lookup.c +21 -16
- data/ext/nokogiri/html_sax_parser_context.c +69 -66
- data/ext/nokogiri/html_sax_push_parser.c +42 -34
- data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
- data/ext/nokogiri/nokogiri.c +192 -93
- data/ext/nokogiri/test_global_handlers.c +40 -0
- data/ext/nokogiri/xml_attr.c +15 -15
- data/ext/nokogiri/xml_attribute_decl.c +18 -18
- data/ext/nokogiri/xml_cdata.c +13 -18
- data/ext/nokogiri/xml_comment.c +19 -26
- data/ext/nokogiri/xml_document.c +225 -163
- data/ext/nokogiri/xml_document_fragment.c +13 -15
- data/ext/nokogiri/xml_dtd.c +54 -48
- data/ext/nokogiri/xml_element_content.c +30 -27
- data/ext/nokogiri/xml_element_decl.c +22 -22
- data/ext/nokogiri/xml_encoding_handler.c +17 -11
- data/ext/nokogiri/xml_entity_decl.c +32 -30
- data/ext/nokogiri/xml_entity_reference.c +16 -18
- data/ext/nokogiri/xml_namespace.c +56 -49
- data/ext/nokogiri/xml_node.c +338 -286
- data/ext/nokogiri/xml_node_set.c +168 -156
- data/ext/nokogiri/xml_processing_instruction.c +17 -19
- data/ext/nokogiri/xml_reader.c +195 -172
- data/ext/nokogiri/xml_relax_ng.c +52 -28
- data/ext/nokogiri/xml_sax_parser.c +118 -118
- data/ext/nokogiri/xml_sax_parser_context.c +103 -86
- data/ext/nokogiri/xml_sax_push_parser.c +36 -27
- data/ext/nokogiri/xml_schema.c +111 -34
- data/ext/nokogiri/xml_syntax_error.c +42 -21
- data/ext/nokogiri/xml_text.c +13 -17
- data/ext/nokogiri/xml_xpath_context.c +206 -123
- data/ext/nokogiri/xslt_stylesheet.c +158 -161
- data/lib/nokogiri.rb +4 -8
- data/lib/nokogiri/css/parser.rb +62 -62
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +38 -36
- data/lib/nokogiri/css/xpath_visitor.rb +70 -42
- data/lib/nokogiri/extension.rb +26 -0
- data/lib/nokogiri/html/document.rb +12 -26
- data/lib/nokogiri/html/document_fragment.rb +15 -15
- data/lib/nokogiri/nokogiri.jar +0 -0
- data/lib/nokogiri/version.rb +2 -148
- data/lib/nokogiri/version/constant.rb +5 -0
- data/lib/nokogiri/version/info.rb +205 -0
- data/lib/nokogiri/xml/builder.rb +2 -2
- data/lib/nokogiri/xml/document.rb +48 -18
- data/lib/nokogiri/xml/document_fragment.rb +4 -6
- data/lib/nokogiri/xml/node.rb +599 -279
- data/lib/nokogiri/xml/parse_options.rb +6 -0
- data/lib/nokogiri/xml/reader.rb +2 -9
- data/lib/nokogiri/xml/relax_ng.rb +6 -2
- data/lib/nokogiri/xml/schema.rb +12 -4
- data/lib/nokogiri/xml/searchable.rb +24 -16
- data/lib/nokogiri/xml/xpath.rb +1 -3
- data/lib/nokogiri/xml/xpath/syntax_error.rb +1 -1
- metadata +87 -158
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/nokogiri.h +0 -122
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.c +0 -61
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -14
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -12
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
package nokogiri.internals;
|
|
2
2
|
|
|
3
3
|
@SuppressWarnings("serial")
|
|
4
|
-
public class ClosedStreamException extends Exception
|
|
4
|
+
public class ClosedStreamException extends Exception
|
|
5
|
+
{
|
|
5
6
|
|
|
6
|
-
public
|
|
7
|
+
public
|
|
8
|
+
ClosedStreamException(String message)
|
|
9
|
+
{
|
|
7
10
|
super(message);
|
|
8
11
|
}
|
|
9
12
|
|
|
@@ -17,10 +17,10 @@
|
|
|
17
17
|
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
18
18
|
* permit persons to whom the Software is furnished to do so, subject to
|
|
19
19
|
* the following conditions:
|
|
20
|
-
*
|
|
20
|
+
*
|
|
21
21
|
* The above copyright notice and this permission notice shall be
|
|
22
22
|
* included in all copies or substantial portions of the Software.
|
|
23
|
-
*
|
|
23
|
+
*
|
|
24
24
|
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
|
25
25
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
26
26
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
@@ -35,9 +35,11 @@ package nokogiri.internals;
|
|
|
35
35
|
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
|
|
36
36
|
import static nokogiri.internals.NokogiriHelpers.isNamespace;
|
|
37
37
|
import static nokogiri.internals.NokogiriHelpers.stringOrNil;
|
|
38
|
+
|
|
38
39
|
import nokogiri.HtmlDocument;
|
|
39
40
|
import nokogiri.NokogiriService;
|
|
40
41
|
import nokogiri.XmlDocument;
|
|
42
|
+
import nokogiri.XmlSyntaxError;
|
|
41
43
|
|
|
42
44
|
import org.apache.xerces.xni.Augmentations;
|
|
43
45
|
import org.apache.xerces.xni.QName;
|
|
@@ -50,6 +52,7 @@ import org.cyberneko.html.filters.DefaultFilter;
|
|
|
50
52
|
import org.jruby.Ruby;
|
|
51
53
|
import org.jruby.RubyClass;
|
|
52
54
|
import org.jruby.runtime.ThreadContext;
|
|
55
|
+
import org.jruby.runtime.Helpers;
|
|
53
56
|
import org.jruby.runtime.builtin.IRubyObject;
|
|
54
57
|
import org.w3c.dom.Document;
|
|
55
58
|
import org.w3c.dom.NamedNodeMap;
|
|
@@ -58,183 +61,223 @@ import org.w3c.dom.NodeList;
|
|
|
58
61
|
|
|
59
62
|
/**
|
|
60
63
|
* Parser for HtmlDocument. This class actually parses HtmlDocument using NekoHtml.
|
|
61
|
-
*
|
|
64
|
+
*
|
|
62
65
|
* @author sergio
|
|
63
66
|
* @author Patrick Mahoney <pat@polycrystal.org>
|
|
64
67
|
* @author Yoko Harada <yokolet@gmail.com>
|
|
65
68
|
*/
|
|
66
|
-
public class HtmlDomParserContext extends XmlDomParserContext
|
|
69
|
+
public class HtmlDomParserContext extends XmlDomParserContext
|
|
70
|
+
{
|
|
67
71
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
super(runtime, encoding, options);
|
|
74
|
-
}
|
|
72
|
+
public
|
|
73
|
+
HtmlDomParserContext(Ruby runtime, IRubyObject options)
|
|
74
|
+
{
|
|
75
|
+
this(runtime, runtime.getNil(), options);
|
|
76
|
+
}
|
|
75
77
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
}
|
|
83
|
-
}
|
|
78
|
+
public
|
|
79
|
+
HtmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options)
|
|
80
|
+
{
|
|
81
|
+
super(runtime, encoding, options);
|
|
82
|
+
java_encoding = NokogiriHelpers.getValidEncoding(encoding);
|
|
83
|
+
}
|
|
84
84
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
85
|
+
@Override
|
|
86
|
+
protected void
|
|
87
|
+
initParser(Ruby runtime)
|
|
88
|
+
{
|
|
89
|
+
XMLParserConfiguration config = new HTMLConfiguration();
|
|
90
|
+
//XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
|
|
91
|
+
XMLDocumentFilter elementValidityCheckFilter = new ElementValidityCheckFilter(errorHandler);
|
|
92
|
+
//XMLDocumentFilter[] filters = { removeNSAttrsFilter, elementValidityCheckFilter};
|
|
93
|
+
XMLDocumentFilter[] filters = { elementValidityCheckFilter};
|
|
94
|
+
|
|
95
|
+
config.setErrorHandler(this.errorHandler);
|
|
96
|
+
|
|
97
|
+
parser = new NokogiriDomParser(config);
|
|
98
|
+
|
|
99
|
+
// see http://nekohtml.sourceforge.net/settings.html for details
|
|
100
|
+
setProperty("http://cyberneko.org/html/properties/default-encoding", java_encoding);
|
|
101
|
+
setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
|
|
102
|
+
setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
|
|
103
|
+
setProperty("http://cyberneko.org/html/properties/filters", filters);
|
|
104
|
+
setFeature("http://cyberneko.org/html/features/report-errors", true);
|
|
105
|
+
setFeature("http://xml.org/sax/features/namespaces", false);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
@Override
|
|
109
|
+
public void
|
|
110
|
+
setEncoding(String encoding)
|
|
111
|
+
{
|
|
112
|
+
super.setEncoding(encoding);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Enable NekoHTML feature for balancing tags in a document fragment.
|
|
117
|
+
*
|
|
118
|
+
* This method is used in XmlNode#in_context method.
|
|
119
|
+
*/
|
|
120
|
+
public void
|
|
121
|
+
enableDocumentFragment()
|
|
122
|
+
{
|
|
123
|
+
setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
@Override
|
|
127
|
+
public XmlDocument
|
|
128
|
+
parse(ThreadContext context, RubyClass klass, IRubyObject url)
|
|
129
|
+
{
|
|
130
|
+
XmlDocument xmlDoc = super.parse(context, klass, url);
|
|
131
|
+
|
|
132
|
+
// let's be consistent in how we handle RECOVER and NORECOVER (a.k.a. STRICT)
|
|
133
|
+
// https://github.com/sparklemotion/nokogiri/issues/2130
|
|
134
|
+
if (!options.recover && errorHandler.getErrors().size() > 0) {
|
|
135
|
+
XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime);
|
|
136
|
+
String exceptionMsg = String.format("%s: '%s'",
|
|
137
|
+
"Parser without recover option encountered error or warning",
|
|
138
|
+
errorHandler.getErrors().get(0));
|
|
139
|
+
xmlSyntaxError.setException(new Exception(exceptionMsg));
|
|
140
|
+
throw xmlSyntaxError.toThrowable();
|
|
109
141
|
}
|
|
110
142
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
143
|
+
return xmlDoc;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
@Override
|
|
147
|
+
protected XmlDocument
|
|
148
|
+
wrapDocument(ThreadContext context, RubyClass klass, Document document)
|
|
149
|
+
{
|
|
150
|
+
HtmlDocument htmlDocument = new HtmlDocument(context.runtime, klass, document);
|
|
151
|
+
htmlDocument.setDocumentNode(context.runtime, document);
|
|
152
|
+
Helpers.invoke(context, htmlDocument, "initialize");
|
|
153
|
+
|
|
154
|
+
if (ruby_encoding.isNil()) {
|
|
155
|
+
// ruby_encoding might have detected by HtmlDocument::EncodingReader
|
|
156
|
+
if (detected_encoding != null && !detected_encoding.isNil()) {
|
|
157
|
+
ruby_encoding = detected_encoding;
|
|
158
|
+
} else {
|
|
159
|
+
// no encoding given & no encoding detected, then try to get it
|
|
160
|
+
String charset = tryGetCharsetFromHtml5MetaTag(document);
|
|
161
|
+
ruby_encoding = stringOrNil(context.runtime, charset);
|
|
162
|
+
}
|
|
118
163
|
}
|
|
164
|
+
htmlDocument.setEncoding(ruby_encoding);
|
|
165
|
+
htmlDocument.setParsedEncoding(java_encoding);
|
|
166
|
+
return htmlDocument;
|
|
167
|
+
}
|
|
119
168
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
169
|
+
// NekoHtml doesn't understand HTML5 meta tag format. This fails to detect charset
|
|
170
|
+
// from an HTML5 style meta tag. Luckily, the meta tag and charset exists in DOM tree
|
|
171
|
+
// so, this method attempts to find the charset.
|
|
172
|
+
private static String
|
|
173
|
+
tryGetCharsetFromHtml5MetaTag(Document document)
|
|
174
|
+
{
|
|
175
|
+
if (!"html".equalsIgnoreCase(document.getDocumentElement().getNodeName())) { return null; }
|
|
176
|
+
NodeList list = document.getDocumentElement().getChildNodes();
|
|
177
|
+
Node item;
|
|
178
|
+
for (int i = 0; i < list.getLength(); i++) {
|
|
179
|
+
if ("head".equalsIgnoreCase((item = list.item(i)).getNodeName())) {
|
|
180
|
+
NodeList headers = item.getChildNodes();
|
|
181
|
+
for (int j = 0; j < headers.getLength(); j++) {
|
|
182
|
+
if ("meta".equalsIgnoreCase((item = headers.item(j)).getNodeName())) {
|
|
183
|
+
NamedNodeMap nodeMap = item.getAttributes();
|
|
184
|
+
for (int k = 0; k < nodeMap.getLength(); k++) {
|
|
185
|
+
if ("charset".equalsIgnoreCase((item = nodeMap.item(k)).getNodeName())) {
|
|
186
|
+
return item.getNodeValue();
|
|
187
|
+
}
|
|
132
188
|
}
|
|
189
|
+
}
|
|
133
190
|
}
|
|
134
|
-
|
|
135
|
-
htmlDocument.setParsedEncoding(java_encoding);
|
|
136
|
-
return htmlDocument;
|
|
191
|
+
}
|
|
137
192
|
}
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
}
|
|
193
|
+
return null;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Filter to strip out attributes that pertain to XML namespaces.
|
|
198
|
+
*/
|
|
199
|
+
public static class RemoveNSAttrsFilter extends DefaultFilter
|
|
200
|
+
{
|
|
201
|
+
@Override
|
|
202
|
+
public void
|
|
203
|
+
startElement(QName element, XMLAttributes attrs,
|
|
204
|
+
Augmentations augs) throws XNIException
|
|
205
|
+
{
|
|
206
|
+
int i;
|
|
207
|
+
for (i = 0; i < attrs.getLength(); ++i) {
|
|
208
|
+
if (isNamespace(attrs.getQName(i))) {
|
|
209
|
+
attrs.removeAttributeAt(i);
|
|
210
|
+
--i;
|
|
159
211
|
}
|
|
160
|
-
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
element.uri = null;
|
|
215
|
+
super.startElement(element, attrs, augs);
|
|
161
216
|
}
|
|
217
|
+
}
|
|
162
218
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
public static class RemoveNSAttrsFilter extends DefaultFilter {
|
|
167
|
-
@Override
|
|
168
|
-
public void startElement(QName element, XMLAttributes attrs,
|
|
169
|
-
Augmentations augs) throws XNIException {
|
|
170
|
-
int i;
|
|
171
|
-
for (i = 0; i < attrs.getLength(); ++i) {
|
|
172
|
-
if (isNamespace(attrs.getQName(i))) {
|
|
173
|
-
attrs.removeAttributeAt(i);
|
|
174
|
-
--i;
|
|
175
|
-
}
|
|
176
|
-
}
|
|
219
|
+
public static class ElementValidityCheckFilter extends DefaultFilter
|
|
220
|
+
{
|
|
221
|
+
private NokogiriErrorHandler errorHandler;
|
|
177
222
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
223
|
+
private
|
|
224
|
+
ElementValidityCheckFilter(NokogiriErrorHandler errorHandler)
|
|
225
|
+
{
|
|
226
|
+
this.errorHandler = errorHandler;
|
|
181
227
|
}
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
if (index >= element_names.length) return false;
|
|
223
|
-
String[] elementNames = element_names[index];
|
|
224
|
-
for (int i=0; i<elementNames.length; i++) {
|
|
225
|
-
if (name.equals(elementNames[i])) {
|
|
226
|
-
return true;
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
return false;
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
@Override
|
|
233
|
-
public void startElement(QName name, XMLAttributes attrs, Augmentations augs) throws XNIException {
|
|
234
|
-
if (!isValid(name.rawname)) {
|
|
235
|
-
errorHandler.addError(new Exception("Tag " + name.rawname + " invalid"));
|
|
236
|
-
}
|
|
237
|
-
super.startElement(name, attrs, augs);
|
|
228
|
+
|
|
229
|
+
// element names from xhtml1-strict.dtd
|
|
230
|
+
private static String[][] element_names = {
|
|
231
|
+
{"a", "abbr", "acronym", "address", "area"},
|
|
232
|
+
{"b", "base", "basefont", "bdo", "big", "blockquote", "body", "br", "button"},
|
|
233
|
+
{"caption", "cite", "code", "col", "colgroup"},
|
|
234
|
+
{"dd", "del", "dfn", "div", "dl", "dt"},
|
|
235
|
+
{"em"},
|
|
236
|
+
{"fieldset", "font", "form", "frame", "frameset"},
|
|
237
|
+
{}, // g
|
|
238
|
+
{"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html"},
|
|
239
|
+
{"i", "iframe", "img", "input", "ins"},
|
|
240
|
+
{}, // j
|
|
241
|
+
{"kbd"},
|
|
242
|
+
{"label", "legend", "li", "link"},
|
|
243
|
+
{"map", "meta"},
|
|
244
|
+
{"noframes", "noscript"},
|
|
245
|
+
{"object", "ol", "optgroup", "option"},
|
|
246
|
+
{"p", "param", "pre"},
|
|
247
|
+
{"q"},
|
|
248
|
+
{}, // r
|
|
249
|
+
{"s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup"},
|
|
250
|
+
{"table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt"},
|
|
251
|
+
{"u", "ul"},
|
|
252
|
+
{"var"},
|
|
253
|
+
{}, // w
|
|
254
|
+
{}, // x
|
|
255
|
+
{}, // y
|
|
256
|
+
{} // z
|
|
257
|
+
};
|
|
258
|
+
|
|
259
|
+
private static boolean
|
|
260
|
+
isValid(final String name)
|
|
261
|
+
{
|
|
262
|
+
int index = name.charAt(0) - 97;
|
|
263
|
+
if (index >= element_names.length) { return false; }
|
|
264
|
+
String[] elementNames = element_names[index];
|
|
265
|
+
for (int i = 0; i < elementNames.length; i++) {
|
|
266
|
+
if (name.equals(elementNames[i])) {
|
|
267
|
+
return true;
|
|
238
268
|
}
|
|
269
|
+
}
|
|
270
|
+
return false;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
@Override
|
|
274
|
+
public void
|
|
275
|
+
startElement(QName name, XMLAttributes attrs, Augmentations augs) throws XNIException
|
|
276
|
+
{
|
|
277
|
+
if (!isValid(name.rawname)) {
|
|
278
|
+
errorHandler.addError(new Exception("Tag " + name.rawname + " invalid"));
|
|
279
|
+
}
|
|
280
|
+
super.startElement(name, attrs, augs);
|
|
239
281
|
}
|
|
282
|
+
}
|
|
240
283
|
}
|
|
@@ -4,17 +4,24 @@ import org.xml.sax.ErrorHandler;
|
|
|
4
4
|
import org.xml.sax.SAXException;
|
|
5
5
|
import org.xml.sax.SAXParseException;
|
|
6
6
|
|
|
7
|
-
public class IgnoreSchemaErrorsErrorHandler implements ErrorHandler
|
|
7
|
+
public class IgnoreSchemaErrorsErrorHandler implements ErrorHandler
|
|
8
|
+
{
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
@Override
|
|
11
|
+
public void
|
|
12
|
+
warning(SAXParseException exception) throws SAXException
|
|
13
|
+
{
|
|
14
|
+
}
|
|
12
15
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
+
@Override
|
|
17
|
+
public void
|
|
18
|
+
error(SAXParseException exception) throws SAXException
|
|
19
|
+
{
|
|
20
|
+
}
|
|
16
21
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
22
|
+
@Override
|
|
23
|
+
public void
|
|
24
|
+
fatalError(SAXParseException exception) throws SAXException
|
|
25
|
+
{
|
|
26
|
+
}
|
|
20
27
|
}
|