nokogiri 1.11.0.rc3-java → 1.11.4-java
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/LICENSE-DEPENDENCIES.md +1015 -947
- data/LICENSE.md +1 -1
- data/README.md +168 -91
- data/dependencies.yml +12 -12
- data/ext/java/nokogiri/EncodingHandler.java +76 -89
- data/ext/java/nokogiri/HtmlDocument.java +135 -144
- data/ext/java/nokogiri/HtmlElementDescription.java +102 -117
- data/ext/java/nokogiri/HtmlEntityLookup.java +33 -60
- data/ext/java/nokogiri/HtmlSaxParserContext.java +218 -222
- data/ext/java/nokogiri/HtmlSaxPushParser.java +162 -169
- data/ext/java/nokogiri/NokogiriService.java +595 -556
- data/ext/java/nokogiri/XmlAttr.java +118 -126
- data/ext/java/nokogiri/XmlAttributeDecl.java +95 -106
- data/ext/java/nokogiri/XmlCdata.java +35 -58
- data/ext/java/nokogiri/XmlComment.java +46 -67
- data/ext/java/nokogiri/XmlDocument.java +645 -572
- data/ext/java/nokogiri/XmlDocumentFragment.java +125 -137
- data/ext/java/nokogiri/XmlDtd.java +448 -414
- data/ext/java/nokogiri/XmlElement.java +23 -48
- data/ext/java/nokogiri/XmlElementContent.java +343 -316
- data/ext/java/nokogiri/XmlElementDecl.java +124 -125
- data/ext/java/nokogiri/XmlEntityDecl.java +119 -127
- data/ext/java/nokogiri/XmlEntityReference.java +49 -72
- data/ext/java/nokogiri/XmlNamespace.java +175 -175
- data/ext/java/nokogiri/XmlNode.java +1843 -1622
- data/ext/java/nokogiri/XmlNodeSet.java +361 -331
- data/ext/java/nokogiri/XmlProcessingInstruction.java +47 -69
- data/ext/java/nokogiri/XmlReader.java +513 -450
- data/ext/java/nokogiri/XmlRelaxng.java +89 -101
- data/ext/java/nokogiri/XmlSaxParserContext.java +328 -310
- data/ext/java/nokogiri/XmlSaxPushParser.java +227 -220
- data/ext/java/nokogiri/XmlSchema.java +335 -242
- data/ext/java/nokogiri/XmlSyntaxError.java +113 -119
- data/ext/java/nokogiri/XmlText.java +55 -76
- data/ext/java/nokogiri/XmlXpathContext.java +242 -210
- data/ext/java/nokogiri/XsltStylesheet.java +280 -269
- data/ext/java/nokogiri/internals/ClosedStreamException.java +5 -2
- data/ext/java/nokogiri/internals/HtmlDomParserContext.java +201 -190
- data/ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java +17 -10
- data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +43 -16
- data/ext/java/nokogiri/internals/NokogiriDomParser.java +63 -80
- data/ext/java/nokogiri/internals/NokogiriEntityResolver.java +107 -88
- data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +27 -52
- data/ext/java/nokogiri/internals/NokogiriHandler.java +316 -286
- data/ext/java/nokogiri/internals/NokogiriHelpers.java +736 -652
- data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +184 -173
- data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +79 -89
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +64 -79
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +84 -99
- data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +48 -65
- data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +119 -78
- data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +34 -54
- data/ext/java/nokogiri/internals/NokogiriXPathVariableResolver.java +23 -46
- data/ext/java/nokogiri/internals/NokogiriXsltErrorListener.java +55 -72
- data/ext/java/nokogiri/internals/ParserContext.java +206 -211
- data/ext/java/nokogiri/internals/ReaderNode.java +478 -403
- data/ext/java/nokogiri/internals/SaveContextVisitor.java +822 -739
- data/ext/java/nokogiri/internals/SchemaErrorHandler.java +31 -54
- data/ext/java/nokogiri/internals/XalanDTMManagerPatch.java +129 -123
- data/ext/java/nokogiri/internals/XmlDeclHandler.java +3 -34
- data/ext/java/nokogiri/internals/XmlDomParserContext.java +206 -207
- data/ext/java/nokogiri/internals/XmlSaxParser.java +22 -47
- data/ext/java/nokogiri/internals/c14n/AttrCompare.java +71 -68
- data/ext/java/nokogiri/internals/c14n/C14nHelper.java +137 -118
- data/ext/java/nokogiri/internals/c14n/CanonicalFilter.java +27 -21
- data/ext/java/nokogiri/internals/c14n/CanonicalizationException.java +74 -61
- data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +230 -205
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +572 -547
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11_OmitComments.java +17 -10
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11_WithComments.java +17 -10
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315.java +323 -302
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +232 -219
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclOmitComments.java +22 -15
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclWithComments.java +23 -16
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315OmitComments.java +23 -16
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315WithComments.java +22 -15
- data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +575 -545
- data/ext/java/nokogiri/internals/c14n/CanonicalizerPhysical.java +141 -120
- data/ext/java/nokogiri/internals/c14n/CanonicalizerSpi.java +39 -38
- data/ext/java/nokogiri/internals/c14n/Constants.java +13 -10
- data/ext/java/nokogiri/internals/c14n/ElementProxy.java +279 -247
- data/ext/java/nokogiri/internals/c14n/HelperNodeList.java +66 -53
- data/ext/java/nokogiri/internals/c14n/IgnoreAllErrorHandler.java +44 -37
- data/ext/java/nokogiri/internals/c14n/InclusiveNamespaces.java +135 -120
- data/ext/java/nokogiri/internals/c14n/InvalidCanonicalizerException.java +59 -48
- data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +384 -334
- data/ext/java/nokogiri/internals/c14n/NodeFilter.java +25 -24
- data/ext/java/nokogiri/internals/c14n/UtfHelpper.java +151 -140
- data/ext/java/nokogiri/internals/c14n/XMLUtils.java +456 -423
- data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +1466 -1500
- data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +626 -574
- data/ext/nokogiri/depend +37 -358
- data/ext/nokogiri/extconf.rb +581 -374
- data/ext/nokogiri/html_document.c +78 -82
- data/ext/nokogiri/html_element_description.c +84 -71
- data/ext/nokogiri/html_entity_lookup.c +21 -16
- data/ext/nokogiri/html_sax_parser_context.c +69 -66
- data/ext/nokogiri/html_sax_push_parser.c +42 -34
- data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
- data/ext/nokogiri/nokogiri.c +192 -93
- data/ext/nokogiri/test_global_handlers.c +40 -0
- data/ext/nokogiri/xml_attr.c +15 -15
- data/ext/nokogiri/xml_attribute_decl.c +18 -18
- data/ext/nokogiri/xml_cdata.c +13 -18
- data/ext/nokogiri/xml_comment.c +19 -26
- data/ext/nokogiri/xml_document.c +246 -188
- data/ext/nokogiri/xml_document_fragment.c +13 -15
- data/ext/nokogiri/xml_dtd.c +54 -48
- data/ext/nokogiri/xml_element_content.c +30 -27
- data/ext/nokogiri/xml_element_decl.c +22 -22
- data/ext/nokogiri/xml_encoding_handler.c +17 -11
- data/ext/nokogiri/xml_entity_decl.c +32 -30
- data/ext/nokogiri/xml_entity_reference.c +16 -18
- data/ext/nokogiri/xml_namespace.c +56 -49
- data/ext/nokogiri/xml_node.c +371 -320
- data/ext/nokogiri/xml_node_set.c +168 -156
- data/ext/nokogiri/xml_processing_instruction.c +17 -19
- data/ext/nokogiri/xml_reader.c +191 -157
- data/ext/nokogiri/xml_relax_ng.c +52 -28
- data/ext/nokogiri/xml_sax_parser.c +118 -118
- data/ext/nokogiri/xml_sax_parser_context.c +103 -86
- data/ext/nokogiri/xml_sax_push_parser.c +36 -27
- data/ext/nokogiri/xml_schema.c +95 -47
- data/ext/nokogiri/xml_syntax_error.c +42 -21
- data/ext/nokogiri/xml_text.c +13 -17
- data/ext/nokogiri/xml_xpath_context.c +206 -123
- data/ext/nokogiri/xslt_stylesheet.c +158 -161
- data/lib/nokogiri.rb +3 -7
- data/lib/nokogiri/css/parser.rb +3 -3
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/xpath_visitor.rb +70 -42
- data/lib/nokogiri/extension.rb +26 -0
- data/lib/nokogiri/html/document.rb +12 -26
- data/lib/nokogiri/html/document_fragment.rb +15 -15
- data/lib/nokogiri/nokogiri.jar +0 -0
- data/lib/nokogiri/version.rb +2 -149
- data/lib/nokogiri/version/constant.rb +5 -0
- data/lib/nokogiri/version/info.rb +205 -0
- data/lib/nokogiri/xml/document.rb +91 -35
- data/lib/nokogiri/xml/document_fragment.rb +4 -6
- data/lib/nokogiri/xml/node.rb +89 -69
- data/lib/nokogiri/xml/parse_options.rb +6 -0
- data/lib/nokogiri/xml/reader.rb +2 -9
- data/lib/nokogiri/xml/relax_ng.rb +6 -2
- data/lib/nokogiri/xml/schema.rb +12 -4
- data/lib/nokogiri/xml/searchable.rb +3 -1
- data/lib/nokogiri/xml/xpath.rb +1 -3
- data/lib/nokogiri/xml/xpath/syntax_error.rb +1 -1
- metadata +86 -177
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/nokogiri.h +0 -134
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.c +0 -63
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -14
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -12
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
@@ -1,9 +1,12 @@
|
|
1
1
|
package nokogiri.internals;
|
2
2
|
|
3
3
|
@SuppressWarnings("serial")
|
4
|
-
public class ClosedStreamException extends Exception
|
4
|
+
public class ClosedStreamException extends Exception
|
5
|
+
{
|
5
6
|
|
6
|
-
public
|
7
|
+
public
|
8
|
+
ClosedStreamException(String message)
|
9
|
+
{
|
7
10
|
super(message);
|
8
11
|
}
|
9
12
|
|
@@ -1,43 +1,13 @@
|
|
1
|
-
/**
|
2
|
-
* (The MIT License)
|
3
|
-
*
|
4
|
-
* Copyright (c) 2008 - 2012:
|
5
|
-
*
|
6
|
-
* * {Aaron Patterson}[http://tenderlovemaking.com]
|
7
|
-
* * {Mike Dalessio}[http://mike.daless.io]
|
8
|
-
* * {Charles Nutter}[http://blog.headius.com]
|
9
|
-
* * {Sergio Arbeo}[http://www.serabe.com]
|
10
|
-
* * {Patrick Mahoney}[http://polycrystal.org]
|
11
|
-
* * {Yoko Harada}[http://yokolet.blogspot.com]
|
12
|
-
*
|
13
|
-
* Permission is hereby granted, free of charge, to any person obtaining
|
14
|
-
* a copy of this software and associated documentation files (the
|
15
|
-
* 'Software'), to deal in the Software without restriction, including
|
16
|
-
* without limitation the rights to use, copy, modify, merge, publish,
|
17
|
-
* distribute, sublicense, and/or sell copies of the Software, and to
|
18
|
-
* permit persons to whom the Software is furnished to do so, subject to
|
19
|
-
* the following conditions:
|
20
|
-
*
|
21
|
-
* The above copyright notice and this permission notice shall be
|
22
|
-
* included in all copies or substantial portions of the Software.
|
23
|
-
*
|
24
|
-
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
25
|
-
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
26
|
-
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
27
|
-
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
28
|
-
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
29
|
-
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
30
|
-
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
31
|
-
*/
|
32
|
-
|
33
1
|
package nokogiri.internals;
|
34
2
|
|
35
3
|
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
|
36
4
|
import static nokogiri.internals.NokogiriHelpers.isNamespace;
|
37
5
|
import static nokogiri.internals.NokogiriHelpers.stringOrNil;
|
6
|
+
|
38
7
|
import nokogiri.HtmlDocument;
|
39
8
|
import nokogiri.NokogiriService;
|
40
9
|
import nokogiri.XmlDocument;
|
10
|
+
import nokogiri.XmlSyntaxError;
|
41
11
|
|
42
12
|
import org.apache.xerces.xni.Augmentations;
|
43
13
|
import org.apache.xerces.xni.QName;
|
@@ -50,6 +20,7 @@ import org.cyberneko.html.filters.DefaultFilter;
|
|
50
20
|
import org.jruby.Ruby;
|
51
21
|
import org.jruby.RubyClass;
|
52
22
|
import org.jruby.runtime.ThreadContext;
|
23
|
+
import org.jruby.runtime.Helpers;
|
53
24
|
import org.jruby.runtime.builtin.IRubyObject;
|
54
25
|
import org.w3c.dom.Document;
|
55
26
|
import org.w3c.dom.NamedNodeMap;
|
@@ -58,183 +29,223 @@ import org.w3c.dom.NodeList;
|
|
58
29
|
|
59
30
|
/**
|
60
31
|
* Parser for HtmlDocument. This class actually parses HtmlDocument using NekoHtml.
|
61
|
-
*
|
32
|
+
*
|
62
33
|
* @author sergio
|
63
34
|
* @author Patrick Mahoney <pat@polycrystal.org>
|
64
35
|
* @author Yoko Harada <yokolet@gmail.com>
|
65
36
|
*/
|
66
|
-
public class HtmlDomParserContext extends XmlDomParserContext
|
37
|
+
public class HtmlDomParserContext extends XmlDomParserContext
|
38
|
+
{
|
67
39
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
super(runtime, encoding, options);
|
74
|
-
}
|
40
|
+
public
|
41
|
+
HtmlDomParserContext(Ruby runtime, IRubyObject options)
|
42
|
+
{
|
43
|
+
this(runtime, runtime.getNil(), options);
|
44
|
+
}
|
75
45
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
}
|
83
|
-
}
|
46
|
+
public
|
47
|
+
HtmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options)
|
48
|
+
{
|
49
|
+
super(runtime, encoding, options);
|
50
|
+
java_encoding = NokogiriHelpers.getValidEncoding(encoding);
|
51
|
+
}
|
84
52
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
53
|
+
@Override
|
54
|
+
protected void
|
55
|
+
initParser(Ruby runtime)
|
56
|
+
{
|
57
|
+
XMLParserConfiguration config = new HTMLConfiguration();
|
58
|
+
//XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
|
59
|
+
XMLDocumentFilter elementValidityCheckFilter = new ElementValidityCheckFilter(errorHandler);
|
60
|
+
//XMLDocumentFilter[] filters = { removeNSAttrsFilter, elementValidityCheckFilter};
|
61
|
+
XMLDocumentFilter[] filters = { elementValidityCheckFilter};
|
62
|
+
|
63
|
+
config.setErrorHandler(this.errorHandler);
|
64
|
+
|
65
|
+
parser = new NokogiriDomParser(config);
|
66
|
+
|
67
|
+
// see http://nekohtml.sourceforge.net/settings.html for details
|
68
|
+
setProperty("http://cyberneko.org/html/properties/default-encoding", java_encoding);
|
69
|
+
setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
|
70
|
+
setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
|
71
|
+
setProperty("http://cyberneko.org/html/properties/filters", filters);
|
72
|
+
setFeature("http://cyberneko.org/html/features/report-errors", true);
|
73
|
+
setFeature("http://xml.org/sax/features/namespaces", false);
|
74
|
+
}
|
75
|
+
|
76
|
+
@Override
|
77
|
+
public void
|
78
|
+
setEncoding(String encoding)
|
79
|
+
{
|
80
|
+
super.setEncoding(encoding);
|
81
|
+
}
|
82
|
+
|
83
|
+
/**
|
84
|
+
* Enable NekoHTML feature for balancing tags in a document fragment.
|
85
|
+
*
|
86
|
+
* This method is used in XmlNode#in_context method.
|
87
|
+
*/
|
88
|
+
public void
|
89
|
+
enableDocumentFragment()
|
90
|
+
{
|
91
|
+
setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
|
92
|
+
}
|
93
|
+
|
94
|
+
@Override
|
95
|
+
public XmlDocument
|
96
|
+
parse(ThreadContext context, RubyClass klass, IRubyObject url)
|
97
|
+
{
|
98
|
+
XmlDocument xmlDoc = super.parse(context, klass, url);
|
99
|
+
|
100
|
+
// let's be consistent in how we handle RECOVER and NORECOVER (a.k.a. STRICT)
|
101
|
+
// https://github.com/sparklemotion/nokogiri/issues/2130
|
102
|
+
if (!options.recover && errorHandler.getErrors().size() > 0) {
|
103
|
+
XmlSyntaxError xmlSyntaxError = XmlSyntaxError.createXMLSyntaxError(context.runtime);
|
104
|
+
String exceptionMsg = String.format("%s: '%s'",
|
105
|
+
"Parser without recover option encountered error or warning",
|
106
|
+
errorHandler.getErrors().get(0));
|
107
|
+
xmlSyntaxError.setException(new Exception(exceptionMsg));
|
108
|
+
throw xmlSyntaxError.toThrowable();
|
109
109
|
}
|
110
110
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
111
|
+
return xmlDoc;
|
112
|
+
}
|
113
|
+
|
114
|
+
@Override
|
115
|
+
protected XmlDocument
|
116
|
+
wrapDocument(ThreadContext context, RubyClass klass, Document document)
|
117
|
+
{
|
118
|
+
HtmlDocument htmlDocument = new HtmlDocument(context.runtime, klass, document);
|
119
|
+
htmlDocument.setDocumentNode(context.runtime, document);
|
120
|
+
Helpers.invoke(context, htmlDocument, "initialize");
|
121
|
+
|
122
|
+
if (ruby_encoding.isNil()) {
|
123
|
+
// ruby_encoding might have detected by HtmlDocument::EncodingReader
|
124
|
+
if (detected_encoding != null && !detected_encoding.isNil()) {
|
125
|
+
ruby_encoding = detected_encoding;
|
126
|
+
} else {
|
127
|
+
// no encoding given & no encoding detected, then try to get it
|
128
|
+
String charset = tryGetCharsetFromHtml5MetaTag(document);
|
129
|
+
ruby_encoding = stringOrNil(context.runtime, charset);
|
130
|
+
}
|
118
131
|
}
|
132
|
+
htmlDocument.setEncoding(ruby_encoding);
|
133
|
+
htmlDocument.setParsedEncoding(java_encoding);
|
134
|
+
return htmlDocument;
|
135
|
+
}
|
119
136
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
137
|
+
// NekoHtml doesn't understand HTML5 meta tag format. This fails to detect charset
|
138
|
+
// from an HTML5 style meta tag. Luckily, the meta tag and charset exists in DOM tree
|
139
|
+
// so, this method attempts to find the charset.
|
140
|
+
private static String
|
141
|
+
tryGetCharsetFromHtml5MetaTag(Document document)
|
142
|
+
{
|
143
|
+
if (!"html".equalsIgnoreCase(document.getDocumentElement().getNodeName())) { return null; }
|
144
|
+
NodeList list = document.getDocumentElement().getChildNodes();
|
145
|
+
Node item;
|
146
|
+
for (int i = 0; i < list.getLength(); i++) {
|
147
|
+
if ("head".equalsIgnoreCase((item = list.item(i)).getNodeName())) {
|
148
|
+
NodeList headers = item.getChildNodes();
|
149
|
+
for (int j = 0; j < headers.getLength(); j++) {
|
150
|
+
if ("meta".equalsIgnoreCase((item = headers.item(j)).getNodeName())) {
|
151
|
+
NamedNodeMap nodeMap = item.getAttributes();
|
152
|
+
for (int k = 0; k < nodeMap.getLength(); k++) {
|
153
|
+
if ("charset".equalsIgnoreCase((item = nodeMap.item(k)).getNodeName())) {
|
154
|
+
return item.getNodeValue();
|
155
|
+
}
|
132
156
|
}
|
157
|
+
}
|
133
158
|
}
|
134
|
-
|
135
|
-
htmlDocument.setParsedEncoding(java_encoding);
|
136
|
-
return htmlDocument;
|
159
|
+
}
|
137
160
|
}
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
}
|
157
|
-
}
|
158
|
-
}
|
161
|
+
return null;
|
162
|
+
}
|
163
|
+
|
164
|
+
/**
|
165
|
+
* Filter to strip out attributes that pertain to XML namespaces.
|
166
|
+
*/
|
167
|
+
public static class RemoveNSAttrsFilter extends DefaultFilter
|
168
|
+
{
|
169
|
+
@Override
|
170
|
+
public void
|
171
|
+
startElement(QName element, XMLAttributes attrs,
|
172
|
+
Augmentations augs) throws XNIException
|
173
|
+
{
|
174
|
+
int i;
|
175
|
+
for (i = 0; i < attrs.getLength(); ++i) {
|
176
|
+
if (isNamespace(attrs.getQName(i))) {
|
177
|
+
attrs.removeAttributeAt(i);
|
178
|
+
--i;
|
159
179
|
}
|
160
|
-
|
180
|
+
}
|
181
|
+
|
182
|
+
element.uri = null;
|
183
|
+
super.startElement(element, attrs, augs);
|
161
184
|
}
|
185
|
+
}
|
162
186
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
public static class RemoveNSAttrsFilter extends DefaultFilter {
|
167
|
-
@Override
|
168
|
-
public void startElement(QName element, XMLAttributes attrs,
|
169
|
-
Augmentations augs) throws XNIException {
|
170
|
-
int i;
|
171
|
-
for (i = 0; i < attrs.getLength(); ++i) {
|
172
|
-
if (isNamespace(attrs.getQName(i))) {
|
173
|
-
attrs.removeAttributeAt(i);
|
174
|
-
--i;
|
175
|
-
}
|
176
|
-
}
|
187
|
+
public static class ElementValidityCheckFilter extends DefaultFilter
|
188
|
+
{
|
189
|
+
private NokogiriErrorHandler errorHandler;
|
177
190
|
|
178
|
-
|
179
|
-
|
180
|
-
|
191
|
+
private
|
192
|
+
ElementValidityCheckFilter(NokogiriErrorHandler errorHandler)
|
193
|
+
{
|
194
|
+
this.errorHandler = errorHandler;
|
181
195
|
}
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
if (index >= element_names.length) return false;
|
223
|
-
String[] elementNames = element_names[index];
|
224
|
-
for (int i=0; i<elementNames.length; i++) {
|
225
|
-
if (name.equals(elementNames[i])) {
|
226
|
-
return true;
|
227
|
-
}
|
228
|
-
}
|
229
|
-
return false;
|
230
|
-
}
|
231
|
-
|
232
|
-
@Override
|
233
|
-
public void startElement(QName name, XMLAttributes attrs, Augmentations augs) throws XNIException {
|
234
|
-
if (!isValid(name.rawname)) {
|
235
|
-
errorHandler.addError(new Exception("Tag " + name.rawname + " invalid"));
|
236
|
-
}
|
237
|
-
super.startElement(name, attrs, augs);
|
196
|
+
|
197
|
+
// element names from xhtml1-strict.dtd
|
198
|
+
private static String[][] element_names = {
|
199
|
+
{"a", "abbr", "acronym", "address", "area"},
|
200
|
+
{"b", "base", "basefont", "bdo", "big", "blockquote", "body", "br", "button"},
|
201
|
+
{"caption", "cite", "code", "col", "colgroup"},
|
202
|
+
{"dd", "del", "dfn", "div", "dl", "dt"},
|
203
|
+
{"em"},
|
204
|
+
{"fieldset", "font", "form", "frame", "frameset"},
|
205
|
+
{}, // g
|
206
|
+
{"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html"},
|
207
|
+
{"i", "iframe", "img", "input", "ins"},
|
208
|
+
{}, // j
|
209
|
+
{"kbd"},
|
210
|
+
{"label", "legend", "li", "link"},
|
211
|
+
{"map", "meta"},
|
212
|
+
{"noframes", "noscript"},
|
213
|
+
{"object", "ol", "optgroup", "option"},
|
214
|
+
{"p", "param", "pre"},
|
215
|
+
{"q"},
|
216
|
+
{}, // r
|
217
|
+
{"s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup"},
|
218
|
+
{"table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt"},
|
219
|
+
{"u", "ul"},
|
220
|
+
{"var"},
|
221
|
+
{}, // w
|
222
|
+
{}, // x
|
223
|
+
{}, // y
|
224
|
+
{} // z
|
225
|
+
};
|
226
|
+
|
227
|
+
private static boolean
|
228
|
+
isValid(final String name)
|
229
|
+
{
|
230
|
+
int index = name.charAt(0) - 97;
|
231
|
+
if (index >= element_names.length) { return false; }
|
232
|
+
String[] elementNames = element_names[index];
|
233
|
+
for (int i = 0; i < elementNames.length; i++) {
|
234
|
+
if (name.equals(elementNames[i])) {
|
235
|
+
return true;
|
238
236
|
}
|
237
|
+
}
|
238
|
+
return false;
|
239
|
+
}
|
240
|
+
|
241
|
+
@Override
|
242
|
+
public void
|
243
|
+
startElement(QName name, XMLAttributes attrs, Augmentations augs) throws XNIException
|
244
|
+
{
|
245
|
+
if (!isValid(name.rawname)) {
|
246
|
+
errorHandler.addError(new Exception("Tag " + name.rawname + " invalid"));
|
247
|
+
}
|
248
|
+
super.startElement(name, attrs, augs);
|
239
249
|
}
|
250
|
+
}
|
240
251
|
}
|
@@ -4,17 +4,24 @@ import org.xml.sax.ErrorHandler;
|
|
4
4
|
import org.xml.sax.SAXException;
|
5
5
|
import org.xml.sax.SAXParseException;
|
6
6
|
|
7
|
-
public class IgnoreSchemaErrorsErrorHandler implements ErrorHandler
|
7
|
+
public class IgnoreSchemaErrorsErrorHandler implements ErrorHandler
|
8
|
+
{
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
@Override
|
11
|
+
public void
|
12
|
+
warning(SAXParseException exception) throws SAXException
|
13
|
+
{
|
14
|
+
}
|
12
15
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
+
@Override
|
17
|
+
public void
|
18
|
+
error(SAXParseException exception) throws SAXException
|
19
|
+
{
|
20
|
+
}
|
16
21
|
|
17
|
-
|
18
|
-
|
19
|
-
|
22
|
+
@Override
|
23
|
+
public void
|
24
|
+
fatalError(SAXParseException exception) throws SAXException
|
25
|
+
{
|
26
|
+
}
|
20
27
|
}
|