nokogiri 1.11.1-java → 1.11.6-java
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/LICENSE-DEPENDENCIES.md +12 -12
- data/LICENSE.md +1 -1
- data/README.md +21 -16
- data/dependencies.yml +12 -12
- data/ext/java/nokogiri/EncodingHandler.java +76 -89
- data/ext/java/nokogiri/HtmlDocument.java +135 -144
- data/ext/java/nokogiri/HtmlElementDescription.java +102 -117
- data/ext/java/nokogiri/HtmlEntityLookup.java +33 -60
- data/ext/java/nokogiri/HtmlSaxParserContext.java +218 -222
- data/ext/java/nokogiri/HtmlSaxPushParser.java +162 -169
- data/ext/java/nokogiri/NokogiriService.java +595 -556
- data/ext/java/nokogiri/XmlAttr.java +118 -126
- data/ext/java/nokogiri/XmlAttributeDecl.java +95 -106
- data/ext/java/nokogiri/XmlCdata.java +35 -58
- data/ext/java/nokogiri/XmlComment.java +46 -67
- data/ext/java/nokogiri/XmlDocument.java +645 -572
- data/ext/java/nokogiri/XmlDocumentFragment.java +125 -137
- data/ext/java/nokogiri/XmlDtd.java +448 -414
- data/ext/java/nokogiri/XmlElement.java +23 -48
- data/ext/java/nokogiri/XmlElementContent.java +343 -316
- data/ext/java/nokogiri/XmlElementDecl.java +124 -125
- data/ext/java/nokogiri/XmlEntityDecl.java +119 -127
- data/ext/java/nokogiri/XmlEntityReference.java +49 -72
- data/ext/java/nokogiri/XmlNamespace.java +175 -175
- data/ext/java/nokogiri/XmlNode.java +1843 -1620
- data/ext/java/nokogiri/XmlNodeSet.java +361 -331
- data/ext/java/nokogiri/XmlProcessingInstruction.java +47 -69
- data/ext/java/nokogiri/XmlReader.java +513 -450
- data/ext/java/nokogiri/XmlRelaxng.java +85 -104
- data/ext/java/nokogiri/XmlSaxParserContext.java +328 -315
- data/ext/java/nokogiri/XmlSaxPushParser.java +227 -220
- data/ext/java/nokogiri/XmlSchema.java +328 -295
- data/ext/java/nokogiri/XmlSyntaxError.java +113 -115
- data/ext/java/nokogiri/XmlText.java +55 -76
- data/ext/java/nokogiri/XmlXpathContext.java +240 -238
- data/ext/java/nokogiri/XsltStylesheet.java +280 -269
- data/ext/java/nokogiri/internals/ClosedStreamException.java +5 -2
- data/ext/java/nokogiri/internals/HtmlDomParserContext.java +201 -202
- data/ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java +17 -10
- data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +43 -16
- data/ext/java/nokogiri/internals/NokogiriDomParser.java +63 -80
- data/ext/java/nokogiri/internals/NokogiriEntityResolver.java +107 -88
- data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +27 -52
- data/ext/java/nokogiri/internals/NokogiriHandler.java +316 -286
- data/ext/java/nokogiri/internals/NokogiriHelpers.java +736 -652
- data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +184 -173
- data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +81 -98
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +64 -79
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +84 -99
- data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +48 -65
- data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +116 -131
- data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +34 -56
- data/ext/java/nokogiri/internals/NokogiriXPathVariableResolver.java +23 -46
- data/ext/java/nokogiri/internals/NokogiriXsltErrorListener.java +55 -72
- data/ext/java/nokogiri/internals/ParserContext.java +206 -211
- data/ext/java/nokogiri/internals/ReaderNode.java +478 -403
- data/ext/java/nokogiri/internals/SaveContextVisitor.java +822 -739
- data/ext/java/nokogiri/internals/SchemaErrorHandler.java +31 -54
- data/ext/java/nokogiri/internals/XalanDTMManagerPatch.java +129 -123
- data/ext/java/nokogiri/internals/XmlDeclHandler.java +3 -34
- data/ext/java/nokogiri/internals/XmlDomParserContext.java +206 -207
- data/ext/java/nokogiri/internals/XmlSaxParser.java +22 -47
- data/ext/java/nokogiri/internals/c14n/AttrCompare.java +71 -68
- data/ext/java/nokogiri/internals/c14n/C14nHelper.java +137 -118
- data/ext/java/nokogiri/internals/c14n/CanonicalFilter.java +27 -21
- data/ext/java/nokogiri/internals/c14n/CanonicalizationException.java +74 -61
- data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +230 -205
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +572 -547
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11_OmitComments.java +17 -10
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11_WithComments.java +17 -10
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315.java +323 -302
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +232 -219
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclOmitComments.java +22 -15
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclWithComments.java +23 -16
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315OmitComments.java +23 -16
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315WithComments.java +22 -15
- data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +575 -545
- data/ext/java/nokogiri/internals/c14n/CanonicalizerPhysical.java +141 -120
- data/ext/java/nokogiri/internals/c14n/CanonicalizerSpi.java +39 -38
- data/ext/java/nokogiri/internals/c14n/Constants.java +13 -10
- data/ext/java/nokogiri/internals/c14n/ElementProxy.java +279 -247
- data/ext/java/nokogiri/internals/c14n/HelperNodeList.java +66 -53
- data/ext/java/nokogiri/internals/c14n/IgnoreAllErrorHandler.java +44 -37
- data/ext/java/nokogiri/internals/c14n/InclusiveNamespaces.java +135 -120
- data/ext/java/nokogiri/internals/c14n/InvalidCanonicalizerException.java +59 -48
- data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +384 -334
- data/ext/java/nokogiri/internals/c14n/NodeFilter.java +25 -24
- data/ext/java/nokogiri/internals/c14n/UtfHelpper.java +151 -140
- data/ext/java/nokogiri/internals/c14n/XMLUtils.java +456 -423
- data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +1466 -1500
- data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +626 -574
- data/ext/nokogiri/depend +34 -474
- data/ext/nokogiri/extconf.rb +253 -183
- data/ext/nokogiri/html_document.c +10 -15
- data/ext/nokogiri/html_element_description.c +84 -71
- data/ext/nokogiri/html_entity_lookup.c +21 -16
- data/ext/nokogiri/html_sax_parser_context.c +66 -65
- data/ext/nokogiri/html_sax_push_parser.c +29 -27
- data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
- data/ext/nokogiri/nokogiri.c +190 -63
- data/ext/nokogiri/test_global_handlers.c +3 -4
- data/ext/nokogiri/xml_attr.c +15 -15
- data/ext/nokogiri/xml_attribute_decl.c +18 -18
- data/ext/nokogiri/xml_cdata.c +13 -18
- data/ext/nokogiri/xml_comment.c +19 -26
- data/ext/nokogiri/xml_document.c +246 -188
- data/ext/nokogiri/xml_document_fragment.c +13 -15
- data/ext/nokogiri/xml_dtd.c +54 -48
- data/ext/nokogiri/xml_element_content.c +30 -27
- data/ext/nokogiri/xml_element_decl.c +22 -22
- data/ext/nokogiri/xml_encoding_handler.c +17 -11
- data/ext/nokogiri/xml_entity_decl.c +32 -30
- data/ext/nokogiri/xml_entity_reference.c +16 -18
- data/ext/nokogiri/xml_namespace.c +56 -49
- data/ext/nokogiri/xml_node.c +385 -326
- data/ext/nokogiri/xml_node_set.c +168 -156
- data/ext/nokogiri/xml_processing_instruction.c +17 -19
- data/ext/nokogiri/xml_reader.c +191 -157
- data/ext/nokogiri/xml_relax_ng.c +29 -23
- data/ext/nokogiri/xml_sax_parser.c +117 -112
- data/ext/nokogiri/xml_sax_parser_context.c +100 -85
- data/ext/nokogiri/xml_sax_push_parser.c +34 -27
- data/ext/nokogiri/xml_schema.c +48 -42
- data/ext/nokogiri/xml_syntax_error.c +21 -23
- data/ext/nokogiri/xml_text.c +13 -17
- data/ext/nokogiri/xml_xpath_context.c +134 -127
- data/ext/nokogiri/xslt_stylesheet.c +157 -157
- data/lib/nokogiri.rb +1 -22
- data/lib/nokogiri/css/parser.rb +1 -1
- data/lib/nokogiri/extension.rb +26 -0
- data/lib/nokogiri/html/document_fragment.rb +15 -15
- data/lib/nokogiri/nokogiri.jar +0 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +32 -8
- data/lib/nokogiri/xml/document.rb +74 -28
- data/lib/nokogiri/xml/node.rb +39 -42
- data/lib/nokogiri/xml/reader.rb +2 -9
- data/lib/nokogiri/xml/xpath.rb +1 -3
- data/lib/nokogiri/xml/xpath/syntax_error.rb +1 -1
- metadata +7 -8
- data/ext/nokogiri/xml_io.c +0 -63
- data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
@@ -1,35 +1,3 @@
|
|
1
|
-
/**
|
2
|
-
* (The MIT License)
|
3
|
-
*
|
4
|
-
* Copyright (c) 2008 - 2012:
|
5
|
-
*
|
6
|
-
* * {Aaron Patterson}[http://tenderlovemaking.com]
|
7
|
-
* * {Mike Dalessio}[http://mike.daless.io]
|
8
|
-
* * {Charles Nutter}[http://blog.headius.com]
|
9
|
-
* * {Sergio Arbeo}[http://www.serabe.com]
|
10
|
-
* * {Patrick Mahoney}[http://polycrystal.org]
|
11
|
-
* * {Yoko Harada}[http://yokolet.blogspot.com]
|
12
|
-
*
|
13
|
-
* Permission is hereby granted, free of charge, to any person obtaining
|
14
|
-
* a copy of this software and associated documentation files (the
|
15
|
-
* 'Software'), to deal in the Software without restriction, including
|
16
|
-
* without limitation the rights to use, copy, modify, merge, publish,
|
17
|
-
* distribute, sublicense, and/or sell copies of the Software, and to
|
18
|
-
* permit persons to whom the Software is furnished to do so, subject to
|
19
|
-
* the following conditions:
|
20
|
-
*
|
21
|
-
* The above copyright notice and this permission notice shall be
|
22
|
-
* included in all copies or substantial portions of the Software.
|
23
|
-
*
|
24
|
-
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
25
|
-
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
26
|
-
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
27
|
-
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
28
|
-
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
29
|
-
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
30
|
-
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
31
|
-
*/
|
32
|
-
|
33
1
|
package nokogiri;
|
34
2
|
|
35
3
|
import java.util.ArrayList;
|
@@ -49,100 +17,117 @@ import org.jruby.runtime.builtin.IRubyObject;
|
|
49
17
|
|
50
18
|
/**
|
51
19
|
* Class for Nokogiri::HTML::ElementDescription.
|
52
|
-
*
|
20
|
+
*
|
53
21
|
* @author Patrick Mahoney <pat@polycrystal.org>
|
54
22
|
*/
|
55
|
-
@JRubyClass(name="Nokogiri::HTML::ElementDescription")
|
56
|
-
public class HtmlElementDescription extends RubyObject
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
23
|
+
@JRubyClass(name = "Nokogiri::HTML::ElementDescription")
|
24
|
+
public class HtmlElementDescription extends RubyObject
|
25
|
+
{
|
26
|
+
|
27
|
+
/**
|
28
|
+
* Stores memoized hash of element -> list of valid subelements.
|
29
|
+
*/
|
30
|
+
static protected Map<Short, List<String>> subElements;
|
31
|
+
static
|
32
|
+
{
|
33
|
+
Map<Short, List<String>> _subElements =
|
34
|
+
new HashMap<Short, List<String>>();
|
35
|
+
subElements = Collections.synchronizedMap(_subElements);
|
36
|
+
}
|
37
|
+
|
38
|
+
protected HTMLElements.Element element;
|
39
|
+
|
40
|
+
public
|
41
|
+
HtmlElementDescription(Ruby runtime, RubyClass rubyClass)
|
42
|
+
{
|
43
|
+
super(runtime, rubyClass);
|
44
|
+
}
|
45
|
+
|
46
|
+
/**
|
47
|
+
* Lookup the list of sub elements of <code>code</code>. If not
|
48
|
+
* already stored, iterate through all elements to find valid
|
49
|
+
* subelements; save this list and return it.
|
50
|
+
*/
|
51
|
+
protected static List<String>
|
52
|
+
findSubElements(HTMLElements.Element elem)
|
53
|
+
{
|
54
|
+
List<String> subs = subElements.get(elem.code);
|
55
|
+
|
56
|
+
if (subs == null) {
|
57
|
+
subs = new ArrayList<String>();
|
58
|
+
|
59
|
+
/*
|
60
|
+
* A bit of a hack. NekoHtml source code shows that
|
61
|
+
* UNKNOWN is the highest value element. We cannot access
|
62
|
+
* the list of elements directly because it's protected.
|
63
|
+
*/
|
64
|
+
for (short c = 0; c < HTMLElements.UNKNOWN; c++) {
|
65
|
+
HTMLElements.Element maybe_sub =
|
66
|
+
HTMLElements.getElement(c);
|
67
|
+
if (maybe_sub.isParent(elem)) {
|
68
|
+
subs.add(maybe_sub.name);
|
99
69
|
}
|
70
|
+
}
|
100
71
|
|
101
|
-
|
72
|
+
subElements.put(elem.code, subs);
|
102
73
|
}
|
103
74
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
75
|
+
return subs;
|
76
|
+
}
|
77
|
+
|
78
|
+
@JRubyMethod(name = "[]", meta = true)
|
79
|
+
public static IRubyObject
|
80
|
+
get(ThreadContext context,
|
81
|
+
IRubyObject klazz, IRubyObject name)
|
82
|
+
{
|
83
|
+
|
84
|
+
// nekohtml will return an element even for invalid names, see
|
85
|
+
// http://sourceforge.net/p/nekohtml/code/HEAD/tree/trunk/src/org/cyberneko/html/HTMLElements.java#l514
|
86
|
+
// which breaks `test_fetch_nonexistent'
|
87
|
+
HTMLElements.Element elem = HTMLElements.getElement(name.asJavaString(), HTMLElements.NO_SUCH_ELEMENT);
|
88
|
+
if (elem == HTMLElements.NO_SUCH_ELEMENT) {
|
89
|
+
return context.nil;
|
119
90
|
}
|
120
91
|
|
121
|
-
|
122
|
-
|
123
|
-
|
92
|
+
HtmlElementDescription desc =
|
93
|
+
new HtmlElementDescription(context.getRuntime(), (RubyClass)klazz);
|
94
|
+
desc.element = elem;
|
95
|
+
return desc;
|
96
|
+
}
|
97
|
+
|
98
|
+
@JRubyMethod()
|
99
|
+
public IRubyObject
|
100
|
+
name(ThreadContext context)
|
101
|
+
{
|
102
|
+
return context.getRuntime().newString(element.name.toLowerCase());
|
103
|
+
}
|
104
|
+
|
105
|
+
@JRubyMethod(name = "inline?")
|
106
|
+
public IRubyObject
|
107
|
+
inline_eh(ThreadContext context)
|
108
|
+
{
|
109
|
+
return context.getRuntime().newBoolean(element.isInline());
|
110
|
+
}
|
111
|
+
|
112
|
+
@JRubyMethod(name = "empty?")
|
113
|
+
public IRubyObject
|
114
|
+
empty_eh(ThreadContext context)
|
115
|
+
{
|
116
|
+
return context.getRuntime().newBoolean(element.isEmpty());
|
117
|
+
}
|
118
|
+
|
119
|
+
@JRubyMethod()
|
120
|
+
public IRubyObject
|
121
|
+
sub_elements(ThreadContext context)
|
122
|
+
{
|
123
|
+
Ruby ruby = context.getRuntime();
|
124
|
+
List<String> subs = findSubElements(element);
|
125
|
+
IRubyObject[] ary = new IRubyObject[subs.size()];
|
126
|
+
for (int i = 0; i < subs.size(); ++i) {
|
127
|
+
ary[i] = ruby.newString(subs.get(i));
|
124
128
|
}
|
125
129
|
|
126
|
-
|
127
|
-
|
128
|
-
return context.getRuntime().newBoolean(element.isInline());
|
129
|
-
}
|
130
|
-
|
131
|
-
@JRubyMethod(name="empty?")
|
132
|
-
public IRubyObject empty_eh(ThreadContext context) {
|
133
|
-
return context.getRuntime().newBoolean(element.isEmpty());
|
134
|
-
}
|
135
|
-
|
136
|
-
@JRubyMethod()
|
137
|
-
public IRubyObject sub_elements(ThreadContext context) {
|
138
|
-
Ruby ruby = context.getRuntime();
|
139
|
-
List<String> subs = findSubElements(element);
|
140
|
-
IRubyObject[] ary = new IRubyObject[subs.size()];
|
141
|
-
for (int i = 0; i < subs.size(); ++i) {
|
142
|
-
ary[i] = ruby.newString(subs.get(i));
|
143
|
-
}
|
144
|
-
|
145
|
-
return ruby.newArray(ary);
|
146
|
-
}
|
130
|
+
return ruby.newArray(ary);
|
131
|
+
}
|
147
132
|
|
148
133
|
}
|
@@ -1,35 +1,3 @@
|
|
1
|
-
/**
|
2
|
-
* (The MIT License)
|
3
|
-
*
|
4
|
-
* Copyright (c) 2008 - 2012:
|
5
|
-
*
|
6
|
-
* * {Aaron Patterson}[http://tenderlovemaking.com]
|
7
|
-
* * {Mike Dalessio}[http://mike.daless.io]
|
8
|
-
* * {Charles Nutter}[http://blog.headius.com]
|
9
|
-
* * {Sergio Arbeo}[http://www.serabe.com]
|
10
|
-
* * {Patrick Mahoney}[http://polycrystal.org]
|
11
|
-
* * {Yoko Harada}[http://yokolet.blogspot.com]
|
12
|
-
*
|
13
|
-
* Permission is hereby granted, free of charge, to any person obtaining
|
14
|
-
* a copy of this software and associated documentation files (the
|
15
|
-
* 'Software'), to deal in the Software without restriction, including
|
16
|
-
* without limitation the rights to use, copy, modify, merge, publish,
|
17
|
-
* distribute, sublicense, and/or sell copies of the Software, and to
|
18
|
-
* permit persons to whom the Software is furnished to do so, subject to
|
19
|
-
* the following conditions:
|
20
|
-
*
|
21
|
-
* The above copyright notice and this permission notice shall be
|
22
|
-
* included in all copies or substantial portions of the Software.
|
23
|
-
*
|
24
|
-
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
25
|
-
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
26
|
-
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
27
|
-
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
28
|
-
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
29
|
-
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
30
|
-
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
31
|
-
*/
|
32
|
-
|
33
1
|
package nokogiri;
|
34
2
|
|
35
3
|
import static org.jruby.runtime.Helpers.invoke;
|
@@ -45,35 +13,40 @@ import org.jruby.runtime.builtin.IRubyObject;
|
|
45
13
|
|
46
14
|
/**
|
47
15
|
* Class for Nokogiri::HTML::EntityLookup.
|
48
|
-
*
|
16
|
+
*
|
49
17
|
* @author Patrick Mahoney <pat@polycrystal.org>
|
50
18
|
*/
|
51
|
-
@JRubyClass(name="Nokogiri::HTML::EntityLookup")
|
52
|
-
public class HtmlEntityLookup extends RubyObject
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
19
|
+
@JRubyClass(name = "Nokogiri::HTML::EntityLookup")
|
20
|
+
public class HtmlEntityLookup extends RubyObject
|
21
|
+
{
|
22
|
+
|
23
|
+
public
|
24
|
+
HtmlEntityLookup(Ruby runtime, RubyClass rubyClass)
|
25
|
+
{
|
26
|
+
super(runtime, rubyClass);
|
27
|
+
}
|
28
|
+
|
29
|
+
/**
|
30
|
+
* Looks up an HTML entity <code>key</code>.
|
31
|
+
*
|
32
|
+
* The description is a bit lacking.
|
33
|
+
*/
|
34
|
+
@JRubyMethod()
|
35
|
+
public IRubyObject
|
36
|
+
get(ThreadContext context, IRubyObject key)
|
37
|
+
{
|
38
|
+
Ruby ruby = context.getRuntime();
|
39
|
+
String name = key.toString();
|
40
|
+
int val = HTMLEntities.get(name);
|
41
|
+
if (val == -1) { return ruby.getNil(); }
|
42
|
+
|
43
|
+
IRubyObject edClass =
|
44
|
+
ruby.getClassFromPath("Nokogiri::HTML::EntityDescription");
|
45
|
+
IRubyObject edObj = invoke(context, edClass, "new",
|
46
|
+
ruby.newFixnum(val), ruby.newString(name),
|
47
|
+
ruby.newString(name + " entity"));
|
48
|
+
|
49
|
+
return edObj;
|
50
|
+
}
|
78
51
|
|
79
52
|
}
|
@@ -1,35 +1,3 @@
|
|
1
|
-
/**
|
2
|
-
* (The MIT License)
|
3
|
-
*
|
4
|
-
* Copyright (c) 2008 - 2011:
|
5
|
-
*
|
6
|
-
* * {Aaron Patterson}[http://tenderlovemaking.com]
|
7
|
-
* * {Mike Dalessio}[http://mike.daless.io]
|
8
|
-
* * {Charles Nutter}[http://blog.headius.com]
|
9
|
-
* * {Sergio Arbeo}[http://www.serabe.com]
|
10
|
-
* * {Patrick Mahoney}[http://polycrystal.org]
|
11
|
-
* * {Yoko Harada}[http://yokolet.blogspot.com]
|
12
|
-
*
|
13
|
-
* Permission is hereby granted, free of charge, to any person obtaining
|
14
|
-
* a copy of this software and associated documentation files (the
|
15
|
-
* 'Software'), to deal in the Software without restriction, including
|
16
|
-
* without limitation the rights to use, copy, modify, merge, publish,
|
17
|
-
* distribute, sublicense, and/or sell copies of the Software, and to
|
18
|
-
* permit persons to whom the Software is furnished to do so, subject to
|
19
|
-
* the following conditions:
|
20
|
-
*
|
21
|
-
* The above copyright notice and this permission notice shall be
|
22
|
-
* included in all copies or substantial portions of the Software.
|
23
|
-
*
|
24
|
-
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
25
|
-
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
26
|
-
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
27
|
-
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
28
|
-
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
29
|
-
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
30
|
-
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
31
|
-
*/
|
32
|
-
|
33
1
|
package nokogiri;
|
34
2
|
|
35
3
|
import java.io.ByteArrayInputStream;
|
@@ -63,220 +31,248 @@ import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
|
|
63
31
|
* @author Yoko Harada <yokolet@gmail.com>
|
64
32
|
*/
|
65
33
|
|
66
|
-
@JRubyClass(name="Nokogiri::HTML::SAX::ParserContext", parent="Nokogiri::XML::SAX::ParserContext")
|
67
|
-
public class HtmlSaxParserContext extends XmlSaxParserContext
|
34
|
+
@JRubyClass(name = "Nokogiri::HTML::SAX::ParserContext", parent = "Nokogiri::XML::SAX::ParserContext")
|
35
|
+
public class HtmlSaxParserContext extends XmlSaxParserContext
|
36
|
+
{
|
68
37
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
38
|
+
static HtmlSaxParserContext
|
39
|
+
newInstance(final Ruby runtime, final RubyClass klazz)
|
40
|
+
{
|
41
|
+
HtmlSaxParserContext instance = new HtmlSaxParserContext(runtime, klazz);
|
42
|
+
instance.initialize(runtime);
|
43
|
+
return instance;
|
44
|
+
}
|
74
45
|
|
75
|
-
|
76
|
-
|
77
|
-
|
46
|
+
public
|
47
|
+
HtmlSaxParserContext(Ruby ruby, RubyClass rubyClass)
|
48
|
+
{
|
49
|
+
super(ruby, rubyClass);
|
50
|
+
}
|
51
|
+
|
52
|
+
@Override
|
53
|
+
protected AbstractSAXParser
|
54
|
+
createParser() throws SAXException
|
55
|
+
{
|
56
|
+
SAXParser parser = new SAXParser();
|
78
57
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
return parser;
|
94
|
-
} catch(SAXException ex) {
|
95
|
-
throw new SAXException(
|
96
|
-
"Problem while creating HTML SAX Parser: " + ex.toString());
|
97
|
-
}
|
58
|
+
try {
|
59
|
+
parser.setProperty(
|
60
|
+
"http://cyberneko.org/html/properties/names/elems", "lower");
|
61
|
+
parser.setProperty(
|
62
|
+
"http://cyberneko.org/html/properties/names/attrs", "lower");
|
63
|
+
|
64
|
+
// NekoHTML should not try to guess the encoding based on the meta
|
65
|
+
// tags or other information in the document. This is already
|
66
|
+
// handled by the EncodingReader.
|
67
|
+
parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
|
68
|
+
return parser;
|
69
|
+
} catch (SAXException ex) {
|
70
|
+
throw new SAXException(
|
71
|
+
"Problem while creating HTML SAX Parser: " + ex.toString());
|
98
72
|
}
|
73
|
+
}
|
99
74
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
75
|
+
@JRubyMethod(name = "memory", meta = true)
|
76
|
+
public static IRubyObject
|
77
|
+
parse_memory(ThreadContext context,
|
78
|
+
IRubyObject klazz,
|
79
|
+
IRubyObject data,
|
80
|
+
IRubyObject encoding)
|
81
|
+
{
|
82
|
+
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
|
83
|
+
String javaEncoding = findEncodingName(context, encoding);
|
84
|
+
if (javaEncoding != null) {
|
85
|
+
CharSequence input = applyEncoding(rubyStringToString(data.convertToString()), javaEncoding);
|
86
|
+
ByteArrayInputStream istream = new ByteArrayInputStream(input.toString().getBytes());
|
87
|
+
ctx.setInputSource(istream);
|
88
|
+
ctx.getInputSource().setEncoding(javaEncoding);
|
114
89
|
}
|
90
|
+
return ctx;
|
91
|
+
}
|
92
|
+
|
93
|
+
public enum EncodingType {
|
94
|
+
NONE(0, "NONE"),
|
95
|
+
UTF_8(1, "UTF-8"),
|
96
|
+
UTF16LE(2, "UTF16LE"),
|
97
|
+
UTF16BE(3, "UTF16BE"),
|
98
|
+
UCS4LE(4, "UCS4LE"),
|
99
|
+
UCS4BE(5, "UCS4BE"),
|
100
|
+
EBCDIC(6, "EBCDIC"),
|
101
|
+
UCS4_2143(7, "ICS4-2143"),
|
102
|
+
UCS4_3412(8, "UCS4-3412"),
|
103
|
+
UCS2(9, "UCS2"),
|
104
|
+
ISO_8859_1(10, "ISO-8859-1"),
|
105
|
+
ISO_8859_2(11, "ISO-8859-2"),
|
106
|
+
ISO_8859_3(12, "ISO-8859-3"),
|
107
|
+
ISO_8859_4(13, "ISO-8859-4"),
|
108
|
+
ISO_8859_5(14, "ISO-8859-5"),
|
109
|
+
ISO_8859_6(15, "ISO-8859-6"),
|
110
|
+
ISO_8859_7(16, "ISO-8859-7"),
|
111
|
+
ISO_8859_8(17, "ISO-8859-8"),
|
112
|
+
ISO_8859_9(18, "ISO-8859-9"),
|
113
|
+
ISO_2022_JP(19, "ISO-2022-JP"),
|
114
|
+
SHIFT_JIS(20, "SHIFT-JIS"),
|
115
|
+
EUC_JP(21, "EUC-JP"),
|
116
|
+
ASCII(22, "ASCII");
|
115
117
|
|
116
|
-
|
117
|
-
|
118
|
-
UTF_8(1, "UTF-8"),
|
119
|
-
UTF16LE(2, "UTF16LE"),
|
120
|
-
UTF16BE(3, "UTF16BE"),
|
121
|
-
UCS4LE(4, "UCS4LE"),
|
122
|
-
UCS4BE(5, "UCS4BE"),
|
123
|
-
EBCDIC(6, "EBCDIC"),
|
124
|
-
UCS4_2143(7, "ICS4-2143"),
|
125
|
-
UCS4_3412(8, "UCS4-3412"),
|
126
|
-
UCS2(9, "UCS2"),
|
127
|
-
ISO_8859_1(10, "ISO-8859-1"),
|
128
|
-
ISO_8859_2(11, "ISO-8859-2"),
|
129
|
-
ISO_8859_3(12, "ISO-8859-3"),
|
130
|
-
ISO_8859_4(13, "ISO-8859-4"),
|
131
|
-
ISO_8859_5(14, "ISO-8859-5"),
|
132
|
-
ISO_8859_6(15, "ISO-8859-6"),
|
133
|
-
ISO_8859_7(16, "ISO-8859-7"),
|
134
|
-
ISO_8859_8(17, "ISO-8859-8"),
|
135
|
-
ISO_8859_9(18, "ISO-8859-9"),
|
136
|
-
ISO_2022_JP(19, "ISO-2022-JP"),
|
137
|
-
SHIFT_JIS(20, "SHIFT-JIS"),
|
138
|
-
EUC_JP(21, "EUC-JP"),
|
139
|
-
ASCII(22, "ASCII");
|
140
|
-
|
141
|
-
private final int value;
|
142
|
-
private final String name;
|
143
|
-
|
144
|
-
EncodingType(int value, String name) {
|
145
|
-
this.value = value;
|
146
|
-
this.name = name;
|
147
|
-
}
|
148
|
-
|
149
|
-
public int getValue() {
|
150
|
-
return value;
|
151
|
-
}
|
152
|
-
|
153
|
-
public String toString() {
|
154
|
-
return name;
|
155
|
-
}
|
156
|
-
|
157
|
-
private static transient EncodingType[] values;
|
158
|
-
|
159
|
-
// NOTE: assuming ordinal == value
|
160
|
-
static EncodingType get(final int ordinal) {
|
161
|
-
EncodingType[] values = EncodingType.values;
|
162
|
-
if (values == null) {
|
163
|
-
values = EncodingType.values();
|
164
|
-
EncodingType.values = values;
|
165
|
-
}
|
166
|
-
if (ordinal >= 0 && ordinal < values.length) {
|
167
|
-
return values[ordinal];
|
168
|
-
}
|
169
|
-
return null;
|
170
|
-
}
|
118
|
+
private final int value;
|
119
|
+
private final String name;
|
171
120
|
|
121
|
+
EncodingType(int value, String name)
|
122
|
+
{
|
123
|
+
this.value = value;
|
124
|
+
this.name = name;
|
172
125
|
}
|
173
126
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
assert type.value == value;
|
178
|
-
return type.name;
|
127
|
+
public int getValue()
|
128
|
+
{
|
129
|
+
return value;
|
179
130
|
}
|
180
131
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
rubyEncoding = rubyStringToString((RubyString) encoding);
|
185
|
-
}
|
186
|
-
else if (encoding instanceof RubyFixnum) {
|
187
|
-
rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding));
|
188
|
-
}
|
189
|
-
if (rubyEncoding == null) return null;
|
190
|
-
try {
|
191
|
-
return Charset.forName(rubyEncoding).displayName();
|
192
|
-
}
|
193
|
-
catch (UnsupportedCharsetException e) {
|
194
|
-
throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported");
|
195
|
-
}
|
196
|
-
catch (IllegalCharsetNameException e) {
|
197
|
-
throw context.getRuntime().newEncodingError(e.getMessage());
|
198
|
-
}
|
132
|
+
public String toString()
|
133
|
+
{
|
134
|
+
return name;
|
199
135
|
}
|
200
136
|
|
201
|
-
private static
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
return input;
|
137
|
+
private static transient EncodingType[] values;
|
138
|
+
|
139
|
+
// NOTE: assuming ordinal == value
|
140
|
+
static EncodingType get(final int ordinal)
|
141
|
+
{
|
142
|
+
EncodingType[] values = EncodingType.values;
|
143
|
+
if (values == null) {
|
144
|
+
values = EncodingType.values();
|
145
|
+
EncodingType.values = values;
|
146
|
+
}
|
147
|
+
if (ordinal >= 0 && ordinal < values.length) {
|
148
|
+
return values[ordinal];
|
149
|
+
}
|
150
|
+
return null;
|
216
151
|
}
|
217
152
|
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
153
|
+
}
|
154
|
+
|
155
|
+
private static String
|
156
|
+
findEncodingName(final int value)
|
157
|
+
{
|
158
|
+
EncodingType type = EncodingType.get(value);
|
159
|
+
if (type == null) { return null; }
|
160
|
+
assert type.value == value;
|
161
|
+
return type.name;
|
162
|
+
}
|
163
|
+
|
164
|
+
private static String
|
165
|
+
findEncodingName(ThreadContext context, IRubyObject encoding)
|
166
|
+
{
|
167
|
+
String rubyEncoding = null;
|
168
|
+
if (encoding instanceof RubyString) {
|
169
|
+
rubyEncoding = rubyStringToString((RubyString) encoding);
|
170
|
+
} else if (encoding instanceof RubyFixnum) {
|
171
|
+
rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding));
|
237
172
|
}
|
173
|
+
if (rubyEncoding == null) { return null; }
|
174
|
+
try {
|
175
|
+
return Charset.forName(rubyEncoding).displayName();
|
176
|
+
} catch (UnsupportedCharsetException e) {
|
177
|
+
throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported");
|
178
|
+
} catch (IllegalCharsetNameException e) {
|
179
|
+
throw context.getRuntime().newEncodingError(e.getMessage());
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+",
|
184
|
+
Pattern.CASE_INSENSITIVE);
|
238
185
|
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
186
|
+
private static CharSequence
|
187
|
+
applyEncoding(final String input, final String enc)
|
188
|
+
{
|
189
|
+
int start_pos = 0;
|
190
|
+
int end_pos = 0;
|
191
|
+
if (containsIgnoreCase(input, "charset")) {
|
192
|
+
Matcher m = CHARSET_PATTERN.matcher(input);
|
193
|
+
while (m.find()) {
|
194
|
+
start_pos = m.start();
|
195
|
+
end_pos = m.end();
|
196
|
+
}
|
197
|
+
}
|
198
|
+
if (start_pos != end_pos) {
|
199
|
+
return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc);
|
251
200
|
}
|
201
|
+
return input;
|
202
|
+
}
|
252
203
|
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
204
|
+
private static boolean
|
205
|
+
containsIgnoreCase(final String str, final String sub)
|
206
|
+
{
|
207
|
+
final int len = sub.length();
|
208
|
+
final int max = str.length() - len;
|
209
|
+
|
210
|
+
if (len == 0) { return true; }
|
211
|
+
final char c0Lower = Character.toLowerCase(sub.charAt(0));
|
212
|
+
final char c0Upper = Character.toUpperCase(sub.charAt(0));
|
213
|
+
|
214
|
+
for (int i = 0; i <= max; i++) {
|
215
|
+
final char ch = str.charAt(i);
|
216
|
+
if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) {
|
217
|
+
continue; // first char doesn't match
|
218
|
+
}
|
219
|
+
|
220
|
+
if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) {
|
221
|
+
return true;
|
222
|
+
}
|
265
223
|
}
|
224
|
+
return false;
|
225
|
+
}
|
266
226
|
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
227
|
+
@JRubyMethod(name = "file", meta = true)
|
228
|
+
public static IRubyObject
|
229
|
+
parse_file(ThreadContext context,
|
230
|
+
IRubyObject klass,
|
231
|
+
IRubyObject data,
|
232
|
+
IRubyObject encoding)
|
233
|
+
{
|
234
|
+
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klass);
|
235
|
+
ctx.setInputSourceFile(context, data);
|
236
|
+
String javaEncoding = findEncodingName(context, encoding);
|
237
|
+
if (javaEncoding != null) {
|
238
|
+
ctx.getInputSource().setEncoding(javaEncoding);
|
275
239
|
}
|
240
|
+
return ctx;
|
241
|
+
}
|
276
242
|
|
277
|
-
|
278
|
-
|
279
|
-
|
243
|
+
@JRubyMethod(name = "io", meta = true)
|
244
|
+
public static IRubyObject
|
245
|
+
parse_io(ThreadContext context,
|
246
|
+
IRubyObject klass,
|
247
|
+
IRubyObject data,
|
248
|
+
IRubyObject encoding)
|
249
|
+
{
|
250
|
+
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klass);
|
251
|
+
ctx.setIOInputSource(context, data, context.nil);
|
252
|
+
String javaEncoding = findEncodingName(context, encoding);
|
253
|
+
if (javaEncoding != null) {
|
254
|
+
ctx.getInputSource().setEncoding(javaEncoding);
|
280
255
|
}
|
256
|
+
return ctx;
|
257
|
+
}
|
258
|
+
|
259
|
+
/**
|
260
|
+
* Create a new parser context that will read from a raw input stream.
|
261
|
+
* Meant to be run in a separate thread by HtmlSaxPushParser.
|
262
|
+
*/
|
263
|
+
static HtmlSaxParserContext
|
264
|
+
parse_stream(final Ruby runtime, RubyClass klass, InputStream stream)
|
265
|
+
{
|
266
|
+
HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(runtime, klass);
|
267
|
+
ctx.setInputSource(stream);
|
268
|
+
return ctx;
|
269
|
+
}
|
270
|
+
|
271
|
+
@Override
|
272
|
+
protected void
|
273
|
+
preParse(final Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler)
|
274
|
+
{
|
275
|
+
// this function is meant to be empty. It overrides the one in XmlSaxParserContext
|
276
|
+
}
|
281
277
|
|
282
278
|
}
|