nokogiri 1.16.8-java → 1.17.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +11 -21
- data/README.md +4 -0
- data/dependencies.yml +6 -6
- data/ext/java/nokogiri/Html4Document.java +3 -3
- data/ext/java/nokogiri/Html4SaxParserContext.java +47 -175
- data/ext/java/nokogiri/NokogiriService.java +2 -2
- data/ext/java/nokogiri/XmlCdata.java +3 -0
- data/ext/java/nokogiri/XmlDocument.java +7 -14
- data/ext/java/nokogiri/XmlDocumentFragment.java +4 -92
- data/ext/java/nokogiri/XmlDtd.java +2 -2
- data/ext/java/nokogiri/XmlEntityReference.java +16 -12
- data/ext/java/nokogiri/XmlNode.java +26 -47
- data/ext/java/nokogiri/XmlNodeSet.java +10 -1
- data/ext/java/nokogiri/XmlSaxParserContext.java +73 -36
- data/ext/java/nokogiri/XmlSchema.java +15 -16
- data/ext/java/nokogiri/XsltStylesheet.java +1 -1
- data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +1 -1
- data/ext/java/nokogiri/internals/NokogiriDomParser.java +3 -3
- data/ext/java/nokogiri/internals/NokogiriHandler.java +59 -15
- data/ext/java/nokogiri/internals/NokogiriHelpers.java +1 -1
- data/ext/java/nokogiri/internals/ParserContext.java +51 -21
- data/ext/java/nokogiri/internals/ReaderNode.java +1 -1
- data/ext/java/nokogiri/internals/XmlDomParserContext.java +8 -19
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +1 -1
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +1 -1
- data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +10 -11
- data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +5 -5
- data/ext/java/nokogiri/internals/c14n/{UtfHelpper.java → UtfHelper.java} +2 -2
- data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +8 -8
- data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +2 -2
- data/ext/nokogiri/extconf.rb +191 -137
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +25 -24
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +2 -10
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +163 -156
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -6
- data/ext/nokogiri/xml_node.c +130 -104
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +54 -58
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +213 -131
- data/ext/nokogiri/xml_sax_push_parser.c +68 -49
- data/ext/nokogiri/xml_schema.c +50 -85
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +2 -4
- data/ext/nokogiri/xml_xpath_context.c +2 -2
- data/ext/nokogiri/xslt_stylesheet.c +8 -8
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +42 -6
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +44 -23
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +1 -1
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -72
- data/lib/nokogiri/nokogiri.jar +0 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +70 -26
- data/lib/nokogiri/xml/document_fragment.rb +84 -13
- data/lib/nokogiri/xml/node.rb +82 -11
- data/lib/nokogiri/xml/node_set.rb +9 -7
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +46 -13
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +6 -8
- data/lib/nokogiri/xml/syntax_error.rb +22 -0
- data/lib/nokogiri/xml.rb +13 -24
- data/lib/nokogiri/xslt.rb +3 -9
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- metadata +9 -5
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e72cb3083fa1cff82029cbbf2fec76f1a8dd39937c017633b0d944925d72ccaa
|
4
|
+
data.tar.gz: 45760600fd16493478685f69e2d122bd807f869052e1a3d2f640e2ba2d0a1451
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db57663703ce10b22d3aa59ab5aa2a308ac5fda082cea1a4bb01245277b1b2df7b20eb103966b1fc814465276d921e52ff971adca42ea7e71b67fd20219fbeba
|
7
|
+
data.tar.gz: 979db2b053dc00919e9ea60a548af51eb30d2b447a8ae46bdcf5d66acba204ab5019b277614a321f03a0d8b7f6a62171d113b8d392f172c56af305ce343c5bdd
|
data/Gemfile
CHANGED
@@ -5,44 +5,34 @@ source "https://rubygems.org"
|
|
5
5
|
gemspec
|
6
6
|
|
7
7
|
group :development do
|
8
|
-
# ruby 3.4.0-dev removed some gems from the default set
|
9
|
-
#
|
10
|
-
# TODO: we should be able to remove these as our gem dependencies sort it out and we pull them in
|
11
|
-
# transitively.
|
12
|
-
gem "mutex_m"
|
13
|
-
|
14
8
|
# bootstrapping
|
15
9
|
gem "bundler", "~> 2.3"
|
16
|
-
gem "rake", "13.1
|
10
|
+
gem "rake", "13.2.1"
|
17
11
|
|
18
12
|
# building extensions
|
19
|
-
gem "rake-compiler", "1.2.
|
20
|
-
gem "rake-compiler-dock", "1.
|
13
|
+
gem "rake-compiler", "1.2.8"
|
14
|
+
gem "rake-compiler-dock", "1.5.2"
|
21
15
|
|
22
16
|
# parser generator
|
23
|
-
gem "rexical", "
|
17
|
+
gem "rexical", "1.0.8"
|
24
18
|
|
25
19
|
# tests
|
26
|
-
gem "minitest", "5.
|
20
|
+
gem "minitest", "5.25.2"
|
27
21
|
gem "minitest-parallel_fork", "2.0.0"
|
28
|
-
gem "ruby_memcheck", "
|
22
|
+
gem "ruby_memcheck", "3.0.0"
|
29
23
|
gem "rubyzip", "~> 2.3.2"
|
30
24
|
gem "simplecov", "= 0.21.2"
|
31
25
|
|
32
26
|
# rubocop
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
gem "rubocop-performance", "1.20.2"
|
38
|
-
gem "rubocop-rake", "= 0.6.0"
|
39
|
-
gem "rubocop-shopify", "2.14.0"
|
40
|
-
end
|
27
|
+
gem "standard", "1.42.1"
|
28
|
+
gem "rubocop-minitest", "0.36.0"
|
29
|
+
gem "rubocop-packaging", "0.5.2"
|
30
|
+
gem "rubocop-rake", "0.6.0"
|
41
31
|
end
|
42
32
|
|
43
33
|
# If Psych doesn't build, you can disable this group locally by running
|
44
34
|
# `bundle config set --local without rdoc`
|
45
35
|
# Then re-run `bundle install`.
|
46
36
|
group :rdoc do
|
47
|
-
gem "rdoc", "6.
|
37
|
+
gem "rdoc", "6.8.1"
|
48
38
|
end
|
data/README.md
CHANGED
@@ -117,6 +117,10 @@ Requirements:
|
|
117
117
|
- Ruby >= 3.0
|
118
118
|
- JRuby >= 9.4.0.0
|
119
119
|
|
120
|
+
If you are compiling the native extension against a system version of libxml2:
|
121
|
+
|
122
|
+
- libxml2 >= 2.9.2 (recommended >= 2.12.0)
|
123
|
+
|
120
124
|
|
121
125
|
### Native Gems: Faster, more reliable installation
|
122
126
|
|
data/dependencies.yml
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
---
|
2
2
|
libxml2:
|
3
|
-
version: "2.
|
4
|
-
sha256: "
|
5
|
-
# sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.
|
3
|
+
version: "2.13.5"
|
4
|
+
sha256: "74fc163217a3964257d3be39af943e08861263c4231f9ef5b496b6f6d4c7b2b6"
|
5
|
+
# sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.13/libxml2-2.13.5.sha256sum
|
6
6
|
|
7
7
|
libxslt:
|
8
|
-
version: "1.1.
|
9
|
-
sha256: "
|
10
|
-
# sha-256 hash provided in https://download.gnome.org/sources/libxslt/1.1/libxslt-1.1.
|
8
|
+
version: "1.1.42"
|
9
|
+
sha256: "85ca62cac0d41fc77d3f6033da9df6fd73d20ea2fc18b0a3609ffb4110e1baeb"
|
10
|
+
# sha-256 hash provided in https://download.gnome.org/sources/libxslt/1.1/libxslt-1.1.42.sha256sum
|
11
11
|
|
12
12
|
zlib:
|
13
13
|
version: "1.3.1"
|
@@ -30,7 +30,7 @@ public class Html4Document extends XmlDocument
|
|
30
30
|
|
31
31
|
private static final String DEFAULT_CONTENT_TYPE = "html";
|
32
32
|
private static final String DEFAULT_PUBLIC_ID = "-//W3C//DTD HTML 4.01//EN";
|
33
|
-
private static final String
|
33
|
+
private static final String DEFAULT_SYSTEM_ID = "http://www.w3.org/TR/html4/strict.dtd";
|
34
34
|
|
35
35
|
private String parsed_encoding = null;
|
36
36
|
|
@@ -88,7 +88,7 @@ public class Html4Document extends XmlDocument
|
|
88
88
|
getDocument(),
|
89
89
|
context.getRuntime().newString(DEFAULT_CONTENT_TYPE),
|
90
90
|
context.getRuntime().newString(DEFAULT_PUBLIC_ID),
|
91
|
-
context.getRuntime().newString(
|
91
|
+
context.getRuntime().newString(DEFAULT_SYSTEM_ID));
|
92
92
|
setInternalSubset(internalSubset);
|
93
93
|
}
|
94
94
|
|
@@ -132,7 +132,7 @@ public class Html4Document extends XmlDocument
|
|
132
132
|
}
|
133
133
|
|
134
134
|
public String
|
135
|
-
|
135
|
+
getParsedEncoding()
|
136
136
|
{
|
137
137
|
return parsed_encoding;
|
138
138
|
}
|
@@ -2,16 +2,12 @@ package nokogiri;
|
|
2
2
|
|
3
3
|
import java.io.ByteArrayInputStream;
|
4
4
|
import java.io.InputStream;
|
5
|
-
import java.nio.charset.Charset;
|
6
|
-
import java.nio.charset.IllegalCharsetNameException;
|
7
|
-
import java.nio.charset.UnsupportedCharsetException;
|
8
|
-
import java.util.regex.Matcher;
|
9
|
-
import java.util.regex.Pattern;
|
10
5
|
|
11
6
|
import org.apache.xerces.parsers.AbstractSAXParser;
|
12
7
|
import net.sourceforge.htmlunit.cyberneko.parsers.SAXParser;
|
13
8
|
import org.jruby.Ruby;
|
14
9
|
import org.jruby.RubyClass;
|
10
|
+
import org.jruby.RubyEncoding;
|
15
11
|
import org.jruby.RubyFixnum;
|
16
12
|
import org.jruby.RubyString;
|
17
13
|
import org.jruby.anno.JRubyClass;
|
@@ -23,6 +19,8 @@ import org.xml.sax.SAXException;
|
|
23
19
|
import nokogiri.internals.NokogiriHandler;
|
24
20
|
import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
|
25
21
|
|
22
|
+
import static org.jruby.runtime.Helpers.invoke;
|
23
|
+
|
26
24
|
/**
|
27
25
|
* Class for Nokogiri::HTML4::SAX::ParserContext.
|
28
26
|
*
|
@@ -56,10 +54,9 @@ public class Html4SaxParserContext extends XmlSaxParserContext
|
|
56
54
|
SAXParser parser = new SAXParser();
|
57
55
|
|
58
56
|
try {
|
59
|
-
parser.setProperty(
|
60
|
-
|
61
|
-
parser.
|
62
|
-
"http://cyberneko.org/html/properties/names/attrs", "lower");
|
57
|
+
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
|
58
|
+
parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
|
59
|
+
parser.setFeature("http://cyberneko.org/html/features/report-errors", true);
|
63
60
|
|
64
61
|
// NekoHTML should not try to guess the encoding based on the meta
|
65
62
|
// tags or other information in the document. This is already
|
@@ -72,198 +69,73 @@ public class Html4SaxParserContext extends XmlSaxParserContext
|
|
72
69
|
}
|
73
70
|
}
|
74
71
|
|
75
|
-
@JRubyMethod(name = "
|
72
|
+
@JRubyMethod(name = "native_memory", meta = true)
|
76
73
|
public static IRubyObject
|
77
|
-
parse_memory(ThreadContext context,
|
78
|
-
IRubyObject klazz,
|
79
|
-
IRubyObject data,
|
80
|
-
IRubyObject encoding)
|
74
|
+
parse_memory(ThreadContext context, IRubyObject klazz, IRubyObject data, IRubyObject encoding)
|
81
75
|
{
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
ByteArrayInputStream istream = new ByteArrayInputStream(input.toString().getBytes());
|
87
|
-
ctx.setInputSource(istream);
|
88
|
-
ctx.getInputSource().setEncoding(javaEncoding);
|
89
|
-
}
|
90
|
-
return ctx;
|
91
|
-
}
|
92
|
-
|
93
|
-
public enum EncodingType {
|
94
|
-
NONE(0, "NONE"),
|
95
|
-
UTF_8(1, "UTF-8"),
|
96
|
-
UTF16LE(2, "UTF16LE"),
|
97
|
-
UTF16BE(3, "UTF16BE"),
|
98
|
-
UCS4LE(4, "UCS4LE"),
|
99
|
-
UCS4BE(5, "UCS4BE"),
|
100
|
-
EBCDIC(6, "EBCDIC"),
|
101
|
-
UCS4_2143(7, "ICS4-2143"),
|
102
|
-
UCS4_3412(8, "UCS4-3412"),
|
103
|
-
UCS2(9, "UCS2"),
|
104
|
-
ISO_8859_1(10, "ISO-8859-1"),
|
105
|
-
ISO_8859_2(11, "ISO-8859-2"),
|
106
|
-
ISO_8859_3(12, "ISO-8859-3"),
|
107
|
-
ISO_8859_4(13, "ISO-8859-4"),
|
108
|
-
ISO_8859_5(14, "ISO-8859-5"),
|
109
|
-
ISO_8859_6(15, "ISO-8859-6"),
|
110
|
-
ISO_8859_7(16, "ISO-8859-7"),
|
111
|
-
ISO_8859_8(17, "ISO-8859-8"),
|
112
|
-
ISO_8859_9(18, "ISO-8859-9"),
|
113
|
-
ISO_2022_JP(19, "ISO-2022-JP"),
|
114
|
-
SHIFT_JIS(20, "SHIFT-JIS"),
|
115
|
-
EUC_JP(21, "EUC-JP"),
|
116
|
-
ASCII(22, "ASCII");
|
117
|
-
|
118
|
-
private final int value;
|
119
|
-
private final String name;
|
120
|
-
|
121
|
-
EncodingType(int value, String name)
|
122
|
-
{
|
123
|
-
this.value = value;
|
124
|
-
this.name = name;
|
125
|
-
}
|
126
|
-
|
127
|
-
public int getValue()
|
128
|
-
{
|
129
|
-
return value;
|
130
|
-
}
|
131
|
-
|
132
|
-
public String toString()
|
133
|
-
{
|
134
|
-
return name;
|
135
|
-
}
|
136
|
-
|
137
|
-
private static transient EncodingType[] values;
|
138
|
-
|
139
|
-
// NOTE: assuming ordinal == value
|
140
|
-
static EncodingType get(final int ordinal)
|
141
|
-
{
|
142
|
-
EncodingType[] values = EncodingType.values;
|
143
|
-
if (values == null) {
|
144
|
-
values = EncodingType.values();
|
145
|
-
EncodingType.values = values;
|
76
|
+
String java_encoding = null;
|
77
|
+
if (encoding != context.runtime.getNil()) {
|
78
|
+
if (!(encoding instanceof RubyEncoding)) {
|
79
|
+
throw context.runtime.newTypeError("encoding must be kind_of Encoding");
|
146
80
|
}
|
147
|
-
|
148
|
-
return values[ordinal];
|
149
|
-
}
|
150
|
-
return null;
|
81
|
+
java_encoding = ((RubyEncoding)encoding).toString();
|
151
82
|
}
|
152
83
|
|
153
|
-
|
154
|
-
|
155
|
-
private static String
|
156
|
-
findEncodingName(final int value)
|
157
|
-
{
|
158
|
-
EncodingType type = EncodingType.get(value);
|
159
|
-
if (type == null) { return null; }
|
160
|
-
assert type.value == value;
|
161
|
-
return type.name;
|
162
|
-
}
|
163
|
-
|
164
|
-
private static String
|
165
|
-
findEncodingName(ThreadContext context, IRubyObject encoding)
|
166
|
-
{
|
167
|
-
String rubyEncoding = null;
|
168
|
-
if (encoding instanceof RubyString) {
|
169
|
-
rubyEncoding = rubyStringToString((RubyString) encoding);
|
170
|
-
} else if (encoding instanceof RubyFixnum) {
|
171
|
-
rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding));
|
172
|
-
}
|
173
|
-
if (rubyEncoding == null) { return null; }
|
174
|
-
try {
|
175
|
-
return Charset.forName(rubyEncoding).displayName();
|
176
|
-
} catch (UnsupportedCharsetException e) {
|
177
|
-
throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported");
|
178
|
-
} catch (IllegalCharsetNameException e) {
|
179
|
-
throw context.getRuntime().newEncodingError(e.getMessage());
|
180
|
-
}
|
181
|
-
}
|
182
|
-
|
183
|
-
private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+",
|
184
|
-
Pattern.CASE_INSENSITIVE);
|
84
|
+
Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
|
85
|
+
ctx.setStringInputSourceNoEnc(context, data, context.runtime.getNil());
|
185
86
|
|
186
|
-
|
187
|
-
|
188
|
-
{
|
189
|
-
int start_pos = 0;
|
190
|
-
int end_pos = 0;
|
191
|
-
if (containsIgnoreCase(input, "charset")) {
|
192
|
-
Matcher m = CHARSET_PATTERN.matcher(input);
|
193
|
-
while (m.find()) {
|
194
|
-
start_pos = m.start();
|
195
|
-
end_pos = m.end();
|
196
|
-
}
|
87
|
+
if (java_encoding != null) {
|
88
|
+
ctx.getInputSource().setEncoding(java_encoding);
|
197
89
|
}
|
198
|
-
if (start_pos != end_pos) {
|
199
|
-
return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc);
|
200
|
-
}
|
201
|
-
return input;
|
202
|
-
}
|
203
90
|
|
204
|
-
|
205
|
-
containsIgnoreCase(final String str, final String sub)
|
206
|
-
{
|
207
|
-
final int len = sub.length();
|
208
|
-
final int max = str.length() - len;
|
209
|
-
|
210
|
-
if (len == 0) { return true; }
|
211
|
-
final char c0Lower = Character.toLowerCase(sub.charAt(0));
|
212
|
-
final char c0Upper = Character.toUpperCase(sub.charAt(0));
|
213
|
-
|
214
|
-
for (int i = 0; i <= max; i++) {
|
215
|
-
final char ch = str.charAt(i);
|
216
|
-
if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) {
|
217
|
-
continue; // first char doesn't match
|
218
|
-
}
|
219
|
-
|
220
|
-
if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) {
|
221
|
-
return true;
|
222
|
-
}
|
223
|
-
}
|
224
|
-
return false;
|
91
|
+
return ctx;
|
225
92
|
}
|
226
93
|
|
227
|
-
@JRubyMethod(name = "
|
94
|
+
@JRubyMethod(name = "native_file", meta = true)
|
228
95
|
public static IRubyObject
|
229
|
-
parse_file(ThreadContext context,
|
230
|
-
IRubyObject klass,
|
231
|
-
IRubyObject data,
|
232
|
-
IRubyObject encoding)
|
96
|
+
parse_file(ThreadContext context, IRubyObject klass, IRubyObject data, IRubyObject encoding)
|
233
97
|
{
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
98
|
+
String java_encoding = null;
|
99
|
+
if (encoding != context.runtime.getNil()) {
|
100
|
+
if (!(encoding instanceof RubyEncoding)) {
|
101
|
+
throw context.runtime.newTypeError("encoding must be kind_of Encoding");
|
102
|
+
}
|
103
|
+
java_encoding = ((RubyEncoding)encoding).toString();
|
239
104
|
}
|
240
105
|
|
241
106
|
Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass);
|
242
107
|
ctx.setInputSourceFile(context, data);
|
243
|
-
|
244
|
-
if (
|
245
|
-
ctx.getInputSource().setEncoding(
|
108
|
+
|
109
|
+
if (java_encoding != null) {
|
110
|
+
ctx.getInputSource().setEncoding(java_encoding);
|
246
111
|
}
|
112
|
+
|
247
113
|
return ctx;
|
248
114
|
}
|
249
115
|
|
250
|
-
@JRubyMethod(name = "
|
116
|
+
@JRubyMethod(name = "native_io", meta = true)
|
251
117
|
public static IRubyObject
|
252
|
-
parse_io(ThreadContext context,
|
253
|
-
IRubyObject klass,
|
254
|
-
IRubyObject data,
|
255
|
-
IRubyObject encoding)
|
118
|
+
parse_io(ThreadContext context, IRubyObject klazz, IRubyObject data, IRubyObject encoding)
|
256
119
|
{
|
257
|
-
if (!(
|
258
|
-
throw context.
|
120
|
+
if (!invoke(context, data, "respond_to?", context.runtime.newSymbol("read")).isTrue()) {
|
121
|
+
throw context.runtime.newTypeError("argument expected to respond to :read");
|
259
122
|
}
|
260
123
|
|
261
|
-
|
124
|
+
String java_encoding = null;
|
125
|
+
if (encoding != context.runtime.getNil()) {
|
126
|
+
if (!(encoding instanceof RubyEncoding)) {
|
127
|
+
throw context.runtime.newTypeError("encoding must be kind_of Encoding");
|
128
|
+
}
|
129
|
+
java_encoding = ((RubyEncoding)encoding).toString();
|
130
|
+
}
|
131
|
+
|
132
|
+
Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
|
262
133
|
ctx.setIOInputSource(context, data, context.nil);
|
263
|
-
|
264
|
-
if (
|
265
|
-
ctx.getInputSource().setEncoding(
|
134
|
+
|
135
|
+
if (java_encoding != null) {
|
136
|
+
ctx.getInputSource().setEncoding(java_encoding);
|
266
137
|
}
|
138
|
+
|
267
139
|
return ctx;
|
268
140
|
}
|
269
141
|
|
@@ -37,7 +37,7 @@ public class NokogiriService implements BasicLibraryService
|
|
37
37
|
}
|
38
38
|
|
39
39
|
private static Map<String, RubyClass>
|
40
|
-
|
40
|
+
populateNokogiriClassCache(Ruby ruby)
|
41
41
|
{
|
42
42
|
Map<String, RubyClass> nokogiriClassCache = new HashMap<String, RubyClass>();
|
43
43
|
nokogiriClassCache.put("Nokogiri::HTML4::Document", (RubyClass)ruby.getClassFromPath("Nokogiri::HTML4::Document"));
|
@@ -91,7 +91,7 @@ public class NokogiriService implements BasicLibraryService
|
|
91
91
|
createDocuments(ruby, xmlModule, htmlModule, xmlNode);
|
92
92
|
createSaxModule(ruby, xmlSaxModule, htmlSaxModule);
|
93
93
|
createXsltModule(ruby, xsltModule);
|
94
|
-
nokogiri.setInternalVariable("cache",
|
94
|
+
nokogiri.setInternalVariable("cache", populateNokogiriClassCache(ruby));
|
95
95
|
}
|
96
96
|
|
97
97
|
private void
|
@@ -46,6 +46,9 @@ public class XmlCdata extends XmlText
|
|
46
46
|
IRubyObject rbDocument = args[0];
|
47
47
|
content = args[1];
|
48
48
|
|
49
|
+
if (content.isNil()) {
|
50
|
+
throw context.runtime.newTypeError("expected second parameter to be a String, received NilClass");
|
51
|
+
}
|
49
52
|
if (!(rbDocument instanceof XmlNode)) {
|
50
53
|
String msg = "expected first parameter to be a Nokogiri::XML::Document, received " + rbDocument.getMetaClass();
|
51
54
|
throw context.runtime.newTypeError(msg);
|
@@ -414,20 +414,13 @@ public class XmlDocument extends XmlNode
|
|
414
414
|
return getCachedNodeOrCreate(context.runtime, rootNode);
|
415
415
|
}
|
416
416
|
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
// dup = doc.dup
|
425
|
-
// dup.equal?(dup.children[0].document)
|
426
|
-
//
|
427
|
-
// Since `dup.children[0].document' will end up creating a new
|
428
|
-
// XmlDocument. See #1060.
|
429
|
-
doc.resetCache();
|
430
|
-
return doc;
|
417
|
+
@JRubyMethod(visibility = Visibility.PROTECTED)
|
418
|
+
public IRubyObject
|
419
|
+
initialize_copy_with_args(ThreadContext context, IRubyObject other, IRubyObject level)
|
420
|
+
{
|
421
|
+
super.initialize_copy_with_args(context, other, level, null);
|
422
|
+
resetCache();
|
423
|
+
return this;
|
431
424
|
}
|
432
425
|
|
433
426
|
@JRubyMethod(name = "root=")
|
@@ -48,112 +48,24 @@ public class XmlDocumentFragment extends XmlNode
|
|
48
48
|
super(ruby, klazz);
|
49
49
|
}
|
50
50
|
|
51
|
-
@JRubyMethod(name = "
|
51
|
+
@JRubyMethod(name = "native_new", meta = true)
|
52
52
|
public static IRubyObject
|
53
|
-
rbNew(ThreadContext context, IRubyObject cls, IRubyObject
|
53
|
+
rbNew(ThreadContext context, IRubyObject cls, IRubyObject value)
|
54
54
|
{
|
55
|
-
if (
|
56
|
-
throw context.runtime.newArgumentError(args.length, 1);
|
57
|
-
}
|
58
|
-
|
59
|
-
if (!(args[0] instanceof XmlDocument)) {
|
55
|
+
if (!(value instanceof XmlDocument)) {
|
60
56
|
throw context.runtime.newArgumentError("first parameter must be a Nokogiri::XML::Document instance");
|
61
57
|
}
|
62
58
|
|
63
|
-
XmlDocument doc = (XmlDocument)
|
64
|
-
|
65
|
-
// make wellformed fragment, ignore invalid namespace, or add appropriate namespace to parse
|
66
|
-
if (args.length > 1 && args[1] instanceof RubyString) {
|
67
|
-
final RubyString arg1 = (RubyString) args[1];
|
68
|
-
if (XmlDocumentFragment.isTag(arg1)) {
|
69
|
-
args[1] = RubyString.newString(context.runtime, addNamespaceDeclIfNeeded(doc, rubyStringToString(arg1)));
|
70
|
-
}
|
71
|
-
}
|
59
|
+
XmlDocument doc = (XmlDocument) value;
|
72
60
|
|
73
61
|
XmlDocumentFragment fragment = (XmlDocumentFragment) NokogiriService.XML_DOCUMENT_FRAGMENT_ALLOCATOR.allocate(
|
74
62
|
context.runtime, (RubyClass)cls);
|
75
63
|
fragment.setDocument(context, doc);
|
76
64
|
fragment.setNode(context.runtime, doc.getDocument().createDocumentFragment());
|
77
65
|
|
78
|
-
Helpers.invoke(context, fragment, "initialize", args, block);
|
79
66
|
return fragment;
|
80
67
|
}
|
81
68
|
|
82
|
-
private static final ByteList TAG_BEG = ByteList.create("<");
|
83
|
-
private static final ByteList TAG_END = ByteList.create(">");
|
84
|
-
|
85
|
-
private static boolean
|
86
|
-
isTag(final RubyString str)
|
87
|
-
{
|
88
|
-
return str.getByteList().startsWith(TAG_BEG) && str.getByteList().endsWith(TAG_END);
|
89
|
-
}
|
90
|
-
|
91
|
-
private static boolean
|
92
|
-
isNamespaceDefined(String qName, NamedNodeMap nodeMap)
|
93
|
-
{
|
94
|
-
if (isNamespace(qName.intern())) { return true; }
|
95
|
-
for (int i = 0; i < nodeMap.getLength(); i++) {
|
96
|
-
Attr attr = (Attr)nodeMap.item(i);
|
97
|
-
if (isNamespace(attr.getNodeName())) {
|
98
|
-
String localPart = getLocalNameForNamespace(attr.getNodeName(), null);
|
99
|
-
if (getPrefix(qName).equals(localPart)) {
|
100
|
-
return true;
|
101
|
-
}
|
102
|
-
}
|
103
|
-
}
|
104
|
-
return false;
|
105
|
-
}
|
106
|
-
|
107
|
-
private static final Pattern QNAME_RE = Pattern.compile("[^</:>\\s]+:[^</:>=\\s]+");
|
108
|
-
private static final Pattern START_TAG_RE = Pattern.compile("<[^</>]+>");
|
109
|
-
|
110
|
-
private static String
|
111
|
-
addNamespaceDeclIfNeeded(XmlDocument doc, String tags)
|
112
|
-
{
|
113
|
-
if (doc.getDocument() == null) { return tags; }
|
114
|
-
if (doc.getDocument().getDocumentElement() == null) { return tags; }
|
115
|
-
Matcher matcher = START_TAG_RE.matcher(tags);
|
116
|
-
Map<CharSequence, CharSequence> rewriteTable = null;
|
117
|
-
while (matcher.find()) {
|
118
|
-
String start_tag = matcher.group();
|
119
|
-
Matcher matcher2 = QNAME_RE.matcher(start_tag);
|
120
|
-
while (matcher2.find()) {
|
121
|
-
String qName = matcher2.group();
|
122
|
-
NamedNodeMap nodeMap = doc.getDocument().getDocumentElement().getAttributes();
|
123
|
-
if (isNamespaceDefined(qName, nodeMap)) {
|
124
|
-
CharSequence namespaceDecl = getNamespaceDecl(getPrefix(qName), nodeMap);
|
125
|
-
if (namespaceDecl != null) {
|
126
|
-
if (rewriteTable == null) { rewriteTable = new HashMap<CharSequence, CharSequence>(8, 1); }
|
127
|
-
StringBuilder str = new StringBuilder(qName.length() + namespaceDecl.length() + 3);
|
128
|
-
String key = str.append('<').append(qName).append('>').toString();
|
129
|
-
str.setCharAt(key.length() - 1, ' '); // (last) '>' -> ' '
|
130
|
-
rewriteTable.put(key, str.append(namespaceDecl).append('>'));
|
131
|
-
}
|
132
|
-
}
|
133
|
-
}
|
134
|
-
}
|
135
|
-
if (rewriteTable != null) {
|
136
|
-
for (Map.Entry<CharSequence, CharSequence> e : rewriteTable.entrySet()) {
|
137
|
-
tags = tags.replace(e.getKey(), e.getValue());
|
138
|
-
}
|
139
|
-
}
|
140
|
-
|
141
|
-
return tags;
|
142
|
-
}
|
143
|
-
|
144
|
-
private static CharSequence
|
145
|
-
getNamespaceDecl(final String prefix, NamedNodeMap nodeMap)
|
146
|
-
{
|
147
|
-
for (int i = 0; i < nodeMap.getLength(); i++) {
|
148
|
-
Attr attr = (Attr) nodeMap.item(i);
|
149
|
-
if (prefix.equals(attr.getLocalName())) {
|
150
|
-
return new StringBuilder().
|
151
|
-
append(attr.getName()).append('=').append('"').append(attr.getValue()).append('"');
|
152
|
-
}
|
153
|
-
}
|
154
|
-
return null;
|
155
|
-
}
|
156
|
-
|
157
69
|
@Override
|
158
70
|
public void
|
159
71
|
relink_namespace(ThreadContext context)
|
@@ -141,7 +141,7 @@ public class XmlDtd extends XmlNode
|
|
141
141
|
*
|
142
142
|
* NekoDTD parser returns a new document node containing elements
|
143
143
|
* representing the dtd declarations. The plan is to get the root
|
144
|
-
* element and adopt it into the correct document,
|
144
|
+
* element and adopt it into the correct document, stripping the
|
145
145
|
* Document provided by NekoDTD.
|
146
146
|
*
|
147
147
|
*/
|
@@ -454,7 +454,7 @@ public class XmlDtd extends XmlNode
|
|
454
454
|
* This recursive function will not descend into an
|
455
455
|
* 'externalSubset' node, thus for an internal subset it only
|
456
456
|
* extracts nodes in the internal subset, and for an external
|
457
|
-
* subset it extracts everything and
|
457
|
+
* subset it extracts everything and assumes <code>node</code>
|
458
458
|
* and all children are part of the external subset.
|
459
459
|
*/
|
460
460
|
protected IRubyObject[]
|
@@ -61,19 +61,23 @@ public class XmlEntityReference extends XmlNode
|
|
61
61
|
public void
|
62
62
|
accept(ThreadContext context, SaveContextVisitor visitor)
|
63
63
|
{
|
64
|
+
//
|
65
|
+
// Note that when noEnt is set, we call setFeature(FEATURE_NOT_EXPAND_ENTITY, false) in
|
66
|
+
// XmlDomParserContext.
|
67
|
+
//
|
68
|
+
// See https://xerces.apache.org/xerces-j/features.html section on `create-entity-ref-nodes`
|
69
|
+
//
|
70
|
+
// When set to true (the default), then EntityReference nodes are present in the DOM tree, and
|
71
|
+
// its children represent the replacement text. When set to false, then the EntityReference is
|
72
|
+
// not present in the tree, and instead the replacement text nodes are present.
|
73
|
+
//
|
74
|
+
// So: if we are here, then noEnt must be true, and we should just serialize the EntityReference
|
75
|
+
// and not worry about the replacement text. When noEnt is false, we would never this and
|
76
|
+
// instead would be serializing the replacement text.
|
77
|
+
//
|
78
|
+
// https://github.com/sparklemotion/nokogiri/issues/3270
|
79
|
+
//
|
64
80
|
visitor.enter(node);
|
65
|
-
Node child = node.getFirstChild();
|
66
|
-
while (child != null) {
|
67
|
-
IRubyObject nokoNode = getCachedNodeOrCreate(context.getRuntime(), child);
|
68
|
-
if (nokoNode instanceof XmlNode) {
|
69
|
-
XmlNode cur = (XmlNode) nokoNode;
|
70
|
-
cur.accept(context, visitor);
|
71
|
-
} else if (nokoNode instanceof XmlNamespace) {
|
72
|
-
XmlNamespace cur = (XmlNamespace) nokoNode;
|
73
|
-
cur.accept(context, visitor);
|
74
|
-
}
|
75
|
-
child = child.getNextSibling();
|
76
|
-
}
|
77
81
|
visitor.leave(node);
|
78
82
|
}
|
79
83
|
}
|