nokogiri 1.16.8-java → 1.17.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +11 -21
  3. data/README.md +4 -0
  4. data/dependencies.yml +6 -6
  5. data/ext/java/nokogiri/Html4Document.java +3 -3
  6. data/ext/java/nokogiri/Html4SaxParserContext.java +47 -175
  7. data/ext/java/nokogiri/NokogiriService.java +2 -2
  8. data/ext/java/nokogiri/XmlCdata.java +3 -0
  9. data/ext/java/nokogiri/XmlDocument.java +7 -14
  10. data/ext/java/nokogiri/XmlDocumentFragment.java +4 -92
  11. data/ext/java/nokogiri/XmlDtd.java +2 -2
  12. data/ext/java/nokogiri/XmlEntityReference.java +16 -12
  13. data/ext/java/nokogiri/XmlNode.java +26 -47
  14. data/ext/java/nokogiri/XmlNodeSet.java +10 -1
  15. data/ext/java/nokogiri/XmlSaxParserContext.java +73 -36
  16. data/ext/java/nokogiri/XmlSchema.java +15 -16
  17. data/ext/java/nokogiri/XsltStylesheet.java +1 -1
  18. data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +1 -1
  19. data/ext/java/nokogiri/internals/NokogiriDomParser.java +3 -3
  20. data/ext/java/nokogiri/internals/NokogiriHandler.java +59 -15
  21. data/ext/java/nokogiri/internals/NokogiriHelpers.java +1 -1
  22. data/ext/java/nokogiri/internals/ParserContext.java +51 -21
  23. data/ext/java/nokogiri/internals/ReaderNode.java +1 -1
  24. data/ext/java/nokogiri/internals/XmlDomParserContext.java +8 -19
  25. data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +1 -1
  26. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +1 -1
  27. data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +10 -11
  28. data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +5 -5
  29. data/ext/java/nokogiri/internals/c14n/{UtfHelpper.java → UtfHelper.java} +2 -2
  30. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +8 -8
  31. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +2 -2
  32. data/ext/nokogiri/extconf.rb +191 -137
  33. data/ext/nokogiri/gumbo.c +69 -53
  34. data/ext/nokogiri/html4_document.c +10 -4
  35. data/ext/nokogiri/html4_element_description.c +18 -18
  36. data/ext/nokogiri/html4_sax_parser.c +40 -0
  37. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  38. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  39. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  40. data/ext/nokogiri/nokogiri.c +9 -2
  41. data/ext/nokogiri/xml_attr.c +1 -1
  42. data/ext/nokogiri/xml_cdata.c +2 -10
  43. data/ext/nokogiri/xml_comment.c +3 -8
  44. data/ext/nokogiri/xml_document.c +163 -156
  45. data/ext/nokogiri/xml_document_fragment.c +10 -25
  46. data/ext/nokogiri/xml_dtd.c +1 -1
  47. data/ext/nokogiri/xml_element_content.c +9 -9
  48. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  49. data/ext/nokogiri/xml_namespace.c +6 -6
  50. data/ext/nokogiri/xml_node.c +130 -104
  51. data/ext/nokogiri/xml_node_set.c +46 -44
  52. data/ext/nokogiri/xml_reader.c +54 -58
  53. data/ext/nokogiri/xml_relax_ng.c +35 -56
  54. data/ext/nokogiri/xml_sax_parser.c +156 -88
  55. data/ext/nokogiri/xml_sax_parser_context.c +213 -131
  56. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  57. data/ext/nokogiri/xml_schema.c +50 -85
  58. data/ext/nokogiri/xml_syntax_error.c +19 -11
  59. data/ext/nokogiri/xml_text.c +2 -4
  60. data/ext/nokogiri/xml_xpath_context.c +2 -2
  61. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  62. data/lib/nokogiri/class_resolver.rb +1 -1
  63. data/lib/nokogiri/css/node.rb +6 -2
  64. data/lib/nokogiri/css/parser.rb +6 -4
  65. data/lib/nokogiri/css/parser.y +2 -2
  66. data/lib/nokogiri/css/parser_extras.rb +6 -66
  67. data/lib/nokogiri/css/selector_cache.rb +38 -0
  68. data/lib/nokogiri/css/tokenizer.rb +4 -4
  69. data/lib/nokogiri/css/tokenizer.rex +9 -8
  70. data/lib/nokogiri/css/xpath_visitor.rb +42 -6
  71. data/lib/nokogiri/css.rb +86 -20
  72. data/lib/nokogiri/decorators/slop.rb +3 -5
  73. data/lib/nokogiri/encoding_handler.rb +2 -2
  74. data/lib/nokogiri/html4/document.rb +44 -23
  75. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  76. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  77. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  78. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  79. data/lib/nokogiri/html4.rb +9 -14
  80. data/lib/nokogiri/html5/builder.rb +40 -0
  81. data/lib/nokogiri/html5/document.rb +61 -30
  82. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  83. data/lib/nokogiri/html5/node.rb +4 -4
  84. data/lib/nokogiri/html5.rb +114 -72
  85. data/lib/nokogiri/nokogiri.jar +0 -0
  86. data/lib/nokogiri/version/constant.rb +1 -1
  87. data/lib/nokogiri/xml/builder.rb +8 -1
  88. data/lib/nokogiri/xml/document.rb +70 -26
  89. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  90. data/lib/nokogiri/xml/node.rb +82 -11
  91. data/lib/nokogiri/xml/node_set.rb +9 -7
  92. data/lib/nokogiri/xml/parse_options.rb +1 -1
  93. data/lib/nokogiri/xml/pp/node.rb +6 -1
  94. data/lib/nokogiri/xml/reader.rb +46 -13
  95. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  96. data/lib/nokogiri/xml/sax/document.rb +174 -83
  97. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  98. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  99. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  100. data/lib/nokogiri/xml/sax.rb +48 -0
  101. data/lib/nokogiri/xml/schema.rb +112 -45
  102. data/lib/nokogiri/xml/searchable.rb +6 -8
  103. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  104. data/lib/nokogiri/xml.rb +13 -24
  105. data/lib/nokogiri/xslt.rb +3 -9
  106. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  107. metadata +9 -5
  108. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d738514ea45f87c34b937caf79fe740ee4e854c3977168bf92a9b9b0b5aa43ba
4
- data.tar.gz: d0a5904665e3376d5da0ef4edd765aecad5f4023834e4b5d4dd5ea4d344ae40c
3
+ metadata.gz: e72cb3083fa1cff82029cbbf2fec76f1a8dd39937c017633b0d944925d72ccaa
4
+ data.tar.gz: 45760600fd16493478685f69e2d122bd807f869052e1a3d2f640e2ba2d0a1451
5
5
  SHA512:
6
- metadata.gz: c26332a0911cbddad1bba94100c87267287339346ead9e9676d275de52363a50a73c73bfd5a123e21762ad828bbe69f0a3d933b14c54b78e443cac57245717d3
7
- data.tar.gz: bb99ef98745b0c546a1556afa2c2309dad123240a675db7d6615e810df6dbb786d9b5edbbafad2d4b33f5b21b51a3b7c1f44eb0541d7efb57a5f0a4e3ed9dbf4
6
+ metadata.gz: db57663703ce10b22d3aa59ab5aa2a308ac5fda082cea1a4bb01245277b1b2df7b20eb103966b1fc814465276d921e52ff971adca42ea7e71b67fd20219fbeba
7
+ data.tar.gz: 979db2b053dc00919e9ea60a548af51eb30d2b447a8ae46bdcf5d66acba204ab5019b277614a321f03a0d8b7f6a62171d113b8d392f172c56af305ce343c5bdd
data/Gemfile CHANGED
@@ -5,44 +5,34 @@ source "https://rubygems.org"
5
5
  gemspec
6
6
 
7
7
  group :development do
8
- # ruby 3.4.0-dev removed some gems from the default set
9
- #
10
- # TODO: we should be able to remove these as our gem dependencies sort it out and we pull them in
11
- # transitively.
12
- gem "mutex_m"
13
-
14
8
  # bootstrapping
15
9
  gem "bundler", "~> 2.3"
16
- gem "rake", "13.1.0"
10
+ gem "rake", "13.2.1"
17
11
 
18
12
  # building extensions
19
- gem "rake-compiler", "1.2.6"
20
- gem "rake-compiler-dock", "1.4.0"
13
+ gem "rake-compiler", "1.2.8"
14
+ gem "rake-compiler-dock", "1.5.2"
21
15
 
22
16
  # parser generator
23
- gem "rexical", "= 1.0.7"
17
+ gem "rexical", "1.0.8"
24
18
 
25
19
  # tests
26
- gem "minitest", "5.21.2"
20
+ gem "minitest", "5.25.2"
27
21
  gem "minitest-parallel_fork", "2.0.0"
28
- gem "ruby_memcheck", "2.3.0"
22
+ gem "ruby_memcheck", "3.0.0"
29
23
  gem "rubyzip", "~> 2.3.2"
30
24
  gem "simplecov", "= 0.21.2"
31
25
 
32
26
  # rubocop
33
- if Gem::Requirement.new("~> 3.0").satisfied_by?(Gem::Version.new(RUBY_VERSION))
34
- gem "rubocop", "1.60.2"
35
- gem "rubocop-minitest", "0.34.5"
36
- gem "rubocop-packaging", "0.5.2"
37
- gem "rubocop-performance", "1.20.2"
38
- gem "rubocop-rake", "= 0.6.0"
39
- gem "rubocop-shopify", "2.14.0"
40
- end
27
+ gem "standard", "1.42.1"
28
+ gem "rubocop-minitest", "0.36.0"
29
+ gem "rubocop-packaging", "0.5.2"
30
+ gem "rubocop-rake", "0.6.0"
41
31
  end
42
32
 
43
33
  # If Psych doesn't build, you can disable this group locally by running
44
34
  # `bundle config set --local without rdoc`
45
35
  # Then re-run `bundle install`.
46
36
  group :rdoc do
47
- gem "rdoc", "6.6.2"
37
+ gem "rdoc", "6.8.1"
48
38
  end
data/README.md CHANGED
@@ -117,6 +117,10 @@ Requirements:
117
117
  - Ruby >= 3.0
118
118
  - JRuby >= 9.4.0.0
119
119
 
120
+ If you are compiling the native extension against a system version of libxml2:
121
+
122
+ - libxml2 >= 2.9.2 (recommended >= 2.12.0)
123
+
120
124
 
121
125
  ### Native Gems: Faster, more reliable installation
122
126
 
data/dependencies.yml CHANGED
@@ -1,13 +1,13 @@
1
1
  ---
2
2
  libxml2:
3
- version: "2.12.9"
4
- sha256: "59912db536ab56a3996489ea0299768c7bcffe57169f0235e7f962a91f483590"
5
- # sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.12/libxml2-2.12.9.sha256sum
3
+ version: "2.13.5"
4
+ sha256: "74fc163217a3964257d3be39af943e08861263c4231f9ef5b496b6f6d4c7b2b6"
5
+ # sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.13/libxml2-2.13.5.sha256sum
6
6
 
7
7
  libxslt:
8
- version: "1.1.39"
9
- sha256: "2a20ad621148339b0759c4d4e96719362dee64c9a096dbba625ba053846349f0"
10
- # sha-256 hash provided in https://download.gnome.org/sources/libxslt/1.1/libxslt-1.1.39.sha256sum
8
+ version: "1.1.42"
9
+ sha256: "85ca62cac0d41fc77d3f6033da9df6fd73d20ea2fc18b0a3609ffb4110e1baeb"
10
+ # sha-256 hash provided in https://download.gnome.org/sources/libxslt/1.1/libxslt-1.1.42.sha256sum
11
11
 
12
12
  zlib:
13
13
  version: "1.3.1"
@@ -30,7 +30,7 @@ public class Html4Document extends XmlDocument
30
30
 
31
31
  private static final String DEFAULT_CONTENT_TYPE = "html";
32
32
  private static final String DEFAULT_PUBLIC_ID = "-//W3C//DTD HTML 4.01//EN";
33
- private static final String DEFAULT_SYTEM_ID = "http://www.w3.org/TR/html4/strict.dtd";
33
+ private static final String DEFAULT_SYSTEM_ID = "http://www.w3.org/TR/html4/strict.dtd";
34
34
 
35
35
  private String parsed_encoding = null;
36
36
 
@@ -88,7 +88,7 @@ public class Html4Document extends XmlDocument
88
88
  getDocument(),
89
89
  context.getRuntime().newString(DEFAULT_CONTENT_TYPE),
90
90
  context.getRuntime().newString(DEFAULT_PUBLIC_ID),
91
- context.getRuntime().newString(DEFAULT_SYTEM_ID));
91
+ context.getRuntime().newString(DEFAULT_SYSTEM_ID));
92
92
  setInternalSubset(internalSubset);
93
93
  }
94
94
 
@@ -132,7 +132,7 @@ public class Html4Document extends XmlDocument
132
132
  }
133
133
 
134
134
  public String
135
- getPraedEncoding()
135
+ getParsedEncoding()
136
136
  {
137
137
  return parsed_encoding;
138
138
  }
@@ -2,16 +2,12 @@ package nokogiri;
2
2
 
3
3
  import java.io.ByteArrayInputStream;
4
4
  import java.io.InputStream;
5
- import java.nio.charset.Charset;
6
- import java.nio.charset.IllegalCharsetNameException;
7
- import java.nio.charset.UnsupportedCharsetException;
8
- import java.util.regex.Matcher;
9
- import java.util.regex.Pattern;
10
5
 
11
6
  import org.apache.xerces.parsers.AbstractSAXParser;
12
7
  import net.sourceforge.htmlunit.cyberneko.parsers.SAXParser;
13
8
  import org.jruby.Ruby;
14
9
  import org.jruby.RubyClass;
10
+ import org.jruby.RubyEncoding;
15
11
  import org.jruby.RubyFixnum;
16
12
  import org.jruby.RubyString;
17
13
  import org.jruby.anno.JRubyClass;
@@ -23,6 +19,8 @@ import org.xml.sax.SAXException;
23
19
  import nokogiri.internals.NokogiriHandler;
24
20
  import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
25
21
 
22
+ import static org.jruby.runtime.Helpers.invoke;
23
+
26
24
  /**
27
25
  * Class for Nokogiri::HTML4::SAX::ParserContext.
28
26
  *
@@ -56,10 +54,9 @@ public class Html4SaxParserContext extends XmlSaxParserContext
56
54
  SAXParser parser = new SAXParser();
57
55
 
58
56
  try {
59
- parser.setProperty(
60
- "http://cyberneko.org/html/properties/names/elems", "lower");
61
- parser.setProperty(
62
- "http://cyberneko.org/html/properties/names/attrs", "lower");
57
+ parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
58
+ parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
59
+ parser.setFeature("http://cyberneko.org/html/features/report-errors", true);
63
60
 
64
61
  // NekoHTML should not try to guess the encoding based on the meta
65
62
  // tags or other information in the document. This is already
@@ -72,198 +69,73 @@ public class Html4SaxParserContext extends XmlSaxParserContext
72
69
  }
73
70
  }
74
71
 
75
- @JRubyMethod(name = "memory", meta = true)
72
+ @JRubyMethod(name = "native_memory", meta = true)
76
73
  public static IRubyObject
77
- parse_memory(ThreadContext context,
78
- IRubyObject klazz,
79
- IRubyObject data,
80
- IRubyObject encoding)
74
+ parse_memory(ThreadContext context, IRubyObject klazz, IRubyObject data, IRubyObject encoding)
81
75
  {
82
- Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
83
- String javaEncoding = findEncodingName(context, encoding);
84
- if (javaEncoding != null) {
85
- CharSequence input = applyEncoding(rubyStringToString(data.convertToString()), javaEncoding);
86
- ByteArrayInputStream istream = new ByteArrayInputStream(input.toString().getBytes());
87
- ctx.setInputSource(istream);
88
- ctx.getInputSource().setEncoding(javaEncoding);
89
- }
90
- return ctx;
91
- }
92
-
93
- public enum EncodingType {
94
- NONE(0, "NONE"),
95
- UTF_8(1, "UTF-8"),
96
- UTF16LE(2, "UTF16LE"),
97
- UTF16BE(3, "UTF16BE"),
98
- UCS4LE(4, "UCS4LE"),
99
- UCS4BE(5, "UCS4BE"),
100
- EBCDIC(6, "EBCDIC"),
101
- UCS4_2143(7, "ICS4-2143"),
102
- UCS4_3412(8, "UCS4-3412"),
103
- UCS2(9, "UCS2"),
104
- ISO_8859_1(10, "ISO-8859-1"),
105
- ISO_8859_2(11, "ISO-8859-2"),
106
- ISO_8859_3(12, "ISO-8859-3"),
107
- ISO_8859_4(13, "ISO-8859-4"),
108
- ISO_8859_5(14, "ISO-8859-5"),
109
- ISO_8859_6(15, "ISO-8859-6"),
110
- ISO_8859_7(16, "ISO-8859-7"),
111
- ISO_8859_8(17, "ISO-8859-8"),
112
- ISO_8859_9(18, "ISO-8859-9"),
113
- ISO_2022_JP(19, "ISO-2022-JP"),
114
- SHIFT_JIS(20, "SHIFT-JIS"),
115
- EUC_JP(21, "EUC-JP"),
116
- ASCII(22, "ASCII");
117
-
118
- private final int value;
119
- private final String name;
120
-
121
- EncodingType(int value, String name)
122
- {
123
- this.value = value;
124
- this.name = name;
125
- }
126
-
127
- public int getValue()
128
- {
129
- return value;
130
- }
131
-
132
- public String toString()
133
- {
134
- return name;
135
- }
136
-
137
- private static transient EncodingType[] values;
138
-
139
- // NOTE: assuming ordinal == value
140
- static EncodingType get(final int ordinal)
141
- {
142
- EncodingType[] values = EncodingType.values;
143
- if (values == null) {
144
- values = EncodingType.values();
145
- EncodingType.values = values;
76
+ String java_encoding = null;
77
+ if (encoding != context.runtime.getNil()) {
78
+ if (!(encoding instanceof RubyEncoding)) {
79
+ throw context.runtime.newTypeError("encoding must be kind_of Encoding");
146
80
  }
147
- if (ordinal >= 0 && ordinal < values.length) {
148
- return values[ordinal];
149
- }
150
- return null;
81
+ java_encoding = ((RubyEncoding)encoding).toString();
151
82
  }
152
83
 
153
- }
154
-
155
- private static String
156
- findEncodingName(final int value)
157
- {
158
- EncodingType type = EncodingType.get(value);
159
- if (type == null) { return null; }
160
- assert type.value == value;
161
- return type.name;
162
- }
163
-
164
- private static String
165
- findEncodingName(ThreadContext context, IRubyObject encoding)
166
- {
167
- String rubyEncoding = null;
168
- if (encoding instanceof RubyString) {
169
- rubyEncoding = rubyStringToString((RubyString) encoding);
170
- } else if (encoding instanceof RubyFixnum) {
171
- rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding));
172
- }
173
- if (rubyEncoding == null) { return null; }
174
- try {
175
- return Charset.forName(rubyEncoding).displayName();
176
- } catch (UnsupportedCharsetException e) {
177
- throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported");
178
- } catch (IllegalCharsetNameException e) {
179
- throw context.getRuntime().newEncodingError(e.getMessage());
180
- }
181
- }
182
-
183
- private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+",
184
- Pattern.CASE_INSENSITIVE);
84
+ Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
85
+ ctx.setStringInputSourceNoEnc(context, data, context.runtime.getNil());
185
86
 
186
- private static CharSequence
187
- applyEncoding(final String input, final String enc)
188
- {
189
- int start_pos = 0;
190
- int end_pos = 0;
191
- if (containsIgnoreCase(input, "charset")) {
192
- Matcher m = CHARSET_PATTERN.matcher(input);
193
- while (m.find()) {
194
- start_pos = m.start();
195
- end_pos = m.end();
196
- }
87
+ if (java_encoding != null) {
88
+ ctx.getInputSource().setEncoding(java_encoding);
197
89
  }
198
- if (start_pos != end_pos) {
199
- return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc);
200
- }
201
- return input;
202
- }
203
90
 
204
- private static boolean
205
- containsIgnoreCase(final String str, final String sub)
206
- {
207
- final int len = sub.length();
208
- final int max = str.length() - len;
209
-
210
- if (len == 0) { return true; }
211
- final char c0Lower = Character.toLowerCase(sub.charAt(0));
212
- final char c0Upper = Character.toUpperCase(sub.charAt(0));
213
-
214
- for (int i = 0; i <= max; i++) {
215
- final char ch = str.charAt(i);
216
- if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) {
217
- continue; // first char doesn't match
218
- }
219
-
220
- if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) {
221
- return true;
222
- }
223
- }
224
- return false;
91
+ return ctx;
225
92
  }
226
93
 
227
- @JRubyMethod(name = "file", meta = true)
94
+ @JRubyMethod(name = "native_file", meta = true)
228
95
  public static IRubyObject
229
- parse_file(ThreadContext context,
230
- IRubyObject klass,
231
- IRubyObject data,
232
- IRubyObject encoding)
96
+ parse_file(ThreadContext context, IRubyObject klass, IRubyObject data, IRubyObject encoding)
233
97
  {
234
- if (!(data instanceof RubyString)) {
235
- throw context.getRuntime().newTypeError("data must be kind_of String");
236
- }
237
- if (!(encoding instanceof RubyString)) {
238
- throw context.getRuntime().newTypeError("data must be kind_of String");
98
+ String java_encoding = null;
99
+ if (encoding != context.runtime.getNil()) {
100
+ if (!(encoding instanceof RubyEncoding)) {
101
+ throw context.runtime.newTypeError("encoding must be kind_of Encoding");
102
+ }
103
+ java_encoding = ((RubyEncoding)encoding).toString();
239
104
  }
240
105
 
241
106
  Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass);
242
107
  ctx.setInputSourceFile(context, data);
243
- String javaEncoding = findEncodingName(context, encoding);
244
- if (javaEncoding != null) {
245
- ctx.getInputSource().setEncoding(javaEncoding);
108
+
109
+ if (java_encoding != null) {
110
+ ctx.getInputSource().setEncoding(java_encoding);
246
111
  }
112
+
247
113
  return ctx;
248
114
  }
249
115
 
250
- @JRubyMethod(name = "io", meta = true)
116
+ @JRubyMethod(name = "native_io", meta = true)
251
117
  public static IRubyObject
252
- parse_io(ThreadContext context,
253
- IRubyObject klass,
254
- IRubyObject data,
255
- IRubyObject encoding)
118
+ parse_io(ThreadContext context, IRubyObject klazz, IRubyObject data, IRubyObject encoding)
256
119
  {
257
- if (!(encoding instanceof RubyFixnum)) {
258
- throw context.getRuntime().newTypeError("encoding must be kind_of String");
120
+ if (!invoke(context, data, "respond_to?", context.runtime.newSymbol("read")).isTrue()) {
121
+ throw context.runtime.newTypeError("argument expected to respond to :read");
259
122
  }
260
123
 
261
- Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass);
124
+ String java_encoding = null;
125
+ if (encoding != context.runtime.getNil()) {
126
+ if (!(encoding instanceof RubyEncoding)) {
127
+ throw context.runtime.newTypeError("encoding must be kind_of Encoding");
128
+ }
129
+ java_encoding = ((RubyEncoding)encoding).toString();
130
+ }
131
+
132
+ Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
262
133
  ctx.setIOInputSource(context, data, context.nil);
263
- String javaEncoding = findEncodingName(context, encoding);
264
- if (javaEncoding != null) {
265
- ctx.getInputSource().setEncoding(javaEncoding);
134
+
135
+ if (java_encoding != null) {
136
+ ctx.getInputSource().setEncoding(java_encoding);
266
137
  }
138
+
267
139
  return ctx;
268
140
  }
269
141
 
@@ -37,7 +37,7 @@ public class NokogiriService implements BasicLibraryService
37
37
  }
38
38
 
39
39
  private static Map<String, RubyClass>
40
- populateNokogiriClassCahce(Ruby ruby)
40
+ populateNokogiriClassCache(Ruby ruby)
41
41
  {
42
42
  Map<String, RubyClass> nokogiriClassCache = new HashMap<String, RubyClass>();
43
43
  nokogiriClassCache.put("Nokogiri::HTML4::Document", (RubyClass)ruby.getClassFromPath("Nokogiri::HTML4::Document"));
@@ -91,7 +91,7 @@ public class NokogiriService implements BasicLibraryService
91
91
  createDocuments(ruby, xmlModule, htmlModule, xmlNode);
92
92
  createSaxModule(ruby, xmlSaxModule, htmlSaxModule);
93
93
  createXsltModule(ruby, xsltModule);
94
- nokogiri.setInternalVariable("cache", populateNokogiriClassCahce(ruby));
94
+ nokogiri.setInternalVariable("cache", populateNokogiriClassCache(ruby));
95
95
  }
96
96
 
97
97
  private void
@@ -46,6 +46,9 @@ public class XmlCdata extends XmlText
46
46
  IRubyObject rbDocument = args[0];
47
47
  content = args[1];
48
48
 
49
+ if (content.isNil()) {
50
+ throw context.runtime.newTypeError("expected second parameter to be a String, received NilClass");
51
+ }
49
52
  if (!(rbDocument instanceof XmlNode)) {
50
53
  String msg = "expected first parameter to be a Nokogiri::XML::Document, received " + rbDocument.getMetaClass();
51
54
  throw context.runtime.newTypeError(msg);
@@ -414,20 +414,13 @@ public class XmlDocument extends XmlNode
414
414
  return getCachedNodeOrCreate(context.runtime, rootNode);
415
415
  }
416
416
 
417
- protected IRubyObject
418
- dup_implementation(Ruby runtime, boolean deep)
419
- {
420
- XmlDocument doc = (XmlDocument) super.dup_implementation(runtime, deep);
421
- // Avoid creating a new XmlDocument since we cloned one
422
- // already. Otherwise the following test will fail:
423
- //
424
- // dup = doc.dup
425
- // dup.equal?(dup.children[0].document)
426
- //
427
- // Since `dup.children[0].document' will end up creating a new
428
- // XmlDocument. See #1060.
429
- doc.resetCache();
430
- return doc;
417
+ @JRubyMethod(visibility = Visibility.PROTECTED)
418
+ public IRubyObject
419
+ initialize_copy_with_args(ThreadContext context, IRubyObject other, IRubyObject level)
420
+ {
421
+ super.initialize_copy_with_args(context, other, level, null);
422
+ resetCache();
423
+ return this;
431
424
  }
432
425
 
433
426
  @JRubyMethod(name = "root=")
@@ -48,112 +48,24 @@ public class XmlDocumentFragment extends XmlNode
48
48
  super(ruby, klazz);
49
49
  }
50
50
 
51
- @JRubyMethod(name = "new", meta = true, required = 1, optional = 3)
51
+ @JRubyMethod(name = "native_new", meta = true)
52
52
  public static IRubyObject
53
- rbNew(ThreadContext context, IRubyObject cls, IRubyObject[] args, Block block)
53
+ rbNew(ThreadContext context, IRubyObject cls, IRubyObject value)
54
54
  {
55
- if (args.length < 1) {
56
- throw context.runtime.newArgumentError(args.length, 1);
57
- }
58
-
59
- if (!(args[0] instanceof XmlDocument)) {
55
+ if (!(value instanceof XmlDocument)) {
60
56
  throw context.runtime.newArgumentError("first parameter must be a Nokogiri::XML::Document instance");
61
57
  }
62
58
 
63
- XmlDocument doc = (XmlDocument) args[0];
64
-
65
- // make wellformed fragment, ignore invalid namespace, or add appropriate namespace to parse
66
- if (args.length > 1 && args[1] instanceof RubyString) {
67
- final RubyString arg1 = (RubyString) args[1];
68
- if (XmlDocumentFragment.isTag(arg1)) {
69
- args[1] = RubyString.newString(context.runtime, addNamespaceDeclIfNeeded(doc, rubyStringToString(arg1)));
70
- }
71
- }
59
+ XmlDocument doc = (XmlDocument) value;
72
60
 
73
61
  XmlDocumentFragment fragment = (XmlDocumentFragment) NokogiriService.XML_DOCUMENT_FRAGMENT_ALLOCATOR.allocate(
74
62
  context.runtime, (RubyClass)cls);
75
63
  fragment.setDocument(context, doc);
76
64
  fragment.setNode(context.runtime, doc.getDocument().createDocumentFragment());
77
65
 
78
- Helpers.invoke(context, fragment, "initialize", args, block);
79
66
  return fragment;
80
67
  }
81
68
 
82
- private static final ByteList TAG_BEG = ByteList.create("<");
83
- private static final ByteList TAG_END = ByteList.create(">");
84
-
85
- private static boolean
86
- isTag(final RubyString str)
87
- {
88
- return str.getByteList().startsWith(TAG_BEG) && str.getByteList().endsWith(TAG_END);
89
- }
90
-
91
- private static boolean
92
- isNamespaceDefined(String qName, NamedNodeMap nodeMap)
93
- {
94
- if (isNamespace(qName.intern())) { return true; }
95
- for (int i = 0; i < nodeMap.getLength(); i++) {
96
- Attr attr = (Attr)nodeMap.item(i);
97
- if (isNamespace(attr.getNodeName())) {
98
- String localPart = getLocalNameForNamespace(attr.getNodeName(), null);
99
- if (getPrefix(qName).equals(localPart)) {
100
- return true;
101
- }
102
- }
103
- }
104
- return false;
105
- }
106
-
107
- private static final Pattern QNAME_RE = Pattern.compile("[^</:>\\s]+:[^</:>=\\s]+");
108
- private static final Pattern START_TAG_RE = Pattern.compile("<[^</>]+>");
109
-
110
- private static String
111
- addNamespaceDeclIfNeeded(XmlDocument doc, String tags)
112
- {
113
- if (doc.getDocument() == null) { return tags; }
114
- if (doc.getDocument().getDocumentElement() == null) { return tags; }
115
- Matcher matcher = START_TAG_RE.matcher(tags);
116
- Map<CharSequence, CharSequence> rewriteTable = null;
117
- while (matcher.find()) {
118
- String start_tag = matcher.group();
119
- Matcher matcher2 = QNAME_RE.matcher(start_tag);
120
- while (matcher2.find()) {
121
- String qName = matcher2.group();
122
- NamedNodeMap nodeMap = doc.getDocument().getDocumentElement().getAttributes();
123
- if (isNamespaceDefined(qName, nodeMap)) {
124
- CharSequence namespaceDecl = getNamespaceDecl(getPrefix(qName), nodeMap);
125
- if (namespaceDecl != null) {
126
- if (rewriteTable == null) { rewriteTable = new HashMap<CharSequence, CharSequence>(8, 1); }
127
- StringBuilder str = new StringBuilder(qName.length() + namespaceDecl.length() + 3);
128
- String key = str.append('<').append(qName).append('>').toString();
129
- str.setCharAt(key.length() - 1, ' '); // (last) '>' -> ' '
130
- rewriteTable.put(key, str.append(namespaceDecl).append('>'));
131
- }
132
- }
133
- }
134
- }
135
- if (rewriteTable != null) {
136
- for (Map.Entry<CharSequence, CharSequence> e : rewriteTable.entrySet()) {
137
- tags = tags.replace(e.getKey(), e.getValue());
138
- }
139
- }
140
-
141
- return tags;
142
- }
143
-
144
- private static CharSequence
145
- getNamespaceDecl(final String prefix, NamedNodeMap nodeMap)
146
- {
147
- for (int i = 0; i < nodeMap.getLength(); i++) {
148
- Attr attr = (Attr) nodeMap.item(i);
149
- if (prefix.equals(attr.getLocalName())) {
150
- return new StringBuilder().
151
- append(attr.getName()).append('=').append('"').append(attr.getValue()).append('"');
152
- }
153
- }
154
- return null;
155
- }
156
-
157
69
  @Override
158
70
  public void
159
71
  relink_namespace(ThreadContext context)
@@ -141,7 +141,7 @@ public class XmlDtd extends XmlNode
141
141
  *
142
142
  * NekoDTD parser returns a new document node containing elements
143
143
  * representing the dtd declarations. The plan is to get the root
144
- * element and adopt it into the correct document, stipping the
144
+ * element and adopt it into the correct document, stripping the
145
145
  * Document provided by NekoDTD.
146
146
  *
147
147
  */
@@ -454,7 +454,7 @@ public class XmlDtd extends XmlNode
454
454
  * This recursive function will not descend into an
455
455
  * 'externalSubset' node, thus for an internal subset it only
456
456
  * extracts nodes in the internal subset, and for an external
457
- * subset it extracts everything and assumess <code>node</code>
457
+ * subset it extracts everything and assumes <code>node</code>
458
458
  * and all children are part of the external subset.
459
459
  */
460
460
  protected IRubyObject[]
@@ -61,19 +61,23 @@ public class XmlEntityReference extends XmlNode
61
61
  public void
62
62
  accept(ThreadContext context, SaveContextVisitor visitor)
63
63
  {
64
+ //
65
+ // Note that when noEnt is set, we call setFeature(FEATURE_NOT_EXPAND_ENTITY, false) in
66
+ // XmlDomParserContext.
67
+ //
68
+ // See https://xerces.apache.org/xerces-j/features.html section on `create-entity-ref-nodes`
69
+ //
70
+ // When set to true (the default), then EntityReference nodes are present in the DOM tree, and
71
+ // its children represent the replacement text. When set to false, then the EntityReference is
72
+ // not present in the tree, and instead the replacement text nodes are present.
73
+ //
74
+ // So: if we are here, then noEnt must be true, and we should just serialize the EntityReference
75
+ // and not worry about the replacement text. When noEnt is false, we would never this and
76
+ // instead would be serializing the replacement text.
77
+ //
78
+ // https://github.com/sparklemotion/nokogiri/issues/3270
79
+ //
64
80
  visitor.enter(node);
65
- Node child = node.getFirstChild();
66
- while (child != null) {
67
- IRubyObject nokoNode = getCachedNodeOrCreate(context.getRuntime(), child);
68
- if (nokoNode instanceof XmlNode) {
69
- XmlNode cur = (XmlNode) nokoNode;
70
- cur.accept(context, visitor);
71
- } else if (nokoNode instanceof XmlNamespace) {
72
- XmlNamespace cur = (XmlNamespace) nokoNode;
73
- cur.accept(context, visitor);
74
- }
75
- child = child.getNextSibling();
76
- }
77
81
  visitor.leave(node);
78
82
  }
79
83
  }