nokogiri 1.16.8-java → 1.17.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +11 -21
  3. data/README.md +4 -0
  4. data/dependencies.yml +6 -6
  5. data/ext/java/nokogiri/Html4Document.java +3 -3
  6. data/ext/java/nokogiri/Html4SaxParserContext.java +47 -175
  7. data/ext/java/nokogiri/NokogiriService.java +2 -2
  8. data/ext/java/nokogiri/XmlCdata.java +3 -0
  9. data/ext/java/nokogiri/XmlDocument.java +7 -14
  10. data/ext/java/nokogiri/XmlDocumentFragment.java +4 -92
  11. data/ext/java/nokogiri/XmlDtd.java +2 -2
  12. data/ext/java/nokogiri/XmlEntityReference.java +16 -12
  13. data/ext/java/nokogiri/XmlNode.java +26 -47
  14. data/ext/java/nokogiri/XmlNodeSet.java +10 -1
  15. data/ext/java/nokogiri/XmlSaxParserContext.java +73 -36
  16. data/ext/java/nokogiri/XmlSchema.java +15 -16
  17. data/ext/java/nokogiri/XsltStylesheet.java +1 -1
  18. data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +1 -1
  19. data/ext/java/nokogiri/internals/NokogiriDomParser.java +3 -3
  20. data/ext/java/nokogiri/internals/NokogiriHandler.java +59 -15
  21. data/ext/java/nokogiri/internals/NokogiriHelpers.java +1 -1
  22. data/ext/java/nokogiri/internals/ParserContext.java +51 -21
  23. data/ext/java/nokogiri/internals/ReaderNode.java +1 -1
  24. data/ext/java/nokogiri/internals/XmlDomParserContext.java +8 -19
  25. data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +1 -1
  26. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +1 -1
  27. data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +10 -11
  28. data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +5 -5
  29. data/ext/java/nokogiri/internals/c14n/{UtfHelpper.java → UtfHelper.java} +2 -2
  30. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +8 -8
  31. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +2 -2
  32. data/ext/nokogiri/extconf.rb +191 -137
  33. data/ext/nokogiri/gumbo.c +69 -53
  34. data/ext/nokogiri/html4_document.c +10 -4
  35. data/ext/nokogiri/html4_element_description.c +18 -18
  36. data/ext/nokogiri/html4_sax_parser.c +40 -0
  37. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  38. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  39. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  40. data/ext/nokogiri/nokogiri.c +9 -2
  41. data/ext/nokogiri/xml_attr.c +1 -1
  42. data/ext/nokogiri/xml_cdata.c +2 -10
  43. data/ext/nokogiri/xml_comment.c +3 -8
  44. data/ext/nokogiri/xml_document.c +163 -156
  45. data/ext/nokogiri/xml_document_fragment.c +10 -25
  46. data/ext/nokogiri/xml_dtd.c +1 -1
  47. data/ext/nokogiri/xml_element_content.c +9 -9
  48. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  49. data/ext/nokogiri/xml_namespace.c +6 -6
  50. data/ext/nokogiri/xml_node.c +130 -104
  51. data/ext/nokogiri/xml_node_set.c +46 -44
  52. data/ext/nokogiri/xml_reader.c +54 -58
  53. data/ext/nokogiri/xml_relax_ng.c +35 -56
  54. data/ext/nokogiri/xml_sax_parser.c +156 -88
  55. data/ext/nokogiri/xml_sax_parser_context.c +213 -131
  56. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  57. data/ext/nokogiri/xml_schema.c +50 -85
  58. data/ext/nokogiri/xml_syntax_error.c +19 -11
  59. data/ext/nokogiri/xml_text.c +2 -4
  60. data/ext/nokogiri/xml_xpath_context.c +2 -2
  61. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  62. data/lib/nokogiri/class_resolver.rb +1 -1
  63. data/lib/nokogiri/css/node.rb +6 -2
  64. data/lib/nokogiri/css/parser.rb +6 -4
  65. data/lib/nokogiri/css/parser.y +2 -2
  66. data/lib/nokogiri/css/parser_extras.rb +6 -66
  67. data/lib/nokogiri/css/selector_cache.rb +38 -0
  68. data/lib/nokogiri/css/tokenizer.rb +4 -4
  69. data/lib/nokogiri/css/tokenizer.rex +9 -8
  70. data/lib/nokogiri/css/xpath_visitor.rb +42 -6
  71. data/lib/nokogiri/css.rb +86 -20
  72. data/lib/nokogiri/decorators/slop.rb +3 -5
  73. data/lib/nokogiri/encoding_handler.rb +2 -2
  74. data/lib/nokogiri/html4/document.rb +44 -23
  75. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  76. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  77. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  78. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  79. data/lib/nokogiri/html4.rb +9 -14
  80. data/lib/nokogiri/html5/builder.rb +40 -0
  81. data/lib/nokogiri/html5/document.rb +61 -30
  82. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  83. data/lib/nokogiri/html5/node.rb +4 -4
  84. data/lib/nokogiri/html5.rb +114 -72
  85. data/lib/nokogiri/nokogiri.jar +0 -0
  86. data/lib/nokogiri/version/constant.rb +1 -1
  87. data/lib/nokogiri/xml/builder.rb +8 -1
  88. data/lib/nokogiri/xml/document.rb +70 -26
  89. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  90. data/lib/nokogiri/xml/node.rb +82 -11
  91. data/lib/nokogiri/xml/node_set.rb +9 -7
  92. data/lib/nokogiri/xml/parse_options.rb +1 -1
  93. data/lib/nokogiri/xml/pp/node.rb +6 -1
  94. data/lib/nokogiri/xml/reader.rb +46 -13
  95. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  96. data/lib/nokogiri/xml/sax/document.rb +174 -83
  97. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  98. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  99. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  100. data/lib/nokogiri/xml/sax.rb +48 -0
  101. data/lib/nokogiri/xml/schema.rb +112 -45
  102. data/lib/nokogiri/xml/searchable.rb +6 -8
  103. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  104. data/lib/nokogiri/xml.rb +13 -24
  105. data/lib/nokogiri/xslt.rb +3 -9
  106. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  107. metadata +9 -5
  108. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d738514ea45f87c34b937caf79fe740ee4e854c3977168bf92a9b9b0b5aa43ba
4
- data.tar.gz: d0a5904665e3376d5da0ef4edd765aecad5f4023834e4b5d4dd5ea4d344ae40c
3
+ metadata.gz: e72cb3083fa1cff82029cbbf2fec76f1a8dd39937c017633b0d944925d72ccaa
4
+ data.tar.gz: 45760600fd16493478685f69e2d122bd807f869052e1a3d2f640e2ba2d0a1451
5
5
  SHA512:
6
- metadata.gz: c26332a0911cbddad1bba94100c87267287339346ead9e9676d275de52363a50a73c73bfd5a123e21762ad828bbe69f0a3d933b14c54b78e443cac57245717d3
7
- data.tar.gz: bb99ef98745b0c546a1556afa2c2309dad123240a675db7d6615e810df6dbb786d9b5edbbafad2d4b33f5b21b51a3b7c1f44eb0541d7efb57a5f0a4e3ed9dbf4
6
+ metadata.gz: db57663703ce10b22d3aa59ab5aa2a308ac5fda082cea1a4bb01245277b1b2df7b20eb103966b1fc814465276d921e52ff971adca42ea7e71b67fd20219fbeba
7
+ data.tar.gz: 979db2b053dc00919e9ea60a548af51eb30d2b447a8ae46bdcf5d66acba204ab5019b277614a321f03a0d8b7f6a62171d113b8d392f172c56af305ce343c5bdd
data/Gemfile CHANGED
@@ -5,44 +5,34 @@ source "https://rubygems.org"
5
5
  gemspec
6
6
 
7
7
  group :development do
8
- # ruby 3.4.0-dev removed some gems from the default set
9
- #
10
- # TODO: we should be able to remove these as our gem dependencies sort it out and we pull them in
11
- # transitively.
12
- gem "mutex_m"
13
-
14
8
  # bootstrapping
15
9
  gem "bundler", "~> 2.3"
16
- gem "rake", "13.1.0"
10
+ gem "rake", "13.2.1"
17
11
 
18
12
  # building extensions
19
- gem "rake-compiler", "1.2.6"
20
- gem "rake-compiler-dock", "1.4.0"
13
+ gem "rake-compiler", "1.2.8"
14
+ gem "rake-compiler-dock", "1.5.2"
21
15
 
22
16
  # parser generator
23
- gem "rexical", "= 1.0.7"
17
+ gem "rexical", "1.0.8"
24
18
 
25
19
  # tests
26
- gem "minitest", "5.21.2"
20
+ gem "minitest", "5.25.2"
27
21
  gem "minitest-parallel_fork", "2.0.0"
28
- gem "ruby_memcheck", "2.3.0"
22
+ gem "ruby_memcheck", "3.0.0"
29
23
  gem "rubyzip", "~> 2.3.2"
30
24
  gem "simplecov", "= 0.21.2"
31
25
 
32
26
  # rubocop
33
- if Gem::Requirement.new("~> 3.0").satisfied_by?(Gem::Version.new(RUBY_VERSION))
34
- gem "rubocop", "1.60.2"
35
- gem "rubocop-minitest", "0.34.5"
36
- gem "rubocop-packaging", "0.5.2"
37
- gem "rubocop-performance", "1.20.2"
38
- gem "rubocop-rake", "= 0.6.0"
39
- gem "rubocop-shopify", "2.14.0"
40
- end
27
+ gem "standard", "1.42.1"
28
+ gem "rubocop-minitest", "0.36.0"
29
+ gem "rubocop-packaging", "0.5.2"
30
+ gem "rubocop-rake", "0.6.0"
41
31
  end
42
32
 
43
33
  # If Psych doesn't build, you can disable this group locally by running
44
34
  # `bundle config set --local without rdoc`
45
35
  # Then re-run `bundle install`.
46
36
  group :rdoc do
47
- gem "rdoc", "6.6.2"
37
+ gem "rdoc", "6.8.1"
48
38
  end
data/README.md CHANGED
@@ -117,6 +117,10 @@ Requirements:
117
117
  - Ruby >= 3.0
118
118
  - JRuby >= 9.4.0.0
119
119
 
120
+ If you are compiling the native extension against a system version of libxml2:
121
+
122
+ - libxml2 >= 2.9.2 (recommended >= 2.12.0)
123
+
120
124
 
121
125
  ### Native Gems: Faster, more reliable installation
122
126
 
data/dependencies.yml CHANGED
@@ -1,13 +1,13 @@
1
1
  ---
2
2
  libxml2:
3
- version: "2.12.9"
4
- sha256: "59912db536ab56a3996489ea0299768c7bcffe57169f0235e7f962a91f483590"
5
- # sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.12/libxml2-2.12.9.sha256sum
3
+ version: "2.13.5"
4
+ sha256: "74fc163217a3964257d3be39af943e08861263c4231f9ef5b496b6f6d4c7b2b6"
5
+ # sha-256 hash provided in https://download.gnome.org/sources/libxml2/2.13/libxml2-2.13.5.sha256sum
6
6
 
7
7
  libxslt:
8
- version: "1.1.39"
9
- sha256: "2a20ad621148339b0759c4d4e96719362dee64c9a096dbba625ba053846349f0"
10
- # sha-256 hash provided in https://download.gnome.org/sources/libxslt/1.1/libxslt-1.1.39.sha256sum
8
+ version: "1.1.42"
9
+ sha256: "85ca62cac0d41fc77d3f6033da9df6fd73d20ea2fc18b0a3609ffb4110e1baeb"
10
+ # sha-256 hash provided in https://download.gnome.org/sources/libxslt/1.1/libxslt-1.1.42.sha256sum
11
11
 
12
12
  zlib:
13
13
  version: "1.3.1"
@@ -30,7 +30,7 @@ public class Html4Document extends XmlDocument
30
30
 
31
31
  private static final String DEFAULT_CONTENT_TYPE = "html";
32
32
  private static final String DEFAULT_PUBLIC_ID = "-//W3C//DTD HTML 4.01//EN";
33
- private static final String DEFAULT_SYTEM_ID = "http://www.w3.org/TR/html4/strict.dtd";
33
+ private static final String DEFAULT_SYSTEM_ID = "http://www.w3.org/TR/html4/strict.dtd";
34
34
 
35
35
  private String parsed_encoding = null;
36
36
 
@@ -88,7 +88,7 @@ public class Html4Document extends XmlDocument
88
88
  getDocument(),
89
89
  context.getRuntime().newString(DEFAULT_CONTENT_TYPE),
90
90
  context.getRuntime().newString(DEFAULT_PUBLIC_ID),
91
- context.getRuntime().newString(DEFAULT_SYTEM_ID));
91
+ context.getRuntime().newString(DEFAULT_SYSTEM_ID));
92
92
  setInternalSubset(internalSubset);
93
93
  }
94
94
 
@@ -132,7 +132,7 @@ public class Html4Document extends XmlDocument
132
132
  }
133
133
 
134
134
  public String
135
- getPraedEncoding()
135
+ getParsedEncoding()
136
136
  {
137
137
  return parsed_encoding;
138
138
  }
@@ -2,16 +2,12 @@ package nokogiri;
2
2
 
3
3
  import java.io.ByteArrayInputStream;
4
4
  import java.io.InputStream;
5
- import java.nio.charset.Charset;
6
- import java.nio.charset.IllegalCharsetNameException;
7
- import java.nio.charset.UnsupportedCharsetException;
8
- import java.util.regex.Matcher;
9
- import java.util.regex.Pattern;
10
5
 
11
6
  import org.apache.xerces.parsers.AbstractSAXParser;
12
7
  import net.sourceforge.htmlunit.cyberneko.parsers.SAXParser;
13
8
  import org.jruby.Ruby;
14
9
  import org.jruby.RubyClass;
10
+ import org.jruby.RubyEncoding;
15
11
  import org.jruby.RubyFixnum;
16
12
  import org.jruby.RubyString;
17
13
  import org.jruby.anno.JRubyClass;
@@ -23,6 +19,8 @@ import org.xml.sax.SAXException;
23
19
  import nokogiri.internals.NokogiriHandler;
24
20
  import static nokogiri.internals.NokogiriHelpers.rubyStringToString;
25
21
 
22
+ import static org.jruby.runtime.Helpers.invoke;
23
+
26
24
  /**
27
25
  * Class for Nokogiri::HTML4::SAX::ParserContext.
28
26
  *
@@ -56,10 +54,9 @@ public class Html4SaxParserContext extends XmlSaxParserContext
56
54
  SAXParser parser = new SAXParser();
57
55
 
58
56
  try {
59
- parser.setProperty(
60
- "http://cyberneko.org/html/properties/names/elems", "lower");
61
- parser.setProperty(
62
- "http://cyberneko.org/html/properties/names/attrs", "lower");
57
+ parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
58
+ parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
59
+ parser.setFeature("http://cyberneko.org/html/features/report-errors", true);
63
60
 
64
61
  // NekoHTML should not try to guess the encoding based on the meta
65
62
  // tags or other information in the document. This is already
@@ -72,198 +69,73 @@ public class Html4SaxParserContext extends XmlSaxParserContext
72
69
  }
73
70
  }
74
71
 
75
- @JRubyMethod(name = "memory", meta = true)
72
+ @JRubyMethod(name = "native_memory", meta = true)
76
73
  public static IRubyObject
77
- parse_memory(ThreadContext context,
78
- IRubyObject klazz,
79
- IRubyObject data,
80
- IRubyObject encoding)
74
+ parse_memory(ThreadContext context, IRubyObject klazz, IRubyObject data, IRubyObject encoding)
81
75
  {
82
- Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
83
- String javaEncoding = findEncodingName(context, encoding);
84
- if (javaEncoding != null) {
85
- CharSequence input = applyEncoding(rubyStringToString(data.convertToString()), javaEncoding);
86
- ByteArrayInputStream istream = new ByteArrayInputStream(input.toString().getBytes());
87
- ctx.setInputSource(istream);
88
- ctx.getInputSource().setEncoding(javaEncoding);
89
- }
90
- return ctx;
91
- }
92
-
93
- public enum EncodingType {
94
- NONE(0, "NONE"),
95
- UTF_8(1, "UTF-8"),
96
- UTF16LE(2, "UTF16LE"),
97
- UTF16BE(3, "UTF16BE"),
98
- UCS4LE(4, "UCS4LE"),
99
- UCS4BE(5, "UCS4BE"),
100
- EBCDIC(6, "EBCDIC"),
101
- UCS4_2143(7, "ICS4-2143"),
102
- UCS4_3412(8, "UCS4-3412"),
103
- UCS2(9, "UCS2"),
104
- ISO_8859_1(10, "ISO-8859-1"),
105
- ISO_8859_2(11, "ISO-8859-2"),
106
- ISO_8859_3(12, "ISO-8859-3"),
107
- ISO_8859_4(13, "ISO-8859-4"),
108
- ISO_8859_5(14, "ISO-8859-5"),
109
- ISO_8859_6(15, "ISO-8859-6"),
110
- ISO_8859_7(16, "ISO-8859-7"),
111
- ISO_8859_8(17, "ISO-8859-8"),
112
- ISO_8859_9(18, "ISO-8859-9"),
113
- ISO_2022_JP(19, "ISO-2022-JP"),
114
- SHIFT_JIS(20, "SHIFT-JIS"),
115
- EUC_JP(21, "EUC-JP"),
116
- ASCII(22, "ASCII");
117
-
118
- private final int value;
119
- private final String name;
120
-
121
- EncodingType(int value, String name)
122
- {
123
- this.value = value;
124
- this.name = name;
125
- }
126
-
127
- public int getValue()
128
- {
129
- return value;
130
- }
131
-
132
- public String toString()
133
- {
134
- return name;
135
- }
136
-
137
- private static transient EncodingType[] values;
138
-
139
- // NOTE: assuming ordinal == value
140
- static EncodingType get(final int ordinal)
141
- {
142
- EncodingType[] values = EncodingType.values;
143
- if (values == null) {
144
- values = EncodingType.values();
145
- EncodingType.values = values;
76
+ String java_encoding = null;
77
+ if (encoding != context.runtime.getNil()) {
78
+ if (!(encoding instanceof RubyEncoding)) {
79
+ throw context.runtime.newTypeError("encoding must be kind_of Encoding");
146
80
  }
147
- if (ordinal >= 0 && ordinal < values.length) {
148
- return values[ordinal];
149
- }
150
- return null;
81
+ java_encoding = ((RubyEncoding)encoding).toString();
151
82
  }
152
83
 
153
- }
154
-
155
- private static String
156
- findEncodingName(final int value)
157
- {
158
- EncodingType type = EncodingType.get(value);
159
- if (type == null) { return null; }
160
- assert type.value == value;
161
- return type.name;
162
- }
163
-
164
- private static String
165
- findEncodingName(ThreadContext context, IRubyObject encoding)
166
- {
167
- String rubyEncoding = null;
168
- if (encoding instanceof RubyString) {
169
- rubyEncoding = rubyStringToString((RubyString) encoding);
170
- } else if (encoding instanceof RubyFixnum) {
171
- rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding));
172
- }
173
- if (rubyEncoding == null) { return null; }
174
- try {
175
- return Charset.forName(rubyEncoding).displayName();
176
- } catch (UnsupportedCharsetException e) {
177
- throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported");
178
- } catch (IllegalCharsetNameException e) {
179
- throw context.getRuntime().newEncodingError(e.getMessage());
180
- }
181
- }
182
-
183
- private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+",
184
- Pattern.CASE_INSENSITIVE);
84
+ Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
85
+ ctx.setStringInputSourceNoEnc(context, data, context.runtime.getNil());
185
86
 
186
- private static CharSequence
187
- applyEncoding(final String input, final String enc)
188
- {
189
- int start_pos = 0;
190
- int end_pos = 0;
191
- if (containsIgnoreCase(input, "charset")) {
192
- Matcher m = CHARSET_PATTERN.matcher(input);
193
- while (m.find()) {
194
- start_pos = m.start();
195
- end_pos = m.end();
196
- }
87
+ if (java_encoding != null) {
88
+ ctx.getInputSource().setEncoding(java_encoding);
197
89
  }
198
- if (start_pos != end_pos) {
199
- return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc);
200
- }
201
- return input;
202
- }
203
90
 
204
- private static boolean
205
- containsIgnoreCase(final String str, final String sub)
206
- {
207
- final int len = sub.length();
208
- final int max = str.length() - len;
209
-
210
- if (len == 0) { return true; }
211
- final char c0Lower = Character.toLowerCase(sub.charAt(0));
212
- final char c0Upper = Character.toUpperCase(sub.charAt(0));
213
-
214
- for (int i = 0; i <= max; i++) {
215
- final char ch = str.charAt(i);
216
- if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) {
217
- continue; // first char doesn't match
218
- }
219
-
220
- if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) {
221
- return true;
222
- }
223
- }
224
- return false;
91
+ return ctx;
225
92
  }
226
93
 
227
- @JRubyMethod(name = "file", meta = true)
94
+ @JRubyMethod(name = "native_file", meta = true)
228
95
  public static IRubyObject
229
- parse_file(ThreadContext context,
230
- IRubyObject klass,
231
- IRubyObject data,
232
- IRubyObject encoding)
96
+ parse_file(ThreadContext context, IRubyObject klass, IRubyObject data, IRubyObject encoding)
233
97
  {
234
- if (!(data instanceof RubyString)) {
235
- throw context.getRuntime().newTypeError("data must be kind_of String");
236
- }
237
- if (!(encoding instanceof RubyString)) {
238
- throw context.getRuntime().newTypeError("data must be kind_of String");
98
+ String java_encoding = null;
99
+ if (encoding != context.runtime.getNil()) {
100
+ if (!(encoding instanceof RubyEncoding)) {
101
+ throw context.runtime.newTypeError("encoding must be kind_of Encoding");
102
+ }
103
+ java_encoding = ((RubyEncoding)encoding).toString();
239
104
  }
240
105
 
241
106
  Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass);
242
107
  ctx.setInputSourceFile(context, data);
243
- String javaEncoding = findEncodingName(context, encoding);
244
- if (javaEncoding != null) {
245
- ctx.getInputSource().setEncoding(javaEncoding);
108
+
109
+ if (java_encoding != null) {
110
+ ctx.getInputSource().setEncoding(java_encoding);
246
111
  }
112
+
247
113
  return ctx;
248
114
  }
249
115
 
250
- @JRubyMethod(name = "io", meta = true)
116
+ @JRubyMethod(name = "native_io", meta = true)
251
117
  public static IRubyObject
252
- parse_io(ThreadContext context,
253
- IRubyObject klass,
254
- IRubyObject data,
255
- IRubyObject encoding)
118
+ parse_io(ThreadContext context, IRubyObject klazz, IRubyObject data, IRubyObject encoding)
256
119
  {
257
- if (!(encoding instanceof RubyFixnum)) {
258
- throw context.getRuntime().newTypeError("encoding must be kind_of String");
120
+ if (!invoke(context, data, "respond_to?", context.runtime.newSymbol("read")).isTrue()) {
121
+ throw context.runtime.newTypeError("argument expected to respond to :read");
259
122
  }
260
123
 
261
- Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass);
124
+ String java_encoding = null;
125
+ if (encoding != context.runtime.getNil()) {
126
+ if (!(encoding instanceof RubyEncoding)) {
127
+ throw context.runtime.newTypeError("encoding must be kind_of Encoding");
128
+ }
129
+ java_encoding = ((RubyEncoding)encoding).toString();
130
+ }
131
+
132
+ Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
262
133
  ctx.setIOInputSource(context, data, context.nil);
263
- String javaEncoding = findEncodingName(context, encoding);
264
- if (javaEncoding != null) {
265
- ctx.getInputSource().setEncoding(javaEncoding);
134
+
135
+ if (java_encoding != null) {
136
+ ctx.getInputSource().setEncoding(java_encoding);
266
137
  }
138
+
267
139
  return ctx;
268
140
  }
269
141
 
@@ -37,7 +37,7 @@ public class NokogiriService implements BasicLibraryService
37
37
  }
38
38
 
39
39
  private static Map<String, RubyClass>
40
- populateNokogiriClassCahce(Ruby ruby)
40
+ populateNokogiriClassCache(Ruby ruby)
41
41
  {
42
42
  Map<String, RubyClass> nokogiriClassCache = new HashMap<String, RubyClass>();
43
43
  nokogiriClassCache.put("Nokogiri::HTML4::Document", (RubyClass)ruby.getClassFromPath("Nokogiri::HTML4::Document"));
@@ -91,7 +91,7 @@ public class NokogiriService implements BasicLibraryService
91
91
  createDocuments(ruby, xmlModule, htmlModule, xmlNode);
92
92
  createSaxModule(ruby, xmlSaxModule, htmlSaxModule);
93
93
  createXsltModule(ruby, xsltModule);
94
- nokogiri.setInternalVariable("cache", populateNokogiriClassCahce(ruby));
94
+ nokogiri.setInternalVariable("cache", populateNokogiriClassCache(ruby));
95
95
  }
96
96
 
97
97
  private void
@@ -46,6 +46,9 @@ public class XmlCdata extends XmlText
46
46
  IRubyObject rbDocument = args[0];
47
47
  content = args[1];
48
48
 
49
+ if (content.isNil()) {
50
+ throw context.runtime.newTypeError("expected second parameter to be a String, received NilClass");
51
+ }
49
52
  if (!(rbDocument instanceof XmlNode)) {
50
53
  String msg = "expected first parameter to be a Nokogiri::XML::Document, received " + rbDocument.getMetaClass();
51
54
  throw context.runtime.newTypeError(msg);
@@ -414,20 +414,13 @@ public class XmlDocument extends XmlNode
414
414
  return getCachedNodeOrCreate(context.runtime, rootNode);
415
415
  }
416
416
 
417
- protected IRubyObject
418
- dup_implementation(Ruby runtime, boolean deep)
419
- {
420
- XmlDocument doc = (XmlDocument) super.dup_implementation(runtime, deep);
421
- // Avoid creating a new XmlDocument since we cloned one
422
- // already. Otherwise the following test will fail:
423
- //
424
- // dup = doc.dup
425
- // dup.equal?(dup.children[0].document)
426
- //
427
- // Since `dup.children[0].document' will end up creating a new
428
- // XmlDocument. See #1060.
429
- doc.resetCache();
430
- return doc;
417
+ @JRubyMethod(visibility = Visibility.PROTECTED)
418
+ public IRubyObject
419
+ initialize_copy_with_args(ThreadContext context, IRubyObject other, IRubyObject level)
420
+ {
421
+ super.initialize_copy_with_args(context, other, level, null);
422
+ resetCache();
423
+ return this;
431
424
  }
432
425
 
433
426
  @JRubyMethod(name = "root=")
@@ -48,112 +48,24 @@ public class XmlDocumentFragment extends XmlNode
48
48
  super(ruby, klazz);
49
49
  }
50
50
 
51
- @JRubyMethod(name = "new", meta = true, required = 1, optional = 3)
51
+ @JRubyMethod(name = "native_new", meta = true)
52
52
  public static IRubyObject
53
- rbNew(ThreadContext context, IRubyObject cls, IRubyObject[] args, Block block)
53
+ rbNew(ThreadContext context, IRubyObject cls, IRubyObject value)
54
54
  {
55
- if (args.length < 1) {
56
- throw context.runtime.newArgumentError(args.length, 1);
57
- }
58
-
59
- if (!(args[0] instanceof XmlDocument)) {
55
+ if (!(value instanceof XmlDocument)) {
60
56
  throw context.runtime.newArgumentError("first parameter must be a Nokogiri::XML::Document instance");
61
57
  }
62
58
 
63
- XmlDocument doc = (XmlDocument) args[0];
64
-
65
- // make wellformed fragment, ignore invalid namespace, or add appropriate namespace to parse
66
- if (args.length > 1 && args[1] instanceof RubyString) {
67
- final RubyString arg1 = (RubyString) args[1];
68
- if (XmlDocumentFragment.isTag(arg1)) {
69
- args[1] = RubyString.newString(context.runtime, addNamespaceDeclIfNeeded(doc, rubyStringToString(arg1)));
70
- }
71
- }
59
+ XmlDocument doc = (XmlDocument) value;
72
60
 
73
61
  XmlDocumentFragment fragment = (XmlDocumentFragment) NokogiriService.XML_DOCUMENT_FRAGMENT_ALLOCATOR.allocate(
74
62
  context.runtime, (RubyClass)cls);
75
63
  fragment.setDocument(context, doc);
76
64
  fragment.setNode(context.runtime, doc.getDocument().createDocumentFragment());
77
65
 
78
- Helpers.invoke(context, fragment, "initialize", args, block);
79
66
  return fragment;
80
67
  }
81
68
 
82
- private static final ByteList TAG_BEG = ByteList.create("<");
83
- private static final ByteList TAG_END = ByteList.create(">");
84
-
85
- private static boolean
86
- isTag(final RubyString str)
87
- {
88
- return str.getByteList().startsWith(TAG_BEG) && str.getByteList().endsWith(TAG_END);
89
- }
90
-
91
- private static boolean
92
- isNamespaceDefined(String qName, NamedNodeMap nodeMap)
93
- {
94
- if (isNamespace(qName.intern())) { return true; }
95
- for (int i = 0; i < nodeMap.getLength(); i++) {
96
- Attr attr = (Attr)nodeMap.item(i);
97
- if (isNamespace(attr.getNodeName())) {
98
- String localPart = getLocalNameForNamespace(attr.getNodeName(), null);
99
- if (getPrefix(qName).equals(localPart)) {
100
- return true;
101
- }
102
- }
103
- }
104
- return false;
105
- }
106
-
107
- private static final Pattern QNAME_RE = Pattern.compile("[^</:>\\s]+:[^</:>=\\s]+");
108
- private static final Pattern START_TAG_RE = Pattern.compile("<[^</>]+>");
109
-
110
- private static String
111
- addNamespaceDeclIfNeeded(XmlDocument doc, String tags)
112
- {
113
- if (doc.getDocument() == null) { return tags; }
114
- if (doc.getDocument().getDocumentElement() == null) { return tags; }
115
- Matcher matcher = START_TAG_RE.matcher(tags);
116
- Map<CharSequence, CharSequence> rewriteTable = null;
117
- while (matcher.find()) {
118
- String start_tag = matcher.group();
119
- Matcher matcher2 = QNAME_RE.matcher(start_tag);
120
- while (matcher2.find()) {
121
- String qName = matcher2.group();
122
- NamedNodeMap nodeMap = doc.getDocument().getDocumentElement().getAttributes();
123
- if (isNamespaceDefined(qName, nodeMap)) {
124
- CharSequence namespaceDecl = getNamespaceDecl(getPrefix(qName), nodeMap);
125
- if (namespaceDecl != null) {
126
- if (rewriteTable == null) { rewriteTable = new HashMap<CharSequence, CharSequence>(8, 1); }
127
- StringBuilder str = new StringBuilder(qName.length() + namespaceDecl.length() + 3);
128
- String key = str.append('<').append(qName).append('>').toString();
129
- str.setCharAt(key.length() - 1, ' '); // (last) '>' -> ' '
130
- rewriteTable.put(key, str.append(namespaceDecl).append('>'));
131
- }
132
- }
133
- }
134
- }
135
- if (rewriteTable != null) {
136
- for (Map.Entry<CharSequence, CharSequence> e : rewriteTable.entrySet()) {
137
- tags = tags.replace(e.getKey(), e.getValue());
138
- }
139
- }
140
-
141
- return tags;
142
- }
143
-
144
- private static CharSequence
145
- getNamespaceDecl(final String prefix, NamedNodeMap nodeMap)
146
- {
147
- for (int i = 0; i < nodeMap.getLength(); i++) {
148
- Attr attr = (Attr) nodeMap.item(i);
149
- if (prefix.equals(attr.getLocalName())) {
150
- return new StringBuilder().
151
- append(attr.getName()).append('=').append('"').append(attr.getValue()).append('"');
152
- }
153
- }
154
- return null;
155
- }
156
-
157
69
  @Override
158
70
  public void
159
71
  relink_namespace(ThreadContext context)
@@ -141,7 +141,7 @@ public class XmlDtd extends XmlNode
141
141
  *
142
142
  * NekoDTD parser returns a new document node containing elements
143
143
  * representing the dtd declarations. The plan is to get the root
144
- * element and adopt it into the correct document, stipping the
144
+ * element and adopt it into the correct document, stripping the
145
145
  * Document provided by NekoDTD.
146
146
  *
147
147
  */
@@ -454,7 +454,7 @@ public class XmlDtd extends XmlNode
454
454
  * This recursive function will not descend into an
455
455
  * 'externalSubset' node, thus for an internal subset it only
456
456
  * extracts nodes in the internal subset, and for an external
457
- * subset it extracts everything and assumess <code>node</code>
457
+ * subset it extracts everything and assumes <code>node</code>
458
458
  * and all children are part of the external subset.
459
459
  */
460
460
  protected IRubyObject[]
@@ -61,19 +61,23 @@ public class XmlEntityReference extends XmlNode
61
61
  public void
62
62
  accept(ThreadContext context, SaveContextVisitor visitor)
63
63
  {
64
+ //
65
+ // Note that when noEnt is set, we call setFeature(FEATURE_NOT_EXPAND_ENTITY, false) in
66
+ // XmlDomParserContext.
67
+ //
68
+ // See https://xerces.apache.org/xerces-j/features.html section on `create-entity-ref-nodes`
69
+ //
70
+ // When set to true (the default), then EntityReference nodes are present in the DOM tree, and
71
+ // its children represent the replacement text. When set to false, then the EntityReference is
72
+ // not present in the tree, and instead the replacement text nodes are present.
73
+ //
74
+ // So: if we are here, then noEnt must be true, and we should just serialize the EntityReference
75
+ // and not worry about the replacement text. When noEnt is false, we would never this and
76
+ // instead would be serializing the replacement text.
77
+ //
78
+ // https://github.com/sparklemotion/nokogiri/issues/3270
79
+ //
64
80
  visitor.enter(node);
65
- Node child = node.getFirstChild();
66
- while (child != null) {
67
- IRubyObject nokoNode = getCachedNodeOrCreate(context.getRuntime(), child);
68
- if (nokoNode instanceof XmlNode) {
69
- XmlNode cur = (XmlNode) nokoNode;
70
- cur.accept(context, visitor);
71
- } else if (nokoNode instanceof XmlNamespace) {
72
- XmlNamespace cur = (XmlNamespace) nokoNode;
73
- cur.accept(context, visitor);
74
- }
75
- child = child.getNextSibling();
76
- }
77
81
  visitor.leave(node);
78
82
  }
79
83
  }