nokogiri 1.13.8 → 1.15.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +40 -0
  3. data/LICENSE-DEPENDENCIES.md +830 -509
  4. data/LICENSE.md +1 -1
  5. data/README.md +18 -11
  6. data/dependencies.yml +33 -15
  7. data/ext/nokogiri/extconf.rb +164 -46
  8. data/ext/nokogiri/gumbo.c +20 -10
  9. data/ext/nokogiri/html4_document.c +3 -4
  10. data/ext/nokogiri/html4_element_description.c +20 -15
  11. data/ext/nokogiri/html4_entity_lookup.c +2 -2
  12. data/ext/nokogiri/html4_sax_parser_context.c +11 -22
  13. data/ext/nokogiri/html4_sax_push_parser.c +3 -3
  14. data/ext/nokogiri/nokogiri.c +84 -75
  15. data/ext/nokogiri/nokogiri.h +31 -16
  16. data/ext/nokogiri/test_global_handlers.c +1 -1
  17. data/ext/nokogiri/xml_attr.c +2 -2
  18. data/ext/nokogiri/xml_attribute_decl.c +2 -2
  19. data/ext/nokogiri/xml_cdata.c +32 -18
  20. data/ext/nokogiri/xml_comment.c +2 -2
  21. data/ext/nokogiri/xml_document.c +127 -34
  22. data/ext/nokogiri/xml_document_fragment.c +2 -2
  23. data/ext/nokogiri/xml_dtd.c +2 -2
  24. data/ext/nokogiri/xml_element_content.c +34 -31
  25. data/ext/nokogiri/xml_element_decl.c +7 -7
  26. data/ext/nokogiri/xml_encoding_handler.c +15 -7
  27. data/ext/nokogiri/xml_entity_decl.c +1 -1
  28. data/ext/nokogiri/xml_entity_reference.c +2 -2
  29. data/ext/nokogiri/xml_namespace.c +79 -14
  30. data/ext/nokogiri/xml_node.c +300 -34
  31. data/ext/nokogiri/xml_node_set.c +125 -107
  32. data/ext/nokogiri/xml_processing_instruction.c +2 -2
  33. data/ext/nokogiri/xml_reader.c +81 -48
  34. data/ext/nokogiri/xml_relax_ng.c +66 -81
  35. data/ext/nokogiri/xml_sax_parser.c +45 -20
  36. data/ext/nokogiri/xml_sax_parser_context.c +46 -30
  37. data/ext/nokogiri/xml_sax_push_parser.c +30 -11
  38. data/ext/nokogiri/xml_schema.c +95 -117
  39. data/ext/nokogiri/xml_syntax_error.c +1 -1
  40. data/ext/nokogiri/xml_text.c +28 -14
  41. data/ext/nokogiri/xml_xpath_context.c +216 -136
  42. data/ext/nokogiri/xslt_stylesheet.c +118 -64
  43. data/gumbo-parser/Makefile +10 -0
  44. data/gumbo-parser/src/attribute.h +1 -1
  45. data/gumbo-parser/src/error.c +10 -6
  46. data/gumbo-parser/src/error.h +1 -1
  47. data/gumbo-parser/src/foreign_attrs.c +15 -16
  48. data/gumbo-parser/src/foreign_attrs.gperf +1 -1
  49. data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
  50. data/gumbo-parser/src/parser.c +21 -5
  51. data/gumbo-parser/src/replacement.h +1 -1
  52. data/gumbo-parser/src/string_buffer.h +1 -1
  53. data/gumbo-parser/src/string_piece.c +1 -1
  54. data/gumbo-parser/src/svg_attrs.c +2 -2
  55. data/gumbo-parser/src/svg_tags.c +2 -2
  56. data/gumbo-parser/src/tag.c +2 -1
  57. data/gumbo-parser/src/tag_lookup.c +7 -7
  58. data/gumbo-parser/src/tag_lookup.gperf +1 -0
  59. data/gumbo-parser/src/tag_lookup.h +1 -1
  60. data/gumbo-parser/src/token_buffer.h +1 -1
  61. data/gumbo-parser/src/tokenizer.c +1 -1
  62. data/gumbo-parser/src/tokenizer.h +1 -1
  63. data/gumbo-parser/src/utf8.c +1 -1
  64. data/gumbo-parser/src/utf8.h +1 -1
  65. data/gumbo-parser/src/util.c +1 -3
  66. data/gumbo-parser/src/util.h +4 -0
  67. data/gumbo-parser/src/vector.h +1 -1
  68. data/lib/nokogiri/css/node.rb +2 -2
  69. data/lib/nokogiri/css/xpath_visitor.rb +7 -5
  70. data/lib/nokogiri/css.rb +6 -0
  71. data/lib/nokogiri/decorators/slop.rb +1 -1
  72. data/lib/nokogiri/encoding_handler.rb +57 -0
  73. data/lib/nokogiri/extension.rb +4 -3
  74. data/lib/nokogiri/html4/document.rb +2 -121
  75. data/lib/nokogiri/html4/document_fragment.rb +1 -1
  76. data/lib/nokogiri/html4/element_description_defaults.rb +1827 -365
  77. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  78. data/lib/nokogiri/html4.rb +1 -0
  79. data/lib/nokogiri/html5/document.rb +113 -36
  80. data/lib/nokogiri/html5/document_fragment.rb +10 -3
  81. data/lib/nokogiri/html5/node.rb +8 -5
  82. data/lib/nokogiri/html5.rb +130 -216
  83. data/lib/nokogiri/jruby/dependencies.rb +1 -19
  84. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  85. data/lib/nokogiri/version/constant.rb +1 -1
  86. data/lib/nokogiri/version/info.rb +11 -10
  87. data/lib/nokogiri/xml/attr.rb +49 -0
  88. data/lib/nokogiri/xml/attribute_decl.rb +4 -2
  89. data/lib/nokogiri/xml/builder.rb +1 -1
  90. data/lib/nokogiri/xml/document.rb +102 -55
  91. data/lib/nokogiri/xml/document_fragment.rb +50 -7
  92. data/lib/nokogiri/xml/element_content.rb +10 -2
  93. data/lib/nokogiri/xml/element_decl.rb +4 -2
  94. data/lib/nokogiri/xml/entity_decl.rb +4 -2
  95. data/lib/nokogiri/xml/namespace.rb +42 -0
  96. data/lib/nokogiri/xml/node/save_options.rb +14 -4
  97. data/lib/nokogiri/xml/node.rb +212 -48
  98. data/lib/nokogiri/xml/node_set.rb +88 -9
  99. data/lib/nokogiri/xml/parse_options.rb +129 -50
  100. data/lib/nokogiri/xml/pp/node.rb +28 -15
  101. data/lib/nokogiri/xml/processing_instruction.rb +2 -1
  102. data/lib/nokogiri/xml/sax/document.rb +1 -1
  103. data/lib/nokogiri/xml/sax/parser.rb +2 -3
  104. data/lib/nokogiri/xml/searchable.rb +18 -10
  105. data/lib/nokogiri/xslt.rb +74 -4
  106. data/lib/nokogiri.rb +15 -15
  107. data/lib/xsd/xmlparser/nokogiri.rb +4 -2
  108. data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
  109. data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
  110. data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
  111. data/ports/archives/libxml2-2.11.7.tar.xz +0 -0
  112. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
  113. metadata +19 -242
  114. data/patches/libxml2/0004-use-glibc-strlen.patch +0 -53
  115. data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
  116. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +0 -3040
  117. data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +0 -61
  118. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +0 -3037
  119. data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
  120. data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
data/ext/nokogiri/gumbo.c CHANGED
@@ -23,13 +23,13 @@
23
23
  //
24
24
  // Processing starts by calling gumbo_parse_with_options. The resulting document tree
25
25
  // is then walked, a parallel libxml2 tree is constructed, and the final document is
26
- // then wrapped using Nokogiri_wrap_xml_document. This approach reduces memory and CPU
26
+ // then wrapped using noko_xml_document_wrap. This approach reduces memory and CPU
27
27
  // requirements as Ruby objects are only built when necessary.
28
28
  //
29
29
 
30
30
  #include <nokogiri.h>
31
31
 
32
- #include "gumbo.h"
32
+ #include "nokogiri_gumbo.h"
33
33
 
34
34
  VALUE cNokogiriHtml5Document;
35
35
 
@@ -281,12 +281,12 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
281
281
  rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
282
282
  rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
283
283
  rb_iv_set(syntax_error, "@file", url);
284
- rb_iv_set(syntax_error, "@line", INT2NUM(position.line));
284
+ rb_iv_set(syntax_error, "@line", SIZET2NUM(position.line));
285
285
  rb_iv_set(syntax_error, "@str1", str1);
286
286
  rb_iv_set(syntax_error, "@str2", Qnil);
287
287
  rb_iv_set(syntax_error, "@str3", Qnil);
288
288
  rb_iv_set(syntax_error, "@int1", INT2NUM(0));
289
- rb_iv_set(syntax_error, "@column", INT2NUM(position.column));
289
+ rb_iv_set(syntax_error, "@column", SIZET2NUM(position.column));
290
290
  rb_ary_push(rerrors, syntax_error);
291
291
  }
292
292
  rb_iv_set(rdoc, "@errors", rerrors);
@@ -297,6 +297,7 @@ typedef struct {
297
297
  GumboOutput *output;
298
298
  VALUE input;
299
299
  VALUE url_or_frag;
300
+ VALUE klass;
300
301
  xmlDocPtr doc;
301
302
  } ParseArgs;
302
303
 
@@ -321,7 +322,7 @@ static VALUE parse_continue(VALUE parse_args);
321
322
  * @!visibility protected
322
323
  */
323
324
  static VALUE
324
- parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
325
+ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
325
326
  {
326
327
  GumboOptions options = kGumboDefaultOptions;
327
328
  options.max_attributes = NUM2INT(max_attributes);
@@ -333,6 +334,7 @@ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors
333
334
  .output = output,
334
335
  .input = input,
335
336
  .url_or_frag = url,
337
+ .klass = klass,
336
338
  .doc = NULL,
337
339
  };
338
340
 
@@ -357,7 +359,9 @@ parse_continue(VALUE parse_args)
357
359
  }
358
360
  args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
359
361
  build_tree(doc, (xmlNodePtr)doc, output->document);
360
- VALUE rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc);
362
+ VALUE rdoc = noko_xml_document_wrap(args->klass, doc);
363
+ rb_iv_set(rdoc, "@url", args->url_or_frag);
364
+ rb_iv_set(rdoc, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
361
365
  args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
362
366
  add_errors(output, rdoc, args->input, args->url_or_frag);
363
367
  return rdoc;
@@ -498,9 +502,11 @@ error:
498
502
  }
499
503
 
500
504
  // Encoding.
501
- if (RSTRING_LEN(tag_name) == 14
505
+ if (ctx_ns == GUMBO_NAMESPACE_MATHML
506
+ && RSTRING_LEN(tag_name) == 14
502
507
  && !st_strcasecmp(ctx_tag, "annotation-xml")) {
503
508
  VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
509
+ 1,
504
510
  rb_utf8_str_new_static("encoding", 8));
505
511
  if (RTEST(enc)) {
506
512
  Check_Type(enc, T_STRING);
@@ -512,8 +518,11 @@ error:
512
518
  // Quirks mode.
513
519
  VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
514
520
  VALUE dtd = rb_funcall(doc, internal_subset, 0);
515
- if (NIL_P(dtd)) {
521
+ VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
522
+ if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
516
523
  quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
524
+ } else if (NIL_P(dtd)) {
525
+ quirks_mode = GUMBO_DOCTYPE_QUIRKS;
517
526
  } else {
518
527
  VALUE dtd_name = rb_funcall(dtd, name, 0);
519
528
  VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
@@ -560,13 +569,14 @@ fragment_continue(VALUE parse_args)
560
569
  args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
561
570
  xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
562
571
  build_tree(xml_doc, xml_frag, output->root);
572
+ rb_iv_set(doc_fragment, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
563
573
  add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
564
574
  return Qnil;
565
575
  }
566
576
 
567
577
  // Initialize the Nokogumbo class and fetch constants we will use later.
568
578
  void
569
- noko_init_gumbo()
579
+ noko_init_gumbo(void)
570
580
  {
571
581
  // Class constants.
572
582
  cNokogiriHtml5Document = rb_define_class_under(mNokogiriHtml5, "Document", cNokogiriHtml4Document);
@@ -577,7 +587,7 @@ noko_init_gumbo()
577
587
  parent = rb_intern_const("parent");
578
588
 
579
589
  // Define Nokogumbo module with parse and fragment methods.
580
- rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 5);
590
+ rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
581
591
  rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
582
592
  }
583
593
 
@@ -144,13 +144,12 @@ rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE r
144
144
  static VALUE
145
145
  rb_html_document_type(VALUE self)
146
146
  {
147
- htmlDocPtr doc;
148
- Data_Get_Struct(self, xmlDoc, doc);
149
- return INT2NUM((long)doc->type);
147
+ htmlDocPtr doc = noko_xml_document_unwrap(self);
148
+ return INT2NUM(doc->type);
150
149
  }
151
150
 
152
151
  void
153
- noko_init_html_document()
152
+ noko_init_html_document(void)
154
153
  {
155
154
  assert(cNokogiriXmlDocument);
156
155
  cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument);
@@ -1,5 +1,10 @@
1
1
  #include <nokogiri.h>
2
2
 
3
+ static const rb_data_type_t html4_element_description_type = {
4
+ .wrap_struct_name = "Nokogiri::HTML4::ElementDescription",
5
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
6
+ };
7
+
3
8
  VALUE cNokogiriHtml4ElementDescription ;
4
9
 
5
10
  /*
@@ -15,7 +20,7 @@ required_attributes(VALUE self)
15
20
  VALUE list;
16
21
  int i;
17
22
 
18
- Data_Get_Struct(self, htmlElemDesc, description);
23
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
19
24
 
20
25
  list = rb_ary_new();
21
26
 
@@ -41,7 +46,7 @@ deprecated_attributes(VALUE self)
41
46
  VALUE list;
42
47
  int i;
43
48
 
44
- Data_Get_Struct(self, htmlElemDesc, description);
49
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
45
50
 
46
51
  list = rb_ary_new();
47
52
 
@@ -67,7 +72,7 @@ optional_attributes(VALUE self)
67
72
  VALUE list;
68
73
  int i;
69
74
 
70
- Data_Get_Struct(self, htmlElemDesc, description);
75
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
71
76
 
72
77
  list = rb_ary_new();
73
78
 
@@ -90,7 +95,7 @@ static VALUE
90
95
  default_sub_element(VALUE self)
91
96
  {
92
97
  const htmlElemDesc *description;
93
- Data_Get_Struct(self, htmlElemDesc, description);
98
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
94
99
 
95
100
  if (description->defaultsubelt) {
96
101
  return NOKOGIRI_STR_NEW2(description->defaultsubelt);
@@ -112,7 +117,7 @@ sub_elements(VALUE self)
112
117
  VALUE list;
113
118
  int i;
114
119
 
115
- Data_Get_Struct(self, htmlElemDesc, description);
120
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
116
121
 
117
122
  list = rb_ary_new();
118
123
 
@@ -135,7 +140,7 @@ static VALUE
135
140
  description(VALUE self)
136
141
  {
137
142
  const htmlElemDesc *description;
138
- Data_Get_Struct(self, htmlElemDesc, description);
143
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
139
144
 
140
145
  return NOKOGIRI_STR_NEW2(description->desc);
141
146
  }
@@ -150,7 +155,7 @@ static VALUE
150
155
  inline_eh(VALUE self)
151
156
  {
152
157
  const htmlElemDesc *description;
153
- Data_Get_Struct(self, htmlElemDesc, description);
158
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
154
159
 
155
160
  if (description->isinline) { return Qtrue; }
156
161
  return Qfalse;
@@ -166,7 +171,7 @@ static VALUE
166
171
  deprecated_eh(VALUE self)
167
172
  {
168
173
  const htmlElemDesc *description;
169
- Data_Get_Struct(self, htmlElemDesc, description);
174
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
170
175
 
171
176
  if (description->depr) { return Qtrue; }
172
177
  return Qfalse;
@@ -182,7 +187,7 @@ static VALUE
182
187
  empty_eh(VALUE self)
183
188
  {
184
189
  const htmlElemDesc *description;
185
- Data_Get_Struct(self, htmlElemDesc, description);
190
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
186
191
 
187
192
  if (description->empty) { return Qtrue; }
188
193
  return Qfalse;
@@ -198,7 +203,7 @@ static VALUE
198
203
  save_end_tag_eh(VALUE self)
199
204
  {
200
205
  const htmlElemDesc *description;
201
- Data_Get_Struct(self, htmlElemDesc, description);
206
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
202
207
 
203
208
  if (description->saveEndTag) { return Qtrue; }
204
209
  return Qfalse;
@@ -214,7 +219,7 @@ static VALUE
214
219
  implied_end_tag_eh(VALUE self)
215
220
  {
216
221
  const htmlElemDesc *description;
217
- Data_Get_Struct(self, htmlElemDesc, description);
222
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
218
223
 
219
224
  if (description->endTag) { return Qtrue; }
220
225
  return Qfalse;
@@ -230,7 +235,7 @@ static VALUE
230
235
  implied_start_tag_eh(VALUE self)
231
236
  {
232
237
  const htmlElemDesc *description;
233
- Data_Get_Struct(self, htmlElemDesc, description);
238
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
234
239
 
235
240
  if (description->startTag) { return Qtrue; }
236
241
  return Qfalse;
@@ -246,7 +251,7 @@ static VALUE
246
251
  name(VALUE self)
247
252
  {
248
253
  const htmlElemDesc *description;
249
- Data_Get_Struct(self, htmlElemDesc, description);
254
+ TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
250
255
 
251
256
  if (NULL == description->name) { return Qnil; }
252
257
  return NOKOGIRI_STR_NEW2(description->name);
@@ -266,11 +271,11 @@ get_description(VALUE klass, VALUE tag_name)
266
271
  );
267
272
 
268
273
  if (NULL == description) { return Qnil; }
269
- return Data_Wrap_Struct(klass, 0, 0, DISCARD_CONST_QUAL(void *, description));
274
+ return TypedData_Wrap_Struct(klass, &html4_element_description_type, DISCARD_CONST_QUAL(void *, description));
270
275
  }
271
276
 
272
277
  void
273
- noko_init_html_element_description()
278
+ noko_init_html_element_description(void)
274
279
  {
275
280
  cNokogiriHtml4ElementDescription = rb_define_class_under(mNokogiriHtml4, "ElementDescription", rb_cObject);
276
281
 
@@ -20,7 +20,7 @@ get(VALUE _, VALUE rb_entity_name)
20
20
  return Qnil;
21
21
  }
22
22
 
23
- rb_constructor_args[0] = INT2NUM((long)c_entity_desc->value);
23
+ rb_constructor_args[0] = UINT2NUM(c_entity_desc->value);
24
24
  rb_constructor_args[1] = NOKOGIRI_STR_NEW2(c_entity_desc->name);
25
25
  rb_constructor_args[2] = NOKOGIRI_STR_NEW2(c_entity_desc->desc);
26
26
 
@@ -29,7 +29,7 @@ get(VALUE _, VALUE rb_entity_name)
29
29
  }
30
30
 
31
31
  void
32
- noko_init_html_entity_lookup()
32
+ noko_init_html_entity_lookup(void)
33
33
  {
34
34
  cNokogiriHtml4EntityLookup = rb_define_class_under(mNokogiriHtml4, "EntityLookup", rb_cObject);
35
35
 
@@ -2,18 +2,6 @@
2
2
 
3
3
  VALUE cNokogiriHtml4SaxParserContext ;
4
4
 
5
- static void
6
- deallocate(xmlParserCtxtPtr ctxt)
7
- {
8
- NOKOGIRI_DEBUG_START(ctxt);
9
-
10
- ctxt->sax = NULL;
11
-
12
- htmlFreeParserCtxt(ctxt);
13
-
14
- NOKOGIRI_DEBUG_END(ctxt);
15
- }
16
-
17
5
  static VALUE
18
6
  parse_memory(VALUE klass, VALUE data, VALUE encoding)
19
7
  {
@@ -43,7 +31,7 @@ parse_memory(VALUE klass, VALUE data, VALUE encoding)
43
31
  }
44
32
  }
45
33
 
46
- return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
34
+ return noko_xml_sax_parser_context_wrap(klass, ctxt);
47
35
  }
48
36
 
49
37
  static VALUE
@@ -53,7 +41,13 @@ parse_file(VALUE klass, VALUE filename, VALUE encoding)
53
41
  StringValueCStr(filename),
54
42
  StringValueCStr(encoding)
55
43
  );
56
- return Data_Wrap_Struct(klass, NULL, deallocate, ctxt);
44
+
45
+ if (ctxt->sax) {
46
+ xmlFree(ctxt->sax);
47
+ ctxt->sax = NULL;
48
+ }
49
+
50
+ return noko_xml_sax_parser_context_wrap(klass, ctxt);
57
51
  }
58
52
 
59
53
  static VALUE
@@ -87,13 +81,8 @@ parse_with(VALUE self, VALUE sax_handler)
87
81
  rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::SAX::Parser");
88
82
  }
89
83
 
90
- Data_Get_Struct(self, htmlParserCtxt, ctxt);
91
- Data_Get_Struct(sax_handler, htmlSAXHandler, sax);
92
-
93
- /* Free the sax handler since we'll assign our own */
94
- if (ctxt->sax && ctxt->sax != (xmlSAXHandlerPtr)&xmlDefaultSAXHandler) {
95
- xmlFree(ctxt->sax);
96
- }
84
+ ctxt = noko_xml_sax_parser_context_unwrap(self);
85
+ sax = noko_sax_handler_unwrap(sax_handler);
97
86
 
98
87
  ctxt->sax = sax;
99
88
  ctxt->userData = (void *)NOKOGIRI_SAX_TUPLE_NEW(ctxt, sax_handler);
@@ -106,7 +95,7 @@ parse_with(VALUE self, VALUE sax_handler)
106
95
  }
107
96
 
108
97
  void
109
- noko_init_html_sax_parser_context()
98
+ noko_init_html_sax_parser_context(void)
110
99
  {
111
100
  assert(cNokogiriXmlSaxParserContext);
112
101
  cNokogiriHtml4SaxParserContext = rb_define_class_under(mNokogiriHtml4Sax, "ParserContext",
@@ -17,7 +17,7 @@ native_write(VALUE self, VALUE _chunk, VALUE _last_chunk)
17
17
  int status = 0;
18
18
  libxmlStructuredErrorHandlerState handler_state;
19
19
 
20
- Data_Get_Struct(self, xmlParserCtxt, ctx);
20
+ ctx = noko_xml_sax_push_parser_unwrap(self);
21
21
 
22
22
  if (Qnil != _chunk) {
23
23
  chunk = StringValuePtr(_chunk);
@@ -54,7 +54,7 @@ initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename,
54
54
  htmlParserCtxtPtr ctx;
55
55
  xmlCharEncoding enc = XML_CHAR_ENCODING_NONE;
56
56
 
57
- Data_Get_Struct(_xml_sax, xmlSAXHandler, sax);
57
+ sax = noko_sax_handler_unwrap(_xml_sax);
58
58
 
59
59
  if (_filename != Qnil) { filename = StringValueCStr(_filename); }
60
60
 
@@ -85,7 +85,7 @@ initialize_native(VALUE self, VALUE _xml_sax, VALUE _filename,
85
85
  }
86
86
 
87
87
  void
88
- noko_init_html_sax_push_parser()
88
+ noko_init_html_sax_push_parser(void)
89
89
  {
90
90
  assert(cNokogiriXmlSaxPushParser);
91
91
  cNokogiriHtml4SaxPushParser = rb_define_class_under(mNokogiriHtml4Sax, "PushParser", cNokogiriXmlSaxPushParser);
@@ -49,34 +49,11 @@ void noko_init_html_sax_push_parser(void);
49
49
  void noko_init_gumbo(void);
50
50
  void noko_init_test_global_handlers(void);
51
51
 
52
- static ID id_read, id_write;
53
-
54
-
55
- #ifndef HAVE_VASPRINTF
56
- /*
57
- * Thank you Geoffroy Couprie for this implementation of vasprintf!
58
- */
59
- int
60
- vasprintf(char **strp, const char *fmt, va_list ap)
61
- {
62
- /* Mingw32/64 have a broken vsnprintf implementation that fails when
63
- * using a zero-byte limit in order to retrieve the required size for malloc.
64
- * So we use a one byte buffer instead.
65
- */
66
- char tmp[1];
67
- int len = vsnprintf(tmp, 1, fmt, ap) + 1;
68
- char *res = (char *)malloc((unsigned int)len);
69
- if (res == NULL) {
70
- return -1;
71
- }
72
- *strp = res;
73
- return vsnprintf(res, (unsigned int)len, fmt, ap);
74
- }
75
- #endif
52
+ static ID id_read, id_write, id_external_encoding;
76
53
 
77
54
 
78
55
  static VALUE
79
- read_check(VALUE val)
56
+ noko_io_read_check(VALUE val)
80
57
  {
81
58
  VALUE *args = (VALUE *)val;
82
59
  return rb_funcall(args[0], id_read, 1, args[1]);
@@ -84,75 +61,126 @@ read_check(VALUE val)
84
61
 
85
62
 
86
63
  static VALUE
87
- read_failed(VALUE arg, VALUE exc)
64
+ noko_io_read_failed(VALUE arg, VALUE exc)
88
65
  {
89
66
  return Qundef;
90
67
  }
91
68
 
92
69
 
93
70
  int
94
- noko_io_read(void *ctx, char *buffer, int len)
71
+ noko_io_read(void *io, char *c_buffer, int c_buffer_len)
95
72
  {
96
- VALUE string, args[2];
97
- size_t str_len, safe_len;
73
+ VALUE rb_io = (VALUE)io;
74
+ VALUE rb_read_string, rb_args[2];
75
+ size_t n_bytes_read, safe_len;
98
76
 
99
- args[0] = (VALUE)ctx;
100
- args[1] = INT2NUM(len);
77
+ rb_args[0] = rb_io;
78
+ rb_args[1] = INT2NUM(c_buffer_len);
101
79
 
102
- string = rb_rescue(read_check, (VALUE)args, read_failed, 0);
80
+ rb_read_string = rb_rescue(noko_io_read_check, (VALUE)rb_args, noko_io_read_failed, 0);
103
81
 
104
- if (NIL_P(string)) { return 0; }
105
- if (string == Qundef) { return -1; }
106
- if (TYPE(string) != T_STRING) { return -1; }
82
+ if (NIL_P(rb_read_string)) { return 0; }
83
+ if (rb_read_string == Qundef) { return -1; }
84
+ if (TYPE(rb_read_string) != T_STRING) { return -1; }
107
85
 
108
- str_len = (size_t)RSTRING_LEN(string);
109
- safe_len = str_len > (size_t)len ? (size_t)len : str_len;
110
- memcpy(buffer, StringValuePtr(string), safe_len);
86
+ n_bytes_read = (size_t)RSTRING_LEN(rb_read_string);
87
+ safe_len = (n_bytes_read > (size_t)c_buffer_len) ? (size_t)c_buffer_len : n_bytes_read;
88
+ memcpy(c_buffer, StringValuePtr(rb_read_string), safe_len);
111
89
 
112
90
  return (int)safe_len;
113
91
  }
114
92
 
115
93
 
116
94
  static VALUE
117
- write_check(VALUE val)
95
+ noko_io_write_check(VALUE rb_args)
118
96
  {
119
- VALUE *args = (VALUE *)val;
120
- return rb_funcall(args[0], id_write, 1, args[1]);
97
+ VALUE rb_io = ((VALUE *)rb_args)[0];
98
+ VALUE rb_output = ((VALUE *)rb_args)[1];
99
+ return rb_funcall(rb_io, id_write, 1, rb_output);
121
100
  }
122
101
 
123
102
 
124
103
  static VALUE
125
- write_failed(VALUE arg, VALUE exc)
104
+ noko_io_write_failed(VALUE arg, VALUE exc)
126
105
  {
127
106
  return Qundef;
128
107
  }
129
108
 
130
109
 
131
110
  int
132
- noko_io_write(void *ctx, char *buffer, int len)
111
+ noko_io_write(void *io, char *c_buffer, int c_buffer_len)
133
112
  {
134
- VALUE args[2], size;
113
+ VALUE rb_args[2], rb_n_bytes_written;
114
+ VALUE rb_io = (VALUE)io;
115
+ VALUE rb_enc = Qnil;
116
+ rb_encoding *io_encoding;
135
117
 
136
- args[0] = (VALUE)ctx;
137
- args[1] = rb_str_new(buffer, (long)len);
118
+ if (rb_respond_to(rb_io, id_external_encoding)) {
119
+ rb_enc = rb_funcall(rb_io, id_external_encoding, 0);
120
+ }
121
+ io_encoding = RB_NIL_P(rb_enc) ? rb_ascii8bit_encoding() : rb_to_encoding(rb_enc);
138
122
 
139
- size = rb_rescue(write_check, (VALUE)args, write_failed, 0);
123
+ rb_args[0] = rb_io;
124
+ rb_args[1] = rb_enc_str_new(c_buffer, (long)c_buffer_len, io_encoding);
140
125
 
141
- if (size == Qundef) { return -1; }
126
+ rb_n_bytes_written = rb_rescue(noko_io_write_check, (VALUE)rb_args, noko_io_write_failed, 0);
127
+ if (rb_n_bytes_written == Qundef) { return -1; }
142
128
 
143
- return NUM2INT(size);
129
+ return NUM2INT(rb_n_bytes_written);
144
130
  }
145
131
 
146
132
 
147
133
  int
148
- noko_io_close(void *ctx)
134
+ noko_io_close(void *io)
149
135
  {
150
136
  return 0;
151
137
  }
152
138
 
153
139
 
140
+ #if defined(_WIN32) && !defined(NOKOGIRI_PACKAGED_LIBRARIES)
141
+ # define NOKOGIRI_WINDOWS_DLLS 1
142
+ #else
143
+ # define NOKOGIRI_WINDOWS_DLLS 0
144
+ #endif
145
+
146
+ //
147
+ // | dlls || true | false |
148
+ // | nlmm || | |
149
+ // |-----------++---------+---------|
150
+ // | NULL || default | ruby |
151
+ // | "random" || default | ruby |
152
+ // | "ruby" || ruby | ruby |
153
+ // | "default" || default | default |
154
+ //
155
+ // We choose *not* to use Ruby's memory management functions with windows DLLs because of this
156
+ // issue: https://github.com/sparklemotion/nokogiri/issues/2241
157
+ //
158
+ static void
159
+ set_libxml_memory_management(void)
160
+ {
161
+ const char *nlmm = getenv("NOKOGIRI_LIBXML_MEMORY_MANAGEMENT");
162
+ if (nlmm) {
163
+ if (strcmp(nlmm, "default") == 0) {
164
+ goto libxml_uses_default_memory_management;
165
+ } else if (strcmp(nlmm, "ruby") == 0) {
166
+ goto libxml_uses_ruby_memory_management;
167
+ }
168
+ }
169
+ if (NOKOGIRI_WINDOWS_DLLS) {
170
+ libxml_uses_default_memory_management:
171
+ rb_const_set(mNokogiri, rb_intern("LIBXML_MEMORY_MANAGEMENT"), NOKOGIRI_STR_NEW2("default"));
172
+ return;
173
+ } else {
174
+ libxml_uses_ruby_memory_management:
175
+ rb_const_set(mNokogiri, rb_intern("LIBXML_MEMORY_MANAGEMENT"), NOKOGIRI_STR_NEW2("ruby"));
176
+ xmlMemSetup((xmlFreeFunc)ruby_xfree, (xmlMallocFunc)ruby_xmalloc, (xmlReallocFunc)ruby_xrealloc, ruby_strdup);
177
+ return;
178
+ }
179
+ }
180
+
181
+
154
182
  void
155
- Init_nokogiri()
183
+ Init_nokogiri(void)
156
184
  {
157
185
  mNokogiri = rb_define_module("Nokogiri");
158
186
  mNokogiriGumbo = rb_define_module_under(mNokogiri, "Gumbo");
@@ -164,6 +192,10 @@ Init_nokogiri()
164
192
  mNokogiriXmlXpath = rb_define_module_under(mNokogiriXml, "XPath");
165
193
  mNokogiriXslt = rb_define_module_under(mNokogiri, "XSLT");
166
194
 
195
+ set_libxml_memory_management(); /* must be before any function calls that might invoke xmlInitParser() */
196
+ xmlInitParser();
197
+ exsltRegisterAll();
198
+
167
199
  rb_const_set(mNokogiri, rb_intern("LIBXML_COMPILED_VERSION"), NOKOGIRI_STR_NEW2(LIBXML_DOTTED_VERSION));
168
200
  rb_const_set(mNokogiri, rb_intern("LIBXML_LOADED_VERSION"), NOKOGIRI_STR_NEW2(xmlParserVersion));
169
201
 
@@ -196,30 +228,6 @@ Init_nokogiri()
196
228
  rb_const_set(mNokogiri, rb_intern("OTHER_LIBRARY_VERSIONS"), NOKOGIRI_STR_NEW2(NOKOGIRI_OTHER_LIBRARY_VERSIONS));
197
229
  #endif
198
230
 
199
- #if defined(_WIN32) && !defined(NOKOGIRI_PACKAGED_LIBRARIES)
200
- /*
201
- * We choose *not* to do use Ruby's memory management functions with windows DLLs because of this
202
- * issue in libxml 2.9.12:
203
- *
204
- * https://github.com/sparklemotion/nokogiri/issues/2241
205
- *
206
- * If the atexit() issue gets fixed in a future version of libxml2, then we may be able to skip
207
- * this config only for the specific libxml2 versions 2.9.12.
208
- *
209
- * Alternatively, now that Ruby has a generational GC, it might be OK to let libxml2 use its
210
- * default memory management functions (recall that this config was introduced to reduce memory
211
- * bloat and allow Ruby to GC more often); but we should *really* test with production workloads
212
- * before making that kind of a potentially-invasive change.
213
- */
214
- rb_const_set(mNokogiri, rb_intern("LIBXML_MEMORY_MANAGEMENT"), NOKOGIRI_STR_NEW2("default"));
215
- #else
216
- rb_const_set(mNokogiri, rb_intern("LIBXML_MEMORY_MANAGEMENT"), NOKOGIRI_STR_NEW2("ruby"));
217
- xmlMemSetup((xmlFreeFunc)ruby_xfree, (xmlMallocFunc)ruby_xmalloc, (xmlReallocFunc)ruby_xrealloc, ruby_strdup);
218
- #endif
219
-
220
- xmlInitParser();
221
- exsltRegisterAll();
222
-
223
231
  if (xsltExtModuleFunctionLookup((const xmlChar *)"date-time", EXSLT_DATE_NAMESPACE)) {
224
232
  rb_const_set(mNokogiri, rb_intern("LIBXSLT_DATETIME_ENABLED"), Qtrue);
225
233
  } else {
@@ -275,4 +283,5 @@ Init_nokogiri()
275
283
 
276
284
  id_read = rb_intern("read");
277
285
  id_write = rb_intern("write");
286
+ id_external_encoding = rb_intern("external_encoding");
278
287
  }