nokogiri 1.16.8 → 1.18.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (95) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +14 -22
  3. data/LICENSE-DEPENDENCIES.md +6 -6
  4. data/README.md +8 -5
  5. data/dependencies.yml +6 -6
  6. data/ext/nokogiri/extconf.rb +188 -142
  7. data/ext/nokogiri/gumbo.c +69 -53
  8. data/ext/nokogiri/html4_document.c +10 -4
  9. data/ext/nokogiri/html4_element_description.c +18 -18
  10. data/ext/nokogiri/html4_sax_parser.c +40 -0
  11. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  12. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  13. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  14. data/ext/nokogiri/nokogiri.c +9 -2
  15. data/ext/nokogiri/nokogiri.h +18 -33
  16. data/ext/nokogiri/xml_attr.c +1 -1
  17. data/ext/nokogiri/xml_cdata.c +2 -10
  18. data/ext/nokogiri/xml_comment.c +3 -8
  19. data/ext/nokogiri/xml_document.c +163 -156
  20. data/ext/nokogiri/xml_document_fragment.c +10 -25
  21. data/ext/nokogiri/xml_dtd.c +1 -1
  22. data/ext/nokogiri/xml_element_content.c +9 -9
  23. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  24. data/ext/nokogiri/xml_namespace.c +6 -6
  25. data/ext/nokogiri/xml_node.c +134 -103
  26. data/ext/nokogiri/xml_node_set.c +46 -44
  27. data/ext/nokogiri/xml_reader.c +54 -58
  28. data/ext/nokogiri/xml_relax_ng.c +35 -56
  29. data/ext/nokogiri/xml_sax_parser.c +156 -88
  30. data/ext/nokogiri/xml_sax_parser_context.c +219 -131
  31. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  32. data/ext/nokogiri/xml_schema.c +50 -85
  33. data/ext/nokogiri/xml_syntax_error.c +19 -11
  34. data/ext/nokogiri/xml_text.c +2 -4
  35. data/ext/nokogiri/xml_xpath_context.c +103 -100
  36. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  37. data/gumbo-parser/src/ascii.c +2 -2
  38. data/gumbo-parser/src/error.c +76 -48
  39. data/gumbo-parser/src/error.h +5 -1
  40. data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
  41. data/gumbo-parser/src/parser.c +63 -25
  42. data/gumbo-parser/src/tokenizer.c +6 -6
  43. data/lib/nokogiri/class_resolver.rb +1 -1
  44. data/lib/nokogiri/css/node.rb +6 -2
  45. data/lib/nokogiri/css/parser.rb +6 -4
  46. data/lib/nokogiri/css/parser.y +2 -2
  47. data/lib/nokogiri/css/parser_extras.rb +6 -66
  48. data/lib/nokogiri/css/selector_cache.rb +38 -0
  49. data/lib/nokogiri/css/tokenizer.rb +4 -4
  50. data/lib/nokogiri/css/tokenizer.rex +9 -8
  51. data/lib/nokogiri/css/xpath_visitor.rb +42 -6
  52. data/lib/nokogiri/css.rb +86 -20
  53. data/lib/nokogiri/decorators/slop.rb +3 -5
  54. data/lib/nokogiri/encoding_handler.rb +2 -2
  55. data/lib/nokogiri/html4/document.rb +44 -23
  56. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  57. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  58. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  59. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  60. data/lib/nokogiri/html4.rb +9 -14
  61. data/lib/nokogiri/html5/builder.rb +40 -0
  62. data/lib/nokogiri/html5/document.rb +61 -30
  63. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  64. data/lib/nokogiri/html5/node.rb +4 -4
  65. data/lib/nokogiri/html5.rb +114 -72
  66. data/lib/nokogiri/version/constant.rb +1 -1
  67. data/lib/nokogiri/xml/builder.rb +8 -1
  68. data/lib/nokogiri/xml/document.rb +70 -26
  69. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  70. data/lib/nokogiri/xml/node.rb +82 -11
  71. data/lib/nokogiri/xml/node_set.rb +9 -7
  72. data/lib/nokogiri/xml/parse_options.rb +1 -1
  73. data/lib/nokogiri/xml/pp/node.rb +6 -1
  74. data/lib/nokogiri/xml/reader.rb +46 -13
  75. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  76. data/lib/nokogiri/xml/sax/document.rb +174 -83
  77. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  78. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  79. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  80. data/lib/nokogiri/xml/sax.rb +48 -0
  81. data/lib/nokogiri/xml/schema.rb +112 -45
  82. data/lib/nokogiri/xml/searchable.rb +38 -42
  83. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  84. data/lib/nokogiri/xml/xpath_context.rb +14 -3
  85. data/lib/nokogiri/xml.rb +13 -24
  86. data/lib/nokogiri/xslt.rb +3 -9
  87. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  88. data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
  89. data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
  90. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  91. metadata +13 -12
  92. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
  93. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
  94. data/ports/archives/libxml2-2.12.9.tar.xz +0 -0
  95. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
data/ext/nokogiri/gumbo.c CHANGED
@@ -37,30 +37,6 @@ VALUE cNokogiriHtml5Document;
37
37
  static ID internal_subset;
38
38
  static ID parent;
39
39
 
40
- /* Backwards compatibility to Ruby 2.1.0 */
41
- #if RUBY_API_VERSION_CODE < 20200
42
- #define ONIG_ESCAPE_UCHAR_COLLISION 1
43
- #include <ruby/encoding.h>
44
-
45
- static VALUE
46
- rb_utf8_str_new(const char *str, long length)
47
- {
48
- return rb_enc_str_new(str, length, rb_utf8_encoding());
49
- }
50
-
51
- static VALUE
52
- rb_utf8_str_new_cstr(const char *str)
53
- {
54
- return rb_enc_str_new_cstr(str, rb_utf8_encoding());
55
- }
56
-
57
- static VALUE
58
- rb_utf8_str_new_static(const char *str, long length)
59
- {
60
- return rb_enc_str_new(str, length, rb_utf8_encoding());
61
- }
62
- #endif
63
-
64
40
  #include <nokogiri.h>
65
41
  #include <libxml/tree.h>
66
42
  #include <libxml/HTMLtree.h>
@@ -94,7 +70,7 @@ perform_parse(const GumboOptions *options, VALUE input)
94
70
  GumboOutput *output = gumbo_parse_with_options(
95
71
  options,
96
72
  RSTRING_PTR(input),
97
- RSTRING_LEN(input)
73
+ (size_t)RSTRING_LEN(input)
98
74
  );
99
75
 
100
76
  const char *status_string = gumbo_status_to_string(output->status);
@@ -260,7 +236,7 @@ static void
260
236
  add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
261
237
  {
262
238
  const char *input_str = RSTRING_PTR(input);
263
- size_t input_len = RSTRING_LEN(input);
239
+ size_t input_len = (size_t)RSTRING_LEN(input);
264
240
 
265
241
  // Add parse errors to rdoc.
266
242
  if (output->errors.length) {
@@ -272,11 +248,11 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
272
248
  GumboSourcePosition position = gumbo_error_position(err);
273
249
  char *msg;
274
250
  size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
275
- VALUE err_str = rb_utf8_str_new(msg, size);
251
+ VALUE err_str = rb_utf8_str_new(msg, (int)size);
276
252
  free(msg);
277
253
  VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
278
254
  const char *error_code = gumbo_error_code(err);
279
- VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
255
+ VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, (int)strlen(error_code)) : Qnil;
280
256
  rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
281
257
  rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
282
258
  rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
@@ -316,18 +292,58 @@ parse_cleanup(VALUE parse_args)
316
292
  return Qnil;
317
293
  }
318
294
 
295
+ // Scan the keyword arguments for options common to the document and fragment
296
+ // parse.
297
+ static GumboOptions
298
+ common_options(VALUE kwargs)
299
+ {
300
+ // The order of the keywords determines the order of the values below.
301
+ // If this order is changed, then setting the options below must change as
302
+ // well.
303
+ ID keywords[] = {
304
+ // Required keywords.
305
+ rb_intern_const("max_attributes"),
306
+ rb_intern_const("max_errors"),
307
+ rb_intern_const("max_tree_depth"),
308
+
309
+ // Optional keywords.
310
+ rb_intern_const("parse_noscript_content_as_text"),
311
+ };
312
+ VALUE values[sizeof keywords / sizeof keywords[0]];
313
+
314
+ // Extract the values coresponding to the required keywords. Raise an error
315
+ // if required arguments are missing.
316
+ rb_get_kwargs(kwargs, keywords, 3, 1, values);
317
+
318
+ GumboOptions options = kGumboDefaultOptions;
319
+ options.max_attributes = NUM2INT(values[0]);
320
+ options.max_errors = NUM2INT(values[1]);
321
+
322
+ // handle negative values
323
+ int depth = NUM2INT(values[2]);
324
+ options.max_tree_depth = depth < 0 ? UINT_MAX : (unsigned int)depth;
325
+
326
+ options.parse_noscript_content_as_text = values[3] != Qundef && RTEST(values[3]);
327
+
328
+ return options;
329
+ }
330
+
319
331
  static VALUE parse_continue(VALUE parse_args);
320
332
 
321
333
  /*
322
334
  * @!visibility protected
323
335
  */
324
336
  static VALUE
325
- parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
337
+ noko_gumbo_s_parse(int argc, VALUE *argv, VALUE _self)
326
338
  {
327
- GumboOptions options = kGumboDefaultOptions;
328
- options.max_attributes = NUM2INT(max_attributes);
329
- options.max_errors = NUM2INT(max_errors);
330
- options.max_tree_depth = NUM2INT(max_depth);
339
+ VALUE input, url, klass, kwargs;
340
+
341
+ rb_scan_args(argc, argv, "3:", &input, &url, &klass, &kwargs);
342
+ if (NIL_P(kwargs)) {
343
+ kwargs = rb_hash_new();
344
+ }
345
+
346
+ GumboOptions options = common_options(kwargs);
331
347
 
332
348
  GumboOutput *output = perform_parse(&options, input);
333
349
  ParseArgs args = {
@@ -383,7 +399,7 @@ lookup_namespace(VALUE node, bool require_known_ns)
383
399
  Check_Type(ns, T_STRING);
384
400
 
385
401
  const char *href_ptr = RSTRING_PTR(ns);
386
- size_t href_len = RSTRING_LEN(ns);
402
+ size_t href_len = (size_t)RSTRING_LEN(ns);
387
403
  #define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
388
404
  if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) {
389
405
  return GUMBO_NAMESPACE_HTML;
@@ -415,16 +431,12 @@ static VALUE fragment_continue(VALUE parse_args);
415
431
  * @!visibility protected
416
432
  */
417
433
  static VALUE
418
- fragment(
419
- VALUE self,
420
- VALUE doc_fragment,
421
- VALUE tags,
422
- VALUE ctx,
423
- VALUE max_attributes,
424
- VALUE max_errors,
425
- VALUE max_depth
426
- )
434
+ noko_gumbo_s_fragment(int argc, VALUE *argv, VALUE _self)
427
435
  {
436
+ VALUE doc_fragment;
437
+ VALUE tags;
438
+ VALUE ctx;
439
+ VALUE kwargs;
428
440
  ID name = rb_intern_const("name");
429
441
  const char *ctx_tag;
430
442
  GumboNamespaceEnum ctx_ns;
@@ -432,13 +444,20 @@ fragment(
432
444
  bool form = false;
433
445
  const char *encoding = NULL;
434
446
 
447
+ rb_scan_args(argc, argv, "3:", &doc_fragment, &tags, &ctx, &kwargs);
448
+ if (NIL_P(kwargs)) {
449
+ kwargs = rb_hash_new();
450
+ }
451
+
452
+ GumboOptions options = common_options(kwargs);
453
+
435
454
  if (NIL_P(ctx)) {
436
455
  ctx_tag = "body";
437
456
  ctx_ns = GUMBO_NAMESPACE_HTML;
438
457
  } else if (TYPE(ctx) == T_STRING) {
439
458
  ctx_tag = StringValueCStr(ctx);
440
459
  ctx_ns = GUMBO_NAMESPACE_HTML;
441
- size_t len = RSTRING_LEN(ctx);
460
+ size_t len = (size_t)RSTRING_LEN(ctx);
442
461
  const char *colon = memchr(ctx_tag, ':', len);
443
462
  if (colon) {
444
463
  switch (colon - ctx_tag) {
@@ -519,7 +538,7 @@ error:
519
538
  VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
520
539
  VALUE dtd = rb_funcall(doc, internal_subset, 0);
521
540
  VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
522
- if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
541
+ if (NIL_P(ctx) || (TYPE(ctx) == T_STRING) || NIL_P(doc_quirks_mode)) {
523
542
  quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
524
543
  } else if (NIL_P(dtd)) {
525
544
  quirks_mode = GUMBO_DOCTYPE_QUIRKS;
@@ -535,18 +554,15 @@ error:
535
554
  }
536
555
 
537
556
  // Perform a fragment parse.
538
- int depth = NUM2INT(max_depth);
539
- GumboOptions options = kGumboDefaultOptions;
540
- options.max_attributes = NUM2INT(max_attributes);
541
- options.max_errors = NUM2INT(max_errors);
542
- // Add one to account for the HTML element.
543
- options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
544
557
  options.fragment_context = ctx_tag;
545
558
  options.fragment_namespace = ctx_ns;
546
559
  options.fragment_encoding = encoding;
547
560
  options.quirks_mode = quirks_mode;
548
561
  options.fragment_context_has_form_ancestor = form;
549
562
 
563
+ // Add one to the max tree depth to account for the HTML element.
564
+ if (options.max_tree_depth < UINT_MAX) { options.max_tree_depth++; }
565
+
550
566
  GumboOutput *output = perform_parse(&options, tags);
551
567
  ParseArgs args = {
552
568
  .output = output,
@@ -587,8 +603,8 @@ noko_init_gumbo(void)
587
603
  parent = rb_intern_const("parent");
588
604
 
589
605
  // Define Nokogumbo module with parse and fragment methods.
590
- rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
591
- rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
606
+ rb_define_singleton_method(mNokogiriGumbo, "parse", noko_gumbo_s_parse, -1);
607
+ rb_define_singleton_method(mNokogiriGumbo, "fragment", noko_gumbo_s_fragment, -1);
592
608
  }
593
609
 
594
610
  // vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -7,9 +7,9 @@ static ID id_to_s;
7
7
 
8
8
  /*
9
9
  * call-seq:
10
- * new
10
+ * new(uri=nil, external_id=nil) → HTML4::Document
11
11
  *
12
- * Create a new document
12
+ * Create a new empty document with base URI +uri+ and external ID +external_id+.
13
13
  */
14
14
  static VALUE
15
15
  rb_html_document_s_new(int argc, VALUE *argv, VALUE klass)
@@ -46,7 +46,7 @@ rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_enco
46
46
  const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
47
47
  int options = NUM2INT(rb_options);
48
48
 
49
- xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
49
+ xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);
50
50
 
51
51
  c_doc = htmlReadIO(noko_io_read, noko_io_close, (void *)rb_io, c_url, c_encoding, options);
52
52
 
@@ -106,7 +106,7 @@ rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE r
106
106
  int html_len = (int)RSTRING_LEN(rb_html);
107
107
  int options = NUM2INT(rb_options);
108
108
 
109
- xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
109
+ xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);
110
110
 
111
111
  c_doc = htmlReadMemory(c_buffer, html_len, c_url, c_encoding, options);
112
112
 
@@ -151,6 +151,12 @@ rb_html_document_type(VALUE self)
151
151
  void
152
152
  noko_init_html_document(void)
153
153
  {
154
+ /* this is here so that rdoc doesn't ignore this file. */
155
+ /*
156
+ mNokogiri = rb_define_module("Nokogiri");
157
+ mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4");
158
+ */
159
+
154
160
  assert(cNokogiriXmlDocument);
155
161
  cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument);
156
162
 
@@ -1,7 +1,7 @@
1
1
  #include <nokogiri.h>
2
2
 
3
- static const rb_data_type_t html4_element_description_type = {
4
- .wrap_struct_name = "Nokogiri::HTML4::ElementDescription",
3
+ static const rb_data_type_t html_elem_desc_type = {
4
+ .wrap_struct_name = "htmlElemDesc",
5
5
  .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
6
6
  };
7
7
 
@@ -20,7 +20,7 @@ required_attributes(VALUE self)
20
20
  VALUE list;
21
21
  int i;
22
22
 
23
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
23
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
24
24
 
25
25
  list = rb_ary_new();
26
26
 
@@ -46,7 +46,7 @@ deprecated_attributes(VALUE self)
46
46
  VALUE list;
47
47
  int i;
48
48
 
49
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
49
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
50
50
 
51
51
  list = rb_ary_new();
52
52
 
@@ -72,7 +72,7 @@ optional_attributes(VALUE self)
72
72
  VALUE list;
73
73
  int i;
74
74
 
75
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
75
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
76
76
 
77
77
  list = rb_ary_new();
78
78
 
@@ -95,7 +95,7 @@ static VALUE
95
95
  default_sub_element(VALUE self)
96
96
  {
97
97
  const htmlElemDesc *description;
98
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
98
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
99
99
 
100
100
  if (description->defaultsubelt) {
101
101
  return NOKOGIRI_STR_NEW2(description->defaultsubelt);
@@ -117,7 +117,7 @@ sub_elements(VALUE self)
117
117
  VALUE list;
118
118
  int i;
119
119
 
120
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
120
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
121
121
 
122
122
  list = rb_ary_new();
123
123
 
@@ -140,7 +140,7 @@ static VALUE
140
140
  description(VALUE self)
141
141
  {
142
142
  const htmlElemDesc *description;
143
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
143
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
144
144
 
145
145
  return NOKOGIRI_STR_NEW2(description->desc);
146
146
  }
@@ -155,7 +155,7 @@ static VALUE
155
155
  inline_eh(VALUE self)
156
156
  {
157
157
  const htmlElemDesc *description;
158
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
158
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
159
159
 
160
160
  if (description->isinline) { return Qtrue; }
161
161
  return Qfalse;
@@ -171,7 +171,7 @@ static VALUE
171
171
  deprecated_eh(VALUE self)
172
172
  {
173
173
  const htmlElemDesc *description;
174
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
174
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
175
175
 
176
176
  if (description->depr) { return Qtrue; }
177
177
  return Qfalse;
@@ -187,7 +187,7 @@ static VALUE
187
187
  empty_eh(VALUE self)
188
188
  {
189
189
  const htmlElemDesc *description;
190
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
190
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
191
191
 
192
192
  if (description->empty) { return Qtrue; }
193
193
  return Qfalse;
@@ -203,7 +203,7 @@ static VALUE
203
203
  save_end_tag_eh(VALUE self)
204
204
  {
205
205
  const htmlElemDesc *description;
206
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
206
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
207
207
 
208
208
  if (description->saveEndTag) { return Qtrue; }
209
209
  return Qfalse;
@@ -219,7 +219,7 @@ static VALUE
219
219
  implied_end_tag_eh(VALUE self)
220
220
  {
221
221
  const htmlElemDesc *description;
222
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
222
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
223
223
 
224
224
  if (description->endTag) { return Qtrue; }
225
225
  return Qfalse;
@@ -235,7 +235,7 @@ static VALUE
235
235
  implied_start_tag_eh(VALUE self)
236
236
  {
237
237
  const htmlElemDesc *description;
238
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
238
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
239
239
 
240
240
  if (description->startTag) { return Qtrue; }
241
241
  return Qfalse;
@@ -245,13 +245,13 @@ implied_start_tag_eh(VALUE self)
245
245
  * call-seq:
246
246
  * name
247
247
  *
248
- * Get the tag name for this ElemementDescription
248
+ * Get the tag name for this ElementDescription
249
249
  */
250
250
  static VALUE
251
251
  name(VALUE self)
252
252
  {
253
253
  const htmlElemDesc *description;
254
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
254
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
255
255
 
256
256
  if (NULL == description->name) { return Qnil; }
257
257
  return NOKOGIRI_STR_NEW2(description->name);
@@ -261,7 +261,7 @@ name(VALUE self)
261
261
  * call-seq:
262
262
  * [](tag_name)
263
263
  *
264
- * Get ElemementDescription for +tag_name+
264
+ * Get ElementDescription for +tag_name+
265
265
  */
266
266
  static VALUE
267
267
  get_description(VALUE klass, VALUE tag_name)
@@ -271,7 +271,7 @@ get_description(VALUE klass, VALUE tag_name)
271
271
  );
272
272
 
273
273
  if (NULL == description) { return Qnil; }
274
- return TypedData_Wrap_Struct(klass, &html4_element_description_type, DISCARD_CONST_QUAL(void *, description));
274
+ return TypedData_Wrap_Struct(klass, &html_elem_desc_type, DISCARD_CONST_QUAL(void *, description));
275
275
  }
276
276
 
277
277
  void
@@ -0,0 +1,40 @@
1
+ #include <nokogiri.h>
2
+
3
+ VALUE cNokogiriHtml4SaxParser;
4
+
5
+ static ID id_start_document;
6
+
7
+ static void
8
+ noko_html4_sax_parser_start_document(void *ctx)
9
+ {
10
+ xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
11
+ VALUE self = (VALUE)ctxt->_private;
12
+ VALUE doc = rb_iv_get(self, "@document");
13
+
14
+ xmlSAX2StartDocument(ctx);
15
+
16
+ rb_funcall(doc, id_start_document, 0);
17
+ }
18
+
19
+ static VALUE
20
+ noko_html4_sax_parser_initialize(VALUE self)
21
+ {
22
+ xmlSAXHandlerPtr handler = noko_xml_sax_parser_unwrap(self);
23
+
24
+ rb_call_super(0, NULL);
25
+
26
+ handler->startDocument = noko_html4_sax_parser_start_document;
27
+
28
+ return self;
29
+ }
30
+
31
+ void
32
+ noko_init_html4_sax_parser(void)
33
+ {
34
+ cNokogiriHtml4SaxParser = rb_define_class_under(mNokogiriHtml4Sax, "Parser", cNokogiriXmlSaxParser);
35
+
36
+ rb_define_private_method(cNokogiriHtml4SaxParser, "initialize_native",
37
+ noko_html4_sax_parser_initialize, 0);
38
+
39
+ id_start_document = rb_intern("start_document");
40
+ }
@@ -2,96 +2,83 @@
2
2
 
3
3
  VALUE cNokogiriHtml4SaxParserContext ;
4
4
 
5
+ /* :nodoc: */
5
6
  static VALUE
6
- parse_memory(VALUE klass, VALUE data, VALUE encoding)
7
+ noko_html4_sax_parser_context_s_native_memory(VALUE rb_class, VALUE rb_input, VALUE rb_encoding)
7
8
  {
8
- htmlParserCtxtPtr ctxt;
9
-
10
- Check_Type(data, T_STRING);
11
-
12
- if (!(int)RSTRING_LEN(data)) {
13
- rb_raise(rb_eRuntimeError, "data cannot be empty");
9
+ Check_Type(rb_input, T_STRING);
10
+ if (!(int)RSTRING_LEN(rb_input)) {
11
+ rb_raise(rb_eRuntimeError, "input string cannot be empty");
14
12
  }
15
13
 
16
- ctxt = htmlCreateMemoryParserCtxt(StringValuePtr(data),
17
- (int)RSTRING_LEN(data));
18
- if (ctxt->sax) {
19
- xmlFree(ctxt->sax);
20
- ctxt->sax = NULL;
14
+ if (!NIL_P(rb_encoding) && !rb_obj_is_kind_of(rb_encoding, rb_cEncoding)) {
15
+ rb_raise(rb_eTypeError, "argument must be an Encoding object");
21
16
  }
22
17
 
23
- if (RTEST(encoding)) {
24
- xmlCharEncodingHandlerPtr enc = xmlFindCharEncodingHandler(StringValueCStr(encoding));
25
- if (enc != NULL) {
26
- xmlSwitchToEncoding(ctxt, enc);
27
- if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
28
- rb_raise(rb_eRuntimeError, "Unsupported encoding %s",
29
- StringValueCStr(encoding));
30
- }
31
- }
18
+ htmlParserCtxtPtr c_context =
19
+ htmlCreateMemoryParserCtxt(StringValuePtr(rb_input), (int)RSTRING_LEN(rb_input));
20
+ if (!c_context) {
21
+ rb_raise(rb_eRuntimeError, "failed to create xml sax parser context");
32
22
  }
33
23
 
34
- return noko_xml_sax_parser_context_wrap(klass, ctxt);
35
- }
24
+ noko_xml_sax_parser_context_set_encoding(c_context, rb_encoding);
36
25
 
37
- static VALUE
38
- parse_file(VALUE klass, VALUE filename, VALUE encoding)
39
- {
40
- htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(
41
- StringValueCStr(filename),
42
- StringValueCStr(encoding)
43
- );
44
-
45
- if (ctxt->sax) {
46
- xmlFree(ctxt->sax);
47
- ctxt->sax = NULL;
26
+ if (c_context->sax) {
27
+ xmlFree(c_context->sax);
28
+ c_context->sax = NULL;
48
29
  }
49
30
 
50
- return noko_xml_sax_parser_context_wrap(klass, ctxt);
31
+ return noko_xml_sax_parser_context_wrap(rb_class, c_context);
51
32
  }
52
33
 
34
+ /* :nodoc: */
53
35
  static VALUE
54
- parse_doc(VALUE ctxt_val)
36
+ noko_html4_sax_parser_context_s_native_file(VALUE rb_class, VALUE rb_filename, VALUE rb_encoding)
55
37
  {
56
- htmlParserCtxtPtr ctxt = (htmlParserCtxtPtr)ctxt_val;
57
- htmlParseDocument(ctxt);
58
- return Qnil;
59
- }
38
+ if (!NIL_P(rb_encoding) && !rb_obj_is_kind_of(rb_encoding, rb_cEncoding)) {
39
+ rb_raise(rb_eTypeError, "argument must be an Encoding object");
40
+ }
60
41
 
61
- static VALUE
62
- parse_doc_finalize(VALUE ctxt_val)
63
- {
64
- htmlParserCtxtPtr ctxt = (htmlParserCtxtPtr)ctxt_val;
42
+ htmlParserCtxtPtr c_context = htmlCreateFileParserCtxt(StringValueCStr(rb_filename), NULL);
43
+ if (!c_context) {
44
+ rb_raise(rb_eRuntimeError, "failed to create xml sax parser context");
45
+ }
65
46
 
66
- if (ctxt->myDoc) {
67
- xmlFreeDoc(ctxt->myDoc);
47
+ noko_xml_sax_parser_context_set_encoding(c_context, rb_encoding);
48
+
49
+ if (c_context->sax) {
50
+ xmlFree(c_context->sax);
51
+ c_context->sax = NULL;
68
52
  }
69
53
 
70
- NOKOGIRI_SAX_TUPLE_DESTROY(ctxt->userData);
71
- return Qnil;
54
+ return noko_xml_sax_parser_context_wrap(rb_class, c_context);
72
55
  }
73
56
 
74
57
  static VALUE
75
- parse_with(VALUE self, VALUE sax_handler)
58
+ noko_html4_sax_parser_context__parse_with(VALUE rb_context, VALUE rb_sax_parser)
76
59
  {
77
60
  htmlParserCtxtPtr ctxt;
78
61
  htmlSAXHandlerPtr sax;
79
62
 
80
- if (!rb_obj_is_kind_of(sax_handler, cNokogiriXmlSaxParser)) {
63
+ if (!rb_obj_is_kind_of(rb_sax_parser, cNokogiriXmlSaxParser)) {
81
64
  rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::SAX::Parser");
82
65
  }
83
66
 
84
- ctxt = noko_xml_sax_parser_context_unwrap(self);
85
- sax = noko_sax_handler_unwrap(sax_handler);
67
+ ctxt = noko_xml_sax_parser_context_unwrap(rb_context);
68
+ sax = noko_xml_sax_parser_unwrap(rb_sax_parser);
86
69
 
87
70
  ctxt->sax = sax;
88
- ctxt->userData = (void *)NOKOGIRI_SAX_TUPLE_NEW(ctxt, sax_handler);
71
+ ctxt->userData = ctxt; /* so we can use libxml2/SAX2.c handlers if we want to */
72
+ ctxt->_private = (void *)rb_sax_parser;
89
73
 
90
74
  xmlSetStructuredErrorFunc(NULL, NULL);
91
75
 
92
- rb_ensure(parse_doc, (VALUE)ctxt, parse_doc_finalize, (VALUE)ctxt);
76
+ /* although we're calling back into Ruby here, we don't need to worry about exceptions, because we
77
+ * don't have any cleanup to do. The only memory we need to free is handled by
78
+ * xml_sax_parser_context_type_free */
79
+ htmlParseDocument(ctxt);
93
80
 
94
- return self;
81
+ return Qnil;
95
82
  }
96
83
 
97
84
  void
@@ -101,8 +88,11 @@ noko_init_html_sax_parser_context(void)
101
88
  cNokogiriHtml4SaxParserContext = rb_define_class_under(mNokogiriHtml4Sax, "ParserContext",
102
89
  cNokogiriXmlSaxParserContext);
103
90
 
104
- rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "memory", parse_memory, 2);
105
- rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "file", parse_file, 2);
91
+ rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "native_memory",
92
+ noko_html4_sax_parser_context_s_native_memory, 2);
93
+ rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "native_file",
94
+ noko_html4_sax_parser_context_s_native_file, 2);
106
95
 
107
- rb_define_method(cNokogiriHtml4SaxParserContext, "parse_with", parse_with, 1);
96
+ rb_define_method(cNokogiriHtml4SaxParserContext, "parse_with",
97
+ noko_html4_sax_parser_context__parse_with, 1);
108
98
  }