nokogiri 1.16.7 → 1.17.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +11 -21
  3. data/README.md +4 -0
  4. data/dependencies.yml +6 -6
  5. data/ext/nokogiri/extconf.rb +191 -137
  6. data/ext/nokogiri/gumbo.c +69 -53
  7. data/ext/nokogiri/html4_document.c +10 -4
  8. data/ext/nokogiri/html4_element_description.c +18 -18
  9. data/ext/nokogiri/html4_sax_parser.c +40 -0
  10. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  11. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  12. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  13. data/ext/nokogiri/nokogiri.c +9 -2
  14. data/ext/nokogiri/nokogiri.h +18 -33
  15. data/ext/nokogiri/xml_attr.c +1 -1
  16. data/ext/nokogiri/xml_cdata.c +2 -10
  17. data/ext/nokogiri/xml_comment.c +3 -8
  18. data/ext/nokogiri/xml_document.c +163 -156
  19. data/ext/nokogiri/xml_document_fragment.c +10 -25
  20. data/ext/nokogiri/xml_dtd.c +1 -1
  21. data/ext/nokogiri/xml_element_content.c +9 -9
  22. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  23. data/ext/nokogiri/xml_namespace.c +6 -6
  24. data/ext/nokogiri/xml_node.c +141 -104
  25. data/ext/nokogiri/xml_node_set.c +46 -44
  26. data/ext/nokogiri/xml_reader.c +54 -58
  27. data/ext/nokogiri/xml_relax_ng.c +35 -56
  28. data/ext/nokogiri/xml_sax_parser.c +156 -88
  29. data/ext/nokogiri/xml_sax_parser_context.c +213 -131
  30. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  31. data/ext/nokogiri/xml_schema.c +50 -85
  32. data/ext/nokogiri/xml_syntax_error.c +19 -11
  33. data/ext/nokogiri/xml_text.c +2 -4
  34. data/ext/nokogiri/xml_xpath_context.c +2 -2
  35. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  36. data/gumbo-parser/src/error.c +76 -48
  37. data/gumbo-parser/src/error.h +5 -1
  38. data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
  39. data/gumbo-parser/src/parser.c +61 -23
  40. data/gumbo-parser/src/tokenizer.c +6 -6
  41. data/lib/nokogiri/class_resolver.rb +1 -1
  42. data/lib/nokogiri/css/node.rb +6 -2
  43. data/lib/nokogiri/css/parser.rb +6 -4
  44. data/lib/nokogiri/css/parser.y +2 -2
  45. data/lib/nokogiri/css/parser_extras.rb +6 -66
  46. data/lib/nokogiri/css/selector_cache.rb +38 -0
  47. data/lib/nokogiri/css/tokenizer.rb +4 -4
  48. data/lib/nokogiri/css/tokenizer.rex +9 -8
  49. data/lib/nokogiri/css/xpath_visitor.rb +42 -6
  50. data/lib/nokogiri/css.rb +86 -20
  51. data/lib/nokogiri/decorators/slop.rb +3 -5
  52. data/lib/nokogiri/encoding_handler.rb +2 -2
  53. data/lib/nokogiri/html4/document.rb +44 -23
  54. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  55. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  56. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  57. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  58. data/lib/nokogiri/html4.rb +9 -14
  59. data/lib/nokogiri/html5/builder.rb +40 -0
  60. data/lib/nokogiri/html5/document.rb +61 -30
  61. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  62. data/lib/nokogiri/html5/node.rb +4 -4
  63. data/lib/nokogiri/html5.rb +114 -72
  64. data/lib/nokogiri/version/constant.rb +1 -1
  65. data/lib/nokogiri/xml/builder.rb +8 -1
  66. data/lib/nokogiri/xml/document.rb +70 -26
  67. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  68. data/lib/nokogiri/xml/node.rb +82 -11
  69. data/lib/nokogiri/xml/node_set.rb +9 -7
  70. data/lib/nokogiri/xml/parse_options.rb +1 -1
  71. data/lib/nokogiri/xml/pp/node.rb +6 -1
  72. data/lib/nokogiri/xml/reader.rb +46 -13
  73. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  74. data/lib/nokogiri/xml/sax/document.rb +174 -83
  75. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  76. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  77. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  78. data/lib/nokogiri/xml/sax.rb +48 -0
  79. data/lib/nokogiri/xml/schema.rb +112 -45
  80. data/lib/nokogiri/xml/searchable.rb +6 -8
  81. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  82. data/lib/nokogiri/xml.rb +13 -24
  83. data/lib/nokogiri/xslt.rb +3 -9
  84. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  85. data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
  86. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  87. metadata +10 -9
  88. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
  89. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
  90. data/ports/archives/libxml2-2.12.9.tar.xz +0 -0
  91. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
data/ext/nokogiri/gumbo.c CHANGED
@@ -37,30 +37,6 @@ VALUE cNokogiriHtml5Document;
37
37
  static ID internal_subset;
38
38
  static ID parent;
39
39
 
40
- /* Backwards compatibility to Ruby 2.1.0 */
41
- #if RUBY_API_VERSION_CODE < 20200
42
- #define ONIG_ESCAPE_UCHAR_COLLISION 1
43
- #include <ruby/encoding.h>
44
-
45
- static VALUE
46
- rb_utf8_str_new(const char *str, long length)
47
- {
48
- return rb_enc_str_new(str, length, rb_utf8_encoding());
49
- }
50
-
51
- static VALUE
52
- rb_utf8_str_new_cstr(const char *str)
53
- {
54
- return rb_enc_str_new_cstr(str, rb_utf8_encoding());
55
- }
56
-
57
- static VALUE
58
- rb_utf8_str_new_static(const char *str, long length)
59
- {
60
- return rb_enc_str_new(str, length, rb_utf8_encoding());
61
- }
62
- #endif
63
-
64
40
  #include <nokogiri.h>
65
41
  #include <libxml/tree.h>
66
42
  #include <libxml/HTMLtree.h>
@@ -94,7 +70,7 @@ perform_parse(const GumboOptions *options, VALUE input)
94
70
  GumboOutput *output = gumbo_parse_with_options(
95
71
  options,
96
72
  RSTRING_PTR(input),
97
- RSTRING_LEN(input)
73
+ (size_t)RSTRING_LEN(input)
98
74
  );
99
75
 
100
76
  const char *status_string = gumbo_status_to_string(output->status);
@@ -260,7 +236,7 @@ static void
260
236
  add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
261
237
  {
262
238
  const char *input_str = RSTRING_PTR(input);
263
- size_t input_len = RSTRING_LEN(input);
239
+ size_t input_len = (size_t)RSTRING_LEN(input);
264
240
 
265
241
  // Add parse errors to rdoc.
266
242
  if (output->errors.length) {
@@ -272,11 +248,11 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
272
248
  GumboSourcePosition position = gumbo_error_position(err);
273
249
  char *msg;
274
250
  size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
275
- VALUE err_str = rb_utf8_str_new(msg, size);
251
+ VALUE err_str = rb_utf8_str_new(msg, (int)size);
276
252
  free(msg);
277
253
  VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
278
254
  const char *error_code = gumbo_error_code(err);
279
- VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
255
+ VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, (int)strlen(error_code)) : Qnil;
280
256
  rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
281
257
  rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
282
258
  rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
@@ -316,18 +292,58 @@ parse_cleanup(VALUE parse_args)
316
292
  return Qnil;
317
293
  }
318
294
 
295
+ // Scan the keyword arguments for options common to the document and fragment
296
+ // parse.
297
+ static GumboOptions
298
+ common_options(VALUE kwargs)
299
+ {
300
+ // The order of the keywords determines the order of the values below.
301
+ // If this order is changed, then setting the options below must change as
302
+ // well.
303
+ ID keywords[] = {
304
+ // Required keywords.
305
+ rb_intern_const("max_attributes"),
306
+ rb_intern_const("max_errors"),
307
+ rb_intern_const("max_tree_depth"),
308
+
309
+ // Optional keywords.
310
+ rb_intern_const("parse_noscript_content_as_text"),
311
+ };
312
+ VALUE values[sizeof keywords / sizeof keywords[0]];
313
+
314
+ // Extract the values coresponding to the required keywords. Raise an error
315
+ // if required arguments are missing.
316
+ rb_get_kwargs(kwargs, keywords, 3, 1, values);
317
+
318
+ GumboOptions options = kGumboDefaultOptions;
319
+ options.max_attributes = NUM2INT(values[0]);
320
+ options.max_errors = NUM2INT(values[1]);
321
+
322
+ // handle negative values
323
+ int depth = NUM2INT(values[2]);
324
+ options.max_tree_depth = depth < 0 ? UINT_MAX : (unsigned int)depth;
325
+
326
+ options.parse_noscript_content_as_text = values[3] != Qundef && RTEST(values[3]);
327
+
328
+ return options;
329
+ }
330
+
319
331
  static VALUE parse_continue(VALUE parse_args);
320
332
 
321
333
  /*
322
334
  * @!visibility protected
323
335
  */
324
336
  static VALUE
325
- parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth, VALUE klass)
337
+ noko_gumbo_s_parse(int argc, VALUE *argv, VALUE _self)
326
338
  {
327
- GumboOptions options = kGumboDefaultOptions;
328
- options.max_attributes = NUM2INT(max_attributes);
329
- options.max_errors = NUM2INT(max_errors);
330
- options.max_tree_depth = NUM2INT(max_depth);
339
+ VALUE input, url, klass, kwargs;
340
+
341
+ rb_scan_args(argc, argv, "3:", &input, &url, &klass, &kwargs);
342
+ if (NIL_P(kwargs)) {
343
+ kwargs = rb_hash_new();
344
+ }
345
+
346
+ GumboOptions options = common_options(kwargs);
331
347
 
332
348
  GumboOutput *output = perform_parse(&options, input);
333
349
  ParseArgs args = {
@@ -383,7 +399,7 @@ lookup_namespace(VALUE node, bool require_known_ns)
383
399
  Check_Type(ns, T_STRING);
384
400
 
385
401
  const char *href_ptr = RSTRING_PTR(ns);
386
- size_t href_len = RSTRING_LEN(ns);
402
+ size_t href_len = (size_t)RSTRING_LEN(ns);
387
403
  #define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
388
404
  if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) {
389
405
  return GUMBO_NAMESPACE_HTML;
@@ -415,16 +431,12 @@ static VALUE fragment_continue(VALUE parse_args);
415
431
  * @!visibility protected
416
432
  */
417
433
  static VALUE
418
- fragment(
419
- VALUE self,
420
- VALUE doc_fragment,
421
- VALUE tags,
422
- VALUE ctx,
423
- VALUE max_attributes,
424
- VALUE max_errors,
425
- VALUE max_depth
426
- )
434
+ noko_gumbo_s_fragment(int argc, VALUE *argv, VALUE _self)
427
435
  {
436
+ VALUE doc_fragment;
437
+ VALUE tags;
438
+ VALUE ctx;
439
+ VALUE kwargs;
428
440
  ID name = rb_intern_const("name");
429
441
  const char *ctx_tag;
430
442
  GumboNamespaceEnum ctx_ns;
@@ -432,13 +444,20 @@ fragment(
432
444
  bool form = false;
433
445
  const char *encoding = NULL;
434
446
 
447
+ rb_scan_args(argc, argv, "3:", &doc_fragment, &tags, &ctx, &kwargs);
448
+ if (NIL_P(kwargs)) {
449
+ kwargs = rb_hash_new();
450
+ }
451
+
452
+ GumboOptions options = common_options(kwargs);
453
+
435
454
  if (NIL_P(ctx)) {
436
455
  ctx_tag = "body";
437
456
  ctx_ns = GUMBO_NAMESPACE_HTML;
438
457
  } else if (TYPE(ctx) == T_STRING) {
439
458
  ctx_tag = StringValueCStr(ctx);
440
459
  ctx_ns = GUMBO_NAMESPACE_HTML;
441
- size_t len = RSTRING_LEN(ctx);
460
+ size_t len = (size_t)RSTRING_LEN(ctx);
442
461
  const char *colon = memchr(ctx_tag, ':', len);
443
462
  if (colon) {
444
463
  switch (colon - ctx_tag) {
@@ -519,7 +538,7 @@ error:
519
538
  VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
520
539
  VALUE dtd = rb_funcall(doc, internal_subset, 0);
521
540
  VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
522
- if (NIL_P(ctx) || NIL_P(doc_quirks_mode)) {
541
+ if (NIL_P(ctx) || (TYPE(ctx) == T_STRING) || NIL_P(doc_quirks_mode)) {
523
542
  quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
524
543
  } else if (NIL_P(dtd)) {
525
544
  quirks_mode = GUMBO_DOCTYPE_QUIRKS;
@@ -535,18 +554,15 @@ error:
535
554
  }
536
555
 
537
556
  // Perform a fragment parse.
538
- int depth = NUM2INT(max_depth);
539
- GumboOptions options = kGumboDefaultOptions;
540
- options.max_attributes = NUM2INT(max_attributes);
541
- options.max_errors = NUM2INT(max_errors);
542
- // Add one to account for the HTML element.
543
- options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
544
557
  options.fragment_context = ctx_tag;
545
558
  options.fragment_namespace = ctx_ns;
546
559
  options.fragment_encoding = encoding;
547
560
  options.quirks_mode = quirks_mode;
548
561
  options.fragment_context_has_form_ancestor = form;
549
562
 
563
+ // Add one to the max tree depth to account for the HTML element.
564
+ if (options.max_tree_depth < UINT_MAX) { options.max_tree_depth++; }
565
+
550
566
  GumboOutput *output = perform_parse(&options, tags);
551
567
  ParseArgs args = {
552
568
  .output = output,
@@ -587,8 +603,8 @@ noko_init_gumbo(void)
587
603
  parent = rb_intern_const("parent");
588
604
 
589
605
  // Define Nokogumbo module with parse and fragment methods.
590
- rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 6);
591
- rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
606
+ rb_define_singleton_method(mNokogiriGumbo, "parse", noko_gumbo_s_parse, -1);
607
+ rb_define_singleton_method(mNokogiriGumbo, "fragment", noko_gumbo_s_fragment, -1);
592
608
  }
593
609
 
594
610
  // vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -7,9 +7,9 @@ static ID id_to_s;
7
7
 
8
8
  /*
9
9
  * call-seq:
10
- * new
10
+ * new(uri=nil, external_id=nil) → HTML4::Document
11
11
  *
12
- * Create a new document
12
+ * Create a new empty document with base URI +uri+ and external ID +external_id+.
13
13
  */
14
14
  static VALUE
15
15
  rb_html_document_s_new(int argc, VALUE *argv, VALUE klass)
@@ -46,7 +46,7 @@ rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_enco
46
46
  const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
47
47
  int options = NUM2INT(rb_options);
48
48
 
49
- xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
49
+ xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);
50
50
 
51
51
  c_doc = htmlReadIO(noko_io_read, noko_io_close, (void *)rb_io, c_url, c_encoding, options);
52
52
 
@@ -106,7 +106,7 @@ rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE r
106
106
  int html_len = (int)RSTRING_LEN(rb_html);
107
107
  int options = NUM2INT(rb_options);
108
108
 
109
- xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
109
+ xmlSetStructuredErrorFunc((void *)rb_error_list, noko__error_array_pusher);
110
110
 
111
111
  c_doc = htmlReadMemory(c_buffer, html_len, c_url, c_encoding, options);
112
112
 
@@ -151,6 +151,12 @@ rb_html_document_type(VALUE self)
151
151
  void
152
152
  noko_init_html_document(void)
153
153
  {
154
+ /* this is here so that rdoc doesn't ignore this file. */
155
+ /*
156
+ mNokogiri = rb_define_module("Nokogiri");
157
+ mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4");
158
+ */
159
+
154
160
  assert(cNokogiriXmlDocument);
155
161
  cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument);
156
162
 
@@ -1,7 +1,7 @@
1
1
  #include <nokogiri.h>
2
2
 
3
- static const rb_data_type_t html4_element_description_type = {
4
- .wrap_struct_name = "Nokogiri::HTML4::ElementDescription",
3
+ static const rb_data_type_t html_elem_desc_type = {
4
+ .wrap_struct_name = "htmlElemDesc",
5
5
  .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
6
6
  };
7
7
 
@@ -20,7 +20,7 @@ required_attributes(VALUE self)
20
20
  VALUE list;
21
21
  int i;
22
22
 
23
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
23
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
24
24
 
25
25
  list = rb_ary_new();
26
26
 
@@ -46,7 +46,7 @@ deprecated_attributes(VALUE self)
46
46
  VALUE list;
47
47
  int i;
48
48
 
49
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
49
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
50
50
 
51
51
  list = rb_ary_new();
52
52
 
@@ -72,7 +72,7 @@ optional_attributes(VALUE self)
72
72
  VALUE list;
73
73
  int i;
74
74
 
75
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
75
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
76
76
 
77
77
  list = rb_ary_new();
78
78
 
@@ -95,7 +95,7 @@ static VALUE
95
95
  default_sub_element(VALUE self)
96
96
  {
97
97
  const htmlElemDesc *description;
98
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
98
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
99
99
 
100
100
  if (description->defaultsubelt) {
101
101
  return NOKOGIRI_STR_NEW2(description->defaultsubelt);
@@ -117,7 +117,7 @@ sub_elements(VALUE self)
117
117
  VALUE list;
118
118
  int i;
119
119
 
120
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
120
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
121
121
 
122
122
  list = rb_ary_new();
123
123
 
@@ -140,7 +140,7 @@ static VALUE
140
140
  description(VALUE self)
141
141
  {
142
142
  const htmlElemDesc *description;
143
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
143
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
144
144
 
145
145
  return NOKOGIRI_STR_NEW2(description->desc);
146
146
  }
@@ -155,7 +155,7 @@ static VALUE
155
155
  inline_eh(VALUE self)
156
156
  {
157
157
  const htmlElemDesc *description;
158
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
158
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
159
159
 
160
160
  if (description->isinline) { return Qtrue; }
161
161
  return Qfalse;
@@ -171,7 +171,7 @@ static VALUE
171
171
  deprecated_eh(VALUE self)
172
172
  {
173
173
  const htmlElemDesc *description;
174
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
174
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
175
175
 
176
176
  if (description->depr) { return Qtrue; }
177
177
  return Qfalse;
@@ -187,7 +187,7 @@ static VALUE
187
187
  empty_eh(VALUE self)
188
188
  {
189
189
  const htmlElemDesc *description;
190
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
190
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
191
191
 
192
192
  if (description->empty) { return Qtrue; }
193
193
  return Qfalse;
@@ -203,7 +203,7 @@ static VALUE
203
203
  save_end_tag_eh(VALUE self)
204
204
  {
205
205
  const htmlElemDesc *description;
206
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
206
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
207
207
 
208
208
  if (description->saveEndTag) { return Qtrue; }
209
209
  return Qfalse;
@@ -219,7 +219,7 @@ static VALUE
219
219
  implied_end_tag_eh(VALUE self)
220
220
  {
221
221
  const htmlElemDesc *description;
222
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
222
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
223
223
 
224
224
  if (description->endTag) { return Qtrue; }
225
225
  return Qfalse;
@@ -235,7 +235,7 @@ static VALUE
235
235
  implied_start_tag_eh(VALUE self)
236
236
  {
237
237
  const htmlElemDesc *description;
238
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
238
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
239
239
 
240
240
  if (description->startTag) { return Qtrue; }
241
241
  return Qfalse;
@@ -245,13 +245,13 @@ implied_start_tag_eh(VALUE self)
245
245
  * call-seq:
246
246
  * name
247
247
  *
248
- * Get the tag name for this ElemementDescription
248
+ * Get the tag name for this ElementDescription
249
249
  */
250
250
  static VALUE
251
251
  name(VALUE self)
252
252
  {
253
253
  const htmlElemDesc *description;
254
- TypedData_Get_Struct(self, htmlElemDesc, &html4_element_description_type, description);
254
+ TypedData_Get_Struct(self, htmlElemDesc, &html_elem_desc_type, description);
255
255
 
256
256
  if (NULL == description->name) { return Qnil; }
257
257
  return NOKOGIRI_STR_NEW2(description->name);
@@ -261,7 +261,7 @@ name(VALUE self)
261
261
  * call-seq:
262
262
  * [](tag_name)
263
263
  *
264
- * Get ElemementDescription for +tag_name+
264
+ * Get ElementDescription for +tag_name+
265
265
  */
266
266
  static VALUE
267
267
  get_description(VALUE klass, VALUE tag_name)
@@ -271,7 +271,7 @@ get_description(VALUE klass, VALUE tag_name)
271
271
  );
272
272
 
273
273
  if (NULL == description) { return Qnil; }
274
- return TypedData_Wrap_Struct(klass, &html4_element_description_type, DISCARD_CONST_QUAL(void *, description));
274
+ return TypedData_Wrap_Struct(klass, &html_elem_desc_type, DISCARD_CONST_QUAL(void *, description));
275
275
  }
276
276
 
277
277
  void
@@ -0,0 +1,40 @@
1
+ #include <nokogiri.h>
2
+
3
+ VALUE cNokogiriHtml4SaxParser;
4
+
5
+ static ID id_start_document;
6
+
7
+ static void
8
+ noko_html4_sax_parser_start_document(void *ctx)
9
+ {
10
+ xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
11
+ VALUE self = (VALUE)ctxt->_private;
12
+ VALUE doc = rb_iv_get(self, "@document");
13
+
14
+ xmlSAX2StartDocument(ctx);
15
+
16
+ rb_funcall(doc, id_start_document, 0);
17
+ }
18
+
19
+ static VALUE
20
+ noko_html4_sax_parser_initialize(VALUE self)
21
+ {
22
+ xmlSAXHandlerPtr handler = noko_xml_sax_parser_unwrap(self);
23
+
24
+ rb_call_super(0, NULL);
25
+
26
+ handler->startDocument = noko_html4_sax_parser_start_document;
27
+
28
+ return self;
29
+ }
30
+
31
+ void
32
+ noko_init_html4_sax_parser(void)
33
+ {
34
+ cNokogiriHtml4SaxParser = rb_define_class_under(mNokogiriHtml4Sax, "Parser", cNokogiriXmlSaxParser);
35
+
36
+ rb_define_private_method(cNokogiriHtml4SaxParser, "initialize_native",
37
+ noko_html4_sax_parser_initialize, 0);
38
+
39
+ id_start_document = rb_intern("start_document");
40
+ }
@@ -2,96 +2,83 @@
2
2
 
3
3
  VALUE cNokogiriHtml4SaxParserContext ;
4
4
 
5
+ /* :nodoc: */
5
6
  static VALUE
6
- parse_memory(VALUE klass, VALUE data, VALUE encoding)
7
+ noko_html4_sax_parser_context_s_native_memory(VALUE rb_class, VALUE rb_input, VALUE rb_encoding)
7
8
  {
8
- htmlParserCtxtPtr ctxt;
9
-
10
- Check_Type(data, T_STRING);
11
-
12
- if (!(int)RSTRING_LEN(data)) {
13
- rb_raise(rb_eRuntimeError, "data cannot be empty");
9
+ Check_Type(rb_input, T_STRING);
10
+ if (!(int)RSTRING_LEN(rb_input)) {
11
+ rb_raise(rb_eRuntimeError, "input string cannot be empty");
14
12
  }
15
13
 
16
- ctxt = htmlCreateMemoryParserCtxt(StringValuePtr(data),
17
- (int)RSTRING_LEN(data));
18
- if (ctxt->sax) {
19
- xmlFree(ctxt->sax);
20
- ctxt->sax = NULL;
14
+ if (!NIL_P(rb_encoding) && !rb_obj_is_kind_of(rb_encoding, rb_cEncoding)) {
15
+ rb_raise(rb_eTypeError, "argument must be an Encoding object");
21
16
  }
22
17
 
23
- if (RTEST(encoding)) {
24
- xmlCharEncodingHandlerPtr enc = xmlFindCharEncodingHandler(StringValueCStr(encoding));
25
- if (enc != NULL) {
26
- xmlSwitchToEncoding(ctxt, enc);
27
- if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
28
- rb_raise(rb_eRuntimeError, "Unsupported encoding %s",
29
- StringValueCStr(encoding));
30
- }
31
- }
18
+ htmlParserCtxtPtr c_context =
19
+ htmlCreateMemoryParserCtxt(StringValuePtr(rb_input), (int)RSTRING_LEN(rb_input));
20
+ if (!c_context) {
21
+ rb_raise(rb_eRuntimeError, "failed to create xml sax parser context");
32
22
  }
33
23
 
34
- return noko_xml_sax_parser_context_wrap(klass, ctxt);
35
- }
24
+ noko_xml_sax_parser_context_set_encoding(c_context, rb_encoding);
36
25
 
37
- static VALUE
38
- parse_file(VALUE klass, VALUE filename, VALUE encoding)
39
- {
40
- htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(
41
- StringValueCStr(filename),
42
- StringValueCStr(encoding)
43
- );
44
-
45
- if (ctxt->sax) {
46
- xmlFree(ctxt->sax);
47
- ctxt->sax = NULL;
26
+ if (c_context->sax) {
27
+ xmlFree(c_context->sax);
28
+ c_context->sax = NULL;
48
29
  }
49
30
 
50
- return noko_xml_sax_parser_context_wrap(klass, ctxt);
31
+ return noko_xml_sax_parser_context_wrap(rb_class, c_context);
51
32
  }
52
33
 
34
+ /* :nodoc: */
53
35
  static VALUE
54
- parse_doc(VALUE ctxt_val)
36
+ noko_html4_sax_parser_context_s_native_file(VALUE rb_class, VALUE rb_filename, VALUE rb_encoding)
55
37
  {
56
- htmlParserCtxtPtr ctxt = (htmlParserCtxtPtr)ctxt_val;
57
- htmlParseDocument(ctxt);
58
- return Qnil;
59
- }
38
+ if (!NIL_P(rb_encoding) && !rb_obj_is_kind_of(rb_encoding, rb_cEncoding)) {
39
+ rb_raise(rb_eTypeError, "argument must be an Encoding object");
40
+ }
60
41
 
61
- static VALUE
62
- parse_doc_finalize(VALUE ctxt_val)
63
- {
64
- htmlParserCtxtPtr ctxt = (htmlParserCtxtPtr)ctxt_val;
42
+ htmlParserCtxtPtr c_context = htmlCreateFileParserCtxt(StringValueCStr(rb_filename), NULL);
43
+ if (!c_context) {
44
+ rb_raise(rb_eRuntimeError, "failed to create xml sax parser context");
45
+ }
65
46
 
66
- if (ctxt->myDoc) {
67
- xmlFreeDoc(ctxt->myDoc);
47
+ noko_xml_sax_parser_context_set_encoding(c_context, rb_encoding);
48
+
49
+ if (c_context->sax) {
50
+ xmlFree(c_context->sax);
51
+ c_context->sax = NULL;
68
52
  }
69
53
 
70
- NOKOGIRI_SAX_TUPLE_DESTROY(ctxt->userData);
71
- return Qnil;
54
+ return noko_xml_sax_parser_context_wrap(rb_class, c_context);
72
55
  }
73
56
 
74
57
  static VALUE
75
- parse_with(VALUE self, VALUE sax_handler)
58
+ noko_html4_sax_parser_context__parse_with(VALUE rb_context, VALUE rb_sax_parser)
76
59
  {
77
60
  htmlParserCtxtPtr ctxt;
78
61
  htmlSAXHandlerPtr sax;
79
62
 
80
- if (!rb_obj_is_kind_of(sax_handler, cNokogiriXmlSaxParser)) {
63
+ if (!rb_obj_is_kind_of(rb_sax_parser, cNokogiriXmlSaxParser)) {
81
64
  rb_raise(rb_eArgError, "argument must be a Nokogiri::XML::SAX::Parser");
82
65
  }
83
66
 
84
- ctxt = noko_xml_sax_parser_context_unwrap(self);
85
- sax = noko_sax_handler_unwrap(sax_handler);
67
+ ctxt = noko_xml_sax_parser_context_unwrap(rb_context);
68
+ sax = noko_xml_sax_parser_unwrap(rb_sax_parser);
86
69
 
87
70
  ctxt->sax = sax;
88
- ctxt->userData = (void *)NOKOGIRI_SAX_TUPLE_NEW(ctxt, sax_handler);
71
+ ctxt->userData = ctxt; /* so we can use libxml2/SAX2.c handlers if we want to */
72
+ ctxt->_private = (void *)rb_sax_parser;
89
73
 
90
74
  xmlSetStructuredErrorFunc(NULL, NULL);
91
75
 
92
- rb_ensure(parse_doc, (VALUE)ctxt, parse_doc_finalize, (VALUE)ctxt);
76
+ /* although we're calling back into Ruby here, we don't need to worry about exceptions, because we
77
+ * don't have any cleanup to do. The only memory we need to free is handled by
78
+ * xml_sax_parser_context_type_free */
79
+ htmlParseDocument(ctxt);
93
80
 
94
- return self;
81
+ return Qnil;
95
82
  }
96
83
 
97
84
  void
@@ -101,8 +88,11 @@ noko_init_html_sax_parser_context(void)
101
88
  cNokogiriHtml4SaxParserContext = rb_define_class_under(mNokogiriHtml4Sax, "ParserContext",
102
89
  cNokogiriXmlSaxParserContext);
103
90
 
104
- rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "memory", parse_memory, 2);
105
- rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "file", parse_file, 2);
91
+ rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "native_memory",
92
+ noko_html4_sax_parser_context_s_native_memory, 2);
93
+ rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "native_file",
94
+ noko_html4_sax_parser_context_s_native_file, 2);
106
95
 
107
- rb_define_method(cNokogiriHtml4SaxParserContext, "parse_with", parse_with, 1);
96
+ rb_define_method(cNokogiriHtml4SaxParserContext, "parse_with",
97
+ noko_html4_sax_parser_context__parse_with, 1);
108
98
  }