nokogiri 1.15.4 → 1.17.2

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +12 -19
  3. data/README.md +8 -1
  4. data/dependencies.yml +9 -8
  5. data/ext/nokogiri/extconf.rb +194 -141
  6. data/ext/nokogiri/gumbo.c +69 -53
  7. data/ext/nokogiri/html4_document.c +10 -4
  8. data/ext/nokogiri/html4_element_description.c +18 -18
  9. data/ext/nokogiri/html4_sax_parser.c +40 -0
  10. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  11. data/ext/nokogiri/html4_sax_push_parser.c +26 -25
  12. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  13. data/ext/nokogiri/nokogiri.c +9 -2
  14. data/ext/nokogiri/nokogiri.h +25 -33
  15. data/ext/nokogiri/test_global_handlers.c +1 -1
  16. data/ext/nokogiri/xml_attr.c +1 -1
  17. data/ext/nokogiri/xml_cdata.c +3 -12
  18. data/ext/nokogiri/xml_comment.c +3 -8
  19. data/ext/nokogiri/xml_document.c +167 -156
  20. data/ext/nokogiri/xml_document_fragment.c +10 -25
  21. data/ext/nokogiri/xml_dtd.c +1 -1
  22. data/ext/nokogiri/xml_element_content.c +9 -9
  23. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  24. data/ext/nokogiri/xml_namespace.c +6 -10
  25. data/ext/nokogiri/xml_node.c +142 -108
  26. data/ext/nokogiri/xml_node_set.c +46 -44
  27. data/ext/nokogiri/xml_reader.c +74 -100
  28. data/ext/nokogiri/xml_relax_ng.c +35 -56
  29. data/ext/nokogiri/xml_sax_parser.c +156 -88
  30. data/ext/nokogiri/xml_sax_parser_context.c +214 -128
  31. data/ext/nokogiri/xml_sax_push_parser.c +69 -50
  32. data/ext/nokogiri/xml_schema.c +51 -87
  33. data/ext/nokogiri/xml_syntax_error.c +19 -11
  34. data/ext/nokogiri/xml_text.c +3 -6
  35. data/ext/nokogiri/xml_xpath_context.c +4 -7
  36. data/ext/nokogiri/xslt_stylesheet.c +16 -11
  37. data/gumbo-parser/Makefile +18 -0
  38. data/gumbo-parser/src/error.c +76 -48
  39. data/gumbo-parser/src/error.h +5 -1
  40. data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
  41. data/gumbo-parser/src/parser.c +64 -23
  42. data/gumbo-parser/src/tokenizer.c +7 -6
  43. data/lib/nokogiri/class_resolver.rb +1 -1
  44. data/lib/nokogiri/css/node.rb +6 -2
  45. data/lib/nokogiri/css/parser.rb +6 -4
  46. data/lib/nokogiri/css/parser.y +2 -2
  47. data/lib/nokogiri/css/parser_extras.rb +6 -66
  48. data/lib/nokogiri/css/selector_cache.rb +38 -0
  49. data/lib/nokogiri/css/tokenizer.rb +4 -4
  50. data/lib/nokogiri/css/tokenizer.rex +9 -8
  51. data/lib/nokogiri/css/xpath_visitor.rb +43 -27
  52. data/lib/nokogiri/css.rb +86 -20
  53. data/lib/nokogiri/decorators/slop.rb +3 -5
  54. data/lib/nokogiri/encoding_handler.rb +2 -2
  55. data/lib/nokogiri/html4/document.rb +45 -24
  56. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  57. data/lib/nokogiri/html4/encoding_reader.rb +2 -2
  58. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  59. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  60. data/lib/nokogiri/html4.rb +9 -14
  61. data/lib/nokogiri/html5/builder.rb +40 -0
  62. data/lib/nokogiri/html5/document.rb +61 -30
  63. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  64. data/lib/nokogiri/html5/node.rb +4 -4
  65. data/lib/nokogiri/html5.rb +114 -138
  66. data/lib/nokogiri/version/constant.rb +1 -1
  67. data/lib/nokogiri/version/info.rb +6 -5
  68. data/lib/nokogiri/xml/attr.rb +2 -2
  69. data/lib/nokogiri/xml/builder.rb +8 -1
  70. data/lib/nokogiri/xml/document.rb +74 -31
  71. data/lib/nokogiri/xml/document_fragment.rb +86 -15
  72. data/lib/nokogiri/xml/namespace.rb +1 -2
  73. data/lib/nokogiri/xml/node.rb +113 -35
  74. data/lib/nokogiri/xml/node_set.rb +12 -10
  75. data/lib/nokogiri/xml/parse_options.rb +1 -1
  76. data/lib/nokogiri/xml/pp/node.rb +6 -1
  77. data/lib/nokogiri/xml/reader.rb +51 -17
  78. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  79. data/lib/nokogiri/xml/sax/document.rb +174 -83
  80. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  81. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  82. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  83. data/lib/nokogiri/xml/sax.rb +48 -0
  84. data/lib/nokogiri/xml/schema.rb +112 -45
  85. data/lib/nokogiri/xml/searchable.rb +9 -11
  86. data/lib/nokogiri/xml/syntax_error.rb +23 -1
  87. data/lib/nokogiri/xml.rb +14 -25
  88. data/lib/nokogiri/xslt/stylesheet.rb +29 -7
  89. data/lib/nokogiri/xslt.rb +4 -10
  90. data/lib/nokogiri.rb +1 -1
  91. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  92. data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
  93. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  94. metadata +15 -14
  95. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
  96. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
  97. data/ports/archives/libxml2-2.11.5.tar.xz +0 -0
  98. data/ports/archives/libxslt-1.1.38.tar.xz +0 -0
@@ -1,6 +1,6 @@
1
1
  #include <nokogiri.h>
2
2
 
3
- VALUE cNokogiriXsltStylesheet ;
3
+ VALUE cNokogiriXsltStylesheet;
4
4
 
5
5
  static void
6
6
  mark(void *data)
@@ -18,8 +18,8 @@ dealloc(void *data)
18
18
  ruby_xfree(wrapper);
19
19
  }
20
20
 
21
- static const rb_data_type_t xslt_stylesheet_type = {
22
- .wrap_struct_name = "Nokogiri::XSLT::Stylesheet",
21
+ static const rb_data_type_t nokogiri_xslt_stylesheet_tuple_type = {
22
+ .wrap_struct_name = "nokogiriXsltStylesheetTuple",
23
23
  .function = {
24
24
  .dmark = mark,
25
25
  .dfree = dealloc,
@@ -56,7 +56,7 @@ Nokogiri_wrap_xslt_stylesheet(xsltStylesheetPtr ss)
56
56
  self = TypedData_Make_Struct(
57
57
  cNokogiriXsltStylesheet,
58
58
  nokogiriXsltStylesheetTuple,
59
- &xslt_stylesheet_type,
59
+ &nokogiri_xslt_stylesheet_tuple_type,
60
60
  wrapper
61
61
  );
62
62
 
@@ -71,7 +71,12 @@ Nokogiri_wrap_xslt_stylesheet(xsltStylesheetPtr ss)
71
71
  * call-seq:
72
72
  * parse_stylesheet_doc(document)
73
73
  *
74
- * Parse a stylesheet from +document+.
74
+ * Parse an XSLT::Stylesheet from +document+.
75
+ *
76
+ * [Parameters]
77
+ * - +document+ (Nokogiri::XML::Document) the document to be parsed.
78
+ *
79
+ * [Returns] Nokogiri::XSLT::Stylesheet
75
80
  */
76
81
  static VALUE
77
82
  parse_stylesheet_doc(VALUE klass, VALUE xmldocobj)
@@ -104,7 +109,7 @@ parse_stylesheet_doc(VALUE klass, VALUE xmldocobj)
104
109
  * call-seq:
105
110
  * serialize(document)
106
111
  *
107
- * Serialize +document+ to an xml string.
112
+ * Serialize +document+ to an xml string, as specified by the +method+ parameter in the Stylesheet.
108
113
  */
109
114
  static VALUE
110
115
  rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj)
@@ -119,7 +124,7 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj)
119
124
  TypedData_Get_Struct(
120
125
  self,
121
126
  nokogiriXsltStylesheetTuple,
122
- &xslt_stylesheet_type,
127
+ &nokogiri_xslt_stylesheet_tuple_type,
123
128
  wrapper
124
129
  );
125
130
  xsltSaveResultToString(&doc_ptr, &doc_len, xml, wrapper->ss);
@@ -133,7 +138,7 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj)
133
138
  * transform(document)
134
139
  * transform(document, params = {})
135
140
  *
136
- * Apply an XSLT stylesheet to an XML::Document.
141
+ * Transform an XML::Document as defined by an XSLT::Stylesheet.
137
142
  *
138
143
  * [Parameters]
139
144
  * - +document+ (Nokogiri::XML::Document) the document to be transformed.
@@ -268,7 +273,7 @@ rb_xslt_stylesheet_transform(int argc, VALUE *argv, VALUE self)
268
273
  Check_Type(rb_param, T_ARRAY);
269
274
 
270
275
  c_document = noko_xml_document_unwrap(rb_document);
271
- TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &xslt_stylesheet_type, wrapper);
276
+ TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &nokogiri_xslt_stylesheet_tuple_type, wrapper);
272
277
 
273
278
  param_len = RARRAY_LEN(rb_param);
274
279
  params = ruby_xcalloc((size_t)param_len + 1, sizeof(char *));
@@ -357,7 +362,7 @@ initFunc(xsltTransformContextPtr ctxt, const xmlChar *uri)
357
362
  TypedData_Get_Struct(
358
363
  (VALUE)ctxt->style->_private,
359
364
  nokogiriXsltStylesheetTuple,
360
- &xslt_stylesheet_type,
365
+ &nokogiri_xslt_stylesheet_tuple_type,
361
366
  wrapper
362
367
  );
363
368
  inst = rb_class_new_instance(0, NULL, obj);
@@ -375,7 +380,7 @@ shutdownFunc(xsltTransformContextPtr ctxt,
375
380
  TypedData_Get_Struct(
376
381
  (VALUE)ctxt->style->_private,
377
382
  nokogiriXsltStylesheetTuple,
378
- &xslt_stylesheet_type,
383
+ &nokogiri_xslt_stylesheet_tuple_type,
379
384
  wrapper
380
385
  );
381
386
 
@@ -13,6 +13,23 @@ LDFLAGS := -pthread
13
13
 
14
14
  all: check
15
15
 
16
+ oss-fuzz:
17
+ ./fuzzer/build-ossfuzz.sh
18
+
19
+ fuzzers: fuzzer-normal fuzzer-asan fuzzer-ubsan fuzzer-msan
20
+
21
+ fuzzer-normal:
22
+ ./fuzzer/build.sh
23
+
24
+ fuzzer-asan:
25
+ SANITIZER=asan ./fuzzer/build.sh
26
+
27
+ fuzzer-ubsan:
28
+ SANITIZER=ubsan ./fuzzer/build.sh
29
+
30
+ fuzzer-msan:
31
+ SANITIZER=msan ./fuzzer/build.sh
32
+
16
33
  # don't try to regenerate ragel or gperf files in CI, that should be a development-only action and
17
34
  # the generated files should be committed to SCM
18
35
  ifneq ($(CI),true)
@@ -81,6 +98,7 @@ coverage:
81
98
 
82
99
  clean:
83
100
  $(RM) -r build
101
+ $(RM) -r fuzzer/build fuzzer/src-* fuzzer/gumbo_corpus
84
102
 
85
103
  build/src/flags: | build/src
86
104
  @echo 'old_CC := $(CC)' > $@
@@ -46,33 +46,40 @@ static int PRINTF(2) print_message (
46
46
  args
47
47
  );
48
48
  va_end(args);
49
- #if _MSC_VER && _MSC_VER < 1900
49
+
50
+ #if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(_RUBY_MSVCRT)
50
51
  if (bytes_written == -1) {
51
52
  // vsnprintf returns -1 on older MSVC++ if there's not enough capacity,
52
53
  // instead of returning the number of bytes that would've been written had
53
- // there been enough. In this case, we'll double the buffer size and hope
54
- // it fits when we retry (letting it fail and returning 0 if it doesn't),
55
- // since there's no way to smartly resize the buffer.
56
- gumbo_string_buffer_reserve(output->capacity * 2, output);
54
+ // there been enough. In this case, we can call vsnprintf() again but
55
+ // with a count of 0 to get the number of bytes written, not including
56
+ // the null terminator.
57
+ // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/vsnprintf-vsnprintf-vsnprintf-l-vsnwprintf-vsnwprintf-l?view=msvc-140#behavior-summary
58
+
57
59
  va_start(args, format);
58
- int result = vsnprintf (
59
- output->data + output->length,
60
- remaining_capacity,
60
+ bytes_written = vsnprintf (
61
+ NULL,
62
+ 0,
61
63
  format,
62
64
  args
63
65
  );
64
66
  va_end(args);
65
- return result == -1 ? 0 : result;
66
67
  }
67
- #else
68
+ #endif
69
+
68
70
  // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
69
71
  if (bytes_written == -1) {
70
72
  return 0;
71
73
  }
72
- #endif
73
74
 
74
75
  if (bytes_written >= remaining_capacity) {
75
- gumbo_string_buffer_reserve(output->capacity + bytes_written, output);
76
+ // At least double the size of the buffer.
77
+ size_t new_capacity = output->capacity * 2;
78
+ if (new_capacity < output->length + bytes_written + 1) {
79
+ // The +1 is for the null terminator.
80
+ new_capacity = output->length + bytes_written + 1;
81
+ }
82
+ gumbo_string_buffer_reserve(new_capacity, output);
76
83
  remaining_capacity = output->capacity - output->length;
77
84
  va_start(args, format);
78
85
  bytes_written = vsnprintf (
@@ -96,8 +103,14 @@ static void print_tag_stack (
96
103
  if (i) {
97
104
  print_message(output, ", ");
98
105
  }
99
- GumboTag tag = (GumboTag)(intptr_t) error->tag_stack.data[i];
100
- print_message(output, "%s", gumbo_normalized_tagname(tag));
106
+ uintptr_t tag = (uintptr_t) error->tag_stack.data[i];
107
+ const char* tag_name;
108
+ if (tag > GUMBO_TAG_UNKNOWN) {
109
+ tag_name = error->tag_stack.data[i];
110
+ } else {
111
+ tag_name = gumbo_normalized_tagname((GumboTag)tag);
112
+ }
113
+ print_message(output, "%s", tag_name);
101
114
  }
102
115
  gumbo_string_buffer_append_codepoint('.', output);
103
116
  }
@@ -326,41 +339,45 @@ static void handle_parser_error (
326
339
  }
327
340
 
328
341
  switch (error->input_type) {
329
- case GUMBO_TOKEN_DOCTYPE:
330
- print_message(output, "This is not a legal doctype");
331
- return;
332
- case GUMBO_TOKEN_COMMENT:
333
- // Should never happen; comments are always legal.
334
- assert(0);
335
- // But just in case...
336
- print_message(output, "Comments aren't legal here");
337
- return;
338
- case GUMBO_TOKEN_CDATA:
339
- case GUMBO_TOKEN_WHITESPACE:
340
- case GUMBO_TOKEN_CHARACTER:
341
- print_message(output, "Character tokens aren't legal here");
342
- return;
343
- case GUMBO_TOKEN_NULL:
344
- print_message(output, "Null bytes are not allowed in HTML5");
345
- return;
346
- case GUMBO_TOKEN_EOF:
347
- if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
348
- print_message(output, "You must provide a doctype");
349
- } else {
350
- print_message(output, "Premature end of file.");
351
- print_tag_stack(error, output);
352
- }
353
- return;
354
- case GUMBO_TOKEN_START_TAG:
355
- print_message(output, "Start tag '%s' isn't allowed here.",
356
- gumbo_normalized_tagname(error->input_tag));
357
- print_tag_stack(error, output);
358
- return;
359
- case GUMBO_TOKEN_END_TAG:
360
- print_message(output, "End tag '%s' isn't allowed here.",
361
- gumbo_normalized_tagname(error->input_tag));
342
+ case GUMBO_TOKEN_DOCTYPE:
343
+ print_message(output, "This is not a legal doctype");
344
+ return;
345
+ case GUMBO_TOKEN_COMMENT:
346
+ // Should never happen; comments are always legal.
347
+ assert(0);
348
+ // But just in case...
349
+ print_message(output, "Comments aren't legal here");
350
+ return;
351
+ case GUMBO_TOKEN_CDATA:
352
+ case GUMBO_TOKEN_WHITESPACE:
353
+ case GUMBO_TOKEN_CHARACTER:
354
+ print_message(output, "Character tokens aren't legal here");
355
+ return;
356
+ case GUMBO_TOKEN_NULL:
357
+ print_message(output, "Null bytes are not allowed in HTML5");
358
+ return;
359
+ case GUMBO_TOKEN_EOF:
360
+ if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
361
+ print_message(output, "You must provide a doctype");
362
+ } else {
363
+ print_message(output, "Premature end of file.");
362
364
  print_tag_stack(error, output);
363
- return;
365
+ }
366
+ return;
367
+ case GUMBO_TOKEN_START_TAG:
368
+ case GUMBO_TOKEN_END_TAG:
369
+ {
370
+ const char* tag_name;
371
+ const char* which = error->input_type == GUMBO_TOKEN_START_TAG ? "Start" : "End";
372
+ if (error->input_name) {
373
+ tag_name = error->input_name;
374
+ } else {
375
+ tag_name = gumbo_normalized_tagname(error->input_tag);
376
+ }
377
+ print_message(output, "%s tag '%s' isn't allowed here.", which, tag_name);
378
+ print_tag_stack(error, output);
379
+ return;
380
+ }
364
381
  }
365
382
  }
366
383
 
@@ -613,6 +630,17 @@ void gumbo_print_caret_diagnostic (
613
630
 
614
631
  void gumbo_error_destroy(GumboError* error) {
615
632
  if (error->type == GUMBO_ERR_PARSER) {
633
+ // Free the tag name.
634
+ if (error->v.parser.input_name) {
635
+ gumbo_free(error->v.parser.input_name);
636
+ }
637
+
638
+ for (unsigned int i = 0; i < error->v.parser.tag_stack.length; ++i) {
639
+ intptr_t tag = (intptr_t) error->v.parser.tag_stack.data[i];
640
+ if (tag > GUMBO_TAG_UNKNOWN) {
641
+ gumbo_free(error->v.parser.tag_stack.data[i]);
642
+ }
643
+ }
616
644
  gumbo_vector_destroy(&error->v.parser.tag_stack);
617
645
  }
618
646
  gumbo_free(error);
@@ -95,12 +95,16 @@ typedef struct GumboInternalParserError {
95
95
  // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
96
96
  GumboTag input_tag;
97
97
 
98
+ // The HTML tag of the input token if it was nonstandard tag token. NULL otherwise.
99
+ char *input_name;
100
+
98
101
  // The insertion mode that the parser was in at the time.
99
102
  GumboInsertionMode parser_state;
100
103
 
101
104
  // The tag stack at the point of the error. Note that this is an GumboVector
102
105
  // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
103
- // get at the tag.
106
+ // get at the tag. For nonstandard tags, this is a pointer to an owned char *
107
+ // containing the tag name.
104
108
  GumboVector /* GumboTag */ tag_stack;
105
109
  } GumboParserError;
106
110
 
@@ -780,6 +780,15 @@ typedef struct GumboInternalOptions {
780
780
  * Default: `false`.
781
781
  */
782
782
  bool fragment_context_has_form_ancestor;
783
+
784
+ /**
785
+ * Parse `noscript` elements as if scripting was enabled. This causes the
786
+ * contents of the `noscript` element to be parsed as raw text, rather
787
+ * than as HTML elements.
788
+ *
789
+ * Default: `false`.
790
+ */
791
+ bool parse_noscript_content_as_text;
783
792
  } GumboOptions;
784
793
 
785
794
  /** Default options struct; use this with gumbo_parse_with_options. */
@@ -791,7 +800,7 @@ extern const GumboOptions kGumboDefaultOptions;
791
800
  */
792
801
  typedef enum {
793
802
  /**
794
- * Indicates that parsing completed successfuly. The resulting tree
803
+ * Indicates that parsing completed successfully. The resulting tree
795
804
  * will be a complete document.
796
805
  */
797
806
  GUMBO_STATUS_OK,
@@ -841,7 +850,7 @@ typedef struct GumboInternalOutput {
841
850
  GumboVector /* GumboError */ errors;
842
851
 
843
852
  /**
844
- * True if the parser encounted an error.
853
+ * True if the parser encountered an error.
845
854
  *
846
855
  * This can be true and `errors` an empty `GumboVector` if the `max_errors`
847
856
  * option was set to 0.
@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
56
56
  .fragment_encoding = NULL,
57
57
  .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
58
58
  .fragment_context_has_form_ancestor = false,
59
+ .parse_noscript_content_as_text = false,
59
60
  };
60
61
 
61
62
  #define STRING(s) {.data = s, .length = sizeof(s) - 1}
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
749
750
  GumboParserError* extra_data = &error->v.parser;
750
751
  extra_data->input_type = token->type;
751
752
  extra_data->input_tag = GUMBO_TAG_UNKNOWN;
752
- if (token->type == GUMBO_TOKEN_START_TAG) {
753
+ extra_data->input_name = NULL;
754
+ if (token->type == GUMBO_TOKEN_START_TAG)
755
+ {
753
756
  extra_data->input_tag = token->v.start_tag.tag;
754
- } else if (token->type == GUMBO_TOKEN_END_TAG) {
757
+ if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
758
+ extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
759
+ }
760
+ }
761
+ else if (token->type == GUMBO_TOKEN_END_TAG)
762
+ {
755
763
  extra_data->input_tag = token->v.end_tag.tag;
764
+ if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
765
+ extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
766
+ }
756
767
  }
757
768
  const GumboParserState* state = parser->_parser_state;
758
769
  extra_data->parser_state = state->_insertion_mode;
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
763
774
  node->type == GUMBO_NODE_ELEMENT
764
775
  || node->type == GUMBO_NODE_TEMPLATE
765
776
  );
766
- gumbo_vector_add (
767
- (void*) node->v.element.tag,
768
- &extra_data->tag_stack
769
- );
777
+ void *tag;
778
+ if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
779
+ tag = gumbo_strdup(node->v.element.name);
780
+ } else {
781
+ tag = (void *)(uintptr_t)node->v.element.tag;
782
+ }
783
+ gumbo_vector_add(tag, &extra_data->tag_stack);
770
784
  }
771
785
  }
772
786
 
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
1187
1201
  element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1188
1202
  insert_element(parser, element, false);
1189
1203
  gumbo_debug (
1190
- "Inserting %s element (@%p) from tag type.\n",
1204
+ "Inserting <%s> element (@%p) from tag type.\n",
1191
1205
  gumbo_normalized_tagname(tag),
1192
1206
  (void*)element
1193
1207
  );
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
1204
1218
  assert(token->type == GUMBO_TOKEN_START_TAG);
1205
1219
  GumboNode* element = create_element_from_token(token, tag_namespace);
1206
1220
  insert_element(parser, element, false);
1221
+ gumbo_debug (
1222
+ "Inserting <%s> foreign element (@%p).\n",
1223
+ gumbo_normalized_tagname(element->v.element.tag),
1224
+ (void*)element
1225
+ );
1207
1226
  if (
1208
1227
  token_has_attribute(token, "xmlns")
1209
1228
  && !attribute_matches_case_sensitive (
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
2066
2085
 
2067
2086
  // This is here to clean up memory when the spec says "Ignore current token."
2068
2087
  static void ignore_token(GumboParser* parser) {
2088
+ gumbo_debug("Ignoring token.\n");
2069
2089
  GumboToken* token = parser->_parser_state->_current_token;
2070
2090
  // Ownership of the token's internal buffers are normally transferred to the
2071
2091
  // element, but if no element is emitted (as happens in non-verbatim-mode
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
2430
2450
 
2431
2451
  // https://html.spec.whatwg.org/multipage/parsing.html#the-end
2432
2452
  static void finish_parsing(GumboParser* parser) {
2433
- gumbo_debug("Finishing parsing");
2453
+ gumbo_debug("Finishing parsing\n");
2434
2454
  maybe_flush_text_node_buffer(parser);
2435
2455
  GumboParserState* state = parser->_parser_state;
2436
2456
  for (
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
2608
2628
  }
2609
2629
  if (
2610
2630
  tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
2631
+ || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
2611
2632
  ) {
2612
2633
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2613
2634
  return;
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
3313
3334
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3314
3335
  return;
3315
3336
  }
3316
- if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3337
+ if (
3338
+ tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
3339
+ || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
3340
+ ) {
3317
3341
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3318
3342
  return;
3319
3343
  }
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
4389
4413
 
4390
4414
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
4391
4415
  static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4392
- gumbo_debug("Handling foreign content");
4416
+ gumbo_debug("Handling foreign content.\n");
4393
4417
  switch (token->type) {
4394
4418
  case GUMBO_TOKEN_NULL:
4395
4419
  parser_add_parse_error(parser, token);
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4507
4531
  if (i == 0)
4508
4532
  return;
4509
4533
  // We can't call handle_token directly because the current node is still in
4510
- // a foriegn namespace, so it would re-enter this and result in infinite
4534
+ // a foreign namespace, so it would re-enter this and result in infinite
4511
4535
  // recursion.
4512
4536
  handle_html_content(parser, token);
4513
4537
  }
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
4627
4651
  const char* fragment_encoding = options->fragment_encoding;
4628
4652
  GumboQuirksModeEnum quirks = options->quirks_mode;
4629
4653
  bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
4630
-
4631
4654
  GumboNode* root;
4632
- // 2.
4655
+
4656
+ // 1. [Create a new Document node, and mark it as being an HTML document.]
4657
+ // 2. [If the node document of the context element is in quirks mode, then
4658
+ // let the Document be in quirks mode. Otherwise, the node document of
4659
+ // the context element is in limited-quirks mode, then let the Document
4660
+ // be in limited-quirks mode. Otherwise, leave the Document in no-quirks
4661
+ // mode.]
4633
4662
  get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
4634
4663
 
4635
- // 3.
4664
+ // 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
4665
+ // declarative shadow roots to true.]
4666
+ // 4. [Create a new HTML parser, and associate it with the just created Document node.]
4667
+ // 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
4636
4668
  parser->_parser_state->_fragment_ctx =
4637
4669
  create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
4638
4670
  GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
4659
4691
  break;
4660
4692
 
4661
4693
  case GUMBO_TAG_NOSCRIPT:
4662
- /* scripting is disabled in Gumbo, so leave the tokenizer
4663
- * in the default data state */
4694
+ if (options->parse_noscript_content_as_text)
4695
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
4664
4696
  break;
4665
4697
 
4666
4698
  case GUMBO_TAG_PLAINTEXT:
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
4762
4794
  adjusted_current_node &&
4763
4795
  adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4764
4796
  );
4765
- gumbo_lex(&parser, &token);
4797
+ // If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
4798
+ //
4799
+ // The parser is pretty fragile. Breaking out of the parsing loop in the middle of
4800
+ // the parse can leave the document in an inconsistent state.
4801
+ if (unlikely(state->_open_elements.length > max_tree_depth)) {
4802
+ parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
4803
+ gumbo_debug("Tree depth limit exceeded.\n");
4804
+ token.type = GUMBO_TOKEN_EOF;
4805
+ } else {
4806
+ gumbo_lex(&parser, &token);
4807
+ }
4808
+
4766
4809
  }
4767
4810
 
4768
4811
  const char* token_type = "text";
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
4786
4829
  break;
4787
4830
  }
4788
4831
  gumbo_debug (
4789
- "Handling %s token @%lu:%lu in state %u.\n",
4832
+ "Handling %s token @%lu:%lu in insertion mode %u.\n",
4790
4833
  (char*) token_type,
4791
4834
  (unsigned long)token.position.line,
4792
4835
  (unsigned long)token.position.column,
@@ -4826,14 +4869,12 @@ GumboOutput* gumbo_parse_with_options (
4826
4869
  // to a token.
4827
4870
  if (token.type == GUMBO_TOKEN_END_TAG &&
4828
4871
  token.v.end_tag.tag == GUMBO_TAG_UNKNOWN)
4872
+ {
4829
4873
  gumbo_free(token.v.end_tag.name);
4874
+ token.v.end_tag.name = NULL;
4875
+ }
4830
4876
  }
4831
4877
 
4832
- if (unlikely(state->_open_elements.length > max_tree_depth)) {
4833
- parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
4834
- gumbo_debug("Tree depth limit exceeded.\n");
4835
- break;
4836
- }
4837
4878
 
4838
4879
  ++loop_count;
4839
4880
  assert(loop_count < 1000000000UL);
@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
340
340
 
341
341
  // Sets the tag buffer original text and start point to the current iterator
342
342
  // position. This is necessary because attribute names & values may have
343
- // whitespace preceeding them, and so we can't assume that the actual token
343
+ // whitespace preceding them, and so we can't assume that the actual token
344
344
  // starting point was the end of the last tag buffer usage.
345
345
  static void reset_tag_buffer_start_point(GumboParser* parser) {
346
346
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -506,6 +506,7 @@ static void abandon_current_tag(GumboParser* parser) {
506
506
  for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
507
507
  gumbo_destroy_attribute(tag_state->_attributes.data[i]);
508
508
  }
509
+ gumbo_free(tag_state->_name);
509
510
  gumbo_free(tag_state->_attributes.data);
510
511
  mark_tag_state_as_empty(tag_state);
511
512
  gumbo_string_buffer_destroy(&tag_state->_buffer);
@@ -568,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
568
569
  }
569
570
 
570
571
  // Appends a codepoint to the current tag buffer. If
571
- // reinitilize_position_on_first is set, this also initializes the tag buffer
572
+ // reinitialize_position_on_first is set, this also initializes the tag buffer
572
573
  // start point; the only time you would *not* want to pass true for this
573
574
  // parameter is if you want the original_text to include character (like an
574
575
  // opening quote) that doesn't appear in the value.
575
576
  static void append_char_to_tag_buffer (
576
577
  GumboParser* parser,
577
578
  int codepoint,
578
- bool reinitilize_position_on_first
579
+ bool reinitialize_position_on_first
579
580
  ) {
580
581
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
581
- if (buffer->length == 0 && reinitilize_position_on_first) {
582
+ if (buffer->length == 0 && reinitialize_position_on_first) {
582
583
  reset_tag_buffer_start_point(parser);
583
584
  }
584
585
  gumbo_string_buffer_append_codepoint(codepoint, buffer);
@@ -588,10 +589,10 @@ static void append_char_to_tag_buffer (
588
589
  static void append_string_to_tag_buffer (
589
590
  GumboParser* parser,
590
591
  GumboStringPiece* str,
591
- bool reinitilize_position_on_first
592
+ bool reinitialize_position_on_first
592
593
  ) {
593
594
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
594
- if (buffer->length == 0 && reinitilize_position_on_first) {
595
+ if (buffer->length == 0 && reinitialize_position_on_first) {
595
596
  reset_tag_buffer_start_point(parser);
596
597
  }
597
598
  gumbo_string_buffer_append_string(str, buffer);
@@ -18,7 +18,7 @@ module Nokogiri
18
18
  #
19
19
  module ClassResolver
20
20
  # #related_class restricts matching namespaces to those matching this set.
21
- VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
21
+ VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
22
22
 
23
23
  # :call-seq:
24
24
  # related_class(class_name) → Class
@@ -23,8 +23,12 @@ module Nokogiri
23
23
 
24
24
  ###
25
25
  # Convert this CSS node to xpath with +prefix+ using +visitor+
26
- def to_xpath(prefix, visitor)
27
- prefix = "." if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
26
+ def to_xpath(visitor)
27
+ prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
28
+ "."
29
+ else
30
+ visitor.prefix
31
+ end
28
32
  prefix + visitor.accept(self)
29
33
  end
30
34
 
@@ -1,8 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
  #
3
3
  # DO NOT MODIFY!!!!
4
- # This file is automatically generated by Racc 1.6.0
5
- # from Racc grammar file "".
4
+ # This file is automatically generated by Racc 1.8.0
5
+ # from Racc grammar file "parser.y".
6
6
  #
7
7
 
8
8
  require 'racc/parser.rb'
@@ -291,6 +291,7 @@ Racc_arg = [
291
291
  racc_shift_n,
292
292
  racc_reduce_n,
293
293
  racc_use_result_var ]
294
+ Ractor.make_shareable(Racc_arg) if defined?(Ractor)
294
295
 
295
296
  Racc_token_to_s_table = [
296
297
  "$end",
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
351
352
  "negation",
352
353
  "eql_incl_dash",
353
354
  "negation_arg" ]
355
+ Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
354
356
 
355
357
  Racc_debug_parser = false
356
358
 
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
468
470
  end
469
471
 
470
472
  def _reduce_24(val, _values, result)
471
- result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')])
473
+ result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
472
474
  result
473
475
  end
474
476
 
475
477
  def _reduce_25(val, _values, result)
476
- name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
478
+ name = val[0]
477
479
  result = Node.new(:ELEMENT_NAME, [name])
478
480
 
479
481
  result