RubyGems - nokogiri - Versions diffs - 1.15.4 → 1.17.2 - Mend

nokogiri 1.15.4 → 1.17.2

Potentially problematic release.

This version of nokogiri might be problematic. Click here for more details.

Files changed (98) hide show

checksums.yaml +4 -4
data/Gemfile +12 -19
data/README.md +8 -1
data/dependencies.yml +9 -8
data/ext/nokogiri/extconf.rb +194 -141
data/ext/nokogiri/gumbo.c +69 -53
data/ext/nokogiri/html4_document.c +10 -4
data/ext/nokogiri/html4_element_description.c +18 -18
data/ext/nokogiri/html4_sax_parser.c +40 -0
data/ext/nokogiri/html4_sax_parser_context.c +48 -58
data/ext/nokogiri/html4_sax_push_parser.c +26 -25
data/ext/nokogiri/libxml2_polyfill.c +114 -0
data/ext/nokogiri/nokogiri.c +9 -2
data/ext/nokogiri/nokogiri.h +25 -33
data/ext/nokogiri/test_global_handlers.c +1 -1
data/ext/nokogiri/xml_attr.c +1 -1
data/ext/nokogiri/xml_cdata.c +3 -12
data/ext/nokogiri/xml_comment.c +3 -8
data/ext/nokogiri/xml_document.c +167 -156
data/ext/nokogiri/xml_document_fragment.c +10 -25
data/ext/nokogiri/xml_dtd.c +1 -1
data/ext/nokogiri/xml_element_content.c +9 -9
data/ext/nokogiri/xml_encoding_handler.c +4 -4
data/ext/nokogiri/xml_namespace.c +6 -10
data/ext/nokogiri/xml_node.c +142 -108
data/ext/nokogiri/xml_node_set.c +46 -44
data/ext/nokogiri/xml_reader.c +74 -100
data/ext/nokogiri/xml_relax_ng.c +35 -56
data/ext/nokogiri/xml_sax_parser.c +156 -88
data/ext/nokogiri/xml_sax_parser_context.c +214 -128
data/ext/nokogiri/xml_sax_push_parser.c +69 -50
data/ext/nokogiri/xml_schema.c +51 -87
data/ext/nokogiri/xml_syntax_error.c +19 -11
data/ext/nokogiri/xml_text.c +3 -6
data/ext/nokogiri/xml_xpath_context.c +4 -7
data/ext/nokogiri/xslt_stylesheet.c +16 -11
data/gumbo-parser/Makefile +18 -0
data/gumbo-parser/src/error.c +76 -48
data/gumbo-parser/src/error.h +5 -1
data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
data/gumbo-parser/src/parser.c +64 -23
data/gumbo-parser/src/tokenizer.c +7 -6
data/lib/nokogiri/class_resolver.rb +1 -1
data/lib/nokogiri/css/node.rb +6 -2
data/lib/nokogiri/css/parser.rb +6 -4
data/lib/nokogiri/css/parser.y +2 -2
data/lib/nokogiri/css/parser_extras.rb +6 -66
data/lib/nokogiri/css/selector_cache.rb +38 -0
data/lib/nokogiri/css/tokenizer.rb +4 -4
data/lib/nokogiri/css/tokenizer.rex +9 -8
data/lib/nokogiri/css/xpath_visitor.rb +43 -27
data/lib/nokogiri/css.rb +86 -20
data/lib/nokogiri/decorators/slop.rb +3 -5
data/lib/nokogiri/encoding_handler.rb +2 -2
data/lib/nokogiri/html4/document.rb +45 -24
data/lib/nokogiri/html4/document_fragment.rb +124 -12
data/lib/nokogiri/html4/encoding_reader.rb +2 -2
data/lib/nokogiri/html4/sax/parser.rb +23 -38
data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
data/lib/nokogiri/html4.rb +9 -14
data/lib/nokogiri/html5/builder.rb +40 -0
data/lib/nokogiri/html5/document.rb +61 -30
data/lib/nokogiri/html5/document_fragment.rb +130 -20
data/lib/nokogiri/html5/node.rb +4 -4
data/lib/nokogiri/html5.rb +114 -138
data/lib/nokogiri/version/constant.rb +1 -1
data/lib/nokogiri/version/info.rb +6 -5
data/lib/nokogiri/xml/attr.rb +2 -2
data/lib/nokogiri/xml/builder.rb +8 -1
data/lib/nokogiri/xml/document.rb +74 -31
data/lib/nokogiri/xml/document_fragment.rb +86 -15
data/lib/nokogiri/xml/namespace.rb +1 -2
data/lib/nokogiri/xml/node.rb +113 -35
data/lib/nokogiri/xml/node_set.rb +12 -10
data/lib/nokogiri/xml/parse_options.rb +1 -1
data/lib/nokogiri/xml/pp/node.rb +6 -1
data/lib/nokogiri/xml/reader.rb +51 -17
data/lib/nokogiri/xml/relax_ng.rb +57 -20
data/lib/nokogiri/xml/sax/document.rb +174 -83
data/lib/nokogiri/xml/sax/parser.rb +115 -41
data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
data/lib/nokogiri/xml/sax.rb +48 -0
data/lib/nokogiri/xml/schema.rb +112 -45
data/lib/nokogiri/xml/searchable.rb +9 -11
data/lib/nokogiri/xml/syntax_error.rb +23 -1
data/lib/nokogiri/xml.rb +14 -25
data/lib/nokogiri/xslt/stylesheet.rb +29 -7
data/lib/nokogiri/xslt.rb +4 -10
data/lib/nokogiri.rb +1 -1
data/lib/xsd/xmlparser/nokogiri.rb +3 -4
data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
metadata +15 -14
data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
data/ports/archives/libxml2-2.11.5.tar.xz +0 -0
data/ports/archives/libxslt-1.1.38.tar.xz +0 -0

data/ext/nokogiri/xslt_stylesheet.c CHANGED Viewed

@@ -1,6 +1,6 @@
 #include <nokogiri.h>
-VALUE cNokogiriXsltStylesheet ;
+VALUE cNokogiriXsltStylesheet;
 static void
 mark(void *data)
@@ -18,8 +18,8 @@ dealloc(void *data)
   ruby_xfree(wrapper);
 }
-static const rb_data_type_t xslt_stylesheet_type = {
-  .wrap_struct_name = "Nokogiri::XSLT::Stylesheet",
+static const rb_data_type_t nokogiri_xslt_stylesheet_tuple_type = {
+  .wrap_struct_name = "nokogiriXsltStylesheetTuple",
   .function = {
     .dmark = mark,
     .dfree = dealloc,
@@ -56,7 +56,7 @@ Nokogiri_wrap_xslt_stylesheet(xsltStylesheetPtr ss)
   self = TypedData_Make_Struct(
            cNokogiriXsltStylesheet,
            nokogiriXsltStylesheetTuple,
-           &xslt_stylesheet_type,
+           &nokogiri_xslt_stylesheet_tuple_type,
            wrapper
          );
@@ -71,7 +71,12 @@ Nokogiri_wrap_xslt_stylesheet(xsltStylesheetPtr ss)
  * call-seq:
  *   parse_stylesheet_doc(document)
  *
- * Parse a stylesheet from +document+.
+ * Parse an XSLT::Stylesheet from +document+.
+ *
+ * [Parameters]
+ * - +document+ (Nokogiri::XML::Document) the document to be parsed.
+ *
+ * [Returns] Nokogiri::XSLT::Stylesheet
  */
 static VALUE
 parse_stylesheet_doc(VALUE klass, VALUE xmldocobj)
@@ -104,7 +109,7 @@ parse_stylesheet_doc(VALUE klass, VALUE xmldocobj)
  * call-seq:
  *   serialize(document)
  *
- * Serialize +document+ to an xml string.
+ * Serialize +document+ to an xml string, as specified by the +method+ parameter in the Stylesheet.
  */
 static VALUE
 rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj)
@@ -119,7 +124,7 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj)
   TypedData_Get_Struct(
     self,
     nokogiriXsltStylesheetTuple,
-    &xslt_stylesheet_type,
+    &nokogiri_xslt_stylesheet_tuple_type,
     wrapper
   );
   xsltSaveResultToString(&doc_ptr, &doc_len, xml, wrapper->ss);
@@ -133,7 +138,7 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj)
  *   transform(document)
  *   transform(document, params = {})
  *
- * Apply an XSLT stylesheet to an XML::Document.
+ * Transform an XML::Document as defined by an XSLT::Stylesheet.
  *
  * [Parameters]
  * - +document+ (Nokogiri::XML::Document) the document to be transformed.
@@ -268,7 +273,7 @@ rb_xslt_stylesheet_transform(int argc, VALUE *argv, VALUE self)
   Check_Type(rb_param, T_ARRAY);
   c_document = noko_xml_document_unwrap(rb_document);
-  TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &xslt_stylesheet_type, wrapper);
+  TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &nokogiri_xslt_stylesheet_tuple_type, wrapper);
   param_len = RARRAY_LEN(rb_param);
   params = ruby_xcalloc((size_t)param_len + 1, sizeof(char *));
@@ -357,7 +362,7 @@ initFunc(xsltTransformContextPtr ctxt, const xmlChar *uri)
   TypedData_Get_Struct(
     (VALUE)ctxt->style->_private,
     nokogiriXsltStylesheetTuple,
-    &xslt_stylesheet_type,
+    &nokogiri_xslt_stylesheet_tuple_type,
     wrapper
   );
   inst = rb_class_new_instance(0, NULL, obj);
@@ -375,7 +380,7 @@ shutdownFunc(xsltTransformContextPtr ctxt,
   TypedData_Get_Struct(
     (VALUE)ctxt->style->_private,
     nokogiriXsltStylesheetTuple,
-    &xslt_stylesheet_type,
+    &nokogiri_xslt_stylesheet_tuple_type,
     wrapper
   );

data/gumbo-parser/Makefile CHANGED Viewed

@@ -13,6 +13,23 @@ LDFLAGS := -pthread
 all: check
+oss-fuzz:
+	./fuzzer/build-ossfuzz.sh
+fuzzers: fuzzer-normal fuzzer-asan fuzzer-ubsan fuzzer-msan
+fuzzer-normal:
+	./fuzzer/build.sh
+fuzzer-asan:
+	SANITIZER=asan ./fuzzer/build.sh
+fuzzer-ubsan:
+	SANITIZER=ubsan ./fuzzer/build.sh
+fuzzer-msan:
+	SANITIZER=msan ./fuzzer/build.sh
 # don't try to regenerate ragel or gperf files in CI, that should be a development-only action and
 # the generated files should be committed to SCM
 ifneq ($(CI),true)
@@ -81,6 +98,7 @@ coverage:
 clean:
 	$(RM) -r build
+	$(RM) -r fuzzer/build fuzzer/src-* fuzzer/gumbo_corpus
 build/src/flags: | build/src
 	@echo 'old_CC := $(CC)' > $@

data/gumbo-parser/src/error.c CHANGED Viewed

@@ -46,33 +46,40 @@ static int PRINTF(2) print_message (
     args
   );
   va_end(args);
-#if _MSC_VER && _MSC_VER < 1900
+#if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(_RUBY_MSVCRT)
   if (bytes_written == -1) {
     // vsnprintf returns -1 on older MSVC++ if there's not enough capacity,
     // instead of returning the number of bytes that would've been written had
-    // there been enough. In this case, we'll double the buffer size and hope
-    // it fits when we retry (letting it fail and returning 0 if it doesn't),
-    // since there's no way to smartly resize the buffer.
-    gumbo_string_buffer_reserve(output->capacity * 2, output);
+    // there been enough. In this case, we can call vsnprintf() again but
+    // with a count of 0 to get the number of bytes written, not including
+    // the null terminator.
+    // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/vsnprintf-vsnprintf-vsnprintf-l-vsnwprintf-vsnwprintf-l?view=msvc-140#behavior-summary
     va_start(args, format);
-    int result = vsnprintf (
-      output->data + output->length,
-      remaining_capacity,
+    bytes_written = vsnprintf (
+      NULL,
+      0,
       format,
       args
     );
     va_end(args);
-    return result == -1 ? 0 : result;
   }
-#else
+#endif
   // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
   if (bytes_written == -1) {
     return 0;
   }
-#endif
   if (bytes_written >= remaining_capacity) {
-    gumbo_string_buffer_reserve(output->capacity + bytes_written, output);
+    // At least double the size of the buffer.
+    size_t new_capacity = output->capacity * 2;
+    if (new_capacity < output->length + bytes_written + 1) {
+      // The +1 is for the null terminator.
+      new_capacity = output->length + bytes_written + 1;
+    }
+    gumbo_string_buffer_reserve(new_capacity, output);
     remaining_capacity = output->capacity - output->length;
     va_start(args, format);
     bytes_written = vsnprintf (
@@ -96,8 +103,14 @@ static void print_tag_stack (
     if (i) {
       print_message(output, ", ");
     }
-    GumboTag tag = (GumboTag)(intptr_t) error->tag_stack.data[i];
-    print_message(output, "%s", gumbo_normalized_tagname(tag));
+    uintptr_t tag = (uintptr_t) error->tag_stack.data[i];
+    const char* tag_name;
+    if (tag > GUMBO_TAG_UNKNOWN) {
+      tag_name = error->tag_stack.data[i];
+    } else {
+      tag_name = gumbo_normalized_tagname((GumboTag)tag);
+    }
+    print_message(output, "%s", tag_name);
   }
   gumbo_string_buffer_append_codepoint('.', output);
 }
@@ -326,41 +339,45 @@ static void handle_parser_error (
   }
   switch (error->input_type) {
-    case GUMBO_TOKEN_DOCTYPE:
-      print_message(output, "This is not a legal doctype");
-      return;
-    case GUMBO_TOKEN_COMMENT:
-      // Should never happen; comments are always legal.
-      assert(0);
-      // But just in case...
-      print_message(output, "Comments aren't legal here");
-      return;
-    case GUMBO_TOKEN_CDATA:
-    case GUMBO_TOKEN_WHITESPACE:
-    case GUMBO_TOKEN_CHARACTER:
-      print_message(output, "Character tokens aren't legal here");
-      return;
-    case GUMBO_TOKEN_NULL:
-      print_message(output, "Null bytes are not allowed in HTML5");
-      return;
-    case GUMBO_TOKEN_EOF:
-      if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
-        print_message(output, "You must provide a doctype");
-      } else {
-        print_message(output, "Premature end of file.");
-        print_tag_stack(error, output);
-      }
-      return;
-    case GUMBO_TOKEN_START_TAG:
-      print_message(output, "Start tag '%s' isn't allowed here.",
-                    gumbo_normalized_tagname(error->input_tag));
-      print_tag_stack(error, output);
-      return;
-    case GUMBO_TOKEN_END_TAG:
-      print_message(output, "End tag '%s' isn't allowed here.",
-                    gumbo_normalized_tagname(error->input_tag));
+  case GUMBO_TOKEN_DOCTYPE:
+    print_message(output, "This is not a legal doctype");
+    return;
+  case GUMBO_TOKEN_COMMENT:
+    // Should never happen; comments are always legal.
+    assert(0);
+    // But just in case...
+    print_message(output, "Comments aren't legal here");
+    return;
+  case GUMBO_TOKEN_CDATA:
+  case GUMBO_TOKEN_WHITESPACE:
+  case GUMBO_TOKEN_CHARACTER:
+    print_message(output, "Character tokens aren't legal here");
+    return;
+  case GUMBO_TOKEN_NULL:
+    print_message(output, "Null bytes are not allowed in HTML5");
+    return;
+  case GUMBO_TOKEN_EOF:
+    if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
+      print_message(output, "You must provide a doctype");
+    } else {
+      print_message(output, "Premature end of file.");
       print_tag_stack(error, output);
-      return;
+    }
+    return;
+  case GUMBO_TOKEN_START_TAG:
+  case GUMBO_TOKEN_END_TAG:
+  {
+    const char* tag_name;
+    const char* which = error->input_type == GUMBO_TOKEN_START_TAG ? "Start" : "End";
+    if (error->input_name) {
+      tag_name = error->input_name;
+    } else {
+      tag_name = gumbo_normalized_tagname(error->input_tag);
+    }
+    print_message(output, "%s tag '%s' isn't allowed here.", which, tag_name);
+    print_tag_stack(error, output);
+    return;
+  }
   }
 }
@@ -613,6 +630,17 @@ void gumbo_print_caret_diagnostic (
 void gumbo_error_destroy(GumboError* error) {
   if (error->type == GUMBO_ERR_PARSER) {
+    // Free the tag name.
+    if (error->v.parser.input_name) {
+      gumbo_free(error->v.parser.input_name);
+    }
+    for (unsigned int i = 0; i < error->v.parser.tag_stack.length; ++i) {
+      intptr_t tag = (intptr_t) error->v.parser.tag_stack.data[i];
+      if (tag > GUMBO_TAG_UNKNOWN) {
+        gumbo_free(error->v.parser.tag_stack.data[i]);
+      }
+    }
     gumbo_vector_destroy(&error->v.parser.tag_stack);
   }
   gumbo_free(error);

data/gumbo-parser/src/error.h CHANGED Viewed

@@ -95,12 +95,16 @@ typedef struct GumboInternalParserError {
   // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
   GumboTag input_tag;
+  // The HTML tag of the input token if it was nonstandard tag token. NULL otherwise.
+  char *input_name;
   // The insertion mode that the parser was in at the time.
   GumboInsertionMode parser_state;
   // The tag stack at the point of the error. Note that this is an GumboVector
   // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
-  // get at the tag.
+  // get at the tag. For nonstandard tags, this is a pointer to an owned char *
+  // containing the tag name.
   GumboVector /* GumboTag */ tag_stack;
 } GumboParserError;

data/gumbo-parser/src/nokogiri_gumbo.h CHANGED Viewed

@@ -780,6 +780,15 @@ typedef struct GumboInternalOptions {
    * Default: `false`.
    */
   bool fragment_context_has_form_ancestor;
+  /**
+   * Parse `noscript` elements as if scripting was enabled. This causes the
+   * contents of the `noscript` element to be parsed as raw text, rather
+   * than as HTML elements.
+   *
+   * Default: `false`.
+   */
+  bool parse_noscript_content_as_text;
 } GumboOptions;
 /** Default options struct; use this with gumbo_parse_with_options. */
@@ -791,7 +800,7 @@ extern const GumboOptions kGumboDefaultOptions;
  */
 typedef enum {
   /**
-   * Indicates that parsing completed successfuly. The resulting tree
+   * Indicates that parsing completed successfully. The resulting tree
    * will be a complete document.
    */
   GUMBO_STATUS_OK,
@@ -841,7 +850,7 @@ typedef struct GumboInternalOutput {
   GumboVector /* GumboError */ errors;
   /**
-   * True if the parser encounted an error.
+   * True if the parser encountered an error.
    *
    * This can be true and `errors` an empty `GumboVector` if the `max_errors`
    * option was set to 0.

data/gumbo-parser/src/parser.c CHANGED Viewed

@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
   .fragment_encoding = NULL,
   .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
   .fragment_context_has_form_ancestor = false,
+  .parse_noscript_content_as_text = false,
 };
 #define STRING(s) {.data = s, .length = sizeof(s) - 1}
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
   GumboParserError* extra_data = &error->v.parser;
   extra_data->input_type = token->type;
   extra_data->input_tag = GUMBO_TAG_UNKNOWN;
-  if (token->type == GUMBO_TOKEN_START_TAG) {
+  extra_data->input_name = NULL;
+  if (token->type == GUMBO_TOKEN_START_TAG)
+  {
     extra_data->input_tag = token->v.start_tag.tag;
-  } else if (token->type == GUMBO_TOKEN_END_TAG) {
+    if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
+      extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
+    }
+  }
+  else if (token->type == GUMBO_TOKEN_END_TAG)
+  {
     extra_data->input_tag = token->v.end_tag.tag;
+    if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
+      extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
+    }
   }
   const GumboParserState* state = parser->_parser_state;
   extra_data->parser_state = state->_insertion_mode;
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
       node->type == GUMBO_NODE_ELEMENT
       || node->type == GUMBO_NODE_TEMPLATE
     );
-    gumbo_vector_add (
-      (void*) node->v.element.tag,
-      &extra_data->tag_stack
-    );
+    void *tag;
+    if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
+      tag = gumbo_strdup(node->v.element.name);
+    } else {
+      tag = (void *)(uintptr_t)node->v.element.tag;
+    }
+    gumbo_vector_add(tag, &extra_data->tag_stack);
   }
 }
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
   element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
   insert_element(parser, element, false);
   gumbo_debug (
-    "Inserting %s element (@%p) from tag type.\n",
+    "Inserting <%s> element (@%p) from tag type.\n",
     gumbo_normalized_tagname(tag),
     (void*)element
   );
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
   assert(token->type == GUMBO_TOKEN_START_TAG);
   GumboNode* element = create_element_from_token(token, tag_namespace);
   insert_element(parser, element, false);
+  gumbo_debug (
+    "Inserting <%s> foreign element (@%p).\n",
+    gumbo_normalized_tagname(element->v.element.tag),
+    (void*)element
+  );
   if (
     token_has_attribute(token, "xmlns")
     && !attribute_matches_case_sensitive (
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
 // This is here to clean up memory when the spec says "Ignore current token."
 static void ignore_token(GumboParser* parser) {
+  gumbo_debug("Ignoring token.\n");
   GumboToken* token = parser->_parser_state->_current_token;
   // Ownership of the token's internal buffers are normally transferred to the
   // element, but if no element is emitted (as happens in non-verbatim-mode
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
 // https://html.spec.whatwg.org/multipage/parsing.html#the-end
 static void finish_parsing(GumboParser* parser) {
-  gumbo_debug("Finishing parsing");
+  gumbo_debug("Finishing parsing\n");
   maybe_flush_text_node_buffer(parser);
   GumboParserState* state = parser->_parser_state;
   for (
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
   }
   if (
     tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
+    || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
   ) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
   }
-  if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
+  if (
+    tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
+    || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
+  ) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
   }
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
 // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
 static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
-  gumbo_debug("Handling foreign content");
+  gumbo_debug("Handling foreign content.\n");
   switch (token->type) {
     case GUMBO_TOKEN_NULL:
       parser_add_parse_error(parser, token);
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
   if (i == 0)
     return;
   // We can't call handle_token directly because the current node is still in
-  // a foriegn namespace, so it would re-enter this and result in infinite
+  // a foreign namespace, so it would re-enter this and result in infinite
   // recursion.
   handle_html_content(parser, token);
 }
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
   const char* fragment_encoding = options->fragment_encoding;
   GumboQuirksModeEnum quirks = options->quirks_mode;
   bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
   GumboNode* root;
-  // 2.
+  // 1. [Create a new Document node, and mark it as being an HTML document.]
+  // 2. [If the node document of the context element is in quirks mode, then
+  //    let the Document be in quirks mode. Otherwise, the node document of
+  //    the context element is in limited-quirks mode, then let the Document
+  //    be in limited-quirks mode. Otherwise, leave the Document in no-quirks
+  //    mode.]
   get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
-  // 3.
+  // 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
+  //    declarative shadow roots to true.]
+  // 4. [Create a new HTML parser, and associate it with the just created Document node.]
+  // 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
   parser->_parser_state->_fragment_ctx =
     create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
   GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
         break;
       case GUMBO_TAG_NOSCRIPT:
-        /* scripting is disabled in Gumbo, so leave the tokenizer
-         * in the default data state */
+        if (options->parse_noscript_content_as_text)
+          gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
         break;
       case GUMBO_TAG_PLAINTEXT:
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
         adjusted_current_node &&
           adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
       );
-      gumbo_lex(&parser, &token);
+      // If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
+      //
+      // The parser is pretty fragile. Breaking out of the parsing loop in the middle of
+      // the parse can leave the document in an inconsistent state.
+      if (unlikely(state->_open_elements.length > max_tree_depth)) {
+        parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
+        gumbo_debug("Tree depth limit exceeded.\n");
+        token.type = GUMBO_TOKEN_EOF;
+      } else {
+        gumbo_lex(&parser, &token);
+      }
     }
     const char* token_type = "text";
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
         break;
     }
     gumbo_debug (
-      "Handling %s token @%lu:%lu in state %u.\n",
+      "Handling %s token @%lu:%lu in insertion mode %u.\n",
       (char*) token_type,
       (unsigned long)token.position.line,
       (unsigned long)token.position.column,
@@ -4826,14 +4869,12 @@ GumboOutput* gumbo_parse_with_options (
       // to a token.
       if (token.type == GUMBO_TOKEN_END_TAG &&
           token.v.end_tag.tag == GUMBO_TAG_UNKNOWN)
+      {
         gumbo_free(token.v.end_tag.name);
+        token.v.end_tag.name = NULL;
+      }
     }
-    if (unlikely(state->_open_elements.length > max_tree_depth)) {
-      parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
-      gumbo_debug("Tree depth limit exceeded.\n");
-      break;
-    }
     ++loop_count;
     assert(loop_count < 1000000000UL);

data/gumbo-parser/src/tokenizer.c CHANGED Viewed

@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
 // Sets the tag buffer original text and start point to the current iterator
 // position. This is necessary because attribute names & values may have
-// whitespace preceeding them, and so we can't assume that the actual token
+// whitespace preceding them, and so we can't assume that the actual token
 // starting point was the end of the last tag buffer usage.
 static void reset_tag_buffer_start_point(GumboParser* parser) {
   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -506,6 +506,7 @@ static void abandon_current_tag(GumboParser* parser) {
   for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
     gumbo_destroy_attribute(tag_state->_attributes.data[i]);
   }
+  gumbo_free(tag_state->_name);
   gumbo_free(tag_state->_attributes.data);
   mark_tag_state_as_empty(tag_state);
   gumbo_string_buffer_destroy(&tag_state->_buffer);
@@ -568,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
 }
 // Appends a codepoint to the current tag buffer. If
-// reinitilize_position_on_first is set, this also initializes the tag buffer
+// reinitialize_position_on_first is set, this also initializes the tag buffer
 // start point; the only time you would *not* want to pass true for this
 // parameter is if you want the original_text to include character (like an
 // opening quote) that doesn't appear in the value.
 static void append_char_to_tag_buffer (
   GumboParser* parser,
   int codepoint,
-  bool reinitilize_position_on_first
+  bool reinitialize_position_on_first
 ) {
   GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
-  if (buffer->length == 0 && reinitilize_position_on_first) {
+  if (buffer->length == 0 && reinitialize_position_on_first) {
     reset_tag_buffer_start_point(parser);
   }
   gumbo_string_buffer_append_codepoint(codepoint, buffer);
@@ -588,10 +589,10 @@ static void append_char_to_tag_buffer (
 static void append_string_to_tag_buffer (
   GumboParser* parser,
   GumboStringPiece* str,
-  bool reinitilize_position_on_first
+  bool reinitialize_position_on_first
 ) {
   GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
-  if (buffer->length == 0 && reinitilize_position_on_first) {
+  if (buffer->length == 0 && reinitialize_position_on_first) {
     reset_tag_buffer_start_point(parser);
   }
   gumbo_string_buffer_append_string(str, buffer);

data/lib/nokogiri/class_resolver.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module Nokogiri
   #
   module ClassResolver
     # #related_class restricts matching namespaces to those matching this set.
-    VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
+    VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
     # :call-seq:
     #   related_class(class_name) → Class

data/lib/nokogiri/css/node.rb CHANGED Viewed

@@ -23,8 +23,12 @@ module Nokogiri
       ###
       # Convert this CSS node to xpath with +prefix+ using +visitor+
-      def to_xpath(prefix, visitor)
-        prefix = "." if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
+      def to_xpath(visitor)
+        prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
+          "."
+        else
+          visitor.prefix
+        end
         prefix + visitor.accept(self)
       end

data/lib/nokogiri/css/parser.rb CHANGED Viewed

@@ -1,8 +1,8 @@
 # frozen_string_literal: true
 #
 # DO NOT MODIFY!!!!
-# This file is automatically generated by Racc 1.6.0
-# from Racc grammar file "".
+# This file is automatically generated by Racc 1.8.0
+# from Racc grammar file "parser.y".
 #
 require 'racc/parser.rb'
@@ -291,6 +291,7 @@ Racc_arg = [
   racc_shift_n,
   racc_reduce_n,
   racc_use_result_var ]
+Ractor.make_shareable(Racc_arg) if defined?(Ractor)
 Racc_token_to_s_table = [
   "$end",
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
   "negation",
   "eql_incl_dash",
   "negation_arg" ]
+Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
 Racc_debug_parser = false
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
 end
 def _reduce_24(val, _values, result)
- result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')])
+ result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
     result
 end
 def _reduce_25(val, _values, result)
-      name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
+      name = val[0]
       result = Node.new(:ELEMENT_NAME, [name])
     result