RubyGems - nokogiri - Versions diffs - 1.16.8 → 1.17.1 - Mend

nokogiri 1.16.8 → 1.17.1

Files changed (91) hide show

checksums.yaml +4 -4
data/Gemfile +11 -21
data/README.md +4 -0
data/dependencies.yml +6 -6
data/ext/nokogiri/extconf.rb +191 -137
data/ext/nokogiri/gumbo.c +69 -53
data/ext/nokogiri/html4_document.c +10 -4
data/ext/nokogiri/html4_element_description.c +18 -18
data/ext/nokogiri/html4_sax_parser.c +40 -0
data/ext/nokogiri/html4_sax_parser_context.c +48 -58
data/ext/nokogiri/html4_sax_push_parser.c +25 -24
data/ext/nokogiri/libxml2_polyfill.c +114 -0
data/ext/nokogiri/nokogiri.c +9 -2
data/ext/nokogiri/nokogiri.h +18 -33
data/ext/nokogiri/xml_attr.c +1 -1
data/ext/nokogiri/xml_cdata.c +2 -10
data/ext/nokogiri/xml_comment.c +3 -8
data/ext/nokogiri/xml_document.c +163 -156
data/ext/nokogiri/xml_document_fragment.c +10 -25
data/ext/nokogiri/xml_dtd.c +1 -1
data/ext/nokogiri/xml_element_content.c +9 -9
data/ext/nokogiri/xml_encoding_handler.c +4 -4
data/ext/nokogiri/xml_namespace.c +6 -6
data/ext/nokogiri/xml_node.c +134 -103
data/ext/nokogiri/xml_node_set.c +46 -44
data/ext/nokogiri/xml_reader.c +54 -58
data/ext/nokogiri/xml_relax_ng.c +35 -56
data/ext/nokogiri/xml_sax_parser.c +156 -88
data/ext/nokogiri/xml_sax_parser_context.c +213 -131
data/ext/nokogiri/xml_sax_push_parser.c +68 -49
data/ext/nokogiri/xml_schema.c +50 -85
data/ext/nokogiri/xml_syntax_error.c +19 -11
data/ext/nokogiri/xml_text.c +2 -4
data/ext/nokogiri/xml_xpath_context.c +2 -2
data/ext/nokogiri/xslt_stylesheet.c +8 -8
data/gumbo-parser/src/error.c +76 -48
data/gumbo-parser/src/error.h +5 -1
data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
data/gumbo-parser/src/parser.c +61 -23
data/gumbo-parser/src/tokenizer.c +6 -6
data/lib/nokogiri/class_resolver.rb +1 -1
data/lib/nokogiri/css/node.rb +6 -2
data/lib/nokogiri/css/parser.rb +6 -4
data/lib/nokogiri/css/parser.y +2 -2
data/lib/nokogiri/css/parser_extras.rb +6 -66
data/lib/nokogiri/css/selector_cache.rb +38 -0
data/lib/nokogiri/css/tokenizer.rb +4 -4
data/lib/nokogiri/css/tokenizer.rex +9 -8
data/lib/nokogiri/css/xpath_visitor.rb +42 -6
data/lib/nokogiri/css.rb +86 -20
data/lib/nokogiri/decorators/slop.rb +3 -5
data/lib/nokogiri/encoding_handler.rb +2 -2
data/lib/nokogiri/html4/document.rb +44 -23
data/lib/nokogiri/html4/document_fragment.rb +124 -12
data/lib/nokogiri/html4/encoding_reader.rb +1 -1
data/lib/nokogiri/html4/sax/parser.rb +23 -38
data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
data/lib/nokogiri/html4.rb +9 -14
data/lib/nokogiri/html5/builder.rb +40 -0
data/lib/nokogiri/html5/document.rb +61 -30
data/lib/nokogiri/html5/document_fragment.rb +130 -20
data/lib/nokogiri/html5/node.rb +4 -4
data/lib/nokogiri/html5.rb +114 -72
data/lib/nokogiri/version/constant.rb +1 -1
data/lib/nokogiri/xml/builder.rb +8 -1
data/lib/nokogiri/xml/document.rb +70 -26
data/lib/nokogiri/xml/document_fragment.rb +84 -13
data/lib/nokogiri/xml/node.rb +82 -11
data/lib/nokogiri/xml/node_set.rb +9 -7
data/lib/nokogiri/xml/parse_options.rb +1 -1
data/lib/nokogiri/xml/pp/node.rb +6 -1
data/lib/nokogiri/xml/reader.rb +46 -13
data/lib/nokogiri/xml/relax_ng.rb +57 -20
data/lib/nokogiri/xml/sax/document.rb +174 -83
data/lib/nokogiri/xml/sax/parser.rb +115 -41
data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
data/lib/nokogiri/xml/sax.rb +48 -0
data/lib/nokogiri/xml/schema.rb +112 -45
data/lib/nokogiri/xml/searchable.rb +6 -8
data/lib/nokogiri/xml/syntax_error.rb +22 -0
data/lib/nokogiri/xml.rb +13 -24
data/lib/nokogiri/xslt.rb +3 -9
data/lib/xsd/xmlparser/nokogiri.rb +3 -4
data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
metadata +10 -7
data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
data/ports/archives/libxml2-2.12.9.tar.xz +0 -0
data/ports/archives/libxslt-1.1.39.tar.xz +0 -0

data/gumbo-parser/src/error.c CHANGED Viewed

@@ -46,33 +46,40 @@ static int PRINTF(2) print_message (
     args
   );
   va_end(args);
-#if _MSC_VER && _MSC_VER < 1900
+#if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(_RUBY_MSVCRT)
   if (bytes_written == -1) {
     // vsnprintf returns -1 on older MSVC++ if there's not enough capacity,
     // instead of returning the number of bytes that would've been written had
-    // there been enough. In this case, we'll double the buffer size and hope
-    // it fits when we retry (letting it fail and returning 0 if it doesn't),
-    // since there's no way to smartly resize the buffer.
-    gumbo_string_buffer_reserve(output->capacity * 2, output);
+    // there been enough. In this case, we can call vsnprintf() again but
+    // with a count of 0 to get the number of bytes written, not including
+    // the null terminator.
+    // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/vsnprintf-vsnprintf-vsnprintf-l-vsnwprintf-vsnwprintf-l?view=msvc-140#behavior-summary
     va_start(args, format);
-    int result = vsnprintf (
-      output->data + output->length,
-      remaining_capacity,
+    bytes_written = vsnprintf (
+      NULL,
+      0,
       format,
       args
     );
     va_end(args);
-    return result == -1 ? 0 : result;
   }
-#else
+#endif
   // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
   if (bytes_written == -1) {
     return 0;
   }
-#endif
   if (bytes_written >= remaining_capacity) {
-    gumbo_string_buffer_reserve(output->capacity + bytes_written, output);
+    // At least double the size of the buffer.
+    size_t new_capacity = output->capacity * 2;
+    if (new_capacity < output->length + bytes_written + 1) {
+      // The +1 is for the null terminator.
+      new_capacity = output->length + bytes_written + 1;
+    }
+    gumbo_string_buffer_reserve(new_capacity, output);
     remaining_capacity = output->capacity - output->length;
     va_start(args, format);
     bytes_written = vsnprintf (
@@ -96,8 +103,14 @@ static void print_tag_stack (
     if (i) {
       print_message(output, ", ");
     }
-    GumboTag tag = (GumboTag)(intptr_t) error->tag_stack.data[i];
-    print_message(output, "%s", gumbo_normalized_tagname(tag));
+    uintptr_t tag = (uintptr_t) error->tag_stack.data[i];
+    const char* tag_name;
+    if (tag > GUMBO_TAG_UNKNOWN) {
+      tag_name = error->tag_stack.data[i];
+    } else {
+      tag_name = gumbo_normalized_tagname((GumboTag)tag);
+    }
+    print_message(output, "%s", tag_name);
   }
   gumbo_string_buffer_append_codepoint('.', output);
 }
@@ -326,41 +339,45 @@ static void handle_parser_error (
   }
   switch (error->input_type) {
-    case GUMBO_TOKEN_DOCTYPE:
-      print_message(output, "This is not a legal doctype");
-      return;
-    case GUMBO_TOKEN_COMMENT:
-      // Should never happen; comments are always legal.
-      assert(0);
-      // But just in case...
-      print_message(output, "Comments aren't legal here");
-      return;
-    case GUMBO_TOKEN_CDATA:
-    case GUMBO_TOKEN_WHITESPACE:
-    case GUMBO_TOKEN_CHARACTER:
-      print_message(output, "Character tokens aren't legal here");
-      return;
-    case GUMBO_TOKEN_NULL:
-      print_message(output, "Null bytes are not allowed in HTML5");
-      return;
-    case GUMBO_TOKEN_EOF:
-      if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
-        print_message(output, "You must provide a doctype");
-      } else {
-        print_message(output, "Premature end of file.");
-        print_tag_stack(error, output);
-      }
-      return;
-    case GUMBO_TOKEN_START_TAG:
-      print_message(output, "Start tag '%s' isn't allowed here.",
-                    gumbo_normalized_tagname(error->input_tag));
-      print_tag_stack(error, output);
-      return;
-    case GUMBO_TOKEN_END_TAG:
-      print_message(output, "End tag '%s' isn't allowed here.",
-                    gumbo_normalized_tagname(error->input_tag));
+  case GUMBO_TOKEN_DOCTYPE:
+    print_message(output, "This is not a legal doctype");
+    return;
+  case GUMBO_TOKEN_COMMENT:
+    // Should never happen; comments are always legal.
+    assert(0);
+    // But just in case...
+    print_message(output, "Comments aren't legal here");
+    return;
+  case GUMBO_TOKEN_CDATA:
+  case GUMBO_TOKEN_WHITESPACE:
+  case GUMBO_TOKEN_CHARACTER:
+    print_message(output, "Character tokens aren't legal here");
+    return;
+  case GUMBO_TOKEN_NULL:
+    print_message(output, "Null bytes are not allowed in HTML5");
+    return;
+  case GUMBO_TOKEN_EOF:
+    if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
+      print_message(output, "You must provide a doctype");
+    } else {
+      print_message(output, "Premature end of file.");
       print_tag_stack(error, output);
-      return;
+    }
+    return;
+  case GUMBO_TOKEN_START_TAG:
+  case GUMBO_TOKEN_END_TAG:
+  {
+    const char* tag_name;
+    const char* which = error->input_type == GUMBO_TOKEN_START_TAG ? "Start" : "End";
+    if (error->input_name) {
+      tag_name = error->input_name;
+    } else {
+      tag_name = gumbo_normalized_tagname(error->input_tag);
+    }
+    print_message(output, "%s tag '%s' isn't allowed here.", which, tag_name);
+    print_tag_stack(error, output);
+    return;
+  }
   }
 }
@@ -613,6 +630,17 @@ void gumbo_print_caret_diagnostic (
 void gumbo_error_destroy(GumboError* error) {
   if (error->type == GUMBO_ERR_PARSER) {
+    // Free the tag name.
+    if (error->v.parser.input_name) {
+      gumbo_free(error->v.parser.input_name);
+    }
+    for (unsigned int i = 0; i < error->v.parser.tag_stack.length; ++i) {
+      intptr_t tag = (intptr_t) error->v.parser.tag_stack.data[i];
+      if (tag > GUMBO_TAG_UNKNOWN) {
+        gumbo_free(error->v.parser.tag_stack.data[i]);
+      }
+    }
     gumbo_vector_destroy(&error->v.parser.tag_stack);
   }
   gumbo_free(error);

data/gumbo-parser/src/error.h CHANGED Viewed

@@ -95,12 +95,16 @@ typedef struct GumboInternalParserError {
   // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
   GumboTag input_tag;
+  // The HTML tag of the input token if it was nonstandard tag token. NULL otherwise.
+  char *input_name;
   // The insertion mode that the parser was in at the time.
   GumboInsertionMode parser_state;
   // The tag stack at the point of the error. Note that this is an GumboVector
   // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
-  // get at the tag.
+  // get at the tag. For nonstandard tags, this is a pointer to an owned char *
+  // containing the tag name.
   GumboVector /* GumboTag */ tag_stack;
 } GumboParserError;

data/gumbo-parser/src/nokogiri_gumbo.h CHANGED Viewed

@@ -780,6 +780,15 @@ typedef struct GumboInternalOptions {
    * Default: `false`.
    */
   bool fragment_context_has_form_ancestor;
+  /**
+   * Parse `noscript` elements as if scripting was enabled. This causes the
+   * contents of the `noscript` element to be parsed as raw text, rather
+   * than as HTML elements.
+   *
+   * Default: `false`.
+   */
+  bool parse_noscript_content_as_text;
 } GumboOptions;
 /** Default options struct; use this with gumbo_parse_with_options. */
@@ -791,7 +800,7 @@ extern const GumboOptions kGumboDefaultOptions;
  */
 typedef enum {
   /**
-   * Indicates that parsing completed successfuly. The resulting tree
+   * Indicates that parsing completed successfully. The resulting tree
    * will be a complete document.
    */
   GUMBO_STATUS_OK,
@@ -841,7 +850,7 @@ typedef struct GumboInternalOutput {
   GumboVector /* GumboError */ errors;
   /**
-   * True if the parser encounted an error.
+   * True if the parser encountered an error.
    *
    * This can be true and `errors` an empty `GumboVector` if the `max_errors`
    * option was set to 0.

data/gumbo-parser/src/parser.c CHANGED Viewed

@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
   .fragment_encoding = NULL,
   .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
   .fragment_context_has_form_ancestor = false,
+  .parse_noscript_content_as_text = false,
 };
 #define STRING(s) {.data = s, .length = sizeof(s) - 1}
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
   GumboParserError* extra_data = &error->v.parser;
   extra_data->input_type = token->type;
   extra_data->input_tag = GUMBO_TAG_UNKNOWN;
-  if (token->type == GUMBO_TOKEN_START_TAG) {
+  extra_data->input_name = NULL;
+  if (token->type == GUMBO_TOKEN_START_TAG)
+  {
     extra_data->input_tag = token->v.start_tag.tag;
-  } else if (token->type == GUMBO_TOKEN_END_TAG) {
+    if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
+      extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
+    }
+  }
+  else if (token->type == GUMBO_TOKEN_END_TAG)
+  {
     extra_data->input_tag = token->v.end_tag.tag;
+    if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
+      extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
+    }
   }
   const GumboParserState* state = parser->_parser_state;
   extra_data->parser_state = state->_insertion_mode;
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
       node->type == GUMBO_NODE_ELEMENT
       || node->type == GUMBO_NODE_TEMPLATE
     );
-    gumbo_vector_add (
-      (void*) node->v.element.tag,
-      &extra_data->tag_stack
-    );
+    void *tag;
+    if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
+      tag = gumbo_strdup(node->v.element.name);
+    } else {
+      tag = (void *)(uintptr_t)node->v.element.tag;
+    }
+    gumbo_vector_add(tag, &extra_data->tag_stack);
   }
 }
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
   element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
   insert_element(parser, element, false);
   gumbo_debug (
-    "Inserting %s element (@%p) from tag type.\n",
+    "Inserting <%s> element (@%p) from tag type.\n",
     gumbo_normalized_tagname(tag),
     (void*)element
   );
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
   assert(token->type == GUMBO_TOKEN_START_TAG);
   GumboNode* element = create_element_from_token(token, tag_namespace);
   insert_element(parser, element, false);
+  gumbo_debug (
+    "Inserting <%s> foreign element (@%p).\n",
+    gumbo_normalized_tagname(element->v.element.tag),
+    (void*)element
+  );
   if (
     token_has_attribute(token, "xmlns")
     && !attribute_matches_case_sensitive (
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
 // This is here to clean up memory when the spec says "Ignore current token."
 static void ignore_token(GumboParser* parser) {
+  gumbo_debug("Ignoring token.\n");
   GumboToken* token = parser->_parser_state->_current_token;
   // Ownership of the token's internal buffers are normally transferred to the
   // element, but if no element is emitted (as happens in non-verbatim-mode
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
 // https://html.spec.whatwg.org/multipage/parsing.html#the-end
 static void finish_parsing(GumboParser* parser) {
-  gumbo_debug("Finishing parsing");
+  gumbo_debug("Finishing parsing\n");
   maybe_flush_text_node_buffer(parser);
   GumboParserState* state = parser->_parser_state;
   for (
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
   }
   if (
     tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
+    || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
   ) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
   }
-  if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
+  if (
+    tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
+    || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
+  ) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
   }
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
 // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
 static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
-  gumbo_debug("Handling foreign content");
+  gumbo_debug("Handling foreign content.\n");
   switch (token->type) {
     case GUMBO_TOKEN_NULL:
       parser_add_parse_error(parser, token);
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
   if (i == 0)
     return;
   // We can't call handle_token directly because the current node is still in
-  // a foriegn namespace, so it would re-enter this and result in infinite
+  // a foreign namespace, so it would re-enter this and result in infinite
   // recursion.
   handle_html_content(parser, token);
 }
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
   const char* fragment_encoding = options->fragment_encoding;
   GumboQuirksModeEnum quirks = options->quirks_mode;
   bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
   GumboNode* root;
-  // 2.
+  // 1. [Create a new Document node, and mark it as being an HTML document.]
+  // 2. [If the node document of the context element is in quirks mode, then
+  //    let the Document be in quirks mode. Otherwise, the node document of
+  //    the context element is in limited-quirks mode, then let the Document
+  //    be in limited-quirks mode. Otherwise, leave the Document in no-quirks
+  //    mode.]
   get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
-  // 3.
+  // 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
+  //    declarative shadow roots to true.]
+  // 4. [Create a new HTML parser, and associate it with the just created Document node.]
+  // 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
   parser->_parser_state->_fragment_ctx =
     create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
   GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
         break;
       case GUMBO_TAG_NOSCRIPT:
-        /* scripting is disabled in Gumbo, so leave the tokenizer
-         * in the default data state */
+        if (options->parse_noscript_content_as_text)
+          gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
         break;
       case GUMBO_TAG_PLAINTEXT:
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
         adjusted_current_node &&
           adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
       );
-      gumbo_lex(&parser, &token);
+      // If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
+      //
+      // The parser is pretty fragile. Breaking out of the parsing loop in the middle of
+      // the parse can leave the document in an inconsistent state.
+      if (unlikely(state->_open_elements.length > max_tree_depth)) {
+        parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
+        gumbo_debug("Tree depth limit exceeded.\n");
+        token.type = GUMBO_TOKEN_EOF;
+      } else {
+        gumbo_lex(&parser, &token);
+      }
     }
     const char* token_type = "text";
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
         break;
     }
     gumbo_debug (
-      "Handling %s token @%lu:%lu in state %u.\n",
+      "Handling %s token @%lu:%lu in insertion mode %u.\n",
       (char*) token_type,
       (unsigned long)token.position.line,
       (unsigned long)token.position.column,
@@ -4830,11 +4873,6 @@ GumboOutput* gumbo_parse_with_options (
         gumbo_free(token.v.end_tag.name);
         token.v.end_tag.name = NULL;
       }
-      if (unlikely(state->_open_elements.length > max_tree_depth)) {
-        parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
-        gumbo_debug("Tree depth limit exceeded.\n");
-        break;
-      }
     }

data/gumbo-parser/src/tokenizer.c CHANGED Viewed

@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
 // Sets the tag buffer original text and start point to the current iterator
 // position. This is necessary because attribute names & values may have
-// whitespace preceeding them, and so we can't assume that the actual token
+// whitespace preceding them, and so we can't assume that the actual token
 // starting point was the end of the last tag buffer usage.
 static void reset_tag_buffer_start_point(GumboParser* parser) {
   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -569,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
 }
 // Appends a codepoint to the current tag buffer. If
-// reinitilize_position_on_first is set, this also initializes the tag buffer
+// reinitialize_position_on_first is set, this also initializes the tag buffer
 // start point; the only time you would *not* want to pass true for this
 // parameter is if you want the original_text to include character (like an
 // opening quote) that doesn't appear in the value.
 static void append_char_to_tag_buffer (
   GumboParser* parser,
   int codepoint,
-  bool reinitilize_position_on_first
+  bool reinitialize_position_on_first
 ) {
   GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
-  if (buffer->length == 0 && reinitilize_position_on_first) {
+  if (buffer->length == 0 && reinitialize_position_on_first) {
     reset_tag_buffer_start_point(parser);
   }
   gumbo_string_buffer_append_codepoint(codepoint, buffer);
@@ -589,10 +589,10 @@ static void append_char_to_tag_buffer (
 static void append_string_to_tag_buffer (
   GumboParser* parser,
   GumboStringPiece* str,
-  bool reinitilize_position_on_first
+  bool reinitialize_position_on_first
 ) {
   GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
-  if (buffer->length == 0 && reinitilize_position_on_first) {
+  if (buffer->length == 0 && reinitialize_position_on_first) {
     reset_tag_buffer_start_point(parser);
   }
   gumbo_string_buffer_append_string(str, buffer);

data/lib/nokogiri/class_resolver.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module Nokogiri
   #
   module ClassResolver
     # #related_class restricts matching namespaces to those matching this set.
-    VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
+    VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
     # :call-seq:
     #   related_class(class_name) → Class

data/lib/nokogiri/css/node.rb CHANGED Viewed

@@ -23,8 +23,12 @@ module Nokogiri
       ###
       # Convert this CSS node to xpath with +prefix+ using +visitor+
-      def to_xpath(prefix, visitor)
-        prefix = "." if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
+      def to_xpath(visitor)
+        prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
+          "."
+        else
+          visitor.prefix
+        end
         prefix + visitor.accept(self)
       end

data/lib/nokogiri/css/parser.rb CHANGED Viewed

@@ -1,8 +1,8 @@
 # frozen_string_literal: true
 #
 # DO NOT MODIFY!!!!
-# This file is automatically generated by Racc 1.6.0
-# from Racc grammar file "".
+# This file is automatically generated by Racc 1.8.0
+# from Racc grammar file "parser.y".
 #
 require 'racc/parser.rb'
@@ -291,6 +291,7 @@ Racc_arg = [
   racc_shift_n,
   racc_reduce_n,
   racc_use_result_var ]
+Ractor.make_shareable(Racc_arg) if defined?(Ractor)
 Racc_token_to_s_table = [
   "$end",
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
   "negation",
   "eql_incl_dash",
   "negation_arg" ]
+Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
 Racc_debug_parser = false
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
 end
 def _reduce_24(val, _values, result)
- result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')])
+ result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
     result
 end
 def _reduce_25(val, _values, result)
-      name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
+      name = val[0]
       result = Node.new(:ELEMENT_NAME, [name])
     result

data/lib/nokogiri/css/parser.y CHANGED Viewed

@@ -64,9 +64,9 @@ rule
   ;
   namespaced_ident:
-    namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')]) }
+    namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [val[0], val[2]]) }
   | IDENT {
-      name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
+      name = val[0]
       result = Node.new(:ELEMENT_NAME, [name])
     }
   ;

data/lib/nokogiri/css/parser_extras.rb CHANGED Viewed

@@ -5,62 +5,9 @@ require "thread"
 module Nokogiri
   module CSS
     class Parser < Racc::Parser # :nodoc:
-      CACHE_SWITCH_NAME = :nokogiri_css_parser_cache_is_off
-      @cache = {}
-      @mutex = Mutex.new
-      class << self
-        # Return a thread-local boolean indicating whether the CSS-to-XPath cache is active. (Default is `true`.)
-        def cache_on?
-          !Thread.current[CACHE_SWITCH_NAME]
-        end
-        # Set a thread-local boolean to turn cacheing on and off. Truthy values turn the cache on, falsey values turn the cache off.
-        def set_cache(value) # rubocop:disable Naming/AccessorMethodName
-          Thread.current[CACHE_SWITCH_NAME] = !value
-        end
-        # Get the css selector in +string+ from the cache
-        def [](string)
-          return unless cache_on?
-          @mutex.synchronize { @cache[string] }
-        end
-        # Set the css selector in +string+ in the cache to +value+
-        def []=(string, value)
-          return value unless cache_on?
-          @mutex.synchronize { @cache[string] = value }
-        end
-        # Clear the cache
-        def clear_cache(create_new_object = false)
-          @mutex.synchronize do
-            if create_new_object
-              @cache = {}
-            else
-              @cache.clear
-            end
-          end
-        end
-        # Execute +block+ without cache
-        def without_cache(&block)
-          original_cache_setting = cache_on?
-          set_cache(false)
-          yield
-        ensure
-          set_cache(original_cache_setting)
-        end
-      end
-      # Create a new CSS parser with respect to +namespaces+
-      def initialize(namespaces = {})
+      def initialize
         @tokenizer = Tokenizer.new
-        @namespaces = namespaces
-        super()
+        super
       end
       def parse(string)
@@ -72,11 +19,10 @@ module Nokogiri
         @tokenizer.next_token
       end
-      # Get the xpath for +string+ using +options+
-      def xpath_for(string, prefix, visitor)
-        key = cache_key(string, prefix, visitor)
-        self.class[key] ||= parse(string).map do |ast|
-          ast.to_xpath(prefix, visitor)
+      # Get the xpath for +selector+ using +visitor+
+      def xpath_for(selector, visitor)
+        parse(selector).map do |ast|
+          ast.to_xpath(visitor)
         end
       end
@@ -85,12 +31,6 @@ module Nokogiri
         after = value_stack.compact.last
         raise SyntaxError, "unexpected '#{error_value}' after '#{after}'"
       end
-      def cache_key(query, prefix, visitor)
-        if self.class.cache_on?
-          [query, prefix, @namespaces, visitor.config]
-        end
-      end
     end
   end
 end

data/lib/nokogiri/css/selector_cache.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+module Nokogiri
+  module CSS
+    module SelectorCache # :nodoc:
+      @cache = {}
+      @mutex = Mutex.new
+      class << self
+        # Retrieve the cached XPath expressions for the key
+        def [](key)
+          @mutex.synchronize { @cache[key] }
+        end
+        # Insert the XPath expressions `value` at the cache key
+        def []=(key, value)
+          @mutex.synchronize { @cache[key] = value }
+        end
+        # Clear the cache
+        def clear_cache(create_new_object = false)
+          @mutex.synchronize do
+            if create_new_object # used in tests to avoid 'method redefined' warnings when injecting spies
+              @cache = {}
+            else
+              @cache.clear
+            end
+          end
+        end
+        # Construct a unique key cache key
+        def key(selector:, visitor:)
+          [selector, visitor.config]
+        end
+      end
+    end
+  end
+end