RubyGems - nokogiri - Versions diffs - 1.16.3 → 1.18.1 - Mend

nokogiri 1.16.3 → 1.18.1

Potentially problematic release.

This version of nokogiri might be problematic. Click here for more details.

Files changed (95) hide show

checksums.yaml +4 -4
data/Gemfile +14 -22
data/LICENSE-DEPENDENCIES.md +6 -6
data/README.md +8 -5
data/dependencies.yml +9 -9
data/ext/nokogiri/extconf.rb +188 -142
data/ext/nokogiri/gumbo.c +69 -53
data/ext/nokogiri/html4_document.c +10 -4
data/ext/nokogiri/html4_element_description.c +18 -18
data/ext/nokogiri/html4_sax_parser.c +40 -0
data/ext/nokogiri/html4_sax_parser_context.c +48 -58
data/ext/nokogiri/html4_sax_push_parser.c +25 -24
data/ext/nokogiri/libxml2_polyfill.c +114 -0
data/ext/nokogiri/nokogiri.c +9 -2
data/ext/nokogiri/nokogiri.h +18 -33
data/ext/nokogiri/xml_attr.c +1 -1
data/ext/nokogiri/xml_cdata.c +2 -10
data/ext/nokogiri/xml_comment.c +3 -8
data/ext/nokogiri/xml_document.c +163 -156
data/ext/nokogiri/xml_document_fragment.c +10 -25
data/ext/nokogiri/xml_dtd.c +1 -1
data/ext/nokogiri/xml_element_content.c +9 -9
data/ext/nokogiri/xml_encoding_handler.c +4 -4
data/ext/nokogiri/xml_namespace.c +6 -6
data/ext/nokogiri/xml_node.c +141 -104
data/ext/nokogiri/xml_node_set.c +46 -44
data/ext/nokogiri/xml_reader.c +54 -58
data/ext/nokogiri/xml_relax_ng.c +35 -56
data/ext/nokogiri/xml_sax_parser.c +156 -88
data/ext/nokogiri/xml_sax_parser_context.c +219 -131
data/ext/nokogiri/xml_sax_push_parser.c +68 -49
data/ext/nokogiri/xml_schema.c +50 -85
data/ext/nokogiri/xml_syntax_error.c +19 -11
data/ext/nokogiri/xml_text.c +2 -4
data/ext/nokogiri/xml_xpath_context.c +103 -100
data/ext/nokogiri/xslt_stylesheet.c +8 -8
data/gumbo-parser/src/ascii.c +2 -2
data/gumbo-parser/src/error.c +76 -48
data/gumbo-parser/src/error.h +5 -1
data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
data/gumbo-parser/src/parser.c +63 -25
data/gumbo-parser/src/tokenizer.c +6 -6
data/lib/nokogiri/class_resolver.rb +1 -1
data/lib/nokogiri/css/node.rb +6 -2
data/lib/nokogiri/css/parser.rb +6 -4
data/lib/nokogiri/css/parser.y +2 -2
data/lib/nokogiri/css/parser_extras.rb +6 -66
data/lib/nokogiri/css/selector_cache.rb +38 -0
data/lib/nokogiri/css/tokenizer.rb +4 -4
data/lib/nokogiri/css/tokenizer.rex +9 -8
data/lib/nokogiri/css/xpath_visitor.rb +42 -6
data/lib/nokogiri/css.rb +86 -20
data/lib/nokogiri/decorators/slop.rb +3 -5
data/lib/nokogiri/encoding_handler.rb +2 -2
data/lib/nokogiri/html4/document.rb +44 -23
data/lib/nokogiri/html4/document_fragment.rb +124 -12
data/lib/nokogiri/html4/encoding_reader.rb +1 -1
data/lib/nokogiri/html4/sax/parser.rb +23 -38
data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
data/lib/nokogiri/html4.rb +9 -14
data/lib/nokogiri/html5/builder.rb +40 -0
data/lib/nokogiri/html5/document.rb +61 -30
data/lib/nokogiri/html5/document_fragment.rb +130 -20
data/lib/nokogiri/html5/node.rb +4 -4
data/lib/nokogiri/html5.rb +114 -72
data/lib/nokogiri/version/constant.rb +1 -1
data/lib/nokogiri/xml/builder.rb +8 -1
data/lib/nokogiri/xml/document.rb +70 -26
data/lib/nokogiri/xml/document_fragment.rb +84 -13
data/lib/nokogiri/xml/node.rb +82 -11
data/lib/nokogiri/xml/node_set.rb +9 -7
data/lib/nokogiri/xml/parse_options.rb +1 -1
data/lib/nokogiri/xml/pp/node.rb +6 -1
data/lib/nokogiri/xml/reader.rb +46 -13
data/lib/nokogiri/xml/relax_ng.rb +57 -20
data/lib/nokogiri/xml/sax/document.rb +174 -83
data/lib/nokogiri/xml/sax/parser.rb +115 -41
data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
data/lib/nokogiri/xml/sax.rb +48 -0
data/lib/nokogiri/xml/schema.rb +112 -45
data/lib/nokogiri/xml/searchable.rb +38 -42
data/lib/nokogiri/xml/syntax_error.rb +22 -0
data/lib/nokogiri/xml/xpath_context.rb +14 -3
data/lib/nokogiri/xml.rb +13 -24
data/lib/nokogiri/xslt.rb +3 -9
data/lib/xsd/xmlparser/nokogiri.rb +3 -4
data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
metadata +13 -14
data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
data/ports/archives/libxml2-2.12.6.tar.xz +0 -0
data/ports/archives/libxslt-1.1.39.tar.xz +0 -0

data/gumbo-parser/src/parser.c CHANGED Viewed

@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
   .fragment_encoding = NULL,
   .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
   .fragment_context_has_form_ancestor = false,
+  .parse_noscript_content_as_text = false,
 };
 #define STRING(s) {.data = s, .length = sizeof(s) - 1}
@@ -317,7 +318,7 @@ static GumboNode* create_node(GumboNodeType type) {
   return node;
 }
-static GumboNode* new_document_node() {
+static GumboNode* new_document_node(void) {
   GumboNode* document_node = create_node(GUMBO_NODE_DOCUMENT);
   document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
   gumbo_vector_init(1, &document_node->v.document.children);
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
   GumboParserError* extra_data = &error->v.parser;
   extra_data->input_type = token->type;
   extra_data->input_tag = GUMBO_TAG_UNKNOWN;
-  if (token->type == GUMBO_TOKEN_START_TAG) {
+  extra_data->input_name = NULL;
+  if (token->type == GUMBO_TOKEN_START_TAG)
+  {
     extra_data->input_tag = token->v.start_tag.tag;
-  } else if (token->type == GUMBO_TOKEN_END_TAG) {
+    if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
+      extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
+    }
+  }
+  else if (token->type == GUMBO_TOKEN_END_TAG)
+  {
     extra_data->input_tag = token->v.end_tag.tag;
+    if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
+      extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
+    }
   }
   const GumboParserState* state = parser->_parser_state;
   extra_data->parser_state = state->_insertion_mode;
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
       node->type == GUMBO_NODE_ELEMENT
       || node->type == GUMBO_NODE_TEMPLATE
     );
-    gumbo_vector_add (
-      (void*) node->v.element.tag,
-      &extra_data->tag_stack
-    );
+    void *tag;
+    if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
+      tag = gumbo_strdup(node->v.element.name);
+    } else {
+      tag = (void *)(uintptr_t)node->v.element.tag;
+    }
+    gumbo_vector_add(tag, &extra_data->tag_stack);
   }
 }
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
   element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
   insert_element(parser, element, false);
   gumbo_debug (
-    "Inserting %s element (@%p) from tag type.\n",
+    "Inserting <%s> element (@%p) from tag type.\n",
     gumbo_normalized_tagname(tag),
     (void*)element
   );
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
   assert(token->type == GUMBO_TOKEN_START_TAG);
   GumboNode* element = create_element_from_token(token, tag_namespace);
   insert_element(parser, element, false);
+  gumbo_debug (
+    "Inserting <%s> foreign element (@%p).\n",
+    gumbo_normalized_tagname(element->v.element.tag),
+    (void*)element
+  );
   if (
     token_has_attribute(token, "xmlns")
     && !attribute_matches_case_sensitive (
@@ -1978,7 +1997,7 @@ static void adjust_svg_tag(GumboToken* token) {
   assert(token->type == GUMBO_TOKEN_START_TAG);
   if (token->v.start_tag.tag == GUMBO_TAG_FOREIGNOBJECT) {
     assert(token->v.start_tag.name == NULL);
-    token->v.start_tag.name = "foreignObject";
+    token->v.start_tag.name = (char *)"foreignObject";
   } else if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
     assert(token->v.start_tag.name);
     const StringReplacement *replacement = gumbo_get_svg_tag_replacement(
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
 // This is here to clean up memory when the spec says "Ignore current token."
 static void ignore_token(GumboParser* parser) {
+  gumbo_debug("Ignoring token.\n");
   GumboToken* token = parser->_parser_state->_current_token;
   // Ownership of the token's internal buffers are normally transferred to the
   // element, but if no element is emitted (as happens in non-verbatim-mode
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
 // https://html.spec.whatwg.org/multipage/parsing.html#the-end
 static void finish_parsing(GumboParser* parser) {
-  gumbo_debug("Finishing parsing");
+  gumbo_debug("Finishing parsing\n");
   maybe_flush_text_node_buffer(parser);
   GumboParserState* state = parser->_parser_state;
   for (
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
   }
   if (
     tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
+    || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
   ) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
   }
-  if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
+  if (
+    tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
+    || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
+  ) {
     run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
     return;
   }
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
 // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
 static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
-  gumbo_debug("Handling foreign content");
+  gumbo_debug("Handling foreign content.\n");
   switch (token->type) {
     case GUMBO_TOKEN_NULL:
       parser_add_parse_error(parser, token);
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
   if (i == 0)
     return;
   // We can't call handle_token directly because the current node is still in
-  // a foriegn namespace, so it would re-enter this and result in infinite
+  // a foreign namespace, so it would re-enter this and result in infinite
   // recursion.
   handle_html_content(parser, token);
 }
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
   const char* fragment_encoding = options->fragment_encoding;
   GumboQuirksModeEnum quirks = options->quirks_mode;
   bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
   GumboNode* root;
-  // 2.
+  // 1. [Create a new Document node, and mark it as being an HTML document.]
+  // 2. [If the node document of the context element is in quirks mode, then
+  //    let the Document be in quirks mode. Otherwise, the node document of
+  //    the context element is in limited-quirks mode, then let the Document
+  //    be in limited-quirks mode. Otherwise, leave the Document in no-quirks
+  //    mode.]
   get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
-  // 3.
+  // 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
+  //    declarative shadow roots to true.]
+  // 4. [Create a new HTML parser, and associate it with the just created Document node.]
+  // 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
   parser->_parser_state->_fragment_ctx =
     create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
   GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
         break;
       case GUMBO_TAG_NOSCRIPT:
-        /* scripting is disabled in Gumbo, so leave the tokenizer
-         * in the default data state */
+        if (options->parse_noscript_content_as_text)
+          gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
         break;
       case GUMBO_TAG_PLAINTEXT:
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
         adjusted_current_node &&
           adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
       );
-      gumbo_lex(&parser, &token);
+      // If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
+      //
+      // The parser is pretty fragile. Breaking out of the parsing loop in the middle of
+      // the parse can leave the document in an inconsistent state.
+      if (unlikely(state->_open_elements.length > max_tree_depth)) {
+        parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
+        gumbo_debug("Tree depth limit exceeded.\n");
+        token.type = GUMBO_TOKEN_EOF;
+      } else {
+        gumbo_lex(&parser, &token);
+      }
     }
     const char* token_type = "text";
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
         break;
     }
     gumbo_debug (
-      "Handling %s token @%lu:%lu in state %u.\n",
+      "Handling %s token @%lu:%lu in insertion mode %u.\n",
       (char*) token_type,
       (unsigned long)token.position.line,
       (unsigned long)token.position.column,
@@ -4830,11 +4873,6 @@ GumboOutput* gumbo_parse_with_options (
         gumbo_free(token.v.end_tag.name);
         token.v.end_tag.name = NULL;
       }
-      if (unlikely(state->_open_elements.length > max_tree_depth)) {
-        parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
-        gumbo_debug("Tree depth limit exceeded.\n");
-        break;
-      }
     }

data/gumbo-parser/src/tokenizer.c CHANGED Viewed

@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
 // Sets the tag buffer original text and start point to the current iterator
 // position. This is necessary because attribute names & values may have
-// whitespace preceeding them, and so we can't assume that the actual token
+// whitespace preceding them, and so we can't assume that the actual token
 // starting point was the end of the last tag buffer usage.
 static void reset_tag_buffer_start_point(GumboParser* parser) {
   GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -569,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
 }
 // Appends a codepoint to the current tag buffer. If
-// reinitilize_position_on_first is set, this also initializes the tag buffer
+// reinitialize_position_on_first is set, this also initializes the tag buffer
 // start point; the only time you would *not* want to pass true for this
 // parameter is if you want the original_text to include character (like an
 // opening quote) that doesn't appear in the value.
 static void append_char_to_tag_buffer (
   GumboParser* parser,
   int codepoint,
-  bool reinitilize_position_on_first
+  bool reinitialize_position_on_first
 ) {
   GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
-  if (buffer->length == 0 && reinitilize_position_on_first) {
+  if (buffer->length == 0 && reinitialize_position_on_first) {
     reset_tag_buffer_start_point(parser);
   }
   gumbo_string_buffer_append_codepoint(codepoint, buffer);
@@ -589,10 +589,10 @@ static void append_char_to_tag_buffer (
 static void append_string_to_tag_buffer (
   GumboParser* parser,
   GumboStringPiece* str,
-  bool reinitilize_position_on_first
+  bool reinitialize_position_on_first
 ) {
   GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
-  if (buffer->length == 0 && reinitilize_position_on_first) {
+  if (buffer->length == 0 && reinitialize_position_on_first) {
     reset_tag_buffer_start_point(parser);
   }
   gumbo_string_buffer_append_string(str, buffer);

data/lib/nokogiri/class_resolver.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module Nokogiri
   #
   module ClassResolver
     # #related_class restricts matching namespaces to those matching this set.
-    VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
+    VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
     # :call-seq:
     #   related_class(class_name) → Class

data/lib/nokogiri/css/node.rb CHANGED Viewed

@@ -23,8 +23,12 @@ module Nokogiri
       ###
       # Convert this CSS node to xpath with +prefix+ using +visitor+
-      def to_xpath(prefix, visitor)
-        prefix = "." if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
+      def to_xpath(visitor)
+        prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
+          "."
+        else
+          visitor.prefix
+        end
         prefix + visitor.accept(self)
       end

data/lib/nokogiri/css/parser.rb CHANGED Viewed

@@ -1,8 +1,8 @@
 # frozen_string_literal: true
 #
 # DO NOT MODIFY!!!!
-# This file is automatically generated by Racc 1.6.0
-# from Racc grammar file "".
+# This file is automatically generated by Racc 1.8.0
+# from Racc grammar file "parser.y".
 #
 require 'racc/parser.rb'
@@ -291,6 +291,7 @@ Racc_arg = [
   racc_shift_n,
   racc_reduce_n,
   racc_use_result_var ]
+Ractor.make_shareable(Racc_arg) if defined?(Ractor)
 Racc_token_to_s_table = [
   "$end",
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
   "negation",
   "eql_incl_dash",
   "negation_arg" ]
+Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
 Racc_debug_parser = false
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
 end
 def _reduce_24(val, _values, result)
- result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')])
+ result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
     result
 end
 def _reduce_25(val, _values, result)
-      name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
+      name = val[0]
       result = Node.new(:ELEMENT_NAME, [name])
     result

data/lib/nokogiri/css/parser.y CHANGED Viewed

@@ -64,9 +64,9 @@ rule
   ;
   namespaced_ident:
-    namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')]) }
+    namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [val[0], val[2]]) }
   | IDENT {
-      name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
+      name = val[0]
       result = Node.new(:ELEMENT_NAME, [name])
     }
   ;

data/lib/nokogiri/css/parser_extras.rb CHANGED Viewed

@@ -5,62 +5,9 @@ require "thread"
 module Nokogiri
   module CSS
     class Parser < Racc::Parser # :nodoc:
-      CACHE_SWITCH_NAME = :nokogiri_css_parser_cache_is_off
-      @cache = {}
-      @mutex = Mutex.new
-      class << self
-        # Return a thread-local boolean indicating whether the CSS-to-XPath cache is active. (Default is `true`.)
-        def cache_on?
-          !Thread.current[CACHE_SWITCH_NAME]
-        end
-        # Set a thread-local boolean to turn cacheing on and off. Truthy values turn the cache on, falsey values turn the cache off.
-        def set_cache(value) # rubocop:disable Naming/AccessorMethodName
-          Thread.current[CACHE_SWITCH_NAME] = !value
-        end
-        # Get the css selector in +string+ from the cache
-        def [](string)
-          return unless cache_on?
-          @mutex.synchronize { @cache[string] }
-        end
-        # Set the css selector in +string+ in the cache to +value+
-        def []=(string, value)
-          return value unless cache_on?
-          @mutex.synchronize { @cache[string] = value }
-        end
-        # Clear the cache
-        def clear_cache(create_new_object = false)
-          @mutex.synchronize do
-            if create_new_object
-              @cache = {}
-            else
-              @cache.clear
-            end
-          end
-        end
-        # Execute +block+ without cache
-        def without_cache(&block)
-          original_cache_setting = cache_on?
-          set_cache(false)
-          yield
-        ensure
-          set_cache(original_cache_setting)
-        end
-      end
-      # Create a new CSS parser with respect to +namespaces+
-      def initialize(namespaces = {})
+      def initialize
         @tokenizer = Tokenizer.new
-        @namespaces = namespaces
-        super()
+        super
       end
       def parse(string)
@@ -72,11 +19,10 @@ module Nokogiri
         @tokenizer.next_token
       end
-      # Get the xpath for +string+ using +options+
-      def xpath_for(string, prefix, visitor)
-        key = cache_key(string, prefix, visitor)
-        self.class[key] ||= parse(string).map do |ast|
-          ast.to_xpath(prefix, visitor)
+      # Get the xpath for +selector+ using +visitor+
+      def xpath_for(selector, visitor)
+        parse(selector).map do |ast|
+          ast.to_xpath(visitor)
         end
       end
@@ -85,12 +31,6 @@ module Nokogiri
         after = value_stack.compact.last
         raise SyntaxError, "unexpected '#{error_value}' after '#{after}'"
       end
-      def cache_key(query, prefix, visitor)
-        if self.class.cache_on?
-          [query, prefix, @namespaces, visitor.config]
-        end
-      end
     end
   end
 end

data/lib/nokogiri/css/selector_cache.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+module Nokogiri
+  module CSS
+    module SelectorCache # :nodoc:
+      @cache = {}
+      @mutex = Mutex.new
+      class << self
+        # Retrieve the cached XPath expressions for the key
+        def [](key)
+          @mutex.synchronize { @cache[key] }
+        end
+        # Insert the XPath expressions `value` at the cache key
+        def []=(key, value)
+          @mutex.synchronize { @cache[key] = value }
+        end
+        # Clear the cache
+        def clear_cache(create_new_object = false)
+          @mutex.synchronize do
+            if create_new_object # used in tests to avoid 'method redefined' warnings when injecting spies
+              @cache = {}
+            else
+              @cache.clear
+            end
+          end
+        end
+        # Construct a unique key cache key
+        def key(selector:, visitor:)
+          [selector, visitor.config]
+        end
+      end
+    end
+  end
+end

data/lib/nokogiri/css/tokenizer.rb CHANGED Viewed

@@ -63,13 +63,13 @@ class Tokenizer
                   when (text = @ss.scan(/has\([\s]*/))
                      action { [:HAS, text] }
-                  when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
+                  when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
                      action { [:FUNCTION, text] }
-                  when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
+                  when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
                      action { [:IDENT, text] }
-                  when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
+                  when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
                      action { [:HASH, text] }
                   when (text = @ss.scan(/[\s]*~=[\s]*/))
@@ -132,7 +132,7 @@ class Tokenizer
                   when (text = @ss.scan(/[\s]+/))
                      action { [:S, text] }
-                  when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*'/))
+                  when (text = @ss.scan(/("([^\n\r\f"]|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*')/))
                      action { [:STRING, text] }
                   when (text = @ss.scan(/./))

data/lib/nokogiri/css/tokenizer.rex CHANGED Viewed

@@ -4,20 +4,21 @@ module CSS
 class Tokenizer
 macro
-  nl        \n|\r\n|\r|\f
+  nl        (\n|\r\n|\r|\f)
   w         [\s]*
   nonascii  [^\0-\177]
   num       -?([0-9]+|[0-9]*\.[0-9]+)
   unicode   \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
-  escape    {unicode}|\\[^\n\r\f0-9A-Fa-f]
-  nmchar    [_A-Za-z0-9-]|{nonascii}|{escape}
-  nmstart   [_A-Za-z]|{nonascii}|{escape}
-  ident     -?({nmstart})({nmchar})*
-  name      ({nmchar})+
+  escape    ({unicode}|\\[^\n\r\f0-9A-Fa-f])
+  nmchar    ([_A-Za-z0-9-]|{nonascii}|{escape})
+  nmstart   ([_A-Za-z]|{nonascii}|{escape})
+  name      {nmstart}{nmchar}*
+  ident     -?{name}
+  charref   {nmchar}+
   string1   "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
   string2   '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
-  string    {string1}|{string2}
+  string    ({string1}|{string2})
 rule
@@ -26,7 +27,7 @@ rule
             has\({w}         { [:HAS, text] }
             {ident}\({w}     { [:FUNCTION, text] }
             {ident}          { [:IDENT, text] }
-            \#{name}         { [:HASH, text] }
+            \#{charref}      { [:HASH, text] }
             {w}~={w}         { [:INCLUDES, text] }
             {w}\|={w}        { [:DASHMATCH, text] }
             {w}\^={w}        { [:PREFIXMATCH, text] }

data/lib/nokogiri/css/xpath_visitor.rb CHANGED Viewed

@@ -44,6 +44,18 @@ module Nokogiri
         VALUES = [XML, HTML4, HTML5]
       end
+      # The visitor configuration set via the +builtins:+ keyword argument to XPathVisitor.new.
+      attr_reader :builtins
+      # The visitor configuration set via the +doctype:+ keyword argument to XPathVisitor.new.
+      attr_reader :doctype
+      # The visitor configuration set via the +prefix:+ keyword argument to XPathVisitor.new.
+      attr_reader :prefix
+      # The visitor configuration set via the +namespaces:+ keyword argument to XPathVisitor.new.
+      attr_reader :namespaces
       # :call-seq:
       #   new() → XPathVisitor
       #   new(builtins:, doctype:) → XPathVisitor
@@ -54,7 +66,12 @@ module Nokogiri
       #
       # [Returns] XPathVisitor
       #
-      def initialize(builtins: BuiltinsConfig::NEVER, doctype: DoctypeConfig::XML)
+      def initialize(
+        builtins: BuiltinsConfig::NEVER,
+        doctype: DoctypeConfig::XML,
+        prefix: Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX,
+        namespaces: nil
+      )
         unless BuiltinsConfig::VALUES.include?(builtins)
           raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter")
         end
@@ -64,6 +81,8 @@ module Nokogiri
         @builtins = builtins
         @doctype = doctype
+        @prefix = prefix
+        @namespaces = namespaces
       end
       # :call-seq: config() → Hash
@@ -72,7 +91,7 @@ module Nokogiri
       #   a Hash representing the configuration of the XPathVisitor, suitable for use as
       #   part of the CSS cache key.
       def config
-        { builtins: @builtins, doctype: @doctype }
+        { builtins: @builtins, doctype: @doctype, prefix: @prefix, namespaces: @namespaces }
       end
       # :stopdoc:
@@ -128,6 +147,8 @@ module Nokogiri
           is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
           ".#{"//" unless is_direct}#{node.value[1].accept(self)}"
         else
+          validate_xpath_function_name(node.value.first)
           # xpath function call, let's marshal those arguments
           args = ["."]
           args += node.value[1..-1].map do |n|
@@ -207,6 +228,7 @@ module Nokogiri
           when "parent" then "node()"
           when "root" then "not(parent::*)"
           else
+            validate_xpath_function_name(node.value.first)
             "nokogiri:#{node.value.first}(.)"
           end
         end
@@ -255,6 +277,14 @@ module Nokogiri
           else
             "*[local-name()='#{node.value.first}']"
           end
+        elsif node.value.length == 2 # has a namespace prefix
+          if node.value.first.nil? # namespace prefix is empty
+            node.value.last
+          else
+            node.value.join(":")
+          end
+        elsif @namespaces&.key?("xmlns") # apply the default namespace if it's declared
+          "xmlns:#{node.value.first}"
         else
           node.value.first
         end
@@ -270,11 +300,17 @@ module Nokogiri
       private
+      def validate_xpath_function_name(name)
+        if name.start_with?("-")
+          raise Nokogiri::CSS::SyntaxError, "Invalid XPath function name '#{name}'"
+        end
+      end
       def html5_element_name_needs_namespace_handling(node)
-        # if this is the wildcard selector "*", use it as normal
-        node.value.first != "*" &&
-          # if there is already a namespace (i.e., it is a prefixed QName), use it as normal
-          !node.value.first.include?(":")
+        # if there is already a namespace (i.e., it is a prefixed QName), use it as normal
+        node.value.length == 1 &&
+          # if this is the wildcard selector "*", use it as normal
+          node.value.first != "*"
       end
       def nth(node, options = {})