nokogiri 1.16.8 → 1.18.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (95) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +11 -21
  3. data/LICENSE-DEPENDENCIES.md +6 -6
  4. data/README.md +8 -5
  5. data/dependencies.yml +6 -6
  6. data/ext/nokogiri/extconf.rb +188 -142
  7. data/ext/nokogiri/gumbo.c +69 -53
  8. data/ext/nokogiri/html4_document.c +10 -4
  9. data/ext/nokogiri/html4_element_description.c +18 -18
  10. data/ext/nokogiri/html4_sax_parser.c +40 -0
  11. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  12. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  13. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  14. data/ext/nokogiri/nokogiri.c +9 -2
  15. data/ext/nokogiri/nokogiri.h +18 -33
  16. data/ext/nokogiri/xml_attr.c +1 -1
  17. data/ext/nokogiri/xml_cdata.c +2 -10
  18. data/ext/nokogiri/xml_comment.c +3 -8
  19. data/ext/nokogiri/xml_document.c +163 -156
  20. data/ext/nokogiri/xml_document_fragment.c +10 -25
  21. data/ext/nokogiri/xml_dtd.c +1 -1
  22. data/ext/nokogiri/xml_element_content.c +9 -9
  23. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  24. data/ext/nokogiri/xml_namespace.c +6 -6
  25. data/ext/nokogiri/xml_node.c +134 -103
  26. data/ext/nokogiri/xml_node_set.c +46 -44
  27. data/ext/nokogiri/xml_reader.c +54 -58
  28. data/ext/nokogiri/xml_relax_ng.c +35 -56
  29. data/ext/nokogiri/xml_sax_parser.c +156 -88
  30. data/ext/nokogiri/xml_sax_parser_context.c +219 -131
  31. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  32. data/ext/nokogiri/xml_schema.c +50 -85
  33. data/ext/nokogiri/xml_syntax_error.c +19 -11
  34. data/ext/nokogiri/xml_text.c +2 -4
  35. data/ext/nokogiri/xml_xpath_context.c +103 -100
  36. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  37. data/gumbo-parser/src/ascii.c +2 -2
  38. data/gumbo-parser/src/error.c +76 -48
  39. data/gumbo-parser/src/error.h +5 -1
  40. data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
  41. data/gumbo-parser/src/parser.c +63 -25
  42. data/gumbo-parser/src/tokenizer.c +6 -6
  43. data/lib/nokogiri/class_resolver.rb +1 -1
  44. data/lib/nokogiri/css/node.rb +6 -2
  45. data/lib/nokogiri/css/parser.rb +6 -4
  46. data/lib/nokogiri/css/parser.y +2 -2
  47. data/lib/nokogiri/css/parser_extras.rb +6 -66
  48. data/lib/nokogiri/css/selector_cache.rb +38 -0
  49. data/lib/nokogiri/css/tokenizer.rb +4 -4
  50. data/lib/nokogiri/css/tokenizer.rex +9 -8
  51. data/lib/nokogiri/css/xpath_visitor.rb +43 -6
  52. data/lib/nokogiri/css.rb +86 -20
  53. data/lib/nokogiri/decorators/slop.rb +3 -5
  54. data/lib/nokogiri/encoding_handler.rb +2 -2
  55. data/lib/nokogiri/html4/document.rb +44 -23
  56. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  57. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  58. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  59. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  60. data/lib/nokogiri/html4.rb +9 -14
  61. data/lib/nokogiri/html5/builder.rb +40 -0
  62. data/lib/nokogiri/html5/document.rb +61 -30
  63. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  64. data/lib/nokogiri/html5/node.rb +4 -4
  65. data/lib/nokogiri/html5.rb +114 -72
  66. data/lib/nokogiri/version/constant.rb +1 -1
  67. data/lib/nokogiri/xml/builder.rb +8 -1
  68. data/lib/nokogiri/xml/document.rb +70 -26
  69. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  70. data/lib/nokogiri/xml/node.rb +82 -11
  71. data/lib/nokogiri/xml/node_set.rb +9 -7
  72. data/lib/nokogiri/xml/parse_options.rb +1 -1
  73. data/lib/nokogiri/xml/pp/node.rb +6 -1
  74. data/lib/nokogiri/xml/reader.rb +46 -13
  75. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  76. data/lib/nokogiri/xml/sax/document.rb +174 -83
  77. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  78. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  79. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  80. data/lib/nokogiri/xml/sax.rb +48 -0
  81. data/lib/nokogiri/xml/schema.rb +112 -45
  82. data/lib/nokogiri/xml/searchable.rb +38 -42
  83. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  84. data/lib/nokogiri/xml/xpath_context.rb +14 -3
  85. data/lib/nokogiri/xml.rb +13 -24
  86. data/lib/nokogiri/xslt.rb +3 -9
  87. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  88. data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
  89. data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
  90. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  91. metadata +13 -12
  92. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
  93. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
  94. data/ports/archives/libxml2-2.12.9.tar.xz +0 -0
  95. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
56
56
  .fragment_encoding = NULL,
57
57
  .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
58
58
  .fragment_context_has_form_ancestor = false,
59
+ .parse_noscript_content_as_text = false,
59
60
  };
60
61
 
61
62
  #define STRING(s) {.data = s, .length = sizeof(s) - 1}
@@ -317,7 +318,7 @@ static GumboNode* create_node(GumboNodeType type) {
317
318
  return node;
318
319
  }
319
320
 
320
- static GumboNode* new_document_node() {
321
+ static GumboNode* new_document_node(void) {
321
322
  GumboNode* document_node = create_node(GUMBO_NODE_DOCUMENT);
322
323
  document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
323
324
  gumbo_vector_init(1, &document_node->v.document.children);
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
749
750
  GumboParserError* extra_data = &error->v.parser;
750
751
  extra_data->input_type = token->type;
751
752
  extra_data->input_tag = GUMBO_TAG_UNKNOWN;
752
- if (token->type == GUMBO_TOKEN_START_TAG) {
753
+ extra_data->input_name = NULL;
754
+ if (token->type == GUMBO_TOKEN_START_TAG)
755
+ {
753
756
  extra_data->input_tag = token->v.start_tag.tag;
754
- } else if (token->type == GUMBO_TOKEN_END_TAG) {
757
+ if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
758
+ extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
759
+ }
760
+ }
761
+ else if (token->type == GUMBO_TOKEN_END_TAG)
762
+ {
755
763
  extra_data->input_tag = token->v.end_tag.tag;
764
+ if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
765
+ extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
766
+ }
756
767
  }
757
768
  const GumboParserState* state = parser->_parser_state;
758
769
  extra_data->parser_state = state->_insertion_mode;
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
763
774
  node->type == GUMBO_NODE_ELEMENT
764
775
  || node->type == GUMBO_NODE_TEMPLATE
765
776
  );
766
- gumbo_vector_add (
767
- (void*) node->v.element.tag,
768
- &extra_data->tag_stack
769
- );
777
+ void *tag;
778
+ if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
779
+ tag = gumbo_strdup(node->v.element.name);
780
+ } else {
781
+ tag = (void *)(uintptr_t)node->v.element.tag;
782
+ }
783
+ gumbo_vector_add(tag, &extra_data->tag_stack);
770
784
  }
771
785
  }
772
786
 
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
1187
1201
  element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1188
1202
  insert_element(parser, element, false);
1189
1203
  gumbo_debug (
1190
- "Inserting %s element (@%p) from tag type.\n",
1204
+ "Inserting <%s> element (@%p) from tag type.\n",
1191
1205
  gumbo_normalized_tagname(tag),
1192
1206
  (void*)element
1193
1207
  );
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
1204
1218
  assert(token->type == GUMBO_TOKEN_START_TAG);
1205
1219
  GumboNode* element = create_element_from_token(token, tag_namespace);
1206
1220
  insert_element(parser, element, false);
1221
+ gumbo_debug (
1222
+ "Inserting <%s> foreign element (@%p).\n",
1223
+ gumbo_normalized_tagname(element->v.element.tag),
1224
+ (void*)element
1225
+ );
1207
1226
  if (
1208
1227
  token_has_attribute(token, "xmlns")
1209
1228
  && !attribute_matches_case_sensitive (
@@ -1978,7 +1997,7 @@ static void adjust_svg_tag(GumboToken* token) {
1978
1997
  assert(token->type == GUMBO_TOKEN_START_TAG);
1979
1998
  if (token->v.start_tag.tag == GUMBO_TAG_FOREIGNOBJECT) {
1980
1999
  assert(token->v.start_tag.name == NULL);
1981
- token->v.start_tag.name = "foreignObject";
2000
+ token->v.start_tag.name = (char *)"foreignObject";
1982
2001
  } else if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
1983
2002
  assert(token->v.start_tag.name);
1984
2003
  const StringReplacement *replacement = gumbo_get_svg_tag_replacement(
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
2066
2085
 
2067
2086
  // This is here to clean up memory when the spec says "Ignore current token."
2068
2087
  static void ignore_token(GumboParser* parser) {
2088
+ gumbo_debug("Ignoring token.\n");
2069
2089
  GumboToken* token = parser->_parser_state->_current_token;
2070
2090
  // Ownership of the token's internal buffers are normally transferred to the
2071
2091
  // element, but if no element is emitted (as happens in non-verbatim-mode
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
2430
2450
 
2431
2451
  // https://html.spec.whatwg.org/multipage/parsing.html#the-end
2432
2452
  static void finish_parsing(GumboParser* parser) {
2433
- gumbo_debug("Finishing parsing");
2453
+ gumbo_debug("Finishing parsing\n");
2434
2454
  maybe_flush_text_node_buffer(parser);
2435
2455
  GumboParserState* state = parser->_parser_state;
2436
2456
  for (
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
2608
2628
  }
2609
2629
  if (
2610
2630
  tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
2631
+ || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
2611
2632
  ) {
2612
2633
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2613
2634
  return;
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
3313
3334
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3314
3335
  return;
3315
3336
  }
3316
- if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3337
+ if (
3338
+ tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
3339
+ || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
3340
+ ) {
3317
3341
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3318
3342
  return;
3319
3343
  }
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
4389
4413
 
4390
4414
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
4391
4415
  static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4392
- gumbo_debug("Handling foreign content");
4416
+ gumbo_debug("Handling foreign content.\n");
4393
4417
  switch (token->type) {
4394
4418
  case GUMBO_TOKEN_NULL:
4395
4419
  parser_add_parse_error(parser, token);
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4507
4531
  if (i == 0)
4508
4532
  return;
4509
4533
  // We can't call handle_token directly because the current node is still in
4510
- // a foriegn namespace, so it would re-enter this and result in infinite
4534
+ // a foreign namespace, so it would re-enter this and result in infinite
4511
4535
  // recursion.
4512
4536
  handle_html_content(parser, token);
4513
4537
  }
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
4627
4651
  const char* fragment_encoding = options->fragment_encoding;
4628
4652
  GumboQuirksModeEnum quirks = options->quirks_mode;
4629
4653
  bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
4630
-
4631
4654
  GumboNode* root;
4632
- // 2.
4655
+
4656
+ // 1. [Create a new Document node, and mark it as being an HTML document.]
4657
+ // 2. [If the node document of the context element is in quirks mode, then
4658
+ // let the Document be in quirks mode. Otherwise, the node document of
4659
+ // the context element is in limited-quirks mode, then let the Document
4660
+ // be in limited-quirks mode. Otherwise, leave the Document in no-quirks
4661
+ // mode.]
4633
4662
  get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
4634
4663
 
4635
- // 3.
4664
+ // 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
4665
+ // declarative shadow roots to true.]
4666
+ // 4. [Create a new HTML parser, and associate it with the just created Document node.]
4667
+ // 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
4636
4668
  parser->_parser_state->_fragment_ctx =
4637
4669
  create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
4638
4670
  GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
4659
4691
  break;
4660
4692
 
4661
4693
  case GUMBO_TAG_NOSCRIPT:
4662
- /* scripting is disabled in Gumbo, so leave the tokenizer
4663
- * in the default data state */
4694
+ if (options->parse_noscript_content_as_text)
4695
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
4664
4696
  break;
4665
4697
 
4666
4698
  case GUMBO_TAG_PLAINTEXT:
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
4762
4794
  adjusted_current_node &&
4763
4795
  adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4764
4796
  );
4765
- gumbo_lex(&parser, &token);
4797
+ // If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
4798
+ //
4799
+ // The parser is pretty fragile. Breaking out of the parsing loop in the middle of
4800
+ // the parse can leave the document in an inconsistent state.
4801
+ if (unlikely(state->_open_elements.length > max_tree_depth)) {
4802
+ parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
4803
+ gumbo_debug("Tree depth limit exceeded.\n");
4804
+ token.type = GUMBO_TOKEN_EOF;
4805
+ } else {
4806
+ gumbo_lex(&parser, &token);
4807
+ }
4808
+
4766
4809
  }
4767
4810
 
4768
4811
  const char* token_type = "text";
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
4786
4829
  break;
4787
4830
  }
4788
4831
  gumbo_debug (
4789
- "Handling %s token @%lu:%lu in state %u.\n",
4832
+ "Handling %s token @%lu:%lu in insertion mode %u.\n",
4790
4833
  (char*) token_type,
4791
4834
  (unsigned long)token.position.line,
4792
4835
  (unsigned long)token.position.column,
@@ -4830,11 +4873,6 @@ GumboOutput* gumbo_parse_with_options (
4830
4873
  gumbo_free(token.v.end_tag.name);
4831
4874
  token.v.end_tag.name = NULL;
4832
4875
  }
4833
- if (unlikely(state->_open_elements.length > max_tree_depth)) {
4834
- parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
4835
- gumbo_debug("Tree depth limit exceeded.\n");
4836
- break;
4837
- }
4838
4876
  }
4839
4877
 
4840
4878
 
@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
340
340
 
341
341
  // Sets the tag buffer original text and start point to the current iterator
342
342
  // position. This is necessary because attribute names & values may have
343
- // whitespace preceeding them, and so we can't assume that the actual token
343
+ // whitespace preceding them, and so we can't assume that the actual token
344
344
  // starting point was the end of the last tag buffer usage.
345
345
  static void reset_tag_buffer_start_point(GumboParser* parser) {
346
346
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -569,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
569
569
  }
570
570
 
571
571
  // Appends a codepoint to the current tag buffer. If
572
- // reinitilize_position_on_first is set, this also initializes the tag buffer
572
+ // reinitialize_position_on_first is set, this also initializes the tag buffer
573
573
  // start point; the only time you would *not* want to pass true for this
574
574
  // parameter is if you want the original_text to include character (like an
575
575
  // opening quote) that doesn't appear in the value.
576
576
  static void append_char_to_tag_buffer (
577
577
  GumboParser* parser,
578
578
  int codepoint,
579
- bool reinitilize_position_on_first
579
+ bool reinitialize_position_on_first
580
580
  ) {
581
581
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
582
- if (buffer->length == 0 && reinitilize_position_on_first) {
582
+ if (buffer->length == 0 && reinitialize_position_on_first) {
583
583
  reset_tag_buffer_start_point(parser);
584
584
  }
585
585
  gumbo_string_buffer_append_codepoint(codepoint, buffer);
@@ -589,10 +589,10 @@ static void append_char_to_tag_buffer (
589
589
  static void append_string_to_tag_buffer (
590
590
  GumboParser* parser,
591
591
  GumboStringPiece* str,
592
- bool reinitilize_position_on_first
592
+ bool reinitialize_position_on_first
593
593
  ) {
594
594
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
595
- if (buffer->length == 0 && reinitilize_position_on_first) {
595
+ if (buffer->length == 0 && reinitialize_position_on_first) {
596
596
  reset_tag_buffer_start_point(parser);
597
597
  }
598
598
  gumbo_string_buffer_append_string(str, buffer);
@@ -18,7 +18,7 @@ module Nokogiri
18
18
  #
19
19
  module ClassResolver
20
20
  # #related_class restricts matching namespaces to those matching this set.
21
- VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
21
+ VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
22
22
 
23
23
  # :call-seq:
24
24
  # related_class(class_name) → Class
@@ -23,8 +23,12 @@ module Nokogiri
23
23
 
24
24
  ###
25
25
  # Convert this CSS node to xpath with +prefix+ using +visitor+
26
- def to_xpath(prefix, visitor)
27
- prefix = "." if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
26
+ def to_xpath(visitor)
27
+ prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
28
+ "."
29
+ else
30
+ visitor.prefix
31
+ end
28
32
  prefix + visitor.accept(self)
29
33
  end
30
34
 
@@ -1,8 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
  #
3
3
  # DO NOT MODIFY!!!!
4
- # This file is automatically generated by Racc 1.6.0
5
- # from Racc grammar file "".
4
+ # This file is automatically generated by Racc 1.8.0
5
+ # from Racc grammar file "parser.y".
6
6
  #
7
7
 
8
8
  require 'racc/parser.rb'
@@ -291,6 +291,7 @@ Racc_arg = [
291
291
  racc_shift_n,
292
292
  racc_reduce_n,
293
293
  racc_use_result_var ]
294
+ Ractor.make_shareable(Racc_arg) if defined?(Ractor)
294
295
 
295
296
  Racc_token_to_s_table = [
296
297
  "$end",
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
351
352
  "negation",
352
353
  "eql_incl_dash",
353
354
  "negation_arg" ]
355
+ Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
354
356
 
355
357
  Racc_debug_parser = false
356
358
 
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
468
470
  end
469
471
 
470
472
  def _reduce_24(val, _values, result)
471
- result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')])
473
+ result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
472
474
  result
473
475
  end
474
476
 
475
477
  def _reduce_25(val, _values, result)
476
- name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
478
+ name = val[0]
477
479
  result = Node.new(:ELEMENT_NAME, [name])
478
480
 
479
481
  result
@@ -64,9 +64,9 @@ rule
64
64
  ;
65
65
 
66
66
  namespaced_ident:
67
- namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')]) }
67
+ namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [val[0], val[2]]) }
68
68
  | IDENT {
69
- name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
69
+ name = val[0]
70
70
  result = Node.new(:ELEMENT_NAME, [name])
71
71
  }
72
72
  ;
@@ -5,62 +5,9 @@ require "thread"
5
5
  module Nokogiri
6
6
  module CSS
7
7
  class Parser < Racc::Parser # :nodoc:
8
- CACHE_SWITCH_NAME = :nokogiri_css_parser_cache_is_off
9
-
10
- @cache = {}
11
- @mutex = Mutex.new
12
-
13
- class << self
14
- # Return a thread-local boolean indicating whether the CSS-to-XPath cache is active. (Default is `true`.)
15
- def cache_on?
16
- !Thread.current[CACHE_SWITCH_NAME]
17
- end
18
-
19
- # Set a thread-local boolean to turn cacheing on and off. Truthy values turn the cache on, falsey values turn the cache off.
20
- def set_cache(value) # rubocop:disable Naming/AccessorMethodName
21
- Thread.current[CACHE_SWITCH_NAME] = !value
22
- end
23
-
24
- # Get the css selector in +string+ from the cache
25
- def [](string)
26
- return unless cache_on?
27
-
28
- @mutex.synchronize { @cache[string] }
29
- end
30
-
31
- # Set the css selector in +string+ in the cache to +value+
32
- def []=(string, value)
33
- return value unless cache_on?
34
-
35
- @mutex.synchronize { @cache[string] = value }
36
- end
37
-
38
- # Clear the cache
39
- def clear_cache(create_new_object = false)
40
- @mutex.synchronize do
41
- if create_new_object
42
- @cache = {}
43
- else
44
- @cache.clear
45
- end
46
- end
47
- end
48
-
49
- # Execute +block+ without cache
50
- def without_cache(&block)
51
- original_cache_setting = cache_on?
52
- set_cache(false)
53
- yield
54
- ensure
55
- set_cache(original_cache_setting)
56
- end
57
- end
58
-
59
- # Create a new CSS parser with respect to +namespaces+
60
- def initialize(namespaces = {})
8
+ def initialize
61
9
  @tokenizer = Tokenizer.new
62
- @namespaces = namespaces
63
- super()
10
+ super
64
11
  end
65
12
 
66
13
  def parse(string)
@@ -72,11 +19,10 @@ module Nokogiri
72
19
  @tokenizer.next_token
73
20
  end
74
21
 
75
- # Get the xpath for +string+ using +options+
76
- def xpath_for(string, prefix, visitor)
77
- key = cache_key(string, prefix, visitor)
78
- self.class[key] ||= parse(string).map do |ast|
79
- ast.to_xpath(prefix, visitor)
22
+ # Get the xpath for +selector+ using +visitor+
23
+ def xpath_for(selector, visitor)
24
+ parse(selector).map do |ast|
25
+ ast.to_xpath(visitor)
80
26
  end
81
27
  end
82
28
 
@@ -85,12 +31,6 @@ module Nokogiri
85
31
  after = value_stack.compact.last
86
32
  raise SyntaxError, "unexpected '#{error_value}' after '#{after}'"
87
33
  end
88
-
89
- def cache_key(query, prefix, visitor)
90
- if self.class.cache_on?
91
- [query, prefix, @namespaces, visitor.config]
92
- end
93
- end
94
34
  end
95
35
  end
96
36
  end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module CSS
5
+ module SelectorCache # :nodoc:
6
+ @cache = {}
7
+ @mutex = Mutex.new
8
+
9
+ class << self
10
+ # Retrieve the cached XPath expressions for the key
11
+ def [](key)
12
+ @mutex.synchronize { @cache[key] }
13
+ end
14
+
15
+ # Insert the XPath expressions `value` at the cache key
16
+ def []=(key, value)
17
+ @mutex.synchronize { @cache[key] = value }
18
+ end
19
+
20
+ # Clear the cache
21
+ def clear_cache(create_new_object = false)
22
+ @mutex.synchronize do
23
+ if create_new_object # used in tests to avoid 'method redefined' warnings when injecting spies
24
+ @cache = {}
25
+ else
26
+ @cache.clear
27
+ end
28
+ end
29
+ end
30
+
31
+ # Construct a unique key cache key
32
+ def key(selector:, visitor:)
33
+ [selector, visitor.config]
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -63,13 +63,13 @@ class Tokenizer
63
63
  when (text = @ss.scan(/has\([\s]*/))
64
64
  action { [:HAS, text] }
65
65
 
66
- when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
66
+ when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
67
67
  action { [:FUNCTION, text] }
68
68
 
69
- when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
69
+ when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
70
70
  action { [:IDENT, text] }
71
71
 
72
- when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
72
+ when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
73
73
  action { [:HASH, text] }
74
74
 
75
75
  when (text = @ss.scan(/[\s]*~=[\s]*/))
@@ -132,7 +132,7 @@ class Tokenizer
132
132
  when (text = @ss.scan(/[\s]+/))
133
133
  action { [:S, text] }
134
134
 
135
- when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*'/))
135
+ when (text = @ss.scan(/("([^\n\r\f"]|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*')/))
136
136
  action { [:STRING, text] }
137
137
 
138
138
  when (text = @ss.scan(/./))
@@ -4,20 +4,21 @@ module CSS
4
4
  class Tokenizer
5
5
 
6
6
  macro
7
- nl \n|\r\n|\r|\f
7
+ nl (\n|\r\n|\r|\f)
8
8
  w [\s]*
9
9
  nonascii [^\0-\177]
10
10
  num -?([0-9]+|[0-9]*\.[0-9]+)
11
11
  unicode \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
12
12
 
13
- escape {unicode}|\\[^\n\r\f0-9A-Fa-f]
14
- nmchar [_A-Za-z0-9-]|{nonascii}|{escape}
15
- nmstart [_A-Za-z]|{nonascii}|{escape}
16
- ident -?({nmstart})({nmchar})*
17
- name ({nmchar})+
13
+ escape ({unicode}|\\[^\n\r\f0-9A-Fa-f])
14
+ nmchar ([_A-Za-z0-9-]|{nonascii}|{escape})
15
+ nmstart ([_A-Za-z]|{nonascii}|{escape})
16
+ name {nmstart}{nmchar}*
17
+ ident -?{name}
18
+ charref {nmchar}+
18
19
  string1 "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
19
20
  string2 '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
20
- string {string1}|{string2}
21
+ string ({string1}|{string2})
21
22
 
22
23
  rule
23
24
 
@@ -26,7 +27,7 @@ rule
26
27
  has\({w} { [:HAS, text] }
27
28
  {ident}\({w} { [:FUNCTION, text] }
28
29
  {ident} { [:IDENT, text] }
29
- \#{name} { [:HASH, text] }
30
+ \#{charref} { [:HASH, text] }
30
31
  {w}~={w} { [:INCLUDES, text] }
31
32
  {w}\|={w} { [:DASHMATCH, text] }
32
33
  {w}\^={w} { [:PREFIXMATCH, text] }
@@ -44,6 +44,18 @@ module Nokogiri
44
44
  VALUES = [XML, HTML4, HTML5]
45
45
  end
46
46
 
47
+ # The visitor configuration set via the +builtins:+ keyword argument to XPathVisitor.new.
48
+ attr_reader :builtins
49
+
50
+ # The visitor configuration set via the +doctype:+ keyword argument to XPathVisitor.new.
51
+ attr_reader :doctype
52
+
53
+ # The visitor configuration set via the +prefix:+ keyword argument to XPathVisitor.new.
54
+ attr_reader :prefix
55
+
56
+ # The visitor configuration set via the +namespaces:+ keyword argument to XPathVisitor.new.
57
+ attr_reader :namespaces
58
+
47
59
  # :call-seq:
48
60
  # new() → XPathVisitor
49
61
  # new(builtins:, doctype:) → XPathVisitor
@@ -54,7 +66,12 @@ module Nokogiri
54
66
  #
55
67
  # [Returns] XPathVisitor
56
68
  #
57
- def initialize(builtins: BuiltinsConfig::NEVER, doctype: DoctypeConfig::XML)
69
+ def initialize(
70
+ builtins: BuiltinsConfig::NEVER,
71
+ doctype: DoctypeConfig::XML,
72
+ prefix: Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX,
73
+ namespaces: nil
74
+ )
58
75
  unless BuiltinsConfig::VALUES.include?(builtins)
59
76
  raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter")
60
77
  end
@@ -64,6 +81,8 @@ module Nokogiri
64
81
 
65
82
  @builtins = builtins
66
83
  @doctype = doctype
84
+ @prefix = prefix
85
+ @namespaces = namespaces
67
86
  end
68
87
 
69
88
  # :call-seq: config() → Hash
@@ -72,7 +91,7 @@ module Nokogiri
72
91
  # a Hash representing the configuration of the XPathVisitor, suitable for use as
73
92
  # part of the CSS cache key.
74
93
  def config
75
- { builtins: @builtins, doctype: @doctype }
94
+ { builtins: @builtins, doctype: @doctype, prefix: @prefix, namespaces: @namespaces }
76
95
  end
77
96
 
78
97
  # :stopdoc:
@@ -128,6 +147,8 @@ module Nokogiri
128
147
  is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
129
148
  ".#{"//" unless is_direct}#{node.value[1].accept(self)}"
130
149
  else
150
+ validate_xpath_function_name(node.value.first)
151
+
131
152
  # xpath function call, let's marshal those arguments
132
153
  args = ["."]
133
154
  args += node.value[1..-1].map do |n|
@@ -207,6 +228,7 @@ module Nokogiri
207
228
  when "parent" then "node()"
208
229
  when "root" then "not(parent::*)"
209
230
  else
231
+ validate_xpath_function_name(node.value.first)
210
232
  "nokogiri:#{node.value.first}(.)"
211
233
  end
212
234
  end
@@ -255,6 +277,15 @@ module Nokogiri
255
277
  else
256
278
  "*[local-name()='#{node.value.first}']"
257
279
  end
280
+ elsif node.value.length == 2 # has a namespace prefix
281
+ if node.value.first.nil? # namespace prefix is empty
282
+ node.value.last
283
+ else
284
+ node.value.join(":")
285
+ end
286
+ elsif node.value.first != "*" && @namespaces&.key?("xmlns")
287
+ # apply the default namespace (if one is present) to a non-wildcard selector
288
+ "xmlns:#{node.value.first}"
258
289
  else
259
290
  node.value.first
260
291
  end
@@ -270,11 +301,17 @@ module Nokogiri
270
301
 
271
302
  private
272
303
 
304
+ def validate_xpath_function_name(name)
305
+ if name.start_with?("-")
306
+ raise Nokogiri::CSS::SyntaxError, "Invalid XPath function name '#{name}'"
307
+ end
308
+ end
309
+
273
310
  def html5_element_name_needs_namespace_handling(node)
274
- # if this is the wildcard selector "*", use it as normal
275
- node.value.first != "*" &&
276
- # if there is already a namespace (i.e., it is a prefixed QName), use it as normal
277
- !node.value.first.include?(":")
311
+ # if there is already a namespace (i.e., it is a prefixed QName), use it as normal
312
+ node.value.length == 1 &&
313
+ # if this is the wildcard selector "*", use it as normal
314
+ node.value.first != "*"
278
315
  end
279
316
 
280
317
  def nth(node, options = {})