nokogiri 1.16.3 → 1.18.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (95) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +14 -22
  3. data/LICENSE-DEPENDENCIES.md +6 -6
  4. data/README.md +8 -5
  5. data/dependencies.yml +9 -9
  6. data/ext/nokogiri/extconf.rb +188 -142
  7. data/ext/nokogiri/gumbo.c +69 -53
  8. data/ext/nokogiri/html4_document.c +10 -4
  9. data/ext/nokogiri/html4_element_description.c +18 -18
  10. data/ext/nokogiri/html4_sax_parser.c +40 -0
  11. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  12. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  13. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  14. data/ext/nokogiri/nokogiri.c +9 -2
  15. data/ext/nokogiri/nokogiri.h +18 -33
  16. data/ext/nokogiri/xml_attr.c +1 -1
  17. data/ext/nokogiri/xml_cdata.c +2 -10
  18. data/ext/nokogiri/xml_comment.c +3 -8
  19. data/ext/nokogiri/xml_document.c +163 -156
  20. data/ext/nokogiri/xml_document_fragment.c +10 -25
  21. data/ext/nokogiri/xml_dtd.c +1 -1
  22. data/ext/nokogiri/xml_element_content.c +9 -9
  23. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  24. data/ext/nokogiri/xml_namespace.c +6 -6
  25. data/ext/nokogiri/xml_node.c +141 -104
  26. data/ext/nokogiri/xml_node_set.c +46 -44
  27. data/ext/nokogiri/xml_reader.c +54 -58
  28. data/ext/nokogiri/xml_relax_ng.c +35 -56
  29. data/ext/nokogiri/xml_sax_parser.c +156 -88
  30. data/ext/nokogiri/xml_sax_parser_context.c +219 -131
  31. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  32. data/ext/nokogiri/xml_schema.c +50 -85
  33. data/ext/nokogiri/xml_syntax_error.c +19 -11
  34. data/ext/nokogiri/xml_text.c +2 -4
  35. data/ext/nokogiri/xml_xpath_context.c +103 -100
  36. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  37. data/gumbo-parser/src/ascii.c +2 -2
  38. data/gumbo-parser/src/error.c +76 -48
  39. data/gumbo-parser/src/error.h +5 -1
  40. data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
  41. data/gumbo-parser/src/parser.c +63 -25
  42. data/gumbo-parser/src/tokenizer.c +6 -6
  43. data/lib/nokogiri/class_resolver.rb +1 -1
  44. data/lib/nokogiri/css/node.rb +6 -2
  45. data/lib/nokogiri/css/parser.rb +6 -4
  46. data/lib/nokogiri/css/parser.y +2 -2
  47. data/lib/nokogiri/css/parser_extras.rb +6 -66
  48. data/lib/nokogiri/css/selector_cache.rb +38 -0
  49. data/lib/nokogiri/css/tokenizer.rb +4 -4
  50. data/lib/nokogiri/css/tokenizer.rex +9 -8
  51. data/lib/nokogiri/css/xpath_visitor.rb +42 -6
  52. data/lib/nokogiri/css.rb +86 -20
  53. data/lib/nokogiri/decorators/slop.rb +3 -5
  54. data/lib/nokogiri/encoding_handler.rb +2 -2
  55. data/lib/nokogiri/html4/document.rb +44 -23
  56. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  57. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  58. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  59. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  60. data/lib/nokogiri/html4.rb +9 -14
  61. data/lib/nokogiri/html5/builder.rb +40 -0
  62. data/lib/nokogiri/html5/document.rb +61 -30
  63. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  64. data/lib/nokogiri/html5/node.rb +4 -4
  65. data/lib/nokogiri/html5.rb +114 -72
  66. data/lib/nokogiri/version/constant.rb +1 -1
  67. data/lib/nokogiri/xml/builder.rb +8 -1
  68. data/lib/nokogiri/xml/document.rb +70 -26
  69. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  70. data/lib/nokogiri/xml/node.rb +82 -11
  71. data/lib/nokogiri/xml/node_set.rb +9 -7
  72. data/lib/nokogiri/xml/parse_options.rb +1 -1
  73. data/lib/nokogiri/xml/pp/node.rb +6 -1
  74. data/lib/nokogiri/xml/reader.rb +46 -13
  75. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  76. data/lib/nokogiri/xml/sax/document.rb +174 -83
  77. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  78. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  79. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  80. data/lib/nokogiri/xml/sax.rb +48 -0
  81. data/lib/nokogiri/xml/schema.rb +112 -45
  82. data/lib/nokogiri/xml/searchable.rb +38 -42
  83. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  84. data/lib/nokogiri/xml/xpath_context.rb +14 -3
  85. data/lib/nokogiri/xml.rb +13 -24
  86. data/lib/nokogiri/xslt.rb +3 -9
  87. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  88. data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
  89. data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
  90. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  91. metadata +13 -14
  92. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
  93. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
  94. data/ports/archives/libxml2-2.12.6.tar.xz +0 -0
  95. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
56
56
  .fragment_encoding = NULL,
57
57
  .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
58
58
  .fragment_context_has_form_ancestor = false,
59
+ .parse_noscript_content_as_text = false,
59
60
  };
60
61
 
61
62
  #define STRING(s) {.data = s, .length = sizeof(s) - 1}
@@ -317,7 +318,7 @@ static GumboNode* create_node(GumboNodeType type) {
317
318
  return node;
318
319
  }
319
320
 
320
- static GumboNode* new_document_node() {
321
+ static GumboNode* new_document_node(void) {
321
322
  GumboNode* document_node = create_node(GUMBO_NODE_DOCUMENT);
322
323
  document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
323
324
  gumbo_vector_init(1, &document_node->v.document.children);
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
749
750
  GumboParserError* extra_data = &error->v.parser;
750
751
  extra_data->input_type = token->type;
751
752
  extra_data->input_tag = GUMBO_TAG_UNKNOWN;
752
- if (token->type == GUMBO_TOKEN_START_TAG) {
753
+ extra_data->input_name = NULL;
754
+ if (token->type == GUMBO_TOKEN_START_TAG)
755
+ {
753
756
  extra_data->input_tag = token->v.start_tag.tag;
754
- } else if (token->type == GUMBO_TOKEN_END_TAG) {
757
+ if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
758
+ extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
759
+ }
760
+ }
761
+ else if (token->type == GUMBO_TOKEN_END_TAG)
762
+ {
755
763
  extra_data->input_tag = token->v.end_tag.tag;
764
+ if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
765
+ extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
766
+ }
756
767
  }
757
768
  const GumboParserState* state = parser->_parser_state;
758
769
  extra_data->parser_state = state->_insertion_mode;
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
763
774
  node->type == GUMBO_NODE_ELEMENT
764
775
  || node->type == GUMBO_NODE_TEMPLATE
765
776
  );
766
- gumbo_vector_add (
767
- (void*) node->v.element.tag,
768
- &extra_data->tag_stack
769
- );
777
+ void *tag;
778
+ if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
779
+ tag = gumbo_strdup(node->v.element.name);
780
+ } else {
781
+ tag = (void *)(uintptr_t)node->v.element.tag;
782
+ }
783
+ gumbo_vector_add(tag, &extra_data->tag_stack);
770
784
  }
771
785
  }
772
786
 
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
1187
1201
  element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1188
1202
  insert_element(parser, element, false);
1189
1203
  gumbo_debug (
1190
- "Inserting %s element (@%p) from tag type.\n",
1204
+ "Inserting <%s> element (@%p) from tag type.\n",
1191
1205
  gumbo_normalized_tagname(tag),
1192
1206
  (void*)element
1193
1207
  );
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
1204
1218
  assert(token->type == GUMBO_TOKEN_START_TAG);
1205
1219
  GumboNode* element = create_element_from_token(token, tag_namespace);
1206
1220
  insert_element(parser, element, false);
1221
+ gumbo_debug (
1222
+ "Inserting <%s> foreign element (@%p).\n",
1223
+ gumbo_normalized_tagname(element->v.element.tag),
1224
+ (void*)element
1225
+ );
1207
1226
  if (
1208
1227
  token_has_attribute(token, "xmlns")
1209
1228
  && !attribute_matches_case_sensitive (
@@ -1978,7 +1997,7 @@ static void adjust_svg_tag(GumboToken* token) {
1978
1997
  assert(token->type == GUMBO_TOKEN_START_TAG);
1979
1998
  if (token->v.start_tag.tag == GUMBO_TAG_FOREIGNOBJECT) {
1980
1999
  assert(token->v.start_tag.name == NULL);
1981
- token->v.start_tag.name = "foreignObject";
2000
+ token->v.start_tag.name = (char *)"foreignObject";
1982
2001
  } else if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
1983
2002
  assert(token->v.start_tag.name);
1984
2003
  const StringReplacement *replacement = gumbo_get_svg_tag_replacement(
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
2066
2085
 
2067
2086
  // This is here to clean up memory when the spec says "Ignore current token."
2068
2087
  static void ignore_token(GumboParser* parser) {
2088
+ gumbo_debug("Ignoring token.\n");
2069
2089
  GumboToken* token = parser->_parser_state->_current_token;
2070
2090
  // Ownership of the token's internal buffers are normally transferred to the
2071
2091
  // element, but if no element is emitted (as happens in non-verbatim-mode
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
2430
2450
 
2431
2451
  // https://html.spec.whatwg.org/multipage/parsing.html#the-end
2432
2452
  static void finish_parsing(GumboParser* parser) {
2433
- gumbo_debug("Finishing parsing");
2453
+ gumbo_debug("Finishing parsing\n");
2434
2454
  maybe_flush_text_node_buffer(parser);
2435
2455
  GumboParserState* state = parser->_parser_state;
2436
2456
  for (
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
2608
2628
  }
2609
2629
  if (
2610
2630
  tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
2631
+ || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
2611
2632
  ) {
2612
2633
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2613
2634
  return;
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
3313
3334
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3314
3335
  return;
3315
3336
  }
3316
- if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3337
+ if (
3338
+ tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
3339
+ || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
3340
+ ) {
3317
3341
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3318
3342
  return;
3319
3343
  }
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
4389
4413
 
4390
4414
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
4391
4415
  static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4392
- gumbo_debug("Handling foreign content");
4416
+ gumbo_debug("Handling foreign content.\n");
4393
4417
  switch (token->type) {
4394
4418
  case GUMBO_TOKEN_NULL:
4395
4419
  parser_add_parse_error(parser, token);
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4507
4531
  if (i == 0)
4508
4532
  return;
4509
4533
  // We can't call handle_token directly because the current node is still in
4510
- // a foriegn namespace, so it would re-enter this and result in infinite
4534
+ // a foreign namespace, so it would re-enter this and result in infinite
4511
4535
  // recursion.
4512
4536
  handle_html_content(parser, token);
4513
4537
  }
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
4627
4651
  const char* fragment_encoding = options->fragment_encoding;
4628
4652
  GumboQuirksModeEnum quirks = options->quirks_mode;
4629
4653
  bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
4630
-
4631
4654
  GumboNode* root;
4632
- // 2.
4655
+
4656
+ // 1. [Create a new Document node, and mark it as being an HTML document.]
4657
+ // 2. [If the node document of the context element is in quirks mode, then
4658
+ // let the Document be in quirks mode. Otherwise, the node document of
4659
+ // the context element is in limited-quirks mode, then let the Document
4660
+ // be in limited-quirks mode. Otherwise, leave the Document in no-quirks
4661
+ // mode.]
4633
4662
  get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
4634
4663
 
4635
- // 3.
4664
+ // 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
4665
+ // declarative shadow roots to true.]
4666
+ // 4. [Create a new HTML parser, and associate it with the just created Document node.]
4667
+ // 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
4636
4668
  parser->_parser_state->_fragment_ctx =
4637
4669
  create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
4638
4670
  GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
4659
4691
  break;
4660
4692
 
4661
4693
  case GUMBO_TAG_NOSCRIPT:
4662
- /* scripting is disabled in Gumbo, so leave the tokenizer
4663
- * in the default data state */
4694
+ if (options->parse_noscript_content_as_text)
4695
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
4664
4696
  break;
4665
4697
 
4666
4698
  case GUMBO_TAG_PLAINTEXT:
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
4762
4794
  adjusted_current_node &&
4763
4795
  adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4764
4796
  );
4765
- gumbo_lex(&parser, &token);
4797
+ // If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
4798
+ //
4799
+ // The parser is pretty fragile. Breaking out of the parsing loop in the middle of
4800
+ // the parse can leave the document in an inconsistent state.
4801
+ if (unlikely(state->_open_elements.length > max_tree_depth)) {
4802
+ parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
4803
+ gumbo_debug("Tree depth limit exceeded.\n");
4804
+ token.type = GUMBO_TOKEN_EOF;
4805
+ } else {
4806
+ gumbo_lex(&parser, &token);
4807
+ }
4808
+
4766
4809
  }
4767
4810
 
4768
4811
  const char* token_type = "text";
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
4786
4829
  break;
4787
4830
  }
4788
4831
  gumbo_debug (
4789
- "Handling %s token @%lu:%lu in state %u.\n",
4832
+ "Handling %s token @%lu:%lu in insertion mode %u.\n",
4790
4833
  (char*) token_type,
4791
4834
  (unsigned long)token.position.line,
4792
4835
  (unsigned long)token.position.column,
@@ -4830,11 +4873,6 @@ GumboOutput* gumbo_parse_with_options (
4830
4873
  gumbo_free(token.v.end_tag.name);
4831
4874
  token.v.end_tag.name = NULL;
4832
4875
  }
4833
- if (unlikely(state->_open_elements.length > max_tree_depth)) {
4834
- parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
4835
- gumbo_debug("Tree depth limit exceeded.\n");
4836
- break;
4837
- }
4838
4876
  }
4839
4877
 
4840
4878
 
@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
340
340
 
341
341
  // Sets the tag buffer original text and start point to the current iterator
342
342
  // position. This is necessary because attribute names & values may have
343
- // whitespace preceeding them, and so we can't assume that the actual token
343
+ // whitespace preceding them, and so we can't assume that the actual token
344
344
  // starting point was the end of the last tag buffer usage.
345
345
  static void reset_tag_buffer_start_point(GumboParser* parser) {
346
346
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -569,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
569
569
  }
570
570
 
571
571
  // Appends a codepoint to the current tag buffer. If
572
- // reinitilize_position_on_first is set, this also initializes the tag buffer
572
+ // reinitialize_position_on_first is set, this also initializes the tag buffer
573
573
  // start point; the only time you would *not* want to pass true for this
574
574
  // parameter is if you want the original_text to include character (like an
575
575
  // opening quote) that doesn't appear in the value.
576
576
  static void append_char_to_tag_buffer (
577
577
  GumboParser* parser,
578
578
  int codepoint,
579
- bool reinitilize_position_on_first
579
+ bool reinitialize_position_on_first
580
580
  ) {
581
581
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
582
- if (buffer->length == 0 && reinitilize_position_on_first) {
582
+ if (buffer->length == 0 && reinitialize_position_on_first) {
583
583
  reset_tag_buffer_start_point(parser);
584
584
  }
585
585
  gumbo_string_buffer_append_codepoint(codepoint, buffer);
@@ -589,10 +589,10 @@ static void append_char_to_tag_buffer (
589
589
  static void append_string_to_tag_buffer (
590
590
  GumboParser* parser,
591
591
  GumboStringPiece* str,
592
- bool reinitilize_position_on_first
592
+ bool reinitialize_position_on_first
593
593
  ) {
594
594
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
595
- if (buffer->length == 0 && reinitilize_position_on_first) {
595
+ if (buffer->length == 0 && reinitialize_position_on_first) {
596
596
  reset_tag_buffer_start_point(parser);
597
597
  }
598
598
  gumbo_string_buffer_append_string(str, buffer);
@@ -18,7 +18,7 @@ module Nokogiri
18
18
  #
19
19
  module ClassResolver
20
20
  # #related_class restricts matching namespaces to those matching this set.
21
- VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
21
+ VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
22
22
 
23
23
  # :call-seq:
24
24
  # related_class(class_name) → Class
@@ -23,8 +23,12 @@ module Nokogiri
23
23
 
24
24
  ###
25
25
  # Convert this CSS node to xpath with +prefix+ using +visitor+
26
- def to_xpath(prefix, visitor)
27
- prefix = "." if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
26
+ def to_xpath(visitor)
27
+ prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
28
+ "."
29
+ else
30
+ visitor.prefix
31
+ end
28
32
  prefix + visitor.accept(self)
29
33
  end
30
34
 
@@ -1,8 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
  #
3
3
  # DO NOT MODIFY!!!!
4
- # This file is automatically generated by Racc 1.6.0
5
- # from Racc grammar file "".
4
+ # This file is automatically generated by Racc 1.8.0
5
+ # from Racc grammar file "parser.y".
6
6
  #
7
7
 
8
8
  require 'racc/parser.rb'
@@ -291,6 +291,7 @@ Racc_arg = [
291
291
  racc_shift_n,
292
292
  racc_reduce_n,
293
293
  racc_use_result_var ]
294
+ Ractor.make_shareable(Racc_arg) if defined?(Ractor)
294
295
 
295
296
  Racc_token_to_s_table = [
296
297
  "$end",
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
351
352
  "negation",
352
353
  "eql_incl_dash",
353
354
  "negation_arg" ]
355
+ Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
354
356
 
355
357
  Racc_debug_parser = false
356
358
 
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
468
470
  end
469
471
 
470
472
  def _reduce_24(val, _values, result)
471
- result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')])
473
+ result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
472
474
  result
473
475
  end
474
476
 
475
477
  def _reduce_25(val, _values, result)
476
- name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
478
+ name = val[0]
477
479
  result = Node.new(:ELEMENT_NAME, [name])
478
480
 
479
481
  result
@@ -64,9 +64,9 @@ rule
64
64
  ;
65
65
 
66
66
  namespaced_ident:
67
- namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')]) }
67
+ namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [val[0], val[2]]) }
68
68
  | IDENT {
69
- name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
69
+ name = val[0]
70
70
  result = Node.new(:ELEMENT_NAME, [name])
71
71
  }
72
72
  ;
@@ -5,62 +5,9 @@ require "thread"
5
5
  module Nokogiri
6
6
  module CSS
7
7
  class Parser < Racc::Parser # :nodoc:
8
- CACHE_SWITCH_NAME = :nokogiri_css_parser_cache_is_off
9
-
10
- @cache = {}
11
- @mutex = Mutex.new
12
-
13
- class << self
14
- # Return a thread-local boolean indicating whether the CSS-to-XPath cache is active. (Default is `true`.)
15
- def cache_on?
16
- !Thread.current[CACHE_SWITCH_NAME]
17
- end
18
-
19
- # Set a thread-local boolean to turn cacheing on and off. Truthy values turn the cache on, falsey values turn the cache off.
20
- def set_cache(value) # rubocop:disable Naming/AccessorMethodName
21
- Thread.current[CACHE_SWITCH_NAME] = !value
22
- end
23
-
24
- # Get the css selector in +string+ from the cache
25
- def [](string)
26
- return unless cache_on?
27
-
28
- @mutex.synchronize { @cache[string] }
29
- end
30
-
31
- # Set the css selector in +string+ in the cache to +value+
32
- def []=(string, value)
33
- return value unless cache_on?
34
-
35
- @mutex.synchronize { @cache[string] = value }
36
- end
37
-
38
- # Clear the cache
39
- def clear_cache(create_new_object = false)
40
- @mutex.synchronize do
41
- if create_new_object
42
- @cache = {}
43
- else
44
- @cache.clear
45
- end
46
- end
47
- end
48
-
49
- # Execute +block+ without cache
50
- def without_cache(&block)
51
- original_cache_setting = cache_on?
52
- set_cache(false)
53
- yield
54
- ensure
55
- set_cache(original_cache_setting)
56
- end
57
- end
58
-
59
- # Create a new CSS parser with respect to +namespaces+
60
- def initialize(namespaces = {})
8
+ def initialize
61
9
  @tokenizer = Tokenizer.new
62
- @namespaces = namespaces
63
- super()
10
+ super
64
11
  end
65
12
 
66
13
  def parse(string)
@@ -72,11 +19,10 @@ module Nokogiri
72
19
  @tokenizer.next_token
73
20
  end
74
21
 
75
- # Get the xpath for +string+ using +options+
76
- def xpath_for(string, prefix, visitor)
77
- key = cache_key(string, prefix, visitor)
78
- self.class[key] ||= parse(string).map do |ast|
79
- ast.to_xpath(prefix, visitor)
22
+ # Get the xpath for +selector+ using +visitor+
23
+ def xpath_for(selector, visitor)
24
+ parse(selector).map do |ast|
25
+ ast.to_xpath(visitor)
80
26
  end
81
27
  end
82
28
 
@@ -85,12 +31,6 @@ module Nokogiri
85
31
  after = value_stack.compact.last
86
32
  raise SyntaxError, "unexpected '#{error_value}' after '#{after}'"
87
33
  end
88
-
89
- def cache_key(query, prefix, visitor)
90
- if self.class.cache_on?
91
- [query, prefix, @namespaces, visitor.config]
92
- end
93
- end
94
34
  end
95
35
  end
96
36
  end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module CSS
5
+ module SelectorCache # :nodoc:
6
+ @cache = {}
7
+ @mutex = Mutex.new
8
+
9
+ class << self
10
+ # Retrieve the cached XPath expressions for the key
11
+ def [](key)
12
+ @mutex.synchronize { @cache[key] }
13
+ end
14
+
15
+ # Insert the XPath expressions `value` at the cache key
16
+ def []=(key, value)
17
+ @mutex.synchronize { @cache[key] = value }
18
+ end
19
+
20
+ # Clear the cache
21
+ def clear_cache(create_new_object = false)
22
+ @mutex.synchronize do
23
+ if create_new_object # used in tests to avoid 'method redefined' warnings when injecting spies
24
+ @cache = {}
25
+ else
26
+ @cache.clear
27
+ end
28
+ end
29
+ end
30
+
31
+ # Construct a unique key cache key
32
+ def key(selector:, visitor:)
33
+ [selector, visitor.config]
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -63,13 +63,13 @@ class Tokenizer
63
63
  when (text = @ss.scan(/has\([\s]*/))
64
64
  action { [:HAS, text] }
65
65
 
66
- when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
66
+ when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
67
67
  action { [:FUNCTION, text] }
68
68
 
69
- when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
69
+ when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
70
70
  action { [:IDENT, text] }
71
71
 
72
- when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
72
+ when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
73
73
  action { [:HASH, text] }
74
74
 
75
75
  when (text = @ss.scan(/[\s]*~=[\s]*/))
@@ -132,7 +132,7 @@ class Tokenizer
132
132
  when (text = @ss.scan(/[\s]+/))
133
133
  action { [:S, text] }
134
134
 
135
- when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*'/))
135
+ when (text = @ss.scan(/("([^\n\r\f"]|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*')/))
136
136
  action { [:STRING, text] }
137
137
 
138
138
  when (text = @ss.scan(/./))
@@ -4,20 +4,21 @@ module CSS
4
4
  class Tokenizer
5
5
 
6
6
  macro
7
- nl \n|\r\n|\r|\f
7
+ nl (\n|\r\n|\r|\f)
8
8
  w [\s]*
9
9
  nonascii [^\0-\177]
10
10
  num -?([0-9]+|[0-9]*\.[0-9]+)
11
11
  unicode \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
12
12
 
13
- escape {unicode}|\\[^\n\r\f0-9A-Fa-f]
14
- nmchar [_A-Za-z0-9-]|{nonascii}|{escape}
15
- nmstart [_A-Za-z]|{nonascii}|{escape}
16
- ident -?({nmstart})({nmchar})*
17
- name ({nmchar})+
13
+ escape ({unicode}|\\[^\n\r\f0-9A-Fa-f])
14
+ nmchar ([_A-Za-z0-9-]|{nonascii}|{escape})
15
+ nmstart ([_A-Za-z]|{nonascii}|{escape})
16
+ name {nmstart}{nmchar}*
17
+ ident -?{name}
18
+ charref {nmchar}+
18
19
  string1 "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
19
20
  string2 '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
20
- string {string1}|{string2}
21
+ string ({string1}|{string2})
21
22
 
22
23
  rule
23
24
 
@@ -26,7 +27,7 @@ rule
26
27
  has\({w} { [:HAS, text] }
27
28
  {ident}\({w} { [:FUNCTION, text] }
28
29
  {ident} { [:IDENT, text] }
29
- \#{name} { [:HASH, text] }
30
+ \#{charref} { [:HASH, text] }
30
31
  {w}~={w} { [:INCLUDES, text] }
31
32
  {w}\|={w} { [:DASHMATCH, text] }
32
33
  {w}\^={w} { [:PREFIXMATCH, text] }
@@ -44,6 +44,18 @@ module Nokogiri
44
44
  VALUES = [XML, HTML4, HTML5]
45
45
  end
46
46
 
47
+ # The visitor configuration set via the +builtins:+ keyword argument to XPathVisitor.new.
48
+ attr_reader :builtins
49
+
50
+ # The visitor configuration set via the +doctype:+ keyword argument to XPathVisitor.new.
51
+ attr_reader :doctype
52
+
53
+ # The visitor configuration set via the +prefix:+ keyword argument to XPathVisitor.new.
54
+ attr_reader :prefix
55
+
56
+ # The visitor configuration set via the +namespaces:+ keyword argument to XPathVisitor.new.
57
+ attr_reader :namespaces
58
+
47
59
  # :call-seq:
48
60
  # new() → XPathVisitor
49
61
  # new(builtins:, doctype:) → XPathVisitor
@@ -54,7 +66,12 @@ module Nokogiri
54
66
  #
55
67
  # [Returns] XPathVisitor
56
68
  #
57
- def initialize(builtins: BuiltinsConfig::NEVER, doctype: DoctypeConfig::XML)
69
+ def initialize(
70
+ builtins: BuiltinsConfig::NEVER,
71
+ doctype: DoctypeConfig::XML,
72
+ prefix: Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX,
73
+ namespaces: nil
74
+ )
58
75
  unless BuiltinsConfig::VALUES.include?(builtins)
59
76
  raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter")
60
77
  end
@@ -64,6 +81,8 @@ module Nokogiri
64
81
 
65
82
  @builtins = builtins
66
83
  @doctype = doctype
84
+ @prefix = prefix
85
+ @namespaces = namespaces
67
86
  end
68
87
 
69
88
  # :call-seq: config() → Hash
@@ -72,7 +91,7 @@ module Nokogiri
72
91
  # a Hash representing the configuration of the XPathVisitor, suitable for use as
73
92
  # part of the CSS cache key.
74
93
  def config
75
- { builtins: @builtins, doctype: @doctype }
94
+ { builtins: @builtins, doctype: @doctype, prefix: @prefix, namespaces: @namespaces }
76
95
  end
77
96
 
78
97
  # :stopdoc:
@@ -128,6 +147,8 @@ module Nokogiri
128
147
  is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
129
148
  ".#{"//" unless is_direct}#{node.value[1].accept(self)}"
130
149
  else
150
+ validate_xpath_function_name(node.value.first)
151
+
131
152
  # xpath function call, let's marshal those arguments
132
153
  args = ["."]
133
154
  args += node.value[1..-1].map do |n|
@@ -207,6 +228,7 @@ module Nokogiri
207
228
  when "parent" then "node()"
208
229
  when "root" then "not(parent::*)"
209
230
  else
231
+ validate_xpath_function_name(node.value.first)
210
232
  "nokogiri:#{node.value.first}(.)"
211
233
  end
212
234
  end
@@ -255,6 +277,14 @@ module Nokogiri
255
277
  else
256
278
  "*[local-name()='#{node.value.first}']"
257
279
  end
280
+ elsif node.value.length == 2 # has a namespace prefix
281
+ if node.value.first.nil? # namespace prefix is empty
282
+ node.value.last
283
+ else
284
+ node.value.join(":")
285
+ end
286
+ elsif @namespaces&.key?("xmlns") # apply the default namespace if it's declared
287
+ "xmlns:#{node.value.first}"
258
288
  else
259
289
  node.value.first
260
290
  end
@@ -270,11 +300,17 @@ module Nokogiri
270
300
 
271
301
  private
272
302
 
303
+ def validate_xpath_function_name(name)
304
+ if name.start_with?("-")
305
+ raise Nokogiri::CSS::SyntaxError, "Invalid XPath function name '#{name}'"
306
+ end
307
+ end
308
+
273
309
  def html5_element_name_needs_namespace_handling(node)
274
- # if this is the wildcard selector "*", use it as normal
275
- node.value.first != "*" &&
276
- # if there is already a namespace (i.e., it is a prefixed QName), use it as normal
277
- !node.value.first.include?(":")
310
+ # if there is already a namespace (i.e., it is a prefixed QName), use it as normal
311
+ node.value.length == 1 &&
312
+ # if this is the wildcard selector "*", use it as normal
313
+ node.value.first != "*"
278
314
  end
279
315
 
280
316
  def nth(node, options = {})