nokogiri 1.16.8 → 1.17.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (91) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +11 -21
  3. data/README.md +4 -0
  4. data/dependencies.yml +6 -6
  5. data/ext/nokogiri/extconf.rb +191 -137
  6. data/ext/nokogiri/gumbo.c +69 -53
  7. data/ext/nokogiri/html4_document.c +10 -4
  8. data/ext/nokogiri/html4_element_description.c +18 -18
  9. data/ext/nokogiri/html4_sax_parser.c +40 -0
  10. data/ext/nokogiri/html4_sax_parser_context.c +48 -58
  11. data/ext/nokogiri/html4_sax_push_parser.c +25 -24
  12. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  13. data/ext/nokogiri/nokogiri.c +9 -2
  14. data/ext/nokogiri/nokogiri.h +18 -33
  15. data/ext/nokogiri/xml_attr.c +1 -1
  16. data/ext/nokogiri/xml_cdata.c +2 -10
  17. data/ext/nokogiri/xml_comment.c +3 -8
  18. data/ext/nokogiri/xml_document.c +163 -156
  19. data/ext/nokogiri/xml_document_fragment.c +10 -25
  20. data/ext/nokogiri/xml_dtd.c +1 -1
  21. data/ext/nokogiri/xml_element_content.c +9 -9
  22. data/ext/nokogiri/xml_encoding_handler.c +4 -4
  23. data/ext/nokogiri/xml_namespace.c +6 -6
  24. data/ext/nokogiri/xml_node.c +134 -103
  25. data/ext/nokogiri/xml_node_set.c +46 -44
  26. data/ext/nokogiri/xml_reader.c +54 -58
  27. data/ext/nokogiri/xml_relax_ng.c +35 -56
  28. data/ext/nokogiri/xml_sax_parser.c +156 -88
  29. data/ext/nokogiri/xml_sax_parser_context.c +213 -131
  30. data/ext/nokogiri/xml_sax_push_parser.c +68 -49
  31. data/ext/nokogiri/xml_schema.c +50 -85
  32. data/ext/nokogiri/xml_syntax_error.c +19 -11
  33. data/ext/nokogiri/xml_text.c +2 -4
  34. data/ext/nokogiri/xml_xpath_context.c +2 -2
  35. data/ext/nokogiri/xslt_stylesheet.c +8 -8
  36. data/gumbo-parser/src/error.c +76 -48
  37. data/gumbo-parser/src/error.h +5 -1
  38. data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
  39. data/gumbo-parser/src/parser.c +61 -23
  40. data/gumbo-parser/src/tokenizer.c +6 -6
  41. data/lib/nokogiri/class_resolver.rb +1 -1
  42. data/lib/nokogiri/css/node.rb +6 -2
  43. data/lib/nokogiri/css/parser.rb +6 -4
  44. data/lib/nokogiri/css/parser.y +2 -2
  45. data/lib/nokogiri/css/parser_extras.rb +6 -66
  46. data/lib/nokogiri/css/selector_cache.rb +38 -0
  47. data/lib/nokogiri/css/tokenizer.rb +4 -4
  48. data/lib/nokogiri/css/tokenizer.rex +9 -8
  49. data/lib/nokogiri/css/xpath_visitor.rb +42 -6
  50. data/lib/nokogiri/css.rb +86 -20
  51. data/lib/nokogiri/decorators/slop.rb +3 -5
  52. data/lib/nokogiri/encoding_handler.rb +2 -2
  53. data/lib/nokogiri/html4/document.rb +44 -23
  54. data/lib/nokogiri/html4/document_fragment.rb +124 -12
  55. data/lib/nokogiri/html4/encoding_reader.rb +1 -1
  56. data/lib/nokogiri/html4/sax/parser.rb +23 -38
  57. data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
  58. data/lib/nokogiri/html4.rb +9 -14
  59. data/lib/nokogiri/html5/builder.rb +40 -0
  60. data/lib/nokogiri/html5/document.rb +61 -30
  61. data/lib/nokogiri/html5/document_fragment.rb +130 -20
  62. data/lib/nokogiri/html5/node.rb +4 -4
  63. data/lib/nokogiri/html5.rb +114 -72
  64. data/lib/nokogiri/version/constant.rb +1 -1
  65. data/lib/nokogiri/xml/builder.rb +8 -1
  66. data/lib/nokogiri/xml/document.rb +70 -26
  67. data/lib/nokogiri/xml/document_fragment.rb +84 -13
  68. data/lib/nokogiri/xml/node.rb +82 -11
  69. data/lib/nokogiri/xml/node_set.rb +9 -7
  70. data/lib/nokogiri/xml/parse_options.rb +1 -1
  71. data/lib/nokogiri/xml/pp/node.rb +6 -1
  72. data/lib/nokogiri/xml/reader.rb +46 -13
  73. data/lib/nokogiri/xml/relax_ng.rb +57 -20
  74. data/lib/nokogiri/xml/sax/document.rb +174 -83
  75. data/lib/nokogiri/xml/sax/parser.rb +115 -41
  76. data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
  77. data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
  78. data/lib/nokogiri/xml/sax.rb +48 -0
  79. data/lib/nokogiri/xml/schema.rb +112 -45
  80. data/lib/nokogiri/xml/searchable.rb +6 -8
  81. data/lib/nokogiri/xml/syntax_error.rb +22 -0
  82. data/lib/nokogiri/xml.rb +13 -24
  83. data/lib/nokogiri/xslt.rb +3 -9
  84. data/lib/xsd/xmlparser/nokogiri.rb +3 -4
  85. data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
  86. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  87. metadata +10 -7
  88. data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
  89. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
  90. data/ports/archives/libxml2-2.12.9.tar.xz +0 -0
  91. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
@@ -46,33 +46,40 @@ static int PRINTF(2) print_message (
46
46
  args
47
47
  );
48
48
  va_end(args);
49
- #if _MSC_VER && _MSC_VER < 1900
49
+
50
+ #if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(_RUBY_MSVCRT)
50
51
  if (bytes_written == -1) {
51
52
  // vsnprintf returns -1 on older MSVC++ if there's not enough capacity,
52
53
  // instead of returning the number of bytes that would've been written had
53
- // there been enough. In this case, we'll double the buffer size and hope
54
- // it fits when we retry (letting it fail and returning 0 if it doesn't),
55
- // since there's no way to smartly resize the buffer.
56
- gumbo_string_buffer_reserve(output->capacity * 2, output);
54
+ // there been enough. In this case, we can call vsnprintf() again but
55
+ // with a count of 0 to get the number of bytes written, not including
56
+ // the null terminator.
57
+ // https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/vsnprintf-vsnprintf-vsnprintf-l-vsnwprintf-vsnwprintf-l?view=msvc-140#behavior-summary
58
+
57
59
  va_start(args, format);
58
- int result = vsnprintf (
59
- output->data + output->length,
60
- remaining_capacity,
60
+ bytes_written = vsnprintf (
61
+ NULL,
62
+ 0,
61
63
  format,
62
64
  args
63
65
  );
64
66
  va_end(args);
65
- return result == -1 ? 0 : result;
66
67
  }
67
- #else
68
+ #endif
69
+
68
70
  // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
69
71
  if (bytes_written == -1) {
70
72
  return 0;
71
73
  }
72
- #endif
73
74
 
74
75
  if (bytes_written >= remaining_capacity) {
75
- gumbo_string_buffer_reserve(output->capacity + bytes_written, output);
76
+ // At least double the size of the buffer.
77
+ size_t new_capacity = output->capacity * 2;
78
+ if (new_capacity < output->length + bytes_written + 1) {
79
+ // The +1 is for the null terminator.
80
+ new_capacity = output->length + bytes_written + 1;
81
+ }
82
+ gumbo_string_buffer_reserve(new_capacity, output);
76
83
  remaining_capacity = output->capacity - output->length;
77
84
  va_start(args, format);
78
85
  bytes_written = vsnprintf (
@@ -96,8 +103,14 @@ static void print_tag_stack (
96
103
  if (i) {
97
104
  print_message(output, ", ");
98
105
  }
99
- GumboTag tag = (GumboTag)(intptr_t) error->tag_stack.data[i];
100
- print_message(output, "%s", gumbo_normalized_tagname(tag));
106
+ uintptr_t tag = (uintptr_t) error->tag_stack.data[i];
107
+ const char* tag_name;
108
+ if (tag > GUMBO_TAG_UNKNOWN) {
109
+ tag_name = error->tag_stack.data[i];
110
+ } else {
111
+ tag_name = gumbo_normalized_tagname((GumboTag)tag);
112
+ }
113
+ print_message(output, "%s", tag_name);
101
114
  }
102
115
  gumbo_string_buffer_append_codepoint('.', output);
103
116
  }
@@ -326,41 +339,45 @@ static void handle_parser_error (
326
339
  }
327
340
 
328
341
  switch (error->input_type) {
329
- case GUMBO_TOKEN_DOCTYPE:
330
- print_message(output, "This is not a legal doctype");
331
- return;
332
- case GUMBO_TOKEN_COMMENT:
333
- // Should never happen; comments are always legal.
334
- assert(0);
335
- // But just in case...
336
- print_message(output, "Comments aren't legal here");
337
- return;
338
- case GUMBO_TOKEN_CDATA:
339
- case GUMBO_TOKEN_WHITESPACE:
340
- case GUMBO_TOKEN_CHARACTER:
341
- print_message(output, "Character tokens aren't legal here");
342
- return;
343
- case GUMBO_TOKEN_NULL:
344
- print_message(output, "Null bytes are not allowed in HTML5");
345
- return;
346
- case GUMBO_TOKEN_EOF:
347
- if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
348
- print_message(output, "You must provide a doctype");
349
- } else {
350
- print_message(output, "Premature end of file.");
351
- print_tag_stack(error, output);
352
- }
353
- return;
354
- case GUMBO_TOKEN_START_TAG:
355
- print_message(output, "Start tag '%s' isn't allowed here.",
356
- gumbo_normalized_tagname(error->input_tag));
357
- print_tag_stack(error, output);
358
- return;
359
- case GUMBO_TOKEN_END_TAG:
360
- print_message(output, "End tag '%s' isn't allowed here.",
361
- gumbo_normalized_tagname(error->input_tag));
342
+ case GUMBO_TOKEN_DOCTYPE:
343
+ print_message(output, "This is not a legal doctype");
344
+ return;
345
+ case GUMBO_TOKEN_COMMENT:
346
+ // Should never happen; comments are always legal.
347
+ assert(0);
348
+ // But just in case...
349
+ print_message(output, "Comments aren't legal here");
350
+ return;
351
+ case GUMBO_TOKEN_CDATA:
352
+ case GUMBO_TOKEN_WHITESPACE:
353
+ case GUMBO_TOKEN_CHARACTER:
354
+ print_message(output, "Character tokens aren't legal here");
355
+ return;
356
+ case GUMBO_TOKEN_NULL:
357
+ print_message(output, "Null bytes are not allowed in HTML5");
358
+ return;
359
+ case GUMBO_TOKEN_EOF:
360
+ if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
361
+ print_message(output, "You must provide a doctype");
362
+ } else {
363
+ print_message(output, "Premature end of file.");
362
364
  print_tag_stack(error, output);
363
- return;
365
+ }
366
+ return;
367
+ case GUMBO_TOKEN_START_TAG:
368
+ case GUMBO_TOKEN_END_TAG:
369
+ {
370
+ const char* tag_name;
371
+ const char* which = error->input_type == GUMBO_TOKEN_START_TAG ? "Start" : "End";
372
+ if (error->input_name) {
373
+ tag_name = error->input_name;
374
+ } else {
375
+ tag_name = gumbo_normalized_tagname(error->input_tag);
376
+ }
377
+ print_message(output, "%s tag '%s' isn't allowed here.", which, tag_name);
378
+ print_tag_stack(error, output);
379
+ return;
380
+ }
364
381
  }
365
382
  }
366
383
 
@@ -613,6 +630,17 @@ void gumbo_print_caret_diagnostic (
613
630
 
614
631
  void gumbo_error_destroy(GumboError* error) {
615
632
  if (error->type == GUMBO_ERR_PARSER) {
633
+ // Free the tag name.
634
+ if (error->v.parser.input_name) {
635
+ gumbo_free(error->v.parser.input_name);
636
+ }
637
+
638
+ for (unsigned int i = 0; i < error->v.parser.tag_stack.length; ++i) {
639
+ intptr_t tag = (intptr_t) error->v.parser.tag_stack.data[i];
640
+ if (tag > GUMBO_TAG_UNKNOWN) {
641
+ gumbo_free(error->v.parser.tag_stack.data[i]);
642
+ }
643
+ }
616
644
  gumbo_vector_destroy(&error->v.parser.tag_stack);
617
645
  }
618
646
  gumbo_free(error);
@@ -95,12 +95,16 @@ typedef struct GumboInternalParserError {
95
95
  // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
96
96
  GumboTag input_tag;
97
97
 
98
+ // The HTML tag of the input token if it was nonstandard tag token. NULL otherwise.
99
+ char *input_name;
100
+
98
101
  // The insertion mode that the parser was in at the time.
99
102
  GumboInsertionMode parser_state;
100
103
 
101
104
  // The tag stack at the point of the error. Note that this is an GumboVector
102
105
  // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
103
- // get at the tag.
106
+ // get at the tag. For nonstandard tags, this is a pointer to an owned char *
107
+ // containing the tag name.
104
108
  GumboVector /* GumboTag */ tag_stack;
105
109
  } GumboParserError;
106
110
 
@@ -780,6 +780,15 @@ typedef struct GumboInternalOptions {
780
780
  * Default: `false`.
781
781
  */
782
782
  bool fragment_context_has_form_ancestor;
783
+
784
+ /**
785
+ * Parse `noscript` elements as if scripting was enabled. This causes the
786
+ * contents of the `noscript` element to be parsed as raw text, rather
787
+ * than as HTML elements.
788
+ *
789
+ * Default: `false`.
790
+ */
791
+ bool parse_noscript_content_as_text;
783
792
  } GumboOptions;
784
793
 
785
794
  /** Default options struct; use this with gumbo_parse_with_options. */
@@ -791,7 +800,7 @@ extern const GumboOptions kGumboDefaultOptions;
791
800
  */
792
801
  typedef enum {
793
802
  /**
794
- * Indicates that parsing completed successfuly. The resulting tree
803
+ * Indicates that parsing completed successfully. The resulting tree
795
804
  * will be a complete document.
796
805
  */
797
806
  GUMBO_STATUS_OK,
@@ -841,7 +850,7 @@ typedef struct GumboInternalOutput {
841
850
  GumboVector /* GumboError */ errors;
842
851
 
843
852
  /**
844
- * True if the parser encounted an error.
853
+ * True if the parser encountered an error.
845
854
  *
846
855
  * This can be true and `errors` an empty `GumboVector` if the `max_errors`
847
856
  * option was set to 0.
@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
56
56
  .fragment_encoding = NULL,
57
57
  .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
58
58
  .fragment_context_has_form_ancestor = false,
59
+ .parse_noscript_content_as_text = false,
59
60
  };
60
61
 
61
62
  #define STRING(s) {.data = s, .length = sizeof(s) - 1}
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
749
750
  GumboParserError* extra_data = &error->v.parser;
750
751
  extra_data->input_type = token->type;
751
752
  extra_data->input_tag = GUMBO_TAG_UNKNOWN;
752
- if (token->type == GUMBO_TOKEN_START_TAG) {
753
+ extra_data->input_name = NULL;
754
+ if (token->type == GUMBO_TOKEN_START_TAG)
755
+ {
753
756
  extra_data->input_tag = token->v.start_tag.tag;
754
- } else if (token->type == GUMBO_TOKEN_END_TAG) {
757
+ if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
758
+ extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
759
+ }
760
+ }
761
+ else if (token->type == GUMBO_TOKEN_END_TAG)
762
+ {
755
763
  extra_data->input_tag = token->v.end_tag.tag;
764
+ if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
765
+ extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
766
+ }
756
767
  }
757
768
  const GumboParserState* state = parser->_parser_state;
758
769
  extra_data->parser_state = state->_insertion_mode;
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
763
774
  node->type == GUMBO_NODE_ELEMENT
764
775
  || node->type == GUMBO_NODE_TEMPLATE
765
776
  );
766
- gumbo_vector_add (
767
- (void*) node->v.element.tag,
768
- &extra_data->tag_stack
769
- );
777
+ void *tag;
778
+ if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
779
+ tag = gumbo_strdup(node->v.element.name);
780
+ } else {
781
+ tag = (void *)(uintptr_t)node->v.element.tag;
782
+ }
783
+ gumbo_vector_add(tag, &extra_data->tag_stack);
770
784
  }
771
785
  }
772
786
 
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
1187
1201
  element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1188
1202
  insert_element(parser, element, false);
1189
1203
  gumbo_debug (
1190
- "Inserting %s element (@%p) from tag type.\n",
1204
+ "Inserting <%s> element (@%p) from tag type.\n",
1191
1205
  gumbo_normalized_tagname(tag),
1192
1206
  (void*)element
1193
1207
  );
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
1204
1218
  assert(token->type == GUMBO_TOKEN_START_TAG);
1205
1219
  GumboNode* element = create_element_from_token(token, tag_namespace);
1206
1220
  insert_element(parser, element, false);
1221
+ gumbo_debug (
1222
+ "Inserting <%s> foreign element (@%p).\n",
1223
+ gumbo_normalized_tagname(element->v.element.tag),
1224
+ (void*)element
1225
+ );
1207
1226
  if (
1208
1227
  token_has_attribute(token, "xmlns")
1209
1228
  && !attribute_matches_case_sensitive (
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
2066
2085
 
2067
2086
  // This is here to clean up memory when the spec says "Ignore current token."
2068
2087
  static void ignore_token(GumboParser* parser) {
2088
+ gumbo_debug("Ignoring token.\n");
2069
2089
  GumboToken* token = parser->_parser_state->_current_token;
2070
2090
  // Ownership of the token's internal buffers are normally transferred to the
2071
2091
  // element, but if no element is emitted (as happens in non-verbatim-mode
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
2430
2450
 
2431
2451
  // https://html.spec.whatwg.org/multipage/parsing.html#the-end
2432
2452
  static void finish_parsing(GumboParser* parser) {
2433
- gumbo_debug("Finishing parsing");
2453
+ gumbo_debug("Finishing parsing\n");
2434
2454
  maybe_flush_text_node_buffer(parser);
2435
2455
  GumboParserState* state = parser->_parser_state;
2436
2456
  for (
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
2608
2628
  }
2609
2629
  if (
2610
2630
  tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
2631
+ || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
2611
2632
  ) {
2612
2633
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2613
2634
  return;
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
3313
3334
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3314
3335
  return;
3315
3336
  }
3316
- if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3337
+ if (
3338
+ tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
3339
+ || (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
3340
+ ) {
3317
3341
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3318
3342
  return;
3319
3343
  }
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
4389
4413
 
4390
4414
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
4391
4415
  static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4392
- gumbo_debug("Handling foreign content");
4416
+ gumbo_debug("Handling foreign content.\n");
4393
4417
  switch (token->type) {
4394
4418
  case GUMBO_TOKEN_NULL:
4395
4419
  parser_add_parse_error(parser, token);
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4507
4531
  if (i == 0)
4508
4532
  return;
4509
4533
  // We can't call handle_token directly because the current node is still in
4510
- // a foriegn namespace, so it would re-enter this and result in infinite
4534
+ // a foreign namespace, so it would re-enter this and result in infinite
4511
4535
  // recursion.
4512
4536
  handle_html_content(parser, token);
4513
4537
  }
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
4627
4651
  const char* fragment_encoding = options->fragment_encoding;
4628
4652
  GumboQuirksModeEnum quirks = options->quirks_mode;
4629
4653
  bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
4630
-
4631
4654
  GumboNode* root;
4632
- // 2.
4655
+
4656
+ // 1. [Create a new Document node, and mark it as being an HTML document.]
4657
+ // 2. [If the node document of the context element is in quirks mode, then
4658
+ // let the Document be in quirks mode. Otherwise, the node document of
4659
+ // the context element is in limited-quirks mode, then let the Document
4660
+ // be in limited-quirks mode. Otherwise, leave the Document in no-quirks
4661
+ // mode.]
4633
4662
  get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
4634
4663
 
4635
- // 3.
4664
+ // 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
4665
+ // declarative shadow roots to true.]
4666
+ // 4. [Create a new HTML parser, and associate it with the just created Document node.]
4667
+ // 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
4636
4668
  parser->_parser_state->_fragment_ctx =
4637
4669
  create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
4638
4670
  GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
4659
4691
  break;
4660
4692
 
4661
4693
  case GUMBO_TAG_NOSCRIPT:
4662
- /* scripting is disabled in Gumbo, so leave the tokenizer
4663
- * in the default data state */
4694
+ if (options->parse_noscript_content_as_text)
4695
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
4664
4696
  break;
4665
4697
 
4666
4698
  case GUMBO_TAG_PLAINTEXT:
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
4762
4794
  adjusted_current_node &&
4763
4795
  adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4764
4796
  );
4765
- gumbo_lex(&parser, &token);
4797
+ // If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
4798
+ //
4799
+ // The parser is pretty fragile. Breaking out of the parsing loop in the middle of
4800
+ // the parse can leave the document in an inconsistent state.
4801
+ if (unlikely(state->_open_elements.length > max_tree_depth)) {
4802
+ parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
4803
+ gumbo_debug("Tree depth limit exceeded.\n");
4804
+ token.type = GUMBO_TOKEN_EOF;
4805
+ } else {
4806
+ gumbo_lex(&parser, &token);
4807
+ }
4808
+
4766
4809
  }
4767
4810
 
4768
4811
  const char* token_type = "text";
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
4786
4829
  break;
4787
4830
  }
4788
4831
  gumbo_debug (
4789
- "Handling %s token @%lu:%lu in state %u.\n",
4832
+ "Handling %s token @%lu:%lu in insertion mode %u.\n",
4790
4833
  (char*) token_type,
4791
4834
  (unsigned long)token.position.line,
4792
4835
  (unsigned long)token.position.column,
@@ -4830,11 +4873,6 @@ GumboOutput* gumbo_parse_with_options (
4830
4873
  gumbo_free(token.v.end_tag.name);
4831
4874
  token.v.end_tag.name = NULL;
4832
4875
  }
4833
- if (unlikely(state->_open_elements.length > max_tree_depth)) {
4834
- parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
4835
- gumbo_debug("Tree depth limit exceeded.\n");
4836
- break;
4837
- }
4838
4876
  }
4839
4877
 
4840
4878
 
@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
340
340
 
341
341
  // Sets the tag buffer original text and start point to the current iterator
342
342
  // position. This is necessary because attribute names & values may have
343
- // whitespace preceeding them, and so we can't assume that the actual token
343
+ // whitespace preceding them, and so we can't assume that the actual token
344
344
  // starting point was the end of the last tag buffer usage.
345
345
  static void reset_tag_buffer_start_point(GumboParser* parser) {
346
346
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
@@ -569,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
569
569
  }
570
570
 
571
571
  // Appends a codepoint to the current tag buffer. If
572
- // reinitilize_position_on_first is set, this also initializes the tag buffer
572
+ // reinitialize_position_on_first is set, this also initializes the tag buffer
573
573
  // start point; the only time you would *not* want to pass true for this
574
574
  // parameter is if you want the original_text to include character (like an
575
575
  // opening quote) that doesn't appear in the value.
576
576
  static void append_char_to_tag_buffer (
577
577
  GumboParser* parser,
578
578
  int codepoint,
579
- bool reinitilize_position_on_first
579
+ bool reinitialize_position_on_first
580
580
  ) {
581
581
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
582
- if (buffer->length == 0 && reinitilize_position_on_first) {
582
+ if (buffer->length == 0 && reinitialize_position_on_first) {
583
583
  reset_tag_buffer_start_point(parser);
584
584
  }
585
585
  gumbo_string_buffer_append_codepoint(codepoint, buffer);
@@ -589,10 +589,10 @@ static void append_char_to_tag_buffer (
589
589
  static void append_string_to_tag_buffer (
590
590
  GumboParser* parser,
591
591
  GumboStringPiece* str,
592
- bool reinitilize_position_on_first
592
+ bool reinitialize_position_on_first
593
593
  ) {
594
594
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
595
- if (buffer->length == 0 && reinitilize_position_on_first) {
595
+ if (buffer->length == 0 && reinitialize_position_on_first) {
596
596
  reset_tag_buffer_start_point(parser);
597
597
  }
598
598
  gumbo_string_buffer_append_string(str, buffer);
@@ -18,7 +18,7 @@ module Nokogiri
18
18
  #
19
19
  module ClassResolver
20
20
  # #related_class restricts matching namespaces to those matching this set.
21
- VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
21
+ VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
22
22
 
23
23
  # :call-seq:
24
24
  # related_class(class_name) → Class
@@ -23,8 +23,12 @@ module Nokogiri
23
23
 
24
24
  ###
25
25
  # Convert this CSS node to xpath with +prefix+ using +visitor+
26
- def to_xpath(prefix, visitor)
27
- prefix = "." if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
26
+ def to_xpath(visitor)
27
+ prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
28
+ "."
29
+ else
30
+ visitor.prefix
31
+ end
28
32
  prefix + visitor.accept(self)
29
33
  end
30
34
 
@@ -1,8 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
  #
3
3
  # DO NOT MODIFY!!!!
4
- # This file is automatically generated by Racc 1.6.0
5
- # from Racc grammar file "".
4
+ # This file is automatically generated by Racc 1.8.0
5
+ # from Racc grammar file "parser.y".
6
6
  #
7
7
 
8
8
  require 'racc/parser.rb'
@@ -291,6 +291,7 @@ Racc_arg = [
291
291
  racc_shift_n,
292
292
  racc_reduce_n,
293
293
  racc_use_result_var ]
294
+ Ractor.make_shareable(Racc_arg) if defined?(Ractor)
294
295
 
295
296
  Racc_token_to_s_table = [
296
297
  "$end",
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
351
352
  "negation",
352
353
  "eql_incl_dash",
353
354
  "negation_arg" ]
355
+ Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
354
356
 
355
357
  Racc_debug_parser = false
356
358
 
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
468
470
  end
469
471
 
470
472
  def _reduce_24(val, _values, result)
471
- result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')])
473
+ result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
472
474
  result
473
475
  end
474
476
 
475
477
  def _reduce_25(val, _values, result)
476
- name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
478
+ name = val[0]
477
479
  result = Node.new(:ELEMENT_NAME, [name])
478
480
 
479
481
  result
@@ -64,9 +64,9 @@ rule
64
64
  ;
65
65
 
66
66
  namespaced_ident:
67
- namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [[val[0], val[2]].compact.join(':')]) }
67
+ namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [val[0], val[2]]) }
68
68
  | IDENT {
69
- name = @namespaces.key?('xmlns') ? "xmlns:#{val[0]}" : val[0]
69
+ name = val[0]
70
70
  result = Node.new(:ELEMENT_NAME, [name])
71
71
  }
72
72
  ;
@@ -5,62 +5,9 @@ require "thread"
5
5
  module Nokogiri
6
6
  module CSS
7
7
  class Parser < Racc::Parser # :nodoc:
8
- CACHE_SWITCH_NAME = :nokogiri_css_parser_cache_is_off
9
-
10
- @cache = {}
11
- @mutex = Mutex.new
12
-
13
- class << self
14
- # Return a thread-local boolean indicating whether the CSS-to-XPath cache is active. (Default is `true`.)
15
- def cache_on?
16
- !Thread.current[CACHE_SWITCH_NAME]
17
- end
18
-
19
- # Set a thread-local boolean to turn cacheing on and off. Truthy values turn the cache on, falsey values turn the cache off.
20
- def set_cache(value) # rubocop:disable Naming/AccessorMethodName
21
- Thread.current[CACHE_SWITCH_NAME] = !value
22
- end
23
-
24
- # Get the css selector in +string+ from the cache
25
- def [](string)
26
- return unless cache_on?
27
-
28
- @mutex.synchronize { @cache[string] }
29
- end
30
-
31
- # Set the css selector in +string+ in the cache to +value+
32
- def []=(string, value)
33
- return value unless cache_on?
34
-
35
- @mutex.synchronize { @cache[string] = value }
36
- end
37
-
38
- # Clear the cache
39
- def clear_cache(create_new_object = false)
40
- @mutex.synchronize do
41
- if create_new_object
42
- @cache = {}
43
- else
44
- @cache.clear
45
- end
46
- end
47
- end
48
-
49
- # Execute +block+ without cache
50
- def without_cache(&block)
51
- original_cache_setting = cache_on?
52
- set_cache(false)
53
- yield
54
- ensure
55
- set_cache(original_cache_setting)
56
- end
57
- end
58
-
59
- # Create a new CSS parser with respect to +namespaces+
60
- def initialize(namespaces = {})
8
+ def initialize
61
9
  @tokenizer = Tokenizer.new
62
- @namespaces = namespaces
63
- super()
10
+ super
64
11
  end
65
12
 
66
13
  def parse(string)
@@ -72,11 +19,10 @@ module Nokogiri
72
19
  @tokenizer.next_token
73
20
  end
74
21
 
75
- # Get the xpath for +string+ using +options+
76
- def xpath_for(string, prefix, visitor)
77
- key = cache_key(string, prefix, visitor)
78
- self.class[key] ||= parse(string).map do |ast|
79
- ast.to_xpath(prefix, visitor)
22
+ # Get the xpath for +selector+ using +visitor+
23
+ def xpath_for(selector, visitor)
24
+ parse(selector).map do |ast|
25
+ ast.to_xpath(visitor)
80
26
  end
81
27
  end
82
28
 
@@ -85,12 +31,6 @@ module Nokogiri
85
31
  after = value_stack.compact.last
86
32
  raise SyntaxError, "unexpected '#{error_value}' after '#{after}'"
87
33
  end
88
-
89
- def cache_key(query, prefix, visitor)
90
- if self.class.cache_on?
91
- [query, prefix, @namespaces, visitor.config]
92
- end
93
- end
94
34
  end
95
35
  end
96
36
  end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module CSS
5
+ module SelectorCache # :nodoc:
6
+ @cache = {}
7
+ @mutex = Mutex.new
8
+
9
+ class << self
10
+ # Retrieve the cached XPath expressions for the key
11
+ def [](key)
12
+ @mutex.synchronize { @cache[key] }
13
+ end
14
+
15
+ # Insert the XPath expressions `value` at the cache key
16
+ def []=(key, value)
17
+ @mutex.synchronize { @cache[key] = value }
18
+ end
19
+
20
+ # Clear the cache
21
+ def clear_cache(create_new_object = false)
22
+ @mutex.synchronize do
23
+ if create_new_object # used in tests to avoid 'method redefined' warnings when injecting spies
24
+ @cache = {}
25
+ else
26
+ @cache.clear
27
+ end
28
+ end
29
+ end
30
+
31
+ # Construct a unique key cache key
32
+ def key(selector:, visitor:)
33
+ [selector, visitor.config]
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end