nokogiri 1.16.7 → 1.18.1
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +14 -22
- data/LICENSE-DEPENDENCIES.md +6 -6
- data/README.md +8 -5
- data/dependencies.yml +6 -6
- data/ext/nokogiri/extconf.rb +188 -142
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +25 -24
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/nokogiri.h +18 -33
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +2 -10
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +163 -156
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -6
- data/ext/nokogiri/xml_node.c +141 -104
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +54 -58
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +219 -131
- data/ext/nokogiri/xml_sax_push_parser.c +68 -49
- data/ext/nokogiri/xml_schema.c +50 -85
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +2 -4
- data/ext/nokogiri/xml_xpath_context.c +103 -100
- data/ext/nokogiri/xslt_stylesheet.c +8 -8
- data/gumbo-parser/src/ascii.c +2 -2
- data/gumbo-parser/src/error.c +76 -48
- data/gumbo-parser/src/error.h +5 -1
- data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
- data/gumbo-parser/src/parser.c +63 -25
- data/gumbo-parser/src/tokenizer.c +6 -6
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +42 -6
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +44 -23
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +1 -1
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -72
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +70 -26
- data/lib/nokogiri/xml/document_fragment.rb +84 -13
- data/lib/nokogiri/xml/node.rb +82 -11
- data/lib/nokogiri/xml/node_set.rb +9 -7
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +46 -13
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +38 -42
- data/lib/nokogiri/xml/syntax_error.rb +22 -0
- data/lib/nokogiri/xml/xpath_context.rb +14 -3
- data/lib/nokogiri/xml.rb +13 -24
- data/lib/nokogiri/xslt.rb +3 -9
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
- data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
- metadata +13 -14
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
- data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
- data/ports/archives/libxml2-2.12.9.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
data/gumbo-parser/src/parser.c
CHANGED
@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
|
|
56
56
|
.fragment_encoding = NULL,
|
57
57
|
.quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
|
58
58
|
.fragment_context_has_form_ancestor = false,
|
59
|
+
.parse_noscript_content_as_text = false,
|
59
60
|
};
|
60
61
|
|
61
62
|
#define STRING(s) {.data = s, .length = sizeof(s) - 1}
|
@@ -317,7 +318,7 @@ static GumboNode* create_node(GumboNodeType type) {
|
|
317
318
|
return node;
|
318
319
|
}
|
319
320
|
|
320
|
-
static GumboNode* new_document_node() {
|
321
|
+
static GumboNode* new_document_node(void) {
|
321
322
|
GumboNode* document_node = create_node(GUMBO_NODE_DOCUMENT);
|
322
323
|
document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
|
323
324
|
gumbo_vector_init(1, &document_node->v.document.children);
|
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
|
|
749
750
|
GumboParserError* extra_data = &error->v.parser;
|
750
751
|
extra_data->input_type = token->type;
|
751
752
|
extra_data->input_tag = GUMBO_TAG_UNKNOWN;
|
752
|
-
|
753
|
+
extra_data->input_name = NULL;
|
754
|
+
if (token->type == GUMBO_TOKEN_START_TAG)
|
755
|
+
{
|
753
756
|
extra_data->input_tag = token->v.start_tag.tag;
|
754
|
-
|
757
|
+
if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
|
758
|
+
extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
|
759
|
+
}
|
760
|
+
}
|
761
|
+
else if (token->type == GUMBO_TOKEN_END_TAG)
|
762
|
+
{
|
755
763
|
extra_data->input_tag = token->v.end_tag.tag;
|
764
|
+
if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
|
765
|
+
extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
|
766
|
+
}
|
756
767
|
}
|
757
768
|
const GumboParserState* state = parser->_parser_state;
|
758
769
|
extra_data->parser_state = state->_insertion_mode;
|
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
|
|
763
774
|
node->type == GUMBO_NODE_ELEMENT
|
764
775
|
|| node->type == GUMBO_NODE_TEMPLATE
|
765
776
|
);
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
777
|
+
void *tag;
|
778
|
+
if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
|
779
|
+
tag = gumbo_strdup(node->v.element.name);
|
780
|
+
} else {
|
781
|
+
tag = (void *)(uintptr_t)node->v.element.tag;
|
782
|
+
}
|
783
|
+
gumbo_vector_add(tag, &extra_data->tag_stack);
|
770
784
|
}
|
771
785
|
}
|
772
786
|
|
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
|
|
1187
1201
|
element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
|
1188
1202
|
insert_element(parser, element, false);
|
1189
1203
|
gumbo_debug (
|
1190
|
-
"Inserting
|
1204
|
+
"Inserting <%s> element (@%p) from tag type.\n",
|
1191
1205
|
gumbo_normalized_tagname(tag),
|
1192
1206
|
(void*)element
|
1193
1207
|
);
|
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
|
|
1204
1218
|
assert(token->type == GUMBO_TOKEN_START_TAG);
|
1205
1219
|
GumboNode* element = create_element_from_token(token, tag_namespace);
|
1206
1220
|
insert_element(parser, element, false);
|
1221
|
+
gumbo_debug (
|
1222
|
+
"Inserting <%s> foreign element (@%p).\n",
|
1223
|
+
gumbo_normalized_tagname(element->v.element.tag),
|
1224
|
+
(void*)element
|
1225
|
+
);
|
1207
1226
|
if (
|
1208
1227
|
token_has_attribute(token, "xmlns")
|
1209
1228
|
&& !attribute_matches_case_sensitive (
|
@@ -1978,7 +1997,7 @@ static void adjust_svg_tag(GumboToken* token) {
|
|
1978
1997
|
assert(token->type == GUMBO_TOKEN_START_TAG);
|
1979
1998
|
if (token->v.start_tag.tag == GUMBO_TAG_FOREIGNOBJECT) {
|
1980
1999
|
assert(token->v.start_tag.name == NULL);
|
1981
|
-
token->v.start_tag.name = "foreignObject";
|
2000
|
+
token->v.start_tag.name = (char *)"foreignObject";
|
1982
2001
|
} else if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
|
1983
2002
|
assert(token->v.start_tag.name);
|
1984
2003
|
const StringReplacement *replacement = gumbo_get_svg_tag_replacement(
|
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
|
|
2066
2085
|
|
2067
2086
|
// This is here to clean up memory when the spec says "Ignore current token."
|
2068
2087
|
static void ignore_token(GumboParser* parser) {
|
2088
|
+
gumbo_debug("Ignoring token.\n");
|
2069
2089
|
GumboToken* token = parser->_parser_state->_current_token;
|
2070
2090
|
// Ownership of the token's internal buffers are normally transferred to the
|
2071
2091
|
// element, but if no element is emitted (as happens in non-verbatim-mode
|
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
|
|
2430
2450
|
|
2431
2451
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-end
|
2432
2452
|
static void finish_parsing(GumboParser* parser) {
|
2433
|
-
gumbo_debug("Finishing parsing");
|
2453
|
+
gumbo_debug("Finishing parsing\n");
|
2434
2454
|
maybe_flush_text_node_buffer(parser);
|
2435
2455
|
GumboParserState* state = parser->_parser_state;
|
2436
2456
|
for (
|
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2608
2628
|
}
|
2609
2629
|
if (
|
2610
2630
|
tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
|
2631
|
+
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
|
2611
2632
|
) {
|
2612
2633
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
2613
2634
|
return;
|
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3313
3334
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3314
3335
|
return;
|
3315
3336
|
}
|
3316
|
-
if (
|
3337
|
+
if (
|
3338
|
+
tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
|
3339
|
+
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
|
3340
|
+
) {
|
3317
3341
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3318
3342
|
return;
|
3319
3343
|
}
|
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
|
|
4389
4413
|
|
4390
4414
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
|
4391
4415
|
static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
4392
|
-
gumbo_debug("Handling foreign content");
|
4416
|
+
gumbo_debug("Handling foreign content.\n");
|
4393
4417
|
switch (token->type) {
|
4394
4418
|
case GUMBO_TOKEN_NULL:
|
4395
4419
|
parser_add_parse_error(parser, token);
|
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
|
4507
4531
|
if (i == 0)
|
4508
4532
|
return;
|
4509
4533
|
// We can't call handle_token directly because the current node is still in
|
4510
|
-
// a
|
4534
|
+
// a foreign namespace, so it would re-enter this and result in infinite
|
4511
4535
|
// recursion.
|
4512
4536
|
handle_html_content(parser, token);
|
4513
4537
|
}
|
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
|
|
4627
4651
|
const char* fragment_encoding = options->fragment_encoding;
|
4628
4652
|
GumboQuirksModeEnum quirks = options->quirks_mode;
|
4629
4653
|
bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
|
4630
|
-
|
4631
4654
|
GumboNode* root;
|
4632
|
-
|
4655
|
+
|
4656
|
+
// 1. [Create a new Document node, and mark it as being an HTML document.]
|
4657
|
+
// 2. [If the node document of the context element is in quirks mode, then
|
4658
|
+
// let the Document be in quirks mode. Otherwise, the node document of
|
4659
|
+
// the context element is in limited-quirks mode, then let the Document
|
4660
|
+
// be in limited-quirks mode. Otherwise, leave the Document in no-quirks
|
4661
|
+
// mode.]
|
4633
4662
|
get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
|
4634
4663
|
|
4635
|
-
// 3.
|
4664
|
+
// 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
|
4665
|
+
// declarative shadow roots to true.]
|
4666
|
+
// 4. [Create a new HTML parser, and associate it with the just created Document node.]
|
4667
|
+
// 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
|
4636
4668
|
parser->_parser_state->_fragment_ctx =
|
4637
4669
|
create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
|
4638
4670
|
GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
|
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
|
|
4659
4691
|
break;
|
4660
4692
|
|
4661
4693
|
case GUMBO_TAG_NOSCRIPT:
|
4662
|
-
|
4663
|
-
|
4694
|
+
if (options->parse_noscript_content_as_text)
|
4695
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
|
4664
4696
|
break;
|
4665
4697
|
|
4666
4698
|
case GUMBO_TAG_PLAINTEXT:
|
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
|
|
4762
4794
|
adjusted_current_node &&
|
4763
4795
|
adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
|
4764
4796
|
);
|
4765
|
-
|
4797
|
+
// If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
|
4798
|
+
//
|
4799
|
+
// The parser is pretty fragile. Breaking out of the parsing loop in the middle of
|
4800
|
+
// the parse can leave the document in an inconsistent state.
|
4801
|
+
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|
4802
|
+
parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
|
4803
|
+
gumbo_debug("Tree depth limit exceeded.\n");
|
4804
|
+
token.type = GUMBO_TOKEN_EOF;
|
4805
|
+
} else {
|
4806
|
+
gumbo_lex(&parser, &token);
|
4807
|
+
}
|
4808
|
+
|
4766
4809
|
}
|
4767
4810
|
|
4768
4811
|
const char* token_type = "text";
|
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
|
|
4786
4829
|
break;
|
4787
4830
|
}
|
4788
4831
|
gumbo_debug (
|
4789
|
-
"Handling %s token @%lu:%lu in
|
4832
|
+
"Handling %s token @%lu:%lu in insertion mode %u.\n",
|
4790
4833
|
(char*) token_type,
|
4791
4834
|
(unsigned long)token.position.line,
|
4792
4835
|
(unsigned long)token.position.column,
|
@@ -4830,11 +4873,6 @@ GumboOutput* gumbo_parse_with_options (
|
|
4830
4873
|
gumbo_free(token.v.end_tag.name);
|
4831
4874
|
token.v.end_tag.name = NULL;
|
4832
4875
|
}
|
4833
|
-
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|
4834
|
-
parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
|
4835
|
-
gumbo_debug("Tree depth limit exceeded.\n");
|
4836
|
-
break;
|
4837
|
-
}
|
4838
4876
|
}
|
4839
4877
|
|
4840
4878
|
|
@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
|
|
340
340
|
|
341
341
|
// Sets the tag buffer original text and start point to the current iterator
|
342
342
|
// position. This is necessary because attribute names & values may have
|
343
|
-
// whitespace
|
343
|
+
// whitespace preceding them, and so we can't assume that the actual token
|
344
344
|
// starting point was the end of the last tag buffer usage.
|
345
345
|
static void reset_tag_buffer_start_point(GumboParser* parser) {
|
346
346
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
@@ -569,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
|
|
569
569
|
}
|
570
570
|
|
571
571
|
// Appends a codepoint to the current tag buffer. If
|
572
|
-
//
|
572
|
+
// reinitialize_position_on_first is set, this also initializes the tag buffer
|
573
573
|
// start point; the only time you would *not* want to pass true for this
|
574
574
|
// parameter is if you want the original_text to include character (like an
|
575
575
|
// opening quote) that doesn't appear in the value.
|
576
576
|
static void append_char_to_tag_buffer (
|
577
577
|
GumboParser* parser,
|
578
578
|
int codepoint,
|
579
|
-
bool
|
579
|
+
bool reinitialize_position_on_first
|
580
580
|
) {
|
581
581
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
582
|
-
if (buffer->length == 0 &&
|
582
|
+
if (buffer->length == 0 && reinitialize_position_on_first) {
|
583
583
|
reset_tag_buffer_start_point(parser);
|
584
584
|
}
|
585
585
|
gumbo_string_buffer_append_codepoint(codepoint, buffer);
|
@@ -589,10 +589,10 @@ static void append_char_to_tag_buffer (
|
|
589
589
|
static void append_string_to_tag_buffer (
|
590
590
|
GumboParser* parser,
|
591
591
|
GumboStringPiece* str,
|
592
|
-
bool
|
592
|
+
bool reinitialize_position_on_first
|
593
593
|
) {
|
594
594
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
595
|
-
if (buffer->length == 0 &&
|
595
|
+
if (buffer->length == 0 && reinitialize_position_on_first) {
|
596
596
|
reset_tag_buffer_start_point(parser);
|
597
597
|
}
|
598
598
|
gumbo_string_buffer_append_string(str, buffer);
|
@@ -18,7 +18,7 @@ module Nokogiri
|
|
18
18
|
#
|
19
19
|
module ClassResolver
|
20
20
|
# #related_class restricts matching namespaces to those matching this set.
|
21
|
-
VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
|
21
|
+
VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
|
22
22
|
|
23
23
|
# :call-seq:
|
24
24
|
# related_class(class_name) → Class
|
data/lib/nokogiri/css/node.rb
CHANGED
@@ -23,8 +23,12 @@ module Nokogiri
|
|
23
23
|
|
24
24
|
###
|
25
25
|
# Convert this CSS node to xpath with +prefix+ using +visitor+
|
26
|
-
def to_xpath(
|
27
|
-
prefix =
|
26
|
+
def to_xpath(visitor)
|
27
|
+
prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
|
28
|
+
"."
|
29
|
+
else
|
30
|
+
visitor.prefix
|
31
|
+
end
|
28
32
|
prefix + visitor.accept(self)
|
29
33
|
end
|
30
34
|
|
data/lib/nokogiri/css/parser.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
#
|
3
3
|
# DO NOT MODIFY!!!!
|
4
|
-
# This file is automatically generated by Racc 1.
|
5
|
-
# from Racc grammar file "".
|
4
|
+
# This file is automatically generated by Racc 1.8.0
|
5
|
+
# from Racc grammar file "parser.y".
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'racc/parser.rb'
|
@@ -291,6 +291,7 @@ Racc_arg = [
|
|
291
291
|
racc_shift_n,
|
292
292
|
racc_reduce_n,
|
293
293
|
racc_use_result_var ]
|
294
|
+
Ractor.make_shareable(Racc_arg) if defined?(Ractor)
|
294
295
|
|
295
296
|
Racc_token_to_s_table = [
|
296
297
|
"$end",
|
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
|
|
351
352
|
"negation",
|
352
353
|
"eql_incl_dash",
|
353
354
|
"negation_arg" ]
|
355
|
+
Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
|
354
356
|
|
355
357
|
Racc_debug_parser = false
|
356
358
|
|
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
|
|
468
470
|
end
|
469
471
|
|
470
472
|
def _reduce_24(val, _values, result)
|
471
|
-
result = Node.new(:ELEMENT_NAME, [
|
473
|
+
result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
|
472
474
|
result
|
473
475
|
end
|
474
476
|
|
475
477
|
def _reduce_25(val, _values, result)
|
476
|
-
name =
|
478
|
+
name = val[0]
|
477
479
|
result = Node.new(:ELEMENT_NAME, [name])
|
478
480
|
|
479
481
|
result
|
data/lib/nokogiri/css/parser.y
CHANGED
@@ -64,9 +64,9 @@ rule
|
|
64
64
|
;
|
65
65
|
|
66
66
|
namespaced_ident:
|
67
|
-
namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [
|
67
|
+
namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [val[0], val[2]]) }
|
68
68
|
| IDENT {
|
69
|
-
name =
|
69
|
+
name = val[0]
|
70
70
|
result = Node.new(:ELEMENT_NAME, [name])
|
71
71
|
}
|
72
72
|
;
|
@@ -5,62 +5,9 @@ require "thread"
|
|
5
5
|
module Nokogiri
|
6
6
|
module CSS
|
7
7
|
class Parser < Racc::Parser # :nodoc:
|
8
|
-
|
9
|
-
|
10
|
-
@cache = {}
|
11
|
-
@mutex = Mutex.new
|
12
|
-
|
13
|
-
class << self
|
14
|
-
# Return a thread-local boolean indicating whether the CSS-to-XPath cache is active. (Default is `true`.)
|
15
|
-
def cache_on?
|
16
|
-
!Thread.current[CACHE_SWITCH_NAME]
|
17
|
-
end
|
18
|
-
|
19
|
-
# Set a thread-local boolean to turn cacheing on and off. Truthy values turn the cache on, falsey values turn the cache off.
|
20
|
-
def set_cache(value) # rubocop:disable Naming/AccessorMethodName
|
21
|
-
Thread.current[CACHE_SWITCH_NAME] = !value
|
22
|
-
end
|
23
|
-
|
24
|
-
# Get the css selector in +string+ from the cache
|
25
|
-
def [](string)
|
26
|
-
return unless cache_on?
|
27
|
-
|
28
|
-
@mutex.synchronize { @cache[string] }
|
29
|
-
end
|
30
|
-
|
31
|
-
# Set the css selector in +string+ in the cache to +value+
|
32
|
-
def []=(string, value)
|
33
|
-
return value unless cache_on?
|
34
|
-
|
35
|
-
@mutex.synchronize { @cache[string] = value }
|
36
|
-
end
|
37
|
-
|
38
|
-
# Clear the cache
|
39
|
-
def clear_cache(create_new_object = false)
|
40
|
-
@mutex.synchronize do
|
41
|
-
if create_new_object
|
42
|
-
@cache = {}
|
43
|
-
else
|
44
|
-
@cache.clear
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
# Execute +block+ without cache
|
50
|
-
def without_cache(&block)
|
51
|
-
original_cache_setting = cache_on?
|
52
|
-
set_cache(false)
|
53
|
-
yield
|
54
|
-
ensure
|
55
|
-
set_cache(original_cache_setting)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Create a new CSS parser with respect to +namespaces+
|
60
|
-
def initialize(namespaces = {})
|
8
|
+
def initialize
|
61
9
|
@tokenizer = Tokenizer.new
|
62
|
-
|
63
|
-
super()
|
10
|
+
super
|
64
11
|
end
|
65
12
|
|
66
13
|
def parse(string)
|
@@ -72,11 +19,10 @@ module Nokogiri
|
|
72
19
|
@tokenizer.next_token
|
73
20
|
end
|
74
21
|
|
75
|
-
# Get the xpath for +
|
76
|
-
def xpath_for(
|
77
|
-
|
78
|
-
|
79
|
-
ast.to_xpath(prefix, visitor)
|
22
|
+
# Get the xpath for +selector+ using +visitor+
|
23
|
+
def xpath_for(selector, visitor)
|
24
|
+
parse(selector).map do |ast|
|
25
|
+
ast.to_xpath(visitor)
|
80
26
|
end
|
81
27
|
end
|
82
28
|
|
@@ -85,12 +31,6 @@ module Nokogiri
|
|
85
31
|
after = value_stack.compact.last
|
86
32
|
raise SyntaxError, "unexpected '#{error_value}' after '#{after}'"
|
87
33
|
end
|
88
|
-
|
89
|
-
def cache_key(query, prefix, visitor)
|
90
|
-
if self.class.cache_on?
|
91
|
-
[query, prefix, @namespaces, visitor.config]
|
92
|
-
end
|
93
|
-
end
|
94
34
|
end
|
95
35
|
end
|
96
36
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module CSS
|
5
|
+
module SelectorCache # :nodoc:
|
6
|
+
@cache = {}
|
7
|
+
@mutex = Mutex.new
|
8
|
+
|
9
|
+
class << self
|
10
|
+
# Retrieve the cached XPath expressions for the key
|
11
|
+
def [](key)
|
12
|
+
@mutex.synchronize { @cache[key] }
|
13
|
+
end
|
14
|
+
|
15
|
+
# Insert the XPath expressions `value` at the cache key
|
16
|
+
def []=(key, value)
|
17
|
+
@mutex.synchronize { @cache[key] = value }
|
18
|
+
end
|
19
|
+
|
20
|
+
# Clear the cache
|
21
|
+
def clear_cache(create_new_object = false)
|
22
|
+
@mutex.synchronize do
|
23
|
+
if create_new_object # used in tests to avoid 'method redefined' warnings when injecting spies
|
24
|
+
@cache = {}
|
25
|
+
else
|
26
|
+
@cache.clear
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Construct a unique key cache key
|
32
|
+
def key(selector:, visitor:)
|
33
|
+
[selector, visitor.config]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -63,13 +63,13 @@ class Tokenizer
|
|
63
63
|
when (text = @ss.scan(/has\([\s]*/))
|
64
64
|
action { [:HAS, text] }
|
65
65
|
|
66
|
-
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]
|
66
|
+
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
|
67
67
|
action { [:FUNCTION, text] }
|
68
68
|
|
69
|
-
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]
|
69
|
+
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
|
70
70
|
action { [:IDENT, text] }
|
71
71
|
|
72
|
-
when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]
|
72
|
+
when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
|
73
73
|
action { [:HASH, text] }
|
74
74
|
|
75
75
|
when (text = @ss.scan(/[\s]*~=[\s]*/))
|
@@ -132,7 +132,7 @@ class Tokenizer
|
|
132
132
|
when (text = @ss.scan(/[\s]+/))
|
133
133
|
action { [:S, text] }
|
134
134
|
|
135
|
-
when (text = @ss.scan(/"([^\n\r\f"]
|
135
|
+
when (text = @ss.scan(/("([^\n\r\f"]|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*')/))
|
136
136
|
action { [:STRING, text] }
|
137
137
|
|
138
138
|
when (text = @ss.scan(/./))
|
@@ -4,20 +4,21 @@ module CSS
|
|
4
4
|
class Tokenizer
|
5
5
|
|
6
6
|
macro
|
7
|
-
nl \n|\r\n|\r|\f
|
7
|
+
nl (\n|\r\n|\r|\f)
|
8
8
|
w [\s]*
|
9
9
|
nonascii [^\0-\177]
|
10
10
|
num -?([0-9]+|[0-9]*\.[0-9]+)
|
11
11
|
unicode \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
|
12
12
|
|
13
|
-
escape {unicode}|\\[^\n\r\f0-9A-Fa-f]
|
14
|
-
nmchar [_A-Za-z0-9-]|{nonascii}|{escape}
|
15
|
-
nmstart [_A-Za-z]|{nonascii}|{escape}
|
16
|
-
|
17
|
-
name
|
13
|
+
escape ({unicode}|\\[^\n\r\f0-9A-Fa-f])
|
14
|
+
nmchar ([_A-Za-z0-9-]|{nonascii}|{escape})
|
15
|
+
nmstart ([_A-Za-z]|{nonascii}|{escape})
|
16
|
+
name {nmstart}{nmchar}*
|
17
|
+
ident -?{name}
|
18
|
+
charref {nmchar}+
|
18
19
|
string1 "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
|
19
20
|
string2 '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
|
20
|
-
string {string1}|{string2}
|
21
|
+
string ({string1}|{string2})
|
21
22
|
|
22
23
|
rule
|
23
24
|
|
@@ -26,7 +27,7 @@ rule
|
|
26
27
|
has\({w} { [:HAS, text] }
|
27
28
|
{ident}\({w} { [:FUNCTION, text] }
|
28
29
|
{ident} { [:IDENT, text] }
|
29
|
-
\#{
|
30
|
+
\#{charref} { [:HASH, text] }
|
30
31
|
{w}~={w} { [:INCLUDES, text] }
|
31
32
|
{w}\|={w} { [:DASHMATCH, text] }
|
32
33
|
{w}\^={w} { [:PREFIXMATCH, text] }
|
@@ -44,6 +44,18 @@ module Nokogiri
|
|
44
44
|
VALUES = [XML, HTML4, HTML5]
|
45
45
|
end
|
46
46
|
|
47
|
+
# The visitor configuration set via the +builtins:+ keyword argument to XPathVisitor.new.
|
48
|
+
attr_reader :builtins
|
49
|
+
|
50
|
+
# The visitor configuration set via the +doctype:+ keyword argument to XPathVisitor.new.
|
51
|
+
attr_reader :doctype
|
52
|
+
|
53
|
+
# The visitor configuration set via the +prefix:+ keyword argument to XPathVisitor.new.
|
54
|
+
attr_reader :prefix
|
55
|
+
|
56
|
+
# The visitor configuration set via the +namespaces:+ keyword argument to XPathVisitor.new.
|
57
|
+
attr_reader :namespaces
|
58
|
+
|
47
59
|
# :call-seq:
|
48
60
|
# new() → XPathVisitor
|
49
61
|
# new(builtins:, doctype:) → XPathVisitor
|
@@ -54,7 +66,12 @@ module Nokogiri
|
|
54
66
|
#
|
55
67
|
# [Returns] XPathVisitor
|
56
68
|
#
|
57
|
-
def initialize(
|
69
|
+
def initialize(
|
70
|
+
builtins: BuiltinsConfig::NEVER,
|
71
|
+
doctype: DoctypeConfig::XML,
|
72
|
+
prefix: Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX,
|
73
|
+
namespaces: nil
|
74
|
+
)
|
58
75
|
unless BuiltinsConfig::VALUES.include?(builtins)
|
59
76
|
raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter")
|
60
77
|
end
|
@@ -64,6 +81,8 @@ module Nokogiri
|
|
64
81
|
|
65
82
|
@builtins = builtins
|
66
83
|
@doctype = doctype
|
84
|
+
@prefix = prefix
|
85
|
+
@namespaces = namespaces
|
67
86
|
end
|
68
87
|
|
69
88
|
# :call-seq: config() → Hash
|
@@ -72,7 +91,7 @@ module Nokogiri
|
|
72
91
|
# a Hash representing the configuration of the XPathVisitor, suitable for use as
|
73
92
|
# part of the CSS cache key.
|
74
93
|
def config
|
75
|
-
{ builtins: @builtins, doctype: @doctype }
|
94
|
+
{ builtins: @builtins, doctype: @doctype, prefix: @prefix, namespaces: @namespaces }
|
76
95
|
end
|
77
96
|
|
78
97
|
# :stopdoc:
|
@@ -128,6 +147,8 @@ module Nokogiri
|
|
128
147
|
is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
|
129
148
|
".#{"//" unless is_direct}#{node.value[1].accept(self)}"
|
130
149
|
else
|
150
|
+
validate_xpath_function_name(node.value.first)
|
151
|
+
|
131
152
|
# xpath function call, let's marshal those arguments
|
132
153
|
args = ["."]
|
133
154
|
args += node.value[1..-1].map do |n|
|
@@ -207,6 +228,7 @@ module Nokogiri
|
|
207
228
|
when "parent" then "node()"
|
208
229
|
when "root" then "not(parent::*)"
|
209
230
|
else
|
231
|
+
validate_xpath_function_name(node.value.first)
|
210
232
|
"nokogiri:#{node.value.first}(.)"
|
211
233
|
end
|
212
234
|
end
|
@@ -255,6 +277,14 @@ module Nokogiri
|
|
255
277
|
else
|
256
278
|
"*[local-name()='#{node.value.first}']"
|
257
279
|
end
|
280
|
+
elsif node.value.length == 2 # has a namespace prefix
|
281
|
+
if node.value.first.nil? # namespace prefix is empty
|
282
|
+
node.value.last
|
283
|
+
else
|
284
|
+
node.value.join(":")
|
285
|
+
end
|
286
|
+
elsif @namespaces&.key?("xmlns") # apply the default namespace if it's declared
|
287
|
+
"xmlns:#{node.value.first}"
|
258
288
|
else
|
259
289
|
node.value.first
|
260
290
|
end
|
@@ -270,11 +300,17 @@ module Nokogiri
|
|
270
300
|
|
271
301
|
private
|
272
302
|
|
303
|
+
def validate_xpath_function_name(name)
|
304
|
+
if name.start_with?("-")
|
305
|
+
raise Nokogiri::CSS::SyntaxError, "Invalid XPath function name '#{name}'"
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
273
309
|
def html5_element_name_needs_namespace_handling(node)
|
274
|
-
# if
|
275
|
-
node.value.
|
276
|
-
# if
|
277
|
-
|
310
|
+
# if there is already a namespace (i.e., it is a prefixed QName), use it as normal
|
311
|
+
node.value.length == 1 &&
|
312
|
+
# if this is the wildcard selector "*", use it as normal
|
313
|
+
node.value.first != "*"
|
278
314
|
end
|
279
315
|
|
280
316
|
def nth(node, options = {})
|