nokogiri 1.16.8 → 1.18.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +11 -21
- data/LICENSE-DEPENDENCIES.md +6 -6
- data/README.md +8 -5
- data/dependencies.yml +6 -6
- data/ext/nokogiri/extconf.rb +188 -142
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +25 -24
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/nokogiri.h +18 -33
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +2 -10
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +163 -156
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -6
- data/ext/nokogiri/xml_node.c +134 -103
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +54 -58
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +219 -131
- data/ext/nokogiri/xml_sax_push_parser.c +68 -49
- data/ext/nokogiri/xml_schema.c +50 -85
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +2 -4
- data/ext/nokogiri/xml_xpath_context.c +103 -100
- data/ext/nokogiri/xslt_stylesheet.c +8 -8
- data/gumbo-parser/src/ascii.c +2 -2
- data/gumbo-parser/src/error.c +76 -48
- data/gumbo-parser/src/error.h +5 -1
- data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
- data/gumbo-parser/src/parser.c +63 -25
- data/gumbo-parser/src/tokenizer.c +6 -6
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +43 -6
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +44 -23
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +1 -1
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -72
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +70 -26
- data/lib/nokogiri/xml/document_fragment.rb +84 -13
- data/lib/nokogiri/xml/node.rb +82 -11
- data/lib/nokogiri/xml/node_set.rb +9 -7
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +46 -13
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +38 -42
- data/lib/nokogiri/xml/syntax_error.rb +22 -0
- data/lib/nokogiri/xml/xpath_context.rb +14 -3
- data/lib/nokogiri/xml.rb +13 -24
- data/lib/nokogiri/xslt.rb +3 -9
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
- data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
- metadata +13 -12
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
- data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
- data/ports/archives/libxml2-2.12.9.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
data/gumbo-parser/src/parser.c
CHANGED
@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
|
|
56
56
|
.fragment_encoding = NULL,
|
57
57
|
.quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
|
58
58
|
.fragment_context_has_form_ancestor = false,
|
59
|
+
.parse_noscript_content_as_text = false,
|
59
60
|
};
|
60
61
|
|
61
62
|
#define STRING(s) {.data = s, .length = sizeof(s) - 1}
|
@@ -317,7 +318,7 @@ static GumboNode* create_node(GumboNodeType type) {
|
|
317
318
|
return node;
|
318
319
|
}
|
319
320
|
|
320
|
-
static GumboNode* new_document_node() {
|
321
|
+
static GumboNode* new_document_node(void) {
|
321
322
|
GumboNode* document_node = create_node(GUMBO_NODE_DOCUMENT);
|
322
323
|
document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
|
323
324
|
gumbo_vector_init(1, &document_node->v.document.children);
|
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
|
|
749
750
|
GumboParserError* extra_data = &error->v.parser;
|
750
751
|
extra_data->input_type = token->type;
|
751
752
|
extra_data->input_tag = GUMBO_TAG_UNKNOWN;
|
752
|
-
|
753
|
+
extra_data->input_name = NULL;
|
754
|
+
if (token->type == GUMBO_TOKEN_START_TAG)
|
755
|
+
{
|
753
756
|
extra_data->input_tag = token->v.start_tag.tag;
|
754
|
-
|
757
|
+
if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
|
758
|
+
extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
|
759
|
+
}
|
760
|
+
}
|
761
|
+
else if (token->type == GUMBO_TOKEN_END_TAG)
|
762
|
+
{
|
755
763
|
extra_data->input_tag = token->v.end_tag.tag;
|
764
|
+
if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
|
765
|
+
extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
|
766
|
+
}
|
756
767
|
}
|
757
768
|
const GumboParserState* state = parser->_parser_state;
|
758
769
|
extra_data->parser_state = state->_insertion_mode;
|
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
|
|
763
774
|
node->type == GUMBO_NODE_ELEMENT
|
764
775
|
|| node->type == GUMBO_NODE_TEMPLATE
|
765
776
|
);
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
777
|
+
void *tag;
|
778
|
+
if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
|
779
|
+
tag = gumbo_strdup(node->v.element.name);
|
780
|
+
} else {
|
781
|
+
tag = (void *)(uintptr_t)node->v.element.tag;
|
782
|
+
}
|
783
|
+
gumbo_vector_add(tag, &extra_data->tag_stack);
|
770
784
|
}
|
771
785
|
}
|
772
786
|
|
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
|
|
1187
1201
|
element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
|
1188
1202
|
insert_element(parser, element, false);
|
1189
1203
|
gumbo_debug (
|
1190
|
-
"Inserting
|
1204
|
+
"Inserting <%s> element (@%p) from tag type.\n",
|
1191
1205
|
gumbo_normalized_tagname(tag),
|
1192
1206
|
(void*)element
|
1193
1207
|
);
|
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
|
|
1204
1218
|
assert(token->type == GUMBO_TOKEN_START_TAG);
|
1205
1219
|
GumboNode* element = create_element_from_token(token, tag_namespace);
|
1206
1220
|
insert_element(parser, element, false);
|
1221
|
+
gumbo_debug (
|
1222
|
+
"Inserting <%s> foreign element (@%p).\n",
|
1223
|
+
gumbo_normalized_tagname(element->v.element.tag),
|
1224
|
+
(void*)element
|
1225
|
+
);
|
1207
1226
|
if (
|
1208
1227
|
token_has_attribute(token, "xmlns")
|
1209
1228
|
&& !attribute_matches_case_sensitive (
|
@@ -1978,7 +1997,7 @@ static void adjust_svg_tag(GumboToken* token) {
|
|
1978
1997
|
assert(token->type == GUMBO_TOKEN_START_TAG);
|
1979
1998
|
if (token->v.start_tag.tag == GUMBO_TAG_FOREIGNOBJECT) {
|
1980
1999
|
assert(token->v.start_tag.name == NULL);
|
1981
|
-
token->v.start_tag.name = "foreignObject";
|
2000
|
+
token->v.start_tag.name = (char *)"foreignObject";
|
1982
2001
|
} else if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
|
1983
2002
|
assert(token->v.start_tag.name);
|
1984
2003
|
const StringReplacement *replacement = gumbo_get_svg_tag_replacement(
|
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
|
|
2066
2085
|
|
2067
2086
|
// This is here to clean up memory when the spec says "Ignore current token."
|
2068
2087
|
static void ignore_token(GumboParser* parser) {
|
2088
|
+
gumbo_debug("Ignoring token.\n");
|
2069
2089
|
GumboToken* token = parser->_parser_state->_current_token;
|
2070
2090
|
// Ownership of the token's internal buffers are normally transferred to the
|
2071
2091
|
// element, but if no element is emitted (as happens in non-verbatim-mode
|
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
|
|
2430
2450
|
|
2431
2451
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-end
|
2432
2452
|
static void finish_parsing(GumboParser* parser) {
|
2433
|
-
gumbo_debug("Finishing parsing");
|
2453
|
+
gumbo_debug("Finishing parsing\n");
|
2434
2454
|
maybe_flush_text_node_buffer(parser);
|
2435
2455
|
GumboParserState* state = parser->_parser_state;
|
2436
2456
|
for (
|
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2608
2628
|
}
|
2609
2629
|
if (
|
2610
2630
|
tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
|
2631
|
+
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
|
2611
2632
|
) {
|
2612
2633
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
2613
2634
|
return;
|
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3313
3334
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3314
3335
|
return;
|
3315
3336
|
}
|
3316
|
-
if (
|
3337
|
+
if (
|
3338
|
+
tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
|
3339
|
+
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
|
3340
|
+
) {
|
3317
3341
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3318
3342
|
return;
|
3319
3343
|
}
|
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
|
|
4389
4413
|
|
4390
4414
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
|
4391
4415
|
static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
4392
|
-
gumbo_debug("Handling foreign content");
|
4416
|
+
gumbo_debug("Handling foreign content.\n");
|
4393
4417
|
switch (token->type) {
|
4394
4418
|
case GUMBO_TOKEN_NULL:
|
4395
4419
|
parser_add_parse_error(parser, token);
|
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
|
4507
4531
|
if (i == 0)
|
4508
4532
|
return;
|
4509
4533
|
// We can't call handle_token directly because the current node is still in
|
4510
|
-
// a
|
4534
|
+
// a foreign namespace, so it would re-enter this and result in infinite
|
4511
4535
|
// recursion.
|
4512
4536
|
handle_html_content(parser, token);
|
4513
4537
|
}
|
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
|
|
4627
4651
|
const char* fragment_encoding = options->fragment_encoding;
|
4628
4652
|
GumboQuirksModeEnum quirks = options->quirks_mode;
|
4629
4653
|
bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
|
4630
|
-
|
4631
4654
|
GumboNode* root;
|
4632
|
-
|
4655
|
+
|
4656
|
+
// 1. [Create a new Document node, and mark it as being an HTML document.]
|
4657
|
+
// 2. [If the node document of the context element is in quirks mode, then
|
4658
|
+
// let the Document be in quirks mode. Otherwise, the node document of
|
4659
|
+
// the context element is in limited-quirks mode, then let the Document
|
4660
|
+
// be in limited-quirks mode. Otherwise, leave the Document in no-quirks
|
4661
|
+
// mode.]
|
4633
4662
|
get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
|
4634
4663
|
|
4635
|
-
// 3.
|
4664
|
+
// 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
|
4665
|
+
// declarative shadow roots to true.]
|
4666
|
+
// 4. [Create a new HTML parser, and associate it with the just created Document node.]
|
4667
|
+
// 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
|
4636
4668
|
parser->_parser_state->_fragment_ctx =
|
4637
4669
|
create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
|
4638
4670
|
GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
|
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
|
|
4659
4691
|
break;
|
4660
4692
|
|
4661
4693
|
case GUMBO_TAG_NOSCRIPT:
|
4662
|
-
|
4663
|
-
|
4694
|
+
if (options->parse_noscript_content_as_text)
|
4695
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
|
4664
4696
|
break;
|
4665
4697
|
|
4666
4698
|
case GUMBO_TAG_PLAINTEXT:
|
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
|
|
4762
4794
|
adjusted_current_node &&
|
4763
4795
|
adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
|
4764
4796
|
);
|
4765
|
-
|
4797
|
+
// If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
|
4798
|
+
//
|
4799
|
+
// The parser is pretty fragile. Breaking out of the parsing loop in the middle of
|
4800
|
+
// the parse can leave the document in an inconsistent state.
|
4801
|
+
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|
4802
|
+
parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
|
4803
|
+
gumbo_debug("Tree depth limit exceeded.\n");
|
4804
|
+
token.type = GUMBO_TOKEN_EOF;
|
4805
|
+
} else {
|
4806
|
+
gumbo_lex(&parser, &token);
|
4807
|
+
}
|
4808
|
+
|
4766
4809
|
}
|
4767
4810
|
|
4768
4811
|
const char* token_type = "text";
|
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
|
|
4786
4829
|
break;
|
4787
4830
|
}
|
4788
4831
|
gumbo_debug (
|
4789
|
-
"Handling %s token @%lu:%lu in
|
4832
|
+
"Handling %s token @%lu:%lu in insertion mode %u.\n",
|
4790
4833
|
(char*) token_type,
|
4791
4834
|
(unsigned long)token.position.line,
|
4792
4835
|
(unsigned long)token.position.column,
|
@@ -4830,11 +4873,6 @@ GumboOutput* gumbo_parse_with_options (
|
|
4830
4873
|
gumbo_free(token.v.end_tag.name);
|
4831
4874
|
token.v.end_tag.name = NULL;
|
4832
4875
|
}
|
4833
|
-
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|
4834
|
-
parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
|
4835
|
-
gumbo_debug("Tree depth limit exceeded.\n");
|
4836
|
-
break;
|
4837
|
-
}
|
4838
4876
|
}
|
4839
4877
|
|
4840
4878
|
|
@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
|
|
340
340
|
|
341
341
|
// Sets the tag buffer original text and start point to the current iterator
|
342
342
|
// position. This is necessary because attribute names & values may have
|
343
|
-
// whitespace
|
343
|
+
// whitespace preceding them, and so we can't assume that the actual token
|
344
344
|
// starting point was the end of the last tag buffer usage.
|
345
345
|
static void reset_tag_buffer_start_point(GumboParser* parser) {
|
346
346
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
@@ -569,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
|
|
569
569
|
}
|
570
570
|
|
571
571
|
// Appends a codepoint to the current tag buffer. If
|
572
|
-
//
|
572
|
+
// reinitialize_position_on_first is set, this also initializes the tag buffer
|
573
573
|
// start point; the only time you would *not* want to pass true for this
|
574
574
|
// parameter is if you want the original_text to include character (like an
|
575
575
|
// opening quote) that doesn't appear in the value.
|
576
576
|
static void append_char_to_tag_buffer (
|
577
577
|
GumboParser* parser,
|
578
578
|
int codepoint,
|
579
|
-
bool
|
579
|
+
bool reinitialize_position_on_first
|
580
580
|
) {
|
581
581
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
582
|
-
if (buffer->length == 0 &&
|
582
|
+
if (buffer->length == 0 && reinitialize_position_on_first) {
|
583
583
|
reset_tag_buffer_start_point(parser);
|
584
584
|
}
|
585
585
|
gumbo_string_buffer_append_codepoint(codepoint, buffer);
|
@@ -589,10 +589,10 @@ static void append_char_to_tag_buffer (
|
|
589
589
|
static void append_string_to_tag_buffer (
|
590
590
|
GumboParser* parser,
|
591
591
|
GumboStringPiece* str,
|
592
|
-
bool
|
592
|
+
bool reinitialize_position_on_first
|
593
593
|
) {
|
594
594
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
595
|
-
if (buffer->length == 0 &&
|
595
|
+
if (buffer->length == 0 && reinitialize_position_on_first) {
|
596
596
|
reset_tag_buffer_start_point(parser);
|
597
597
|
}
|
598
598
|
gumbo_string_buffer_append_string(str, buffer);
|
@@ -18,7 +18,7 @@ module Nokogiri
|
|
18
18
|
#
|
19
19
|
module ClassResolver
|
20
20
|
# #related_class restricts matching namespaces to those matching this set.
|
21
|
-
VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
|
21
|
+
VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
|
22
22
|
|
23
23
|
# :call-seq:
|
24
24
|
# related_class(class_name) → Class
|
data/lib/nokogiri/css/node.rb
CHANGED
@@ -23,8 +23,12 @@ module Nokogiri
|
|
23
23
|
|
24
24
|
###
|
25
25
|
# Convert this CSS node to xpath with +prefix+ using +visitor+
|
26
|
-
def to_xpath(
|
27
|
-
prefix =
|
26
|
+
def to_xpath(visitor)
|
27
|
+
prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
|
28
|
+
"."
|
29
|
+
else
|
30
|
+
visitor.prefix
|
31
|
+
end
|
28
32
|
prefix + visitor.accept(self)
|
29
33
|
end
|
30
34
|
|
data/lib/nokogiri/css/parser.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
#
|
3
3
|
# DO NOT MODIFY!!!!
|
4
|
-
# This file is automatically generated by Racc 1.
|
5
|
-
# from Racc grammar file "".
|
4
|
+
# This file is automatically generated by Racc 1.8.0
|
5
|
+
# from Racc grammar file "parser.y".
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'racc/parser.rb'
|
@@ -291,6 +291,7 @@ Racc_arg = [
|
|
291
291
|
racc_shift_n,
|
292
292
|
racc_reduce_n,
|
293
293
|
racc_use_result_var ]
|
294
|
+
Ractor.make_shareable(Racc_arg) if defined?(Ractor)
|
294
295
|
|
295
296
|
Racc_token_to_s_table = [
|
296
297
|
"$end",
|
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
|
|
351
352
|
"negation",
|
352
353
|
"eql_incl_dash",
|
353
354
|
"negation_arg" ]
|
355
|
+
Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
|
354
356
|
|
355
357
|
Racc_debug_parser = false
|
356
358
|
|
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
|
|
468
470
|
end
|
469
471
|
|
470
472
|
def _reduce_24(val, _values, result)
|
471
|
-
result = Node.new(:ELEMENT_NAME, [
|
473
|
+
result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
|
472
474
|
result
|
473
475
|
end
|
474
476
|
|
475
477
|
def _reduce_25(val, _values, result)
|
476
|
-
name =
|
478
|
+
name = val[0]
|
477
479
|
result = Node.new(:ELEMENT_NAME, [name])
|
478
480
|
|
479
481
|
result
|
data/lib/nokogiri/css/parser.y
CHANGED
@@ -64,9 +64,9 @@ rule
|
|
64
64
|
;
|
65
65
|
|
66
66
|
namespaced_ident:
|
67
|
-
namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [
|
67
|
+
namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [val[0], val[2]]) }
|
68
68
|
| IDENT {
|
69
|
-
name =
|
69
|
+
name = val[0]
|
70
70
|
result = Node.new(:ELEMENT_NAME, [name])
|
71
71
|
}
|
72
72
|
;
|
@@ -5,62 +5,9 @@ require "thread"
|
|
5
5
|
module Nokogiri
|
6
6
|
module CSS
|
7
7
|
class Parser < Racc::Parser # :nodoc:
|
8
|
-
|
9
|
-
|
10
|
-
@cache = {}
|
11
|
-
@mutex = Mutex.new
|
12
|
-
|
13
|
-
class << self
|
14
|
-
# Return a thread-local boolean indicating whether the CSS-to-XPath cache is active. (Default is `true`.)
|
15
|
-
def cache_on?
|
16
|
-
!Thread.current[CACHE_SWITCH_NAME]
|
17
|
-
end
|
18
|
-
|
19
|
-
# Set a thread-local boolean to turn cacheing on and off. Truthy values turn the cache on, falsey values turn the cache off.
|
20
|
-
def set_cache(value) # rubocop:disable Naming/AccessorMethodName
|
21
|
-
Thread.current[CACHE_SWITCH_NAME] = !value
|
22
|
-
end
|
23
|
-
|
24
|
-
# Get the css selector in +string+ from the cache
|
25
|
-
def [](string)
|
26
|
-
return unless cache_on?
|
27
|
-
|
28
|
-
@mutex.synchronize { @cache[string] }
|
29
|
-
end
|
30
|
-
|
31
|
-
# Set the css selector in +string+ in the cache to +value+
|
32
|
-
def []=(string, value)
|
33
|
-
return value unless cache_on?
|
34
|
-
|
35
|
-
@mutex.synchronize { @cache[string] = value }
|
36
|
-
end
|
37
|
-
|
38
|
-
# Clear the cache
|
39
|
-
def clear_cache(create_new_object = false)
|
40
|
-
@mutex.synchronize do
|
41
|
-
if create_new_object
|
42
|
-
@cache = {}
|
43
|
-
else
|
44
|
-
@cache.clear
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
# Execute +block+ without cache
|
50
|
-
def without_cache(&block)
|
51
|
-
original_cache_setting = cache_on?
|
52
|
-
set_cache(false)
|
53
|
-
yield
|
54
|
-
ensure
|
55
|
-
set_cache(original_cache_setting)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Create a new CSS parser with respect to +namespaces+
|
60
|
-
def initialize(namespaces = {})
|
8
|
+
def initialize
|
61
9
|
@tokenizer = Tokenizer.new
|
62
|
-
|
63
|
-
super()
|
10
|
+
super
|
64
11
|
end
|
65
12
|
|
66
13
|
def parse(string)
|
@@ -72,11 +19,10 @@ module Nokogiri
|
|
72
19
|
@tokenizer.next_token
|
73
20
|
end
|
74
21
|
|
75
|
-
# Get the xpath for +
|
76
|
-
def xpath_for(
|
77
|
-
|
78
|
-
|
79
|
-
ast.to_xpath(prefix, visitor)
|
22
|
+
# Get the xpath for +selector+ using +visitor+
|
23
|
+
def xpath_for(selector, visitor)
|
24
|
+
parse(selector).map do |ast|
|
25
|
+
ast.to_xpath(visitor)
|
80
26
|
end
|
81
27
|
end
|
82
28
|
|
@@ -85,12 +31,6 @@ module Nokogiri
|
|
85
31
|
after = value_stack.compact.last
|
86
32
|
raise SyntaxError, "unexpected '#{error_value}' after '#{after}'"
|
87
33
|
end
|
88
|
-
|
89
|
-
def cache_key(query, prefix, visitor)
|
90
|
-
if self.class.cache_on?
|
91
|
-
[query, prefix, @namespaces, visitor.config]
|
92
|
-
end
|
93
|
-
end
|
94
34
|
end
|
95
35
|
end
|
96
36
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module CSS
|
5
|
+
module SelectorCache # :nodoc:
|
6
|
+
@cache = {}
|
7
|
+
@mutex = Mutex.new
|
8
|
+
|
9
|
+
class << self
|
10
|
+
# Retrieve the cached XPath expressions for the key
|
11
|
+
def [](key)
|
12
|
+
@mutex.synchronize { @cache[key] }
|
13
|
+
end
|
14
|
+
|
15
|
+
# Insert the XPath expressions `value` at the cache key
|
16
|
+
def []=(key, value)
|
17
|
+
@mutex.synchronize { @cache[key] = value }
|
18
|
+
end
|
19
|
+
|
20
|
+
# Clear the cache
|
21
|
+
def clear_cache(create_new_object = false)
|
22
|
+
@mutex.synchronize do
|
23
|
+
if create_new_object # used in tests to avoid 'method redefined' warnings when injecting spies
|
24
|
+
@cache = {}
|
25
|
+
else
|
26
|
+
@cache.clear
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Construct a unique key cache key
|
32
|
+
def key(selector:, visitor:)
|
33
|
+
[selector, visitor.config]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -63,13 +63,13 @@ class Tokenizer
|
|
63
63
|
when (text = @ss.scan(/has\([\s]*/))
|
64
64
|
action { [:HAS, text] }
|
65
65
|
|
66
|
-
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]
|
66
|
+
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
|
67
67
|
action { [:FUNCTION, text] }
|
68
68
|
|
69
|
-
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]
|
69
|
+
when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
|
70
70
|
action { [:IDENT, text] }
|
71
71
|
|
72
|
-
when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]
|
72
|
+
when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
|
73
73
|
action { [:HASH, text] }
|
74
74
|
|
75
75
|
when (text = @ss.scan(/[\s]*~=[\s]*/))
|
@@ -132,7 +132,7 @@ class Tokenizer
|
|
132
132
|
when (text = @ss.scan(/[\s]+/))
|
133
133
|
action { [:S, text] }
|
134
134
|
|
135
|
-
when (text = @ss.scan(/"([^\n\r\f"]
|
135
|
+
when (text = @ss.scan(/("([^\n\r\f"]|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*')/))
|
136
136
|
action { [:STRING, text] }
|
137
137
|
|
138
138
|
when (text = @ss.scan(/./))
|
@@ -4,20 +4,21 @@ module CSS
|
|
4
4
|
class Tokenizer
|
5
5
|
|
6
6
|
macro
|
7
|
-
nl \n|\r\n|\r|\f
|
7
|
+
nl (\n|\r\n|\r|\f)
|
8
8
|
w [\s]*
|
9
9
|
nonascii [^\0-\177]
|
10
10
|
num -?([0-9]+|[0-9]*\.[0-9]+)
|
11
11
|
unicode \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
|
12
12
|
|
13
|
-
escape {unicode}|\\[^\n\r\f0-9A-Fa-f]
|
14
|
-
nmchar [_A-Za-z0-9-]|{nonascii}|{escape}
|
15
|
-
nmstart [_A-Za-z]|{nonascii}|{escape}
|
16
|
-
|
17
|
-
name
|
13
|
+
escape ({unicode}|\\[^\n\r\f0-9A-Fa-f])
|
14
|
+
nmchar ([_A-Za-z0-9-]|{nonascii}|{escape})
|
15
|
+
nmstart ([_A-Za-z]|{nonascii}|{escape})
|
16
|
+
name {nmstart}{nmchar}*
|
17
|
+
ident -?{name}
|
18
|
+
charref {nmchar}+
|
18
19
|
string1 "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
|
19
20
|
string2 '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
|
20
|
-
string {string1}|{string2}
|
21
|
+
string ({string1}|{string2})
|
21
22
|
|
22
23
|
rule
|
23
24
|
|
@@ -26,7 +27,7 @@ rule
|
|
26
27
|
has\({w} { [:HAS, text] }
|
27
28
|
{ident}\({w} { [:FUNCTION, text] }
|
28
29
|
{ident} { [:IDENT, text] }
|
29
|
-
\#{
|
30
|
+
\#{charref} { [:HASH, text] }
|
30
31
|
{w}~={w} { [:INCLUDES, text] }
|
31
32
|
{w}\|={w} { [:DASHMATCH, text] }
|
32
33
|
{w}\^={w} { [:PREFIXMATCH, text] }
|
@@ -44,6 +44,18 @@ module Nokogiri
|
|
44
44
|
VALUES = [XML, HTML4, HTML5]
|
45
45
|
end
|
46
46
|
|
47
|
+
# The visitor configuration set via the +builtins:+ keyword argument to XPathVisitor.new.
|
48
|
+
attr_reader :builtins
|
49
|
+
|
50
|
+
# The visitor configuration set via the +doctype:+ keyword argument to XPathVisitor.new.
|
51
|
+
attr_reader :doctype
|
52
|
+
|
53
|
+
# The visitor configuration set via the +prefix:+ keyword argument to XPathVisitor.new.
|
54
|
+
attr_reader :prefix
|
55
|
+
|
56
|
+
# The visitor configuration set via the +namespaces:+ keyword argument to XPathVisitor.new.
|
57
|
+
attr_reader :namespaces
|
58
|
+
|
47
59
|
# :call-seq:
|
48
60
|
# new() → XPathVisitor
|
49
61
|
# new(builtins:, doctype:) → XPathVisitor
|
@@ -54,7 +66,12 @@ module Nokogiri
|
|
54
66
|
#
|
55
67
|
# [Returns] XPathVisitor
|
56
68
|
#
|
57
|
-
def initialize(
|
69
|
+
def initialize(
|
70
|
+
builtins: BuiltinsConfig::NEVER,
|
71
|
+
doctype: DoctypeConfig::XML,
|
72
|
+
prefix: Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX,
|
73
|
+
namespaces: nil
|
74
|
+
)
|
58
75
|
unless BuiltinsConfig::VALUES.include?(builtins)
|
59
76
|
raise(ArgumentError, "Invalid values #{builtins.inspect} for builtins: keyword parameter")
|
60
77
|
end
|
@@ -64,6 +81,8 @@ module Nokogiri
|
|
64
81
|
|
65
82
|
@builtins = builtins
|
66
83
|
@doctype = doctype
|
84
|
+
@prefix = prefix
|
85
|
+
@namespaces = namespaces
|
67
86
|
end
|
68
87
|
|
69
88
|
# :call-seq: config() → Hash
|
@@ -72,7 +91,7 @@ module Nokogiri
|
|
72
91
|
# a Hash representing the configuration of the XPathVisitor, suitable for use as
|
73
92
|
# part of the CSS cache key.
|
74
93
|
def config
|
75
|
-
{ builtins: @builtins, doctype: @doctype }
|
94
|
+
{ builtins: @builtins, doctype: @doctype, prefix: @prefix, namespaces: @namespaces }
|
76
95
|
end
|
77
96
|
|
78
97
|
# :stopdoc:
|
@@ -128,6 +147,8 @@ module Nokogiri
|
|
128
147
|
is_direct = node.value[1].value[0].nil? # e.g. "has(> a)", "has(~ a)", "has(+ a)"
|
129
148
|
".#{"//" unless is_direct}#{node.value[1].accept(self)}"
|
130
149
|
else
|
150
|
+
validate_xpath_function_name(node.value.first)
|
151
|
+
|
131
152
|
# xpath function call, let's marshal those arguments
|
132
153
|
args = ["."]
|
133
154
|
args += node.value[1..-1].map do |n|
|
@@ -207,6 +228,7 @@ module Nokogiri
|
|
207
228
|
when "parent" then "node()"
|
208
229
|
when "root" then "not(parent::*)"
|
209
230
|
else
|
231
|
+
validate_xpath_function_name(node.value.first)
|
210
232
|
"nokogiri:#{node.value.first}(.)"
|
211
233
|
end
|
212
234
|
end
|
@@ -255,6 +277,15 @@ module Nokogiri
|
|
255
277
|
else
|
256
278
|
"*[local-name()='#{node.value.first}']"
|
257
279
|
end
|
280
|
+
elsif node.value.length == 2 # has a namespace prefix
|
281
|
+
if node.value.first.nil? # namespace prefix is empty
|
282
|
+
node.value.last
|
283
|
+
else
|
284
|
+
node.value.join(":")
|
285
|
+
end
|
286
|
+
elsif node.value.first != "*" && @namespaces&.key?("xmlns")
|
287
|
+
# apply the default namespace (if one is present) to a non-wildcard selector
|
288
|
+
"xmlns:#{node.value.first}"
|
258
289
|
else
|
259
290
|
node.value.first
|
260
291
|
end
|
@@ -270,11 +301,17 @@ module Nokogiri
|
|
270
301
|
|
271
302
|
private
|
272
303
|
|
304
|
+
def validate_xpath_function_name(name)
|
305
|
+
if name.start_with?("-")
|
306
|
+
raise Nokogiri::CSS::SyntaxError, "Invalid XPath function name '#{name}'"
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
273
310
|
def html5_element_name_needs_namespace_handling(node)
|
274
|
-
# if
|
275
|
-
node.value.
|
276
|
-
# if
|
277
|
-
|
311
|
+
# if there is already a namespace (i.e., it is a prefixed QName), use it as normal
|
312
|
+
node.value.length == 1 &&
|
313
|
+
# if this is the wildcard selector "*", use it as normal
|
314
|
+
node.value.first != "*"
|
278
315
|
end
|
279
316
|
|
280
317
|
def nth(node, options = {})
|