nokogiri 1.16.8 → 1.17.2
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +11 -21
- data/README.md +4 -0
- data/dependencies.yml +6 -6
- data/ext/nokogiri/extconf.rb +191 -137
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +25 -24
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/nokogiri.h +18 -33
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +2 -10
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +163 -156
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -6
- data/ext/nokogiri/xml_node.c +134 -103
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +54 -58
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +213 -131
- data/ext/nokogiri/xml_sax_push_parser.c +68 -49
- data/ext/nokogiri/xml_schema.c +50 -85
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +2 -4
- data/ext/nokogiri/xml_xpath_context.c +2 -2
- data/ext/nokogiri/xslt_stylesheet.c +8 -8
- data/gumbo-parser/src/error.c +76 -48
- data/gumbo-parser/src/error.h +5 -1
- data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
- data/gumbo-parser/src/parser.c +61 -23
- data/gumbo-parser/src/tokenizer.c +6 -6
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +42 -6
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +44 -23
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +1 -1
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -72
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +70 -26
- data/lib/nokogiri/xml/document_fragment.rb +84 -13
- data/lib/nokogiri/xml/node.rb +82 -11
- data/lib/nokogiri/xml/node_set.rb +9 -7
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +46 -13
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +6 -8
- data/lib/nokogiri/xml/syntax_error.rb +22 -0
- data/lib/nokogiri/xml.rb +13 -24
- data/lib/nokogiri/xslt.rb +3 -9
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
- metadata +11 -8
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
- data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
- data/ports/archives/libxml2-2.12.9.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
data/gumbo-parser/src/error.c
CHANGED
@@ -46,33 +46,40 @@ static int PRINTF(2) print_message (
|
|
46
46
|
args
|
47
47
|
);
|
48
48
|
va_end(args);
|
49
|
-
|
49
|
+
|
50
|
+
#if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(_RUBY_MSVCRT)
|
50
51
|
if (bytes_written == -1) {
|
51
52
|
// vsnprintf returns -1 on older MSVC++ if there's not enough capacity,
|
52
53
|
// instead of returning the number of bytes that would've been written had
|
53
|
-
// there been enough. In this case, we
|
54
|
-
//
|
55
|
-
//
|
56
|
-
|
54
|
+
// there been enough. In this case, we can call vsnprintf() again but
|
55
|
+
// with a count of 0 to get the number of bytes written, not including
|
56
|
+
// the null terminator.
|
57
|
+
// https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/vsnprintf-vsnprintf-vsnprintf-l-vsnwprintf-vsnwprintf-l?view=msvc-140#behavior-summary
|
58
|
+
|
57
59
|
va_start(args, format);
|
58
|
-
|
59
|
-
|
60
|
-
|
60
|
+
bytes_written = vsnprintf (
|
61
|
+
NULL,
|
62
|
+
0,
|
61
63
|
format,
|
62
64
|
args
|
63
65
|
);
|
64
66
|
va_end(args);
|
65
|
-
return result == -1 ? 0 : result;
|
66
67
|
}
|
67
|
-
#
|
68
|
+
#endif
|
69
|
+
|
68
70
|
// -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
|
69
71
|
if (bytes_written == -1) {
|
70
72
|
return 0;
|
71
73
|
}
|
72
|
-
#endif
|
73
74
|
|
74
75
|
if (bytes_written >= remaining_capacity) {
|
75
|
-
|
76
|
+
// At least double the size of the buffer.
|
77
|
+
size_t new_capacity = output->capacity * 2;
|
78
|
+
if (new_capacity < output->length + bytes_written + 1) {
|
79
|
+
// The +1 is for the null terminator.
|
80
|
+
new_capacity = output->length + bytes_written + 1;
|
81
|
+
}
|
82
|
+
gumbo_string_buffer_reserve(new_capacity, output);
|
76
83
|
remaining_capacity = output->capacity - output->length;
|
77
84
|
va_start(args, format);
|
78
85
|
bytes_written = vsnprintf (
|
@@ -96,8 +103,14 @@ static void print_tag_stack (
|
|
96
103
|
if (i) {
|
97
104
|
print_message(output, ", ");
|
98
105
|
}
|
99
|
-
|
100
|
-
|
106
|
+
uintptr_t tag = (uintptr_t) error->tag_stack.data[i];
|
107
|
+
const char* tag_name;
|
108
|
+
if (tag > GUMBO_TAG_UNKNOWN) {
|
109
|
+
tag_name = error->tag_stack.data[i];
|
110
|
+
} else {
|
111
|
+
tag_name = gumbo_normalized_tagname((GumboTag)tag);
|
112
|
+
}
|
113
|
+
print_message(output, "%s", tag_name);
|
101
114
|
}
|
102
115
|
gumbo_string_buffer_append_codepoint('.', output);
|
103
116
|
}
|
@@ -326,41 +339,45 @@ static void handle_parser_error (
|
|
326
339
|
}
|
327
340
|
|
328
341
|
switch (error->input_type) {
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
print_tag_stack(error, output);
|
352
|
-
}
|
353
|
-
return;
|
354
|
-
case GUMBO_TOKEN_START_TAG:
|
355
|
-
print_message(output, "Start tag '%s' isn't allowed here.",
|
356
|
-
gumbo_normalized_tagname(error->input_tag));
|
357
|
-
print_tag_stack(error, output);
|
358
|
-
return;
|
359
|
-
case GUMBO_TOKEN_END_TAG:
|
360
|
-
print_message(output, "End tag '%s' isn't allowed here.",
|
361
|
-
gumbo_normalized_tagname(error->input_tag));
|
342
|
+
case GUMBO_TOKEN_DOCTYPE:
|
343
|
+
print_message(output, "This is not a legal doctype");
|
344
|
+
return;
|
345
|
+
case GUMBO_TOKEN_COMMENT:
|
346
|
+
// Should never happen; comments are always legal.
|
347
|
+
assert(0);
|
348
|
+
// But just in case...
|
349
|
+
print_message(output, "Comments aren't legal here");
|
350
|
+
return;
|
351
|
+
case GUMBO_TOKEN_CDATA:
|
352
|
+
case GUMBO_TOKEN_WHITESPACE:
|
353
|
+
case GUMBO_TOKEN_CHARACTER:
|
354
|
+
print_message(output, "Character tokens aren't legal here");
|
355
|
+
return;
|
356
|
+
case GUMBO_TOKEN_NULL:
|
357
|
+
print_message(output, "Null bytes are not allowed in HTML5");
|
358
|
+
return;
|
359
|
+
case GUMBO_TOKEN_EOF:
|
360
|
+
if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
|
361
|
+
print_message(output, "You must provide a doctype");
|
362
|
+
} else {
|
363
|
+
print_message(output, "Premature end of file.");
|
362
364
|
print_tag_stack(error, output);
|
363
|
-
|
365
|
+
}
|
366
|
+
return;
|
367
|
+
case GUMBO_TOKEN_START_TAG:
|
368
|
+
case GUMBO_TOKEN_END_TAG:
|
369
|
+
{
|
370
|
+
const char* tag_name;
|
371
|
+
const char* which = error->input_type == GUMBO_TOKEN_START_TAG ? "Start" : "End";
|
372
|
+
if (error->input_name) {
|
373
|
+
tag_name = error->input_name;
|
374
|
+
} else {
|
375
|
+
tag_name = gumbo_normalized_tagname(error->input_tag);
|
376
|
+
}
|
377
|
+
print_message(output, "%s tag '%s' isn't allowed here.", which, tag_name);
|
378
|
+
print_tag_stack(error, output);
|
379
|
+
return;
|
380
|
+
}
|
364
381
|
}
|
365
382
|
}
|
366
383
|
|
@@ -613,6 +630,17 @@ void gumbo_print_caret_diagnostic (
|
|
613
630
|
|
614
631
|
void gumbo_error_destroy(GumboError* error) {
|
615
632
|
if (error->type == GUMBO_ERR_PARSER) {
|
633
|
+
// Free the tag name.
|
634
|
+
if (error->v.parser.input_name) {
|
635
|
+
gumbo_free(error->v.parser.input_name);
|
636
|
+
}
|
637
|
+
|
638
|
+
for (unsigned int i = 0; i < error->v.parser.tag_stack.length; ++i) {
|
639
|
+
intptr_t tag = (intptr_t) error->v.parser.tag_stack.data[i];
|
640
|
+
if (tag > GUMBO_TAG_UNKNOWN) {
|
641
|
+
gumbo_free(error->v.parser.tag_stack.data[i]);
|
642
|
+
}
|
643
|
+
}
|
616
644
|
gumbo_vector_destroy(&error->v.parser.tag_stack);
|
617
645
|
}
|
618
646
|
gumbo_free(error);
|
data/gumbo-parser/src/error.h
CHANGED
@@ -95,12 +95,16 @@ typedef struct GumboInternalParserError {
|
|
95
95
|
// The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
|
96
96
|
GumboTag input_tag;
|
97
97
|
|
98
|
+
// The HTML tag of the input token if it was nonstandard tag token. NULL otherwise.
|
99
|
+
char *input_name;
|
100
|
+
|
98
101
|
// The insertion mode that the parser was in at the time.
|
99
102
|
GumboInsertionMode parser_state;
|
100
103
|
|
101
104
|
// The tag stack at the point of the error. Note that this is an GumboVector
|
102
105
|
// of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
|
103
|
-
// get at the tag.
|
106
|
+
// get at the tag. For nonstandard tags, this is a pointer to an owned char *
|
107
|
+
// containing the tag name.
|
104
108
|
GumboVector /* GumboTag */ tag_stack;
|
105
109
|
} GumboParserError;
|
106
110
|
|
@@ -780,6 +780,15 @@ typedef struct GumboInternalOptions {
|
|
780
780
|
* Default: `false`.
|
781
781
|
*/
|
782
782
|
bool fragment_context_has_form_ancestor;
|
783
|
+
|
784
|
+
/**
|
785
|
+
* Parse `noscript` elements as if scripting was enabled. This causes the
|
786
|
+
* contents of the `noscript` element to be parsed as raw text, rather
|
787
|
+
* than as HTML elements.
|
788
|
+
*
|
789
|
+
* Default: `false`.
|
790
|
+
*/
|
791
|
+
bool parse_noscript_content_as_text;
|
783
792
|
} GumboOptions;
|
784
793
|
|
785
794
|
/** Default options struct; use this with gumbo_parse_with_options. */
|
@@ -791,7 +800,7 @@ extern const GumboOptions kGumboDefaultOptions;
|
|
791
800
|
*/
|
792
801
|
typedef enum {
|
793
802
|
/**
|
794
|
-
* Indicates that parsing completed
|
803
|
+
* Indicates that parsing completed successfully. The resulting tree
|
795
804
|
* will be a complete document.
|
796
805
|
*/
|
797
806
|
GUMBO_STATUS_OK,
|
@@ -841,7 +850,7 @@ typedef struct GumboInternalOutput {
|
|
841
850
|
GumboVector /* GumboError */ errors;
|
842
851
|
|
843
852
|
/**
|
844
|
-
* True if the parser
|
853
|
+
* True if the parser encountered an error.
|
845
854
|
*
|
846
855
|
* This can be true and `errors` an empty `GumboVector` if the `max_errors`
|
847
856
|
* option was set to 0.
|
data/gumbo-parser/src/parser.c
CHANGED
@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
|
|
56
56
|
.fragment_encoding = NULL,
|
57
57
|
.quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
|
58
58
|
.fragment_context_has_form_ancestor = false,
|
59
|
+
.parse_noscript_content_as_text = false,
|
59
60
|
};
|
60
61
|
|
61
62
|
#define STRING(s) {.data = s, .length = sizeof(s) - 1}
|
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
|
|
749
750
|
GumboParserError* extra_data = &error->v.parser;
|
750
751
|
extra_data->input_type = token->type;
|
751
752
|
extra_data->input_tag = GUMBO_TAG_UNKNOWN;
|
752
|
-
|
753
|
+
extra_data->input_name = NULL;
|
754
|
+
if (token->type == GUMBO_TOKEN_START_TAG)
|
755
|
+
{
|
753
756
|
extra_data->input_tag = token->v.start_tag.tag;
|
754
|
-
|
757
|
+
if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
|
758
|
+
extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
|
759
|
+
}
|
760
|
+
}
|
761
|
+
else if (token->type == GUMBO_TOKEN_END_TAG)
|
762
|
+
{
|
755
763
|
extra_data->input_tag = token->v.end_tag.tag;
|
764
|
+
if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
|
765
|
+
extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
|
766
|
+
}
|
756
767
|
}
|
757
768
|
const GumboParserState* state = parser->_parser_state;
|
758
769
|
extra_data->parser_state = state->_insertion_mode;
|
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
|
|
763
774
|
node->type == GUMBO_NODE_ELEMENT
|
764
775
|
|| node->type == GUMBO_NODE_TEMPLATE
|
765
776
|
);
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
777
|
+
void *tag;
|
778
|
+
if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
|
779
|
+
tag = gumbo_strdup(node->v.element.name);
|
780
|
+
} else {
|
781
|
+
tag = (void *)(uintptr_t)node->v.element.tag;
|
782
|
+
}
|
783
|
+
gumbo_vector_add(tag, &extra_data->tag_stack);
|
770
784
|
}
|
771
785
|
}
|
772
786
|
|
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
|
|
1187
1201
|
element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
|
1188
1202
|
insert_element(parser, element, false);
|
1189
1203
|
gumbo_debug (
|
1190
|
-
"Inserting
|
1204
|
+
"Inserting <%s> element (@%p) from tag type.\n",
|
1191
1205
|
gumbo_normalized_tagname(tag),
|
1192
1206
|
(void*)element
|
1193
1207
|
);
|
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
|
|
1204
1218
|
assert(token->type == GUMBO_TOKEN_START_TAG);
|
1205
1219
|
GumboNode* element = create_element_from_token(token, tag_namespace);
|
1206
1220
|
insert_element(parser, element, false);
|
1221
|
+
gumbo_debug (
|
1222
|
+
"Inserting <%s> foreign element (@%p).\n",
|
1223
|
+
gumbo_normalized_tagname(element->v.element.tag),
|
1224
|
+
(void*)element
|
1225
|
+
);
|
1207
1226
|
if (
|
1208
1227
|
token_has_attribute(token, "xmlns")
|
1209
1228
|
&& !attribute_matches_case_sensitive (
|
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
|
|
2066
2085
|
|
2067
2086
|
// This is here to clean up memory when the spec says "Ignore current token."
|
2068
2087
|
static void ignore_token(GumboParser* parser) {
|
2088
|
+
gumbo_debug("Ignoring token.\n");
|
2069
2089
|
GumboToken* token = parser->_parser_state->_current_token;
|
2070
2090
|
// Ownership of the token's internal buffers are normally transferred to the
|
2071
2091
|
// element, but if no element is emitted (as happens in non-verbatim-mode
|
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
|
|
2430
2450
|
|
2431
2451
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-end
|
2432
2452
|
static void finish_parsing(GumboParser* parser) {
|
2433
|
-
gumbo_debug("Finishing parsing");
|
2453
|
+
gumbo_debug("Finishing parsing\n");
|
2434
2454
|
maybe_flush_text_node_buffer(parser);
|
2435
2455
|
GumboParserState* state = parser->_parser_state;
|
2436
2456
|
for (
|
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2608
2628
|
}
|
2609
2629
|
if (
|
2610
2630
|
tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
|
2631
|
+
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
|
2611
2632
|
) {
|
2612
2633
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
2613
2634
|
return;
|
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3313
3334
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3314
3335
|
return;
|
3315
3336
|
}
|
3316
|
-
if (
|
3337
|
+
if (
|
3338
|
+
tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
|
3339
|
+
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
|
3340
|
+
) {
|
3317
3341
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3318
3342
|
return;
|
3319
3343
|
}
|
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
|
|
4389
4413
|
|
4390
4414
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
|
4391
4415
|
static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
4392
|
-
gumbo_debug("Handling foreign content");
|
4416
|
+
gumbo_debug("Handling foreign content.\n");
|
4393
4417
|
switch (token->type) {
|
4394
4418
|
case GUMBO_TOKEN_NULL:
|
4395
4419
|
parser_add_parse_error(parser, token);
|
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
|
4507
4531
|
if (i == 0)
|
4508
4532
|
return;
|
4509
4533
|
// We can't call handle_token directly because the current node is still in
|
4510
|
-
// a
|
4534
|
+
// a foreign namespace, so it would re-enter this and result in infinite
|
4511
4535
|
// recursion.
|
4512
4536
|
handle_html_content(parser, token);
|
4513
4537
|
}
|
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
|
|
4627
4651
|
const char* fragment_encoding = options->fragment_encoding;
|
4628
4652
|
GumboQuirksModeEnum quirks = options->quirks_mode;
|
4629
4653
|
bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
|
4630
|
-
|
4631
4654
|
GumboNode* root;
|
4632
|
-
|
4655
|
+
|
4656
|
+
// 1. [Create a new Document node, and mark it as being an HTML document.]
|
4657
|
+
// 2. [If the node document of the context element is in quirks mode, then
|
4658
|
+
// let the Document be in quirks mode. Otherwise, the node document of
|
4659
|
+
// the context element is in limited-quirks mode, then let the Document
|
4660
|
+
// be in limited-quirks mode. Otherwise, leave the Document in no-quirks
|
4661
|
+
// mode.]
|
4633
4662
|
get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
|
4634
4663
|
|
4635
|
-
// 3.
|
4664
|
+
// 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
|
4665
|
+
// declarative shadow roots to true.]
|
4666
|
+
// 4. [Create a new HTML parser, and associate it with the just created Document node.]
|
4667
|
+
// 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
|
4636
4668
|
parser->_parser_state->_fragment_ctx =
|
4637
4669
|
create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
|
4638
4670
|
GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
|
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
|
|
4659
4691
|
break;
|
4660
4692
|
|
4661
4693
|
case GUMBO_TAG_NOSCRIPT:
|
4662
|
-
|
4663
|
-
|
4694
|
+
if (options->parse_noscript_content_as_text)
|
4695
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
|
4664
4696
|
break;
|
4665
4697
|
|
4666
4698
|
case GUMBO_TAG_PLAINTEXT:
|
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
|
|
4762
4794
|
adjusted_current_node &&
|
4763
4795
|
adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
|
4764
4796
|
);
|
4765
|
-
|
4797
|
+
// If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
|
4798
|
+
//
|
4799
|
+
// The parser is pretty fragile. Breaking out of the parsing loop in the middle of
|
4800
|
+
// the parse can leave the document in an inconsistent state.
|
4801
|
+
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|
4802
|
+
parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
|
4803
|
+
gumbo_debug("Tree depth limit exceeded.\n");
|
4804
|
+
token.type = GUMBO_TOKEN_EOF;
|
4805
|
+
} else {
|
4806
|
+
gumbo_lex(&parser, &token);
|
4807
|
+
}
|
4808
|
+
|
4766
4809
|
}
|
4767
4810
|
|
4768
4811
|
const char* token_type = "text";
|
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
|
|
4786
4829
|
break;
|
4787
4830
|
}
|
4788
4831
|
gumbo_debug (
|
4789
|
-
"Handling %s token @%lu:%lu in
|
4832
|
+
"Handling %s token @%lu:%lu in insertion mode %u.\n",
|
4790
4833
|
(char*) token_type,
|
4791
4834
|
(unsigned long)token.position.line,
|
4792
4835
|
(unsigned long)token.position.column,
|
@@ -4830,11 +4873,6 @@ GumboOutput* gumbo_parse_with_options (
|
|
4830
4873
|
gumbo_free(token.v.end_tag.name);
|
4831
4874
|
token.v.end_tag.name = NULL;
|
4832
4875
|
}
|
4833
|
-
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|
4834
|
-
parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
|
4835
|
-
gumbo_debug("Tree depth limit exceeded.\n");
|
4836
|
-
break;
|
4837
|
-
}
|
4838
4876
|
}
|
4839
4877
|
|
4840
4878
|
|
@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
|
|
340
340
|
|
341
341
|
// Sets the tag buffer original text and start point to the current iterator
|
342
342
|
// position. This is necessary because attribute names & values may have
|
343
|
-
// whitespace
|
343
|
+
// whitespace preceding them, and so we can't assume that the actual token
|
344
344
|
// starting point was the end of the last tag buffer usage.
|
345
345
|
static void reset_tag_buffer_start_point(GumboParser* parser) {
|
346
346
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
@@ -569,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
|
|
569
569
|
}
|
570
570
|
|
571
571
|
// Appends a codepoint to the current tag buffer. If
|
572
|
-
//
|
572
|
+
// reinitialize_position_on_first is set, this also initializes the tag buffer
|
573
573
|
// start point; the only time you would *not* want to pass true for this
|
574
574
|
// parameter is if you want the original_text to include character (like an
|
575
575
|
// opening quote) that doesn't appear in the value.
|
576
576
|
static void append_char_to_tag_buffer (
|
577
577
|
GumboParser* parser,
|
578
578
|
int codepoint,
|
579
|
-
bool
|
579
|
+
bool reinitialize_position_on_first
|
580
580
|
) {
|
581
581
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
582
|
-
if (buffer->length == 0 &&
|
582
|
+
if (buffer->length == 0 && reinitialize_position_on_first) {
|
583
583
|
reset_tag_buffer_start_point(parser);
|
584
584
|
}
|
585
585
|
gumbo_string_buffer_append_codepoint(codepoint, buffer);
|
@@ -589,10 +589,10 @@ static void append_char_to_tag_buffer (
|
|
589
589
|
static void append_string_to_tag_buffer (
|
590
590
|
GumboParser* parser,
|
591
591
|
GumboStringPiece* str,
|
592
|
-
bool
|
592
|
+
bool reinitialize_position_on_first
|
593
593
|
) {
|
594
594
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
595
|
-
if (buffer->length == 0 &&
|
595
|
+
if (buffer->length == 0 && reinitialize_position_on_first) {
|
596
596
|
reset_tag_buffer_start_point(parser);
|
597
597
|
}
|
598
598
|
gumbo_string_buffer_append_string(str, buffer);
|
@@ -18,7 +18,7 @@ module Nokogiri
|
|
18
18
|
#
|
19
19
|
module ClassResolver
|
20
20
|
# #related_class restricts matching namespaces to those matching this set.
|
21
|
-
VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
|
21
|
+
VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
|
22
22
|
|
23
23
|
# :call-seq:
|
24
24
|
# related_class(class_name) → Class
|
data/lib/nokogiri/css/node.rb
CHANGED
@@ -23,8 +23,12 @@ module Nokogiri
|
|
23
23
|
|
24
24
|
###
|
25
25
|
# Convert this CSS node to xpath with +prefix+ using +visitor+
|
26
|
-
def to_xpath(
|
27
|
-
prefix =
|
26
|
+
def to_xpath(visitor)
|
27
|
+
prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
|
28
|
+
"."
|
29
|
+
else
|
30
|
+
visitor.prefix
|
31
|
+
end
|
28
32
|
prefix + visitor.accept(self)
|
29
33
|
end
|
30
34
|
|
data/lib/nokogiri/css/parser.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
#
|
3
3
|
# DO NOT MODIFY!!!!
|
4
|
-
# This file is automatically generated by Racc 1.
|
5
|
-
# from Racc grammar file "".
|
4
|
+
# This file is automatically generated by Racc 1.8.0
|
5
|
+
# from Racc grammar file "parser.y".
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'racc/parser.rb'
|
@@ -291,6 +291,7 @@ Racc_arg = [
|
|
291
291
|
racc_shift_n,
|
292
292
|
racc_reduce_n,
|
293
293
|
racc_use_result_var ]
|
294
|
+
Ractor.make_shareable(Racc_arg) if defined?(Ractor)
|
294
295
|
|
295
296
|
Racc_token_to_s_table = [
|
296
297
|
"$end",
|
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
|
|
351
352
|
"negation",
|
352
353
|
"eql_incl_dash",
|
353
354
|
"negation_arg" ]
|
355
|
+
Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
|
354
356
|
|
355
357
|
Racc_debug_parser = false
|
356
358
|
|
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
|
|
468
470
|
end
|
469
471
|
|
470
472
|
def _reduce_24(val, _values, result)
|
471
|
-
result = Node.new(:ELEMENT_NAME, [
|
473
|
+
result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
|
472
474
|
result
|
473
475
|
end
|
474
476
|
|
475
477
|
def _reduce_25(val, _values, result)
|
476
|
-
name =
|
478
|
+
name = val[0]
|
477
479
|
result = Node.new(:ELEMENT_NAME, [name])
|
478
480
|
|
479
481
|
result
|
data/lib/nokogiri/css/parser.y
CHANGED
@@ -64,9 +64,9 @@ rule
|
|
64
64
|
;
|
65
65
|
|
66
66
|
namespaced_ident:
|
67
|
-
namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [
|
67
|
+
namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [val[0], val[2]]) }
|
68
68
|
| IDENT {
|
69
|
-
name =
|
69
|
+
name = val[0]
|
70
70
|
result = Node.new(:ELEMENT_NAME, [name])
|
71
71
|
}
|
72
72
|
;
|
@@ -5,62 +5,9 @@ require "thread"
|
|
5
5
|
module Nokogiri
|
6
6
|
module CSS
|
7
7
|
class Parser < Racc::Parser # :nodoc:
|
8
|
-
|
9
|
-
|
10
|
-
@cache = {}
|
11
|
-
@mutex = Mutex.new
|
12
|
-
|
13
|
-
class << self
|
14
|
-
# Return a thread-local boolean indicating whether the CSS-to-XPath cache is active. (Default is `true`.)
|
15
|
-
def cache_on?
|
16
|
-
!Thread.current[CACHE_SWITCH_NAME]
|
17
|
-
end
|
18
|
-
|
19
|
-
# Set a thread-local boolean to turn cacheing on and off. Truthy values turn the cache on, falsey values turn the cache off.
|
20
|
-
def set_cache(value) # rubocop:disable Naming/AccessorMethodName
|
21
|
-
Thread.current[CACHE_SWITCH_NAME] = !value
|
22
|
-
end
|
23
|
-
|
24
|
-
# Get the css selector in +string+ from the cache
|
25
|
-
def [](string)
|
26
|
-
return unless cache_on?
|
27
|
-
|
28
|
-
@mutex.synchronize { @cache[string] }
|
29
|
-
end
|
30
|
-
|
31
|
-
# Set the css selector in +string+ in the cache to +value+
|
32
|
-
def []=(string, value)
|
33
|
-
return value unless cache_on?
|
34
|
-
|
35
|
-
@mutex.synchronize { @cache[string] = value }
|
36
|
-
end
|
37
|
-
|
38
|
-
# Clear the cache
|
39
|
-
def clear_cache(create_new_object = false)
|
40
|
-
@mutex.synchronize do
|
41
|
-
if create_new_object
|
42
|
-
@cache = {}
|
43
|
-
else
|
44
|
-
@cache.clear
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
# Execute +block+ without cache
|
50
|
-
def without_cache(&block)
|
51
|
-
original_cache_setting = cache_on?
|
52
|
-
set_cache(false)
|
53
|
-
yield
|
54
|
-
ensure
|
55
|
-
set_cache(original_cache_setting)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Create a new CSS parser with respect to +namespaces+
|
60
|
-
def initialize(namespaces = {})
|
8
|
+
def initialize
|
61
9
|
@tokenizer = Tokenizer.new
|
62
|
-
|
63
|
-
super()
|
10
|
+
super
|
64
11
|
end
|
65
12
|
|
66
13
|
def parse(string)
|
@@ -72,11 +19,10 @@ module Nokogiri
|
|
72
19
|
@tokenizer.next_token
|
73
20
|
end
|
74
21
|
|
75
|
-
# Get the xpath for +
|
76
|
-
def xpath_for(
|
77
|
-
|
78
|
-
|
79
|
-
ast.to_xpath(prefix, visitor)
|
22
|
+
# Get the xpath for +selector+ using +visitor+
|
23
|
+
def xpath_for(selector, visitor)
|
24
|
+
parse(selector).map do |ast|
|
25
|
+
ast.to_xpath(visitor)
|
80
26
|
end
|
81
27
|
end
|
82
28
|
|
@@ -85,12 +31,6 @@ module Nokogiri
|
|
85
31
|
after = value_stack.compact.last
|
86
32
|
raise SyntaxError, "unexpected '#{error_value}' after '#{after}'"
|
87
33
|
end
|
88
|
-
|
89
|
-
def cache_key(query, prefix, visitor)
|
90
|
-
if self.class.cache_on?
|
91
|
-
[query, prefix, @namespaces, visitor.config]
|
92
|
-
end
|
93
|
-
end
|
94
34
|
end
|
95
35
|
end
|
96
36
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module CSS
|
5
|
+
module SelectorCache # :nodoc:
|
6
|
+
@cache = {}
|
7
|
+
@mutex = Mutex.new
|
8
|
+
|
9
|
+
class << self
|
10
|
+
# Retrieve the cached XPath expressions for the key
|
11
|
+
def [](key)
|
12
|
+
@mutex.synchronize { @cache[key] }
|
13
|
+
end
|
14
|
+
|
15
|
+
# Insert the XPath expressions `value` at the cache key
|
16
|
+
def []=(key, value)
|
17
|
+
@mutex.synchronize { @cache[key] = value }
|
18
|
+
end
|
19
|
+
|
20
|
+
# Clear the cache
|
21
|
+
def clear_cache(create_new_object = false)
|
22
|
+
@mutex.synchronize do
|
23
|
+
if create_new_object # used in tests to avoid 'method redefined' warnings when injecting spies
|
24
|
+
@cache = {}
|
25
|
+
else
|
26
|
+
@cache.clear
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Construct a unique key cache key
|
32
|
+
def key(selector:, visitor:)
|
33
|
+
[selector, visitor.config]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|