nokogiri 1.16.7 → 1.17.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +11 -21
- data/README.md +4 -0
- data/dependencies.yml +6 -6
- data/ext/nokogiri/extconf.rb +191 -137
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +25 -24
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/nokogiri.h +18 -33
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +2 -10
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +163 -156
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -6
- data/ext/nokogiri/xml_node.c +141 -104
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +54 -58
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +213 -131
- data/ext/nokogiri/xml_sax_push_parser.c +68 -49
- data/ext/nokogiri/xml_schema.c +50 -85
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +2 -4
- data/ext/nokogiri/xml_xpath_context.c +2 -2
- data/ext/nokogiri/xslt_stylesheet.c +8 -8
- data/gumbo-parser/src/error.c +76 -48
- data/gumbo-parser/src/error.h +5 -1
- data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
- data/gumbo-parser/src/parser.c +61 -23
- data/gumbo-parser/src/tokenizer.c +6 -6
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +42 -6
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +44 -23
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +1 -1
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -72
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +70 -26
- data/lib/nokogiri/xml/document_fragment.rb +84 -13
- data/lib/nokogiri/xml/node.rb +82 -11
- data/lib/nokogiri/xml/node_set.rb +9 -7
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +46 -13
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +6 -8
- data/lib/nokogiri/xml/syntax_error.rb +22 -0
- data/lib/nokogiri/xml.rb +13 -24
- data/lib/nokogiri/xslt.rb +3 -9
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
- metadata +10 -9
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
- data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
- data/ports/archives/libxml2-2.12.9.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
data/gumbo-parser/src/error.c
CHANGED
@@ -46,33 +46,40 @@ static int PRINTF(2) print_message (
|
|
46
46
|
args
|
47
47
|
);
|
48
48
|
va_end(args);
|
49
|
-
|
49
|
+
|
50
|
+
#if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(_RUBY_MSVCRT)
|
50
51
|
if (bytes_written == -1) {
|
51
52
|
// vsnprintf returns -1 on older MSVC++ if there's not enough capacity,
|
52
53
|
// instead of returning the number of bytes that would've been written had
|
53
|
-
// there been enough. In this case, we
|
54
|
-
//
|
55
|
-
//
|
56
|
-
|
54
|
+
// there been enough. In this case, we can call vsnprintf() again but
|
55
|
+
// with a count of 0 to get the number of bytes written, not including
|
56
|
+
// the null terminator.
|
57
|
+
// https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/vsnprintf-vsnprintf-vsnprintf-l-vsnwprintf-vsnwprintf-l?view=msvc-140#behavior-summary
|
58
|
+
|
57
59
|
va_start(args, format);
|
58
|
-
|
59
|
-
|
60
|
-
|
60
|
+
bytes_written = vsnprintf (
|
61
|
+
NULL,
|
62
|
+
0,
|
61
63
|
format,
|
62
64
|
args
|
63
65
|
);
|
64
66
|
va_end(args);
|
65
|
-
return result == -1 ? 0 : result;
|
66
67
|
}
|
67
|
-
#
|
68
|
+
#endif
|
69
|
+
|
68
70
|
// -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
|
69
71
|
if (bytes_written == -1) {
|
70
72
|
return 0;
|
71
73
|
}
|
72
|
-
#endif
|
73
74
|
|
74
75
|
if (bytes_written >= remaining_capacity) {
|
75
|
-
|
76
|
+
// At least double the size of the buffer.
|
77
|
+
size_t new_capacity = output->capacity * 2;
|
78
|
+
if (new_capacity < output->length + bytes_written + 1) {
|
79
|
+
// The +1 is for the null terminator.
|
80
|
+
new_capacity = output->length + bytes_written + 1;
|
81
|
+
}
|
82
|
+
gumbo_string_buffer_reserve(new_capacity, output);
|
76
83
|
remaining_capacity = output->capacity - output->length;
|
77
84
|
va_start(args, format);
|
78
85
|
bytes_written = vsnprintf (
|
@@ -96,8 +103,14 @@ static void print_tag_stack (
|
|
96
103
|
if (i) {
|
97
104
|
print_message(output, ", ");
|
98
105
|
}
|
99
|
-
|
100
|
-
|
106
|
+
uintptr_t tag = (uintptr_t) error->tag_stack.data[i];
|
107
|
+
const char* tag_name;
|
108
|
+
if (tag > GUMBO_TAG_UNKNOWN) {
|
109
|
+
tag_name = error->tag_stack.data[i];
|
110
|
+
} else {
|
111
|
+
tag_name = gumbo_normalized_tagname((GumboTag)tag);
|
112
|
+
}
|
113
|
+
print_message(output, "%s", tag_name);
|
101
114
|
}
|
102
115
|
gumbo_string_buffer_append_codepoint('.', output);
|
103
116
|
}
|
@@ -326,41 +339,45 @@ static void handle_parser_error (
|
|
326
339
|
}
|
327
340
|
|
328
341
|
switch (error->input_type) {
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
print_tag_stack(error, output);
|
352
|
-
}
|
353
|
-
return;
|
354
|
-
case GUMBO_TOKEN_START_TAG:
|
355
|
-
print_message(output, "Start tag '%s' isn't allowed here.",
|
356
|
-
gumbo_normalized_tagname(error->input_tag));
|
357
|
-
print_tag_stack(error, output);
|
358
|
-
return;
|
359
|
-
case GUMBO_TOKEN_END_TAG:
|
360
|
-
print_message(output, "End tag '%s' isn't allowed here.",
|
361
|
-
gumbo_normalized_tagname(error->input_tag));
|
342
|
+
case GUMBO_TOKEN_DOCTYPE:
|
343
|
+
print_message(output, "This is not a legal doctype");
|
344
|
+
return;
|
345
|
+
case GUMBO_TOKEN_COMMENT:
|
346
|
+
// Should never happen; comments are always legal.
|
347
|
+
assert(0);
|
348
|
+
// But just in case...
|
349
|
+
print_message(output, "Comments aren't legal here");
|
350
|
+
return;
|
351
|
+
case GUMBO_TOKEN_CDATA:
|
352
|
+
case GUMBO_TOKEN_WHITESPACE:
|
353
|
+
case GUMBO_TOKEN_CHARACTER:
|
354
|
+
print_message(output, "Character tokens aren't legal here");
|
355
|
+
return;
|
356
|
+
case GUMBO_TOKEN_NULL:
|
357
|
+
print_message(output, "Null bytes are not allowed in HTML5");
|
358
|
+
return;
|
359
|
+
case GUMBO_TOKEN_EOF:
|
360
|
+
if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
|
361
|
+
print_message(output, "You must provide a doctype");
|
362
|
+
} else {
|
363
|
+
print_message(output, "Premature end of file.");
|
362
364
|
print_tag_stack(error, output);
|
363
|
-
|
365
|
+
}
|
366
|
+
return;
|
367
|
+
case GUMBO_TOKEN_START_TAG:
|
368
|
+
case GUMBO_TOKEN_END_TAG:
|
369
|
+
{
|
370
|
+
const char* tag_name;
|
371
|
+
const char* which = error->input_type == GUMBO_TOKEN_START_TAG ? "Start" : "End";
|
372
|
+
if (error->input_name) {
|
373
|
+
tag_name = error->input_name;
|
374
|
+
} else {
|
375
|
+
tag_name = gumbo_normalized_tagname(error->input_tag);
|
376
|
+
}
|
377
|
+
print_message(output, "%s tag '%s' isn't allowed here.", which, tag_name);
|
378
|
+
print_tag_stack(error, output);
|
379
|
+
return;
|
380
|
+
}
|
364
381
|
}
|
365
382
|
}
|
366
383
|
|
@@ -613,6 +630,17 @@ void gumbo_print_caret_diagnostic (
|
|
613
630
|
|
614
631
|
void gumbo_error_destroy(GumboError* error) {
|
615
632
|
if (error->type == GUMBO_ERR_PARSER) {
|
633
|
+
// Free the tag name.
|
634
|
+
if (error->v.parser.input_name) {
|
635
|
+
gumbo_free(error->v.parser.input_name);
|
636
|
+
}
|
637
|
+
|
638
|
+
for (unsigned int i = 0; i < error->v.parser.tag_stack.length; ++i) {
|
639
|
+
intptr_t tag = (intptr_t) error->v.parser.tag_stack.data[i];
|
640
|
+
if (tag > GUMBO_TAG_UNKNOWN) {
|
641
|
+
gumbo_free(error->v.parser.tag_stack.data[i]);
|
642
|
+
}
|
643
|
+
}
|
616
644
|
gumbo_vector_destroy(&error->v.parser.tag_stack);
|
617
645
|
}
|
618
646
|
gumbo_free(error);
|
data/gumbo-parser/src/error.h
CHANGED
@@ -95,12 +95,16 @@ typedef struct GumboInternalParserError {
|
|
95
95
|
// The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
|
96
96
|
GumboTag input_tag;
|
97
97
|
|
98
|
+
// The HTML tag of the input token if it was nonstandard tag token. NULL otherwise.
|
99
|
+
char *input_name;
|
100
|
+
|
98
101
|
// The insertion mode that the parser was in at the time.
|
99
102
|
GumboInsertionMode parser_state;
|
100
103
|
|
101
104
|
// The tag stack at the point of the error. Note that this is an GumboVector
|
102
105
|
// of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
|
103
|
-
// get at the tag.
|
106
|
+
// get at the tag. For nonstandard tags, this is a pointer to an owned char *
|
107
|
+
// containing the tag name.
|
104
108
|
GumboVector /* GumboTag */ tag_stack;
|
105
109
|
} GumboParserError;
|
106
110
|
|
@@ -780,6 +780,15 @@ typedef struct GumboInternalOptions {
|
|
780
780
|
* Default: `false`.
|
781
781
|
*/
|
782
782
|
bool fragment_context_has_form_ancestor;
|
783
|
+
|
784
|
+
/**
|
785
|
+
* Parse `noscript` elements as if scripting was enabled. This causes the
|
786
|
+
* contents of the `noscript` element to be parsed as raw text, rather
|
787
|
+
* than as HTML elements.
|
788
|
+
*
|
789
|
+
* Default: `false`.
|
790
|
+
*/
|
791
|
+
bool parse_noscript_content_as_text;
|
783
792
|
} GumboOptions;
|
784
793
|
|
785
794
|
/** Default options struct; use this with gumbo_parse_with_options. */
|
@@ -791,7 +800,7 @@ extern const GumboOptions kGumboDefaultOptions;
|
|
791
800
|
*/
|
792
801
|
typedef enum {
|
793
802
|
/**
|
794
|
-
* Indicates that parsing completed
|
803
|
+
* Indicates that parsing completed successfully. The resulting tree
|
795
804
|
* will be a complete document.
|
796
805
|
*/
|
797
806
|
GUMBO_STATUS_OK,
|
@@ -841,7 +850,7 @@ typedef struct GumboInternalOutput {
|
|
841
850
|
GumboVector /* GumboError */ errors;
|
842
851
|
|
843
852
|
/**
|
844
|
-
* True if the parser
|
853
|
+
* True if the parser encountered an error.
|
845
854
|
*
|
846
855
|
* This can be true and `errors` an empty `GumboVector` if the `max_errors`
|
847
856
|
* option was set to 0.
|
data/gumbo-parser/src/parser.c
CHANGED
@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
|
|
56
56
|
.fragment_encoding = NULL,
|
57
57
|
.quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
|
58
58
|
.fragment_context_has_form_ancestor = false,
|
59
|
+
.parse_noscript_content_as_text = false,
|
59
60
|
};
|
60
61
|
|
61
62
|
#define STRING(s) {.data = s, .length = sizeof(s) - 1}
|
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
|
|
749
750
|
GumboParserError* extra_data = &error->v.parser;
|
750
751
|
extra_data->input_type = token->type;
|
751
752
|
extra_data->input_tag = GUMBO_TAG_UNKNOWN;
|
752
|
-
|
753
|
+
extra_data->input_name = NULL;
|
754
|
+
if (token->type == GUMBO_TOKEN_START_TAG)
|
755
|
+
{
|
753
756
|
extra_data->input_tag = token->v.start_tag.tag;
|
754
|
-
|
757
|
+
if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
|
758
|
+
extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
|
759
|
+
}
|
760
|
+
}
|
761
|
+
else if (token->type == GUMBO_TOKEN_END_TAG)
|
762
|
+
{
|
755
763
|
extra_data->input_tag = token->v.end_tag.tag;
|
764
|
+
if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
|
765
|
+
extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
|
766
|
+
}
|
756
767
|
}
|
757
768
|
const GumboParserState* state = parser->_parser_state;
|
758
769
|
extra_data->parser_state = state->_insertion_mode;
|
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
|
|
763
774
|
node->type == GUMBO_NODE_ELEMENT
|
764
775
|
|| node->type == GUMBO_NODE_TEMPLATE
|
765
776
|
);
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
777
|
+
void *tag;
|
778
|
+
if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
|
779
|
+
tag = gumbo_strdup(node->v.element.name);
|
780
|
+
} else {
|
781
|
+
tag = (void *)(uintptr_t)node->v.element.tag;
|
782
|
+
}
|
783
|
+
gumbo_vector_add(tag, &extra_data->tag_stack);
|
770
784
|
}
|
771
785
|
}
|
772
786
|
|
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
|
|
1187
1201
|
element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
|
1188
1202
|
insert_element(parser, element, false);
|
1189
1203
|
gumbo_debug (
|
1190
|
-
"Inserting
|
1204
|
+
"Inserting <%s> element (@%p) from tag type.\n",
|
1191
1205
|
gumbo_normalized_tagname(tag),
|
1192
1206
|
(void*)element
|
1193
1207
|
);
|
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
|
|
1204
1218
|
assert(token->type == GUMBO_TOKEN_START_TAG);
|
1205
1219
|
GumboNode* element = create_element_from_token(token, tag_namespace);
|
1206
1220
|
insert_element(parser, element, false);
|
1221
|
+
gumbo_debug (
|
1222
|
+
"Inserting <%s> foreign element (@%p).\n",
|
1223
|
+
gumbo_normalized_tagname(element->v.element.tag),
|
1224
|
+
(void*)element
|
1225
|
+
);
|
1207
1226
|
if (
|
1208
1227
|
token_has_attribute(token, "xmlns")
|
1209
1228
|
&& !attribute_matches_case_sensitive (
|
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
|
|
2066
2085
|
|
2067
2086
|
// This is here to clean up memory when the spec says "Ignore current token."
|
2068
2087
|
static void ignore_token(GumboParser* parser) {
|
2088
|
+
gumbo_debug("Ignoring token.\n");
|
2069
2089
|
GumboToken* token = parser->_parser_state->_current_token;
|
2070
2090
|
// Ownership of the token's internal buffers are normally transferred to the
|
2071
2091
|
// element, but if no element is emitted (as happens in non-verbatim-mode
|
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
|
|
2430
2450
|
|
2431
2451
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-end
|
2432
2452
|
static void finish_parsing(GumboParser* parser) {
|
2433
|
-
gumbo_debug("Finishing parsing");
|
2453
|
+
gumbo_debug("Finishing parsing\n");
|
2434
2454
|
maybe_flush_text_node_buffer(parser);
|
2435
2455
|
GumboParserState* state = parser->_parser_state;
|
2436
2456
|
for (
|
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2608
2628
|
}
|
2609
2629
|
if (
|
2610
2630
|
tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
|
2631
|
+
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
|
2611
2632
|
) {
|
2612
2633
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
2613
2634
|
return;
|
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3313
3334
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3314
3335
|
return;
|
3315
3336
|
}
|
3316
|
-
if (
|
3337
|
+
if (
|
3338
|
+
tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
|
3339
|
+
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
|
3340
|
+
) {
|
3317
3341
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3318
3342
|
return;
|
3319
3343
|
}
|
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
|
|
4389
4413
|
|
4390
4414
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
|
4391
4415
|
static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
4392
|
-
gumbo_debug("Handling foreign content");
|
4416
|
+
gumbo_debug("Handling foreign content.\n");
|
4393
4417
|
switch (token->type) {
|
4394
4418
|
case GUMBO_TOKEN_NULL:
|
4395
4419
|
parser_add_parse_error(parser, token);
|
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
|
4507
4531
|
if (i == 0)
|
4508
4532
|
return;
|
4509
4533
|
// We can't call handle_token directly because the current node is still in
|
4510
|
-
// a
|
4534
|
+
// a foreign namespace, so it would re-enter this and result in infinite
|
4511
4535
|
// recursion.
|
4512
4536
|
handle_html_content(parser, token);
|
4513
4537
|
}
|
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
|
|
4627
4651
|
const char* fragment_encoding = options->fragment_encoding;
|
4628
4652
|
GumboQuirksModeEnum quirks = options->quirks_mode;
|
4629
4653
|
bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
|
4630
|
-
|
4631
4654
|
GumboNode* root;
|
4632
|
-
|
4655
|
+
|
4656
|
+
// 1. [Create a new Document node, and mark it as being an HTML document.]
|
4657
|
+
// 2. [If the node document of the context element is in quirks mode, then
|
4658
|
+
// let the Document be in quirks mode. Otherwise, the node document of
|
4659
|
+
// the context element is in limited-quirks mode, then let the Document
|
4660
|
+
// be in limited-quirks mode. Otherwise, leave the Document in no-quirks
|
4661
|
+
// mode.]
|
4633
4662
|
get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
|
4634
4663
|
|
4635
|
-
// 3.
|
4664
|
+
// 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
|
4665
|
+
// declarative shadow roots to true.]
|
4666
|
+
// 4. [Create a new HTML parser, and associate it with the just created Document node.]
|
4667
|
+
// 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
|
4636
4668
|
parser->_parser_state->_fragment_ctx =
|
4637
4669
|
create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
|
4638
4670
|
GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
|
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
|
|
4659
4691
|
break;
|
4660
4692
|
|
4661
4693
|
case GUMBO_TAG_NOSCRIPT:
|
4662
|
-
|
4663
|
-
|
4694
|
+
if (options->parse_noscript_content_as_text)
|
4695
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
|
4664
4696
|
break;
|
4665
4697
|
|
4666
4698
|
case GUMBO_TAG_PLAINTEXT:
|
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
|
|
4762
4794
|
adjusted_current_node &&
|
4763
4795
|
adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
|
4764
4796
|
);
|
4765
|
-
|
4797
|
+
// If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
|
4798
|
+
//
|
4799
|
+
// The parser is pretty fragile. Breaking out of the parsing loop in the middle of
|
4800
|
+
// the parse can leave the document in an inconsistent state.
|
4801
|
+
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|
4802
|
+
parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
|
4803
|
+
gumbo_debug("Tree depth limit exceeded.\n");
|
4804
|
+
token.type = GUMBO_TOKEN_EOF;
|
4805
|
+
} else {
|
4806
|
+
gumbo_lex(&parser, &token);
|
4807
|
+
}
|
4808
|
+
|
4766
4809
|
}
|
4767
4810
|
|
4768
4811
|
const char* token_type = "text";
|
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
|
|
4786
4829
|
break;
|
4787
4830
|
}
|
4788
4831
|
gumbo_debug (
|
4789
|
-
"Handling %s token @%lu:%lu in
|
4832
|
+
"Handling %s token @%lu:%lu in insertion mode %u.\n",
|
4790
4833
|
(char*) token_type,
|
4791
4834
|
(unsigned long)token.position.line,
|
4792
4835
|
(unsigned long)token.position.column,
|
@@ -4830,11 +4873,6 @@ GumboOutput* gumbo_parse_with_options (
|
|
4830
4873
|
gumbo_free(token.v.end_tag.name);
|
4831
4874
|
token.v.end_tag.name = NULL;
|
4832
4875
|
}
|
4833
|
-
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|
4834
|
-
parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
|
4835
|
-
gumbo_debug("Tree depth limit exceeded.\n");
|
4836
|
-
break;
|
4837
|
-
}
|
4838
4876
|
}
|
4839
4877
|
|
4840
4878
|
|
@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
|
|
340
340
|
|
341
341
|
// Sets the tag buffer original text and start point to the current iterator
|
342
342
|
// position. This is necessary because attribute names & values may have
|
343
|
-
// whitespace
|
343
|
+
// whitespace preceding them, and so we can't assume that the actual token
|
344
344
|
// starting point was the end of the last tag buffer usage.
|
345
345
|
static void reset_tag_buffer_start_point(GumboParser* parser) {
|
346
346
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
@@ -569,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
|
|
569
569
|
}
|
570
570
|
|
571
571
|
// Appends a codepoint to the current tag buffer. If
|
572
|
-
//
|
572
|
+
// reinitialize_position_on_first is set, this also initializes the tag buffer
|
573
573
|
// start point; the only time you would *not* want to pass true for this
|
574
574
|
// parameter is if you want the original_text to include character (like an
|
575
575
|
// opening quote) that doesn't appear in the value.
|
576
576
|
static void append_char_to_tag_buffer (
|
577
577
|
GumboParser* parser,
|
578
578
|
int codepoint,
|
579
|
-
bool
|
579
|
+
bool reinitialize_position_on_first
|
580
580
|
) {
|
581
581
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
582
|
-
if (buffer->length == 0 &&
|
582
|
+
if (buffer->length == 0 && reinitialize_position_on_first) {
|
583
583
|
reset_tag_buffer_start_point(parser);
|
584
584
|
}
|
585
585
|
gumbo_string_buffer_append_codepoint(codepoint, buffer);
|
@@ -589,10 +589,10 @@ static void append_char_to_tag_buffer (
|
|
589
589
|
static void append_string_to_tag_buffer (
|
590
590
|
GumboParser* parser,
|
591
591
|
GumboStringPiece* str,
|
592
|
-
bool
|
592
|
+
bool reinitialize_position_on_first
|
593
593
|
) {
|
594
594
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
595
|
-
if (buffer->length == 0 &&
|
595
|
+
if (buffer->length == 0 && reinitialize_position_on_first) {
|
596
596
|
reset_tag_buffer_start_point(parser);
|
597
597
|
}
|
598
598
|
gumbo_string_buffer_append_string(str, buffer);
|
@@ -18,7 +18,7 @@ module Nokogiri
|
|
18
18
|
#
|
19
19
|
module ClassResolver
|
20
20
|
# #related_class restricts matching namespaces to those matching this set.
|
21
|
-
VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
|
21
|
+
VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
|
22
22
|
|
23
23
|
# :call-seq:
|
24
24
|
# related_class(class_name) → Class
|
data/lib/nokogiri/css/node.rb
CHANGED
@@ -23,8 +23,12 @@ module Nokogiri
|
|
23
23
|
|
24
24
|
###
|
25
25
|
# Convert this CSS node to xpath with +prefix+ using +visitor+
|
26
|
-
def to_xpath(
|
27
|
-
prefix =
|
26
|
+
def to_xpath(visitor)
|
27
|
+
prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
|
28
|
+
"."
|
29
|
+
else
|
30
|
+
visitor.prefix
|
31
|
+
end
|
28
32
|
prefix + visitor.accept(self)
|
29
33
|
end
|
30
34
|
|
data/lib/nokogiri/css/parser.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
#
|
3
3
|
# DO NOT MODIFY!!!!
|
4
|
-
# This file is automatically generated by Racc 1.
|
5
|
-
# from Racc grammar file "".
|
4
|
+
# This file is automatically generated by Racc 1.8.0
|
5
|
+
# from Racc grammar file "parser.y".
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'racc/parser.rb'
|
@@ -291,6 +291,7 @@ Racc_arg = [
|
|
291
291
|
racc_shift_n,
|
292
292
|
racc_reduce_n,
|
293
293
|
racc_use_result_var ]
|
294
|
+
Ractor.make_shareable(Racc_arg) if defined?(Ractor)
|
294
295
|
|
295
296
|
Racc_token_to_s_table = [
|
296
297
|
"$end",
|
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
|
|
351
352
|
"negation",
|
352
353
|
"eql_incl_dash",
|
353
354
|
"negation_arg" ]
|
355
|
+
Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
|
354
356
|
|
355
357
|
Racc_debug_parser = false
|
356
358
|
|
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
|
|
468
470
|
end
|
469
471
|
|
470
472
|
def _reduce_24(val, _values, result)
|
471
|
-
result = Node.new(:ELEMENT_NAME, [
|
473
|
+
result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
|
472
474
|
result
|
473
475
|
end
|
474
476
|
|
475
477
|
def _reduce_25(val, _values, result)
|
476
|
-
name =
|
478
|
+
name = val[0]
|
477
479
|
result = Node.new(:ELEMENT_NAME, [name])
|
478
480
|
|
479
481
|
result
|
data/lib/nokogiri/css/parser.y
CHANGED
@@ -64,9 +64,9 @@ rule
|
|
64
64
|
;
|
65
65
|
|
66
66
|
namespaced_ident:
|
67
|
-
namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [
|
67
|
+
namespace '|' IDENT { result = Node.new(:ELEMENT_NAME, [val[0], val[2]]) }
|
68
68
|
| IDENT {
|
69
|
-
name =
|
69
|
+
name = val[0]
|
70
70
|
result = Node.new(:ELEMENT_NAME, [name])
|
71
71
|
}
|
72
72
|
;
|
@@ -5,62 +5,9 @@ require "thread"
|
|
5
5
|
module Nokogiri
|
6
6
|
module CSS
|
7
7
|
class Parser < Racc::Parser # :nodoc:
|
8
|
-
|
9
|
-
|
10
|
-
@cache = {}
|
11
|
-
@mutex = Mutex.new
|
12
|
-
|
13
|
-
class << self
|
14
|
-
# Return a thread-local boolean indicating whether the CSS-to-XPath cache is active. (Default is `true`.)
|
15
|
-
def cache_on?
|
16
|
-
!Thread.current[CACHE_SWITCH_NAME]
|
17
|
-
end
|
18
|
-
|
19
|
-
# Set a thread-local boolean to turn cacheing on and off. Truthy values turn the cache on, falsey values turn the cache off.
|
20
|
-
def set_cache(value) # rubocop:disable Naming/AccessorMethodName
|
21
|
-
Thread.current[CACHE_SWITCH_NAME] = !value
|
22
|
-
end
|
23
|
-
|
24
|
-
# Get the css selector in +string+ from the cache
|
25
|
-
def [](string)
|
26
|
-
return unless cache_on?
|
27
|
-
|
28
|
-
@mutex.synchronize { @cache[string] }
|
29
|
-
end
|
30
|
-
|
31
|
-
# Set the css selector in +string+ in the cache to +value+
|
32
|
-
def []=(string, value)
|
33
|
-
return value unless cache_on?
|
34
|
-
|
35
|
-
@mutex.synchronize { @cache[string] = value }
|
36
|
-
end
|
37
|
-
|
38
|
-
# Clear the cache
|
39
|
-
def clear_cache(create_new_object = false)
|
40
|
-
@mutex.synchronize do
|
41
|
-
if create_new_object
|
42
|
-
@cache = {}
|
43
|
-
else
|
44
|
-
@cache.clear
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
# Execute +block+ without cache
|
50
|
-
def without_cache(&block)
|
51
|
-
original_cache_setting = cache_on?
|
52
|
-
set_cache(false)
|
53
|
-
yield
|
54
|
-
ensure
|
55
|
-
set_cache(original_cache_setting)
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
# Create a new CSS parser with respect to +namespaces+
|
60
|
-
def initialize(namespaces = {})
|
8
|
+
def initialize
|
61
9
|
@tokenizer = Tokenizer.new
|
62
|
-
|
63
|
-
super()
|
10
|
+
super
|
64
11
|
end
|
65
12
|
|
66
13
|
def parse(string)
|
@@ -72,11 +19,10 @@ module Nokogiri
|
|
72
19
|
@tokenizer.next_token
|
73
20
|
end
|
74
21
|
|
75
|
-
# Get the xpath for +
|
76
|
-
def xpath_for(
|
77
|
-
|
78
|
-
|
79
|
-
ast.to_xpath(prefix, visitor)
|
22
|
+
# Get the xpath for +selector+ using +visitor+
|
23
|
+
def xpath_for(selector, visitor)
|
24
|
+
parse(selector).map do |ast|
|
25
|
+
ast.to_xpath(visitor)
|
80
26
|
end
|
81
27
|
end
|
82
28
|
|
@@ -85,12 +31,6 @@ module Nokogiri
|
|
85
31
|
after = value_stack.compact.last
|
86
32
|
raise SyntaxError, "unexpected '#{error_value}' after '#{after}'"
|
87
33
|
end
|
88
|
-
|
89
|
-
def cache_key(query, prefix, visitor)
|
90
|
-
if self.class.cache_on?
|
91
|
-
[query, prefix, @namespaces, visitor.config]
|
92
|
-
end
|
93
|
-
end
|
94
34
|
end
|
95
35
|
end
|
96
36
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module CSS
|
5
|
+
module SelectorCache # :nodoc:
|
6
|
+
@cache = {}
|
7
|
+
@mutex = Mutex.new
|
8
|
+
|
9
|
+
class << self
|
10
|
+
# Retrieve the cached XPath expressions for the key
|
11
|
+
def [](key)
|
12
|
+
@mutex.synchronize { @cache[key] }
|
13
|
+
end
|
14
|
+
|
15
|
+
# Insert the XPath expressions `value` at the cache key
|
16
|
+
def []=(key, value)
|
17
|
+
@mutex.synchronize { @cache[key] = value }
|
18
|
+
end
|
19
|
+
|
20
|
+
# Clear the cache
|
21
|
+
def clear_cache(create_new_object = false)
|
22
|
+
@mutex.synchronize do
|
23
|
+
if create_new_object # used in tests to avoid 'method redefined' warnings when injecting spies
|
24
|
+
@cache = {}
|
25
|
+
else
|
26
|
+
@cache.clear
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Construct a unique key cache key
|
32
|
+
def key(selector:, visitor:)
|
33
|
+
[selector, visitor.config]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|