nokogiri 1.15.4 → 1.17.2
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +12 -19
- data/README.md +8 -1
- data/dependencies.yml +9 -8
- data/ext/nokogiri/extconf.rb +194 -141
- data/ext/nokogiri/gumbo.c +69 -53
- data/ext/nokogiri/html4_document.c +10 -4
- data/ext/nokogiri/html4_element_description.c +18 -18
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +48 -58
- data/ext/nokogiri/html4_sax_push_parser.c +26 -25
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +9 -2
- data/ext/nokogiri/nokogiri.h +25 -33
- data/ext/nokogiri/test_global_handlers.c +1 -1
- data/ext/nokogiri/xml_attr.c +1 -1
- data/ext/nokogiri/xml_cdata.c +3 -12
- data/ext/nokogiri/xml_comment.c +3 -8
- data/ext/nokogiri/xml_document.c +167 -156
- data/ext/nokogiri/xml_document_fragment.c +10 -25
- data/ext/nokogiri/xml_dtd.c +1 -1
- data/ext/nokogiri/xml_element_content.c +9 -9
- data/ext/nokogiri/xml_encoding_handler.c +4 -4
- data/ext/nokogiri/xml_namespace.c +6 -10
- data/ext/nokogiri/xml_node.c +142 -108
- data/ext/nokogiri/xml_node_set.c +46 -44
- data/ext/nokogiri/xml_reader.c +74 -100
- data/ext/nokogiri/xml_relax_ng.c +35 -56
- data/ext/nokogiri/xml_sax_parser.c +156 -88
- data/ext/nokogiri/xml_sax_parser_context.c +214 -128
- data/ext/nokogiri/xml_sax_push_parser.c +69 -50
- data/ext/nokogiri/xml_schema.c +51 -87
- data/ext/nokogiri/xml_syntax_error.c +19 -11
- data/ext/nokogiri/xml_text.c +3 -6
- data/ext/nokogiri/xml_xpath_context.c +4 -7
- data/ext/nokogiri/xslt_stylesheet.c +16 -11
- data/gumbo-parser/Makefile +18 -0
- data/gumbo-parser/src/error.c +76 -48
- data/gumbo-parser/src/error.h +5 -1
- data/gumbo-parser/src/nokogiri_gumbo.h +11 -2
- data/gumbo-parser/src/parser.c +64 -23
- data/gumbo-parser/src/tokenizer.c +7 -6
- data/lib/nokogiri/class_resolver.rb +1 -1
- data/lib/nokogiri/css/node.rb +6 -2
- data/lib/nokogiri/css/parser.rb +6 -4
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +6 -66
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/tokenizer.rb +4 -4
- data/lib/nokogiri/css/tokenizer.rex +9 -8
- data/lib/nokogiri/css/xpath_visitor.rb +43 -27
- data/lib/nokogiri/css.rb +86 -20
- data/lib/nokogiri/decorators/slop.rb +3 -5
- data/lib/nokogiri/encoding_handler.rb +2 -2
- data/lib/nokogiri/html4/document.rb +45 -24
- data/lib/nokogiri/html4/document_fragment.rb +124 -12
- data/lib/nokogiri/html4/encoding_reader.rb +2 -2
- data/lib/nokogiri/html4/sax/parser.rb +23 -38
- data/lib/nokogiri/html4/sax/parser_context.rb +4 -9
- data/lib/nokogiri/html4.rb +9 -14
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +61 -30
- data/lib/nokogiri/html5/document_fragment.rb +130 -20
- data/lib/nokogiri/html5/node.rb +4 -4
- data/lib/nokogiri/html5.rb +114 -138
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +6 -5
- data/lib/nokogiri/xml/attr.rb +2 -2
- data/lib/nokogiri/xml/builder.rb +8 -1
- data/lib/nokogiri/xml/document.rb +74 -31
- data/lib/nokogiri/xml/document_fragment.rb +86 -15
- data/lib/nokogiri/xml/namespace.rb +1 -2
- data/lib/nokogiri/xml/node.rb +113 -35
- data/lib/nokogiri/xml/node_set.rb +12 -10
- data/lib/nokogiri/xml/parse_options.rb +1 -1
- data/lib/nokogiri/xml/pp/node.rb +6 -1
- data/lib/nokogiri/xml/reader.rb +51 -17
- data/lib/nokogiri/xml/relax_ng.rb +57 -20
- data/lib/nokogiri/xml/sax/document.rb +174 -83
- data/lib/nokogiri/xml/sax/parser.rb +115 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +116 -8
- data/lib/nokogiri/xml/sax/push_parser.rb +3 -0
- data/lib/nokogiri/xml/sax.rb +48 -0
- data/lib/nokogiri/xml/schema.rb +112 -45
- data/lib/nokogiri/xml/searchable.rb +9 -11
- data/lib/nokogiri/xml/syntax_error.rb +23 -1
- data/lib/nokogiri/xml.rb +14 -25
- data/lib/nokogiri/xslt/stylesheet.rb +29 -7
- data/lib/nokogiri/xslt.rb +4 -10
- data/lib/nokogiri.rb +1 -1
- data/lib/xsd/xmlparser/nokogiri.rb +3 -4
- data/ports/archives/libxml2-2.13.5.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
- metadata +15 -14
- data/ext/nokogiri/libxml2_backwards_compat.c +0 -121
- data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +0 -25
- data/ports/archives/libxml2-2.11.5.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.38.tar.xz +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
#include <nokogiri.h>
|
2
2
|
|
3
|
-
VALUE cNokogiriXsltStylesheet
|
3
|
+
VALUE cNokogiriXsltStylesheet;
|
4
4
|
|
5
5
|
static void
|
6
6
|
mark(void *data)
|
@@ -18,8 +18,8 @@ dealloc(void *data)
|
|
18
18
|
ruby_xfree(wrapper);
|
19
19
|
}
|
20
20
|
|
21
|
-
static const rb_data_type_t
|
22
|
-
.wrap_struct_name = "
|
21
|
+
static const rb_data_type_t nokogiri_xslt_stylesheet_tuple_type = {
|
22
|
+
.wrap_struct_name = "nokogiriXsltStylesheetTuple",
|
23
23
|
.function = {
|
24
24
|
.dmark = mark,
|
25
25
|
.dfree = dealloc,
|
@@ -56,7 +56,7 @@ Nokogiri_wrap_xslt_stylesheet(xsltStylesheetPtr ss)
|
|
56
56
|
self = TypedData_Make_Struct(
|
57
57
|
cNokogiriXsltStylesheet,
|
58
58
|
nokogiriXsltStylesheetTuple,
|
59
|
-
&
|
59
|
+
&nokogiri_xslt_stylesheet_tuple_type,
|
60
60
|
wrapper
|
61
61
|
);
|
62
62
|
|
@@ -71,7 +71,12 @@ Nokogiri_wrap_xslt_stylesheet(xsltStylesheetPtr ss)
|
|
71
71
|
* call-seq:
|
72
72
|
* parse_stylesheet_doc(document)
|
73
73
|
*
|
74
|
-
* Parse
|
74
|
+
* Parse an XSLT::Stylesheet from +document+.
|
75
|
+
*
|
76
|
+
* [Parameters]
|
77
|
+
* - +document+ (Nokogiri::XML::Document) the document to be parsed.
|
78
|
+
*
|
79
|
+
* [Returns] Nokogiri::XSLT::Stylesheet
|
75
80
|
*/
|
76
81
|
static VALUE
|
77
82
|
parse_stylesheet_doc(VALUE klass, VALUE xmldocobj)
|
@@ -104,7 +109,7 @@ parse_stylesheet_doc(VALUE klass, VALUE xmldocobj)
|
|
104
109
|
* call-seq:
|
105
110
|
* serialize(document)
|
106
111
|
*
|
107
|
-
* Serialize +document+ to an xml string.
|
112
|
+
* Serialize +document+ to an xml string, as specified by the +method+ parameter in the Stylesheet.
|
108
113
|
*/
|
109
114
|
static VALUE
|
110
115
|
rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj)
|
@@ -119,7 +124,7 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj)
|
|
119
124
|
TypedData_Get_Struct(
|
120
125
|
self,
|
121
126
|
nokogiriXsltStylesheetTuple,
|
122
|
-
&
|
127
|
+
&nokogiri_xslt_stylesheet_tuple_type,
|
123
128
|
wrapper
|
124
129
|
);
|
125
130
|
xsltSaveResultToString(&doc_ptr, &doc_len, xml, wrapper->ss);
|
@@ -133,7 +138,7 @@ rb_xslt_stylesheet_serialize(VALUE self, VALUE xmlobj)
|
|
133
138
|
* transform(document)
|
134
139
|
* transform(document, params = {})
|
135
140
|
*
|
136
|
-
*
|
141
|
+
* Transform an XML::Document as defined by an XSLT::Stylesheet.
|
137
142
|
*
|
138
143
|
* [Parameters]
|
139
144
|
* - +document+ (Nokogiri::XML::Document) the document to be transformed.
|
@@ -268,7 +273,7 @@ rb_xslt_stylesheet_transform(int argc, VALUE *argv, VALUE self)
|
|
268
273
|
Check_Type(rb_param, T_ARRAY);
|
269
274
|
|
270
275
|
c_document = noko_xml_document_unwrap(rb_document);
|
271
|
-
TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &
|
276
|
+
TypedData_Get_Struct(self, nokogiriXsltStylesheetTuple, &nokogiri_xslt_stylesheet_tuple_type, wrapper);
|
272
277
|
|
273
278
|
param_len = RARRAY_LEN(rb_param);
|
274
279
|
params = ruby_xcalloc((size_t)param_len + 1, sizeof(char *));
|
@@ -357,7 +362,7 @@ initFunc(xsltTransformContextPtr ctxt, const xmlChar *uri)
|
|
357
362
|
TypedData_Get_Struct(
|
358
363
|
(VALUE)ctxt->style->_private,
|
359
364
|
nokogiriXsltStylesheetTuple,
|
360
|
-
&
|
365
|
+
&nokogiri_xslt_stylesheet_tuple_type,
|
361
366
|
wrapper
|
362
367
|
);
|
363
368
|
inst = rb_class_new_instance(0, NULL, obj);
|
@@ -375,7 +380,7 @@ shutdownFunc(xsltTransformContextPtr ctxt,
|
|
375
380
|
TypedData_Get_Struct(
|
376
381
|
(VALUE)ctxt->style->_private,
|
377
382
|
nokogiriXsltStylesheetTuple,
|
378
|
-
&
|
383
|
+
&nokogiri_xslt_stylesheet_tuple_type,
|
379
384
|
wrapper
|
380
385
|
);
|
381
386
|
|
data/gumbo-parser/Makefile
CHANGED
@@ -13,6 +13,23 @@ LDFLAGS := -pthread
|
|
13
13
|
|
14
14
|
all: check
|
15
15
|
|
16
|
+
oss-fuzz:
|
17
|
+
./fuzzer/build-ossfuzz.sh
|
18
|
+
|
19
|
+
fuzzers: fuzzer-normal fuzzer-asan fuzzer-ubsan fuzzer-msan
|
20
|
+
|
21
|
+
fuzzer-normal:
|
22
|
+
./fuzzer/build.sh
|
23
|
+
|
24
|
+
fuzzer-asan:
|
25
|
+
SANITIZER=asan ./fuzzer/build.sh
|
26
|
+
|
27
|
+
fuzzer-ubsan:
|
28
|
+
SANITIZER=ubsan ./fuzzer/build.sh
|
29
|
+
|
30
|
+
fuzzer-msan:
|
31
|
+
SANITIZER=msan ./fuzzer/build.sh
|
32
|
+
|
16
33
|
# don't try to regenerate ragel or gperf files in CI, that should be a development-only action and
|
17
34
|
# the generated files should be committed to SCM
|
18
35
|
ifneq ($(CI),true)
|
@@ -81,6 +98,7 @@ coverage:
|
|
81
98
|
|
82
99
|
clean:
|
83
100
|
$(RM) -r build
|
101
|
+
$(RM) -r fuzzer/build fuzzer/src-* fuzzer/gumbo_corpus
|
84
102
|
|
85
103
|
build/src/flags: | build/src
|
86
104
|
@echo 'old_CC := $(CC)' > $@
|
data/gumbo-parser/src/error.c
CHANGED
@@ -46,33 +46,40 @@ static int PRINTF(2) print_message (
|
|
46
46
|
args
|
47
47
|
);
|
48
48
|
va_end(args);
|
49
|
-
|
49
|
+
|
50
|
+
#if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(_RUBY_MSVCRT)
|
50
51
|
if (bytes_written == -1) {
|
51
52
|
// vsnprintf returns -1 on older MSVC++ if there's not enough capacity,
|
52
53
|
// instead of returning the number of bytes that would've been written had
|
53
|
-
// there been enough. In this case, we
|
54
|
-
//
|
55
|
-
//
|
56
|
-
|
54
|
+
// there been enough. In this case, we can call vsnprintf() again but
|
55
|
+
// with a count of 0 to get the number of bytes written, not including
|
56
|
+
// the null terminator.
|
57
|
+
// https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/vsnprintf-vsnprintf-vsnprintf-l-vsnwprintf-vsnwprintf-l?view=msvc-140#behavior-summary
|
58
|
+
|
57
59
|
va_start(args, format);
|
58
|
-
|
59
|
-
|
60
|
-
|
60
|
+
bytes_written = vsnprintf (
|
61
|
+
NULL,
|
62
|
+
0,
|
61
63
|
format,
|
62
64
|
args
|
63
65
|
);
|
64
66
|
va_end(args);
|
65
|
-
return result == -1 ? 0 : result;
|
66
67
|
}
|
67
|
-
#
|
68
|
+
#endif
|
69
|
+
|
68
70
|
// -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
|
69
71
|
if (bytes_written == -1) {
|
70
72
|
return 0;
|
71
73
|
}
|
72
|
-
#endif
|
73
74
|
|
74
75
|
if (bytes_written >= remaining_capacity) {
|
75
|
-
|
76
|
+
// At least double the size of the buffer.
|
77
|
+
size_t new_capacity = output->capacity * 2;
|
78
|
+
if (new_capacity < output->length + bytes_written + 1) {
|
79
|
+
// The +1 is for the null terminator.
|
80
|
+
new_capacity = output->length + bytes_written + 1;
|
81
|
+
}
|
82
|
+
gumbo_string_buffer_reserve(new_capacity, output);
|
76
83
|
remaining_capacity = output->capacity - output->length;
|
77
84
|
va_start(args, format);
|
78
85
|
bytes_written = vsnprintf (
|
@@ -96,8 +103,14 @@ static void print_tag_stack (
|
|
96
103
|
if (i) {
|
97
104
|
print_message(output, ", ");
|
98
105
|
}
|
99
|
-
|
100
|
-
|
106
|
+
uintptr_t tag = (uintptr_t) error->tag_stack.data[i];
|
107
|
+
const char* tag_name;
|
108
|
+
if (tag > GUMBO_TAG_UNKNOWN) {
|
109
|
+
tag_name = error->tag_stack.data[i];
|
110
|
+
} else {
|
111
|
+
tag_name = gumbo_normalized_tagname((GumboTag)tag);
|
112
|
+
}
|
113
|
+
print_message(output, "%s", tag_name);
|
101
114
|
}
|
102
115
|
gumbo_string_buffer_append_codepoint('.', output);
|
103
116
|
}
|
@@ -326,41 +339,45 @@ static void handle_parser_error (
|
|
326
339
|
}
|
327
340
|
|
328
341
|
switch (error->input_type) {
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
print_tag_stack(error, output);
|
352
|
-
}
|
353
|
-
return;
|
354
|
-
case GUMBO_TOKEN_START_TAG:
|
355
|
-
print_message(output, "Start tag '%s' isn't allowed here.",
|
356
|
-
gumbo_normalized_tagname(error->input_tag));
|
357
|
-
print_tag_stack(error, output);
|
358
|
-
return;
|
359
|
-
case GUMBO_TOKEN_END_TAG:
|
360
|
-
print_message(output, "End tag '%s' isn't allowed here.",
|
361
|
-
gumbo_normalized_tagname(error->input_tag));
|
342
|
+
case GUMBO_TOKEN_DOCTYPE:
|
343
|
+
print_message(output, "This is not a legal doctype");
|
344
|
+
return;
|
345
|
+
case GUMBO_TOKEN_COMMENT:
|
346
|
+
// Should never happen; comments are always legal.
|
347
|
+
assert(0);
|
348
|
+
// But just in case...
|
349
|
+
print_message(output, "Comments aren't legal here");
|
350
|
+
return;
|
351
|
+
case GUMBO_TOKEN_CDATA:
|
352
|
+
case GUMBO_TOKEN_WHITESPACE:
|
353
|
+
case GUMBO_TOKEN_CHARACTER:
|
354
|
+
print_message(output, "Character tokens aren't legal here");
|
355
|
+
return;
|
356
|
+
case GUMBO_TOKEN_NULL:
|
357
|
+
print_message(output, "Null bytes are not allowed in HTML5");
|
358
|
+
return;
|
359
|
+
case GUMBO_TOKEN_EOF:
|
360
|
+
if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
|
361
|
+
print_message(output, "You must provide a doctype");
|
362
|
+
} else {
|
363
|
+
print_message(output, "Premature end of file.");
|
362
364
|
print_tag_stack(error, output);
|
363
|
-
|
365
|
+
}
|
366
|
+
return;
|
367
|
+
case GUMBO_TOKEN_START_TAG:
|
368
|
+
case GUMBO_TOKEN_END_TAG:
|
369
|
+
{
|
370
|
+
const char* tag_name;
|
371
|
+
const char* which = error->input_type == GUMBO_TOKEN_START_TAG ? "Start" : "End";
|
372
|
+
if (error->input_name) {
|
373
|
+
tag_name = error->input_name;
|
374
|
+
} else {
|
375
|
+
tag_name = gumbo_normalized_tagname(error->input_tag);
|
376
|
+
}
|
377
|
+
print_message(output, "%s tag '%s' isn't allowed here.", which, tag_name);
|
378
|
+
print_tag_stack(error, output);
|
379
|
+
return;
|
380
|
+
}
|
364
381
|
}
|
365
382
|
}
|
366
383
|
|
@@ -613,6 +630,17 @@ void gumbo_print_caret_diagnostic (
|
|
613
630
|
|
614
631
|
void gumbo_error_destroy(GumboError* error) {
|
615
632
|
if (error->type == GUMBO_ERR_PARSER) {
|
633
|
+
// Free the tag name.
|
634
|
+
if (error->v.parser.input_name) {
|
635
|
+
gumbo_free(error->v.parser.input_name);
|
636
|
+
}
|
637
|
+
|
638
|
+
for (unsigned int i = 0; i < error->v.parser.tag_stack.length; ++i) {
|
639
|
+
intptr_t tag = (intptr_t) error->v.parser.tag_stack.data[i];
|
640
|
+
if (tag > GUMBO_TAG_UNKNOWN) {
|
641
|
+
gumbo_free(error->v.parser.tag_stack.data[i]);
|
642
|
+
}
|
643
|
+
}
|
616
644
|
gumbo_vector_destroy(&error->v.parser.tag_stack);
|
617
645
|
}
|
618
646
|
gumbo_free(error);
|
data/gumbo-parser/src/error.h
CHANGED
@@ -95,12 +95,16 @@ typedef struct GumboInternalParserError {
|
|
95
95
|
// The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
|
96
96
|
GumboTag input_tag;
|
97
97
|
|
98
|
+
// The HTML tag of the input token if it was nonstandard tag token. NULL otherwise.
|
99
|
+
char *input_name;
|
100
|
+
|
98
101
|
// The insertion mode that the parser was in at the time.
|
99
102
|
GumboInsertionMode parser_state;
|
100
103
|
|
101
104
|
// The tag stack at the point of the error. Note that this is an GumboVector
|
102
105
|
// of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
|
103
|
-
// get at the tag.
|
106
|
+
// get at the tag. For nonstandard tags, this is a pointer to an owned char *
|
107
|
+
// containing the tag name.
|
104
108
|
GumboVector /* GumboTag */ tag_stack;
|
105
109
|
} GumboParserError;
|
106
110
|
|
@@ -780,6 +780,15 @@ typedef struct GumboInternalOptions {
|
|
780
780
|
* Default: `false`.
|
781
781
|
*/
|
782
782
|
bool fragment_context_has_form_ancestor;
|
783
|
+
|
784
|
+
/**
|
785
|
+
* Parse `noscript` elements as if scripting was enabled. This causes the
|
786
|
+
* contents of the `noscript` element to be parsed as raw text, rather
|
787
|
+
* than as HTML elements.
|
788
|
+
*
|
789
|
+
* Default: `false`.
|
790
|
+
*/
|
791
|
+
bool parse_noscript_content_as_text;
|
783
792
|
} GumboOptions;
|
784
793
|
|
785
794
|
/** Default options struct; use this with gumbo_parse_with_options. */
|
@@ -791,7 +800,7 @@ extern const GumboOptions kGumboDefaultOptions;
|
|
791
800
|
*/
|
792
801
|
typedef enum {
|
793
802
|
/**
|
794
|
-
* Indicates that parsing completed
|
803
|
+
* Indicates that parsing completed successfully. The resulting tree
|
795
804
|
* will be a complete document.
|
796
805
|
*/
|
797
806
|
GUMBO_STATUS_OK,
|
@@ -841,7 +850,7 @@ typedef struct GumboInternalOutput {
|
|
841
850
|
GumboVector /* GumboError */ errors;
|
842
851
|
|
843
852
|
/**
|
844
|
-
* True if the parser
|
853
|
+
* True if the parser encountered an error.
|
845
854
|
*
|
846
855
|
* This can be true and `errors` an empty `GumboVector` if the `max_errors`
|
847
856
|
* option was set to 0.
|
data/gumbo-parser/src/parser.c
CHANGED
@@ -56,6 +56,7 @@ const GumboOptions kGumboDefaultOptions = {
|
|
56
56
|
.fragment_encoding = NULL,
|
57
57
|
.quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
|
58
58
|
.fragment_context_has_form_ancestor = false,
|
59
|
+
.parse_noscript_content_as_text = false,
|
59
60
|
};
|
60
61
|
|
61
62
|
#define STRING(s) {.data = s, .length = sizeof(s) - 1}
|
@@ -749,10 +750,20 @@ static void parser_add_parse_error (
|
|
749
750
|
GumboParserError* extra_data = &error->v.parser;
|
750
751
|
extra_data->input_type = token->type;
|
751
752
|
extra_data->input_tag = GUMBO_TAG_UNKNOWN;
|
752
|
-
|
753
|
+
extra_data->input_name = NULL;
|
754
|
+
if (token->type == GUMBO_TOKEN_START_TAG)
|
755
|
+
{
|
753
756
|
extra_data->input_tag = token->v.start_tag.tag;
|
754
|
-
|
757
|
+
if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.start_tag.name) {
|
758
|
+
extra_data->input_name = gumbo_strdup(token->v.start_tag.name);
|
759
|
+
}
|
760
|
+
}
|
761
|
+
else if (token->type == GUMBO_TOKEN_END_TAG)
|
762
|
+
{
|
755
763
|
extra_data->input_tag = token->v.end_tag.tag;
|
764
|
+
if (extra_data->input_tag == GUMBO_TAG_UNKNOWN && token->v.end_tag.name) {
|
765
|
+
extra_data->input_name = gumbo_strdup(token->v.end_tag.name);
|
766
|
+
}
|
756
767
|
}
|
757
768
|
const GumboParserState* state = parser->_parser_state;
|
758
769
|
extra_data->parser_state = state->_insertion_mode;
|
@@ -763,10 +774,13 @@ static void parser_add_parse_error (
|
|
763
774
|
node->type == GUMBO_NODE_ELEMENT
|
764
775
|
|| node->type == GUMBO_NODE_TEMPLATE
|
765
776
|
);
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
777
|
+
void *tag;
|
778
|
+
if (node->v.element.tag == GUMBO_TAG_UNKNOWN && node->v.element.name) {
|
779
|
+
tag = gumbo_strdup(node->v.element.name);
|
780
|
+
} else {
|
781
|
+
tag = (void *)(uintptr_t)node->v.element.tag;
|
782
|
+
}
|
783
|
+
gumbo_vector_add(tag, &extra_data->tag_stack);
|
770
784
|
}
|
771
785
|
}
|
772
786
|
|
@@ -1187,7 +1201,7 @@ static GumboNode* insert_element_of_tag_type (
|
|
1187
1201
|
element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
|
1188
1202
|
insert_element(parser, element, false);
|
1189
1203
|
gumbo_debug (
|
1190
|
-
"Inserting
|
1204
|
+
"Inserting <%s> element (@%p) from tag type.\n",
|
1191
1205
|
gumbo_normalized_tagname(tag),
|
1192
1206
|
(void*)element
|
1193
1207
|
);
|
@@ -1204,6 +1218,11 @@ static GumboNode* insert_foreign_element (
|
|
1204
1218
|
assert(token->type == GUMBO_TOKEN_START_TAG);
|
1205
1219
|
GumboNode* element = create_element_from_token(token, tag_namespace);
|
1206
1220
|
insert_element(parser, element, false);
|
1221
|
+
gumbo_debug (
|
1222
|
+
"Inserting <%s> foreign element (@%p).\n",
|
1223
|
+
gumbo_normalized_tagname(element->v.element.tag),
|
1224
|
+
(void*)element
|
1225
|
+
);
|
1207
1226
|
if (
|
1208
1227
|
token_has_attribute(token, "xmlns")
|
1209
1228
|
&& !attribute_matches_case_sensitive (
|
@@ -2066,6 +2085,7 @@ static void remove_from_parent(GumboNode* node) {
|
|
2066
2085
|
|
2067
2086
|
// This is here to clean up memory when the spec says "Ignore current token."
|
2068
2087
|
static void ignore_token(GumboParser* parser) {
|
2088
|
+
gumbo_debug("Ignoring token.\n");
|
2069
2089
|
GumboToken* token = parser->_parser_state->_current_token;
|
2070
2090
|
// Ownership of the token's internal buffers are normally transferred to the
|
2071
2091
|
// element, but if no element is emitted (as happens in non-verbatim-mode
|
@@ -2430,7 +2450,7 @@ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
|
|
2430
2450
|
|
2431
2451
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-end
|
2432
2452
|
static void finish_parsing(GumboParser* parser) {
|
2433
|
-
gumbo_debug("Finishing parsing");
|
2453
|
+
gumbo_debug("Finishing parsing\n");
|
2434
2454
|
maybe_flush_text_node_buffer(parser);
|
2435
2455
|
GumboParserState* state = parser->_parser_state;
|
2436
2456
|
for (
|
@@ -2608,6 +2628,7 @@ static void handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2608
2628
|
}
|
2609
2629
|
if (
|
2610
2630
|
tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
|
2631
|
+
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
|
2611
2632
|
) {
|
2612
2633
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
2613
2634
|
return;
|
@@ -3313,7 +3334,10 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3313
3334
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3314
3335
|
return;
|
3315
3336
|
}
|
3316
|
-
if (
|
3337
|
+
if (
|
3338
|
+
tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)
|
3339
|
+
|| (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT) && parser->_options->parse_noscript_content_as_text)
|
3340
|
+
) {
|
3317
3341
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3318
3342
|
return;
|
3319
3343
|
}
|
@@ -4389,7 +4413,7 @@ static void handle_html_content(GumboParser* parser, GumboToken* token) {
|
|
4389
4413
|
|
4390
4414
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
|
4391
4415
|
static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
4392
|
-
gumbo_debug("Handling foreign content");
|
4416
|
+
gumbo_debug("Handling foreign content.\n");
|
4393
4417
|
switch (token->type) {
|
4394
4418
|
case GUMBO_TOKEN_NULL:
|
4395
4419
|
parser_add_parse_error(parser, token);
|
@@ -4507,7 +4531,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
|
4507
4531
|
if (i == 0)
|
4508
4532
|
return;
|
4509
4533
|
// We can't call handle_token directly because the current node is still in
|
4510
|
-
// a
|
4534
|
+
// a foreign namespace, so it would re-enter this and result in infinite
|
4511
4535
|
// recursion.
|
4512
4536
|
handle_html_content(parser, token);
|
4513
4537
|
}
|
@@ -4627,12 +4651,20 @@ static void fragment_parser_init (
|
|
4627
4651
|
const char* fragment_encoding = options->fragment_encoding;
|
4628
4652
|
GumboQuirksModeEnum quirks = options->quirks_mode;
|
4629
4653
|
bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
|
4630
|
-
|
4631
4654
|
GumboNode* root;
|
4632
|
-
|
4655
|
+
|
4656
|
+
// 1. [Create a new Document node, and mark it as being an HTML document.]
|
4657
|
+
// 2. [If the node document of the context element is in quirks mode, then
|
4658
|
+
// let the Document be in quirks mode. Otherwise, the node document of
|
4659
|
+
// the context element is in limited-quirks mode, then let the Document
|
4660
|
+
// be in limited-quirks mode. Otherwise, leave the Document in no-quirks
|
4661
|
+
// mode.]
|
4633
4662
|
get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
|
4634
4663
|
|
4635
|
-
// 3.
|
4664
|
+
// 3. [If allowDeclarativeShadowRoots is true, then set the Document's allow
|
4665
|
+
// declarative shadow roots to true.]
|
4666
|
+
// 4. [Create a new HTML parser, and associate it with the just created Document node.]
|
4667
|
+
// 5. [Set the state of the HTML parser's tokenization stage as follows, switching on the context element:]
|
4636
4668
|
parser->_parser_state->_fragment_ctx =
|
4637
4669
|
create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
|
4638
4670
|
GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
|
@@ -4659,8 +4691,8 @@ static void fragment_parser_init (
|
|
4659
4691
|
break;
|
4660
4692
|
|
4661
4693
|
case GUMBO_TAG_NOSCRIPT:
|
4662
|
-
|
4663
|
-
|
4694
|
+
if (options->parse_noscript_content_as_text)
|
4695
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
|
4664
4696
|
break;
|
4665
4697
|
|
4666
4698
|
case GUMBO_TAG_PLAINTEXT:
|
@@ -4762,7 +4794,18 @@ GumboOutput* gumbo_parse_with_options (
|
|
4762
4794
|
adjusted_current_node &&
|
4763
4795
|
adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
|
4764
4796
|
);
|
4765
|
-
|
4797
|
+
// If the maximum tree depth has been exceeded, proceed as if EOF has been reached.
|
4798
|
+
//
|
4799
|
+
// The parser is pretty fragile. Breaking out of the parsing loop in the middle of
|
4800
|
+
// the parse can leave the document in an inconsistent state.
|
4801
|
+
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|
4802
|
+
parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
|
4803
|
+
gumbo_debug("Tree depth limit exceeded.\n");
|
4804
|
+
token.type = GUMBO_TOKEN_EOF;
|
4805
|
+
} else {
|
4806
|
+
gumbo_lex(&parser, &token);
|
4807
|
+
}
|
4808
|
+
|
4766
4809
|
}
|
4767
4810
|
|
4768
4811
|
const char* token_type = "text";
|
@@ -4786,7 +4829,7 @@ GumboOutput* gumbo_parse_with_options (
|
|
4786
4829
|
break;
|
4787
4830
|
}
|
4788
4831
|
gumbo_debug (
|
4789
|
-
"Handling %s token @%lu:%lu in
|
4832
|
+
"Handling %s token @%lu:%lu in insertion mode %u.\n",
|
4790
4833
|
(char*) token_type,
|
4791
4834
|
(unsigned long)token.position.line,
|
4792
4835
|
(unsigned long)token.position.column,
|
@@ -4826,14 +4869,12 @@ GumboOutput* gumbo_parse_with_options (
|
|
4826
4869
|
// to a token.
|
4827
4870
|
if (token.type == GUMBO_TOKEN_END_TAG &&
|
4828
4871
|
token.v.end_tag.tag == GUMBO_TAG_UNKNOWN)
|
4872
|
+
{
|
4829
4873
|
gumbo_free(token.v.end_tag.name);
|
4874
|
+
token.v.end_tag.name = NULL;
|
4875
|
+
}
|
4830
4876
|
}
|
4831
4877
|
|
4832
|
-
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|
4833
|
-
parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
|
4834
|
-
gumbo_debug("Tree depth limit exceeded.\n");
|
4835
|
-
break;
|
4836
|
-
}
|
4837
4878
|
|
4838
4879
|
++loop_count;
|
4839
4880
|
assert(loop_count < 1000000000UL);
|
@@ -340,7 +340,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
|
|
340
340
|
|
341
341
|
// Sets the tag buffer original text and start point to the current iterator
|
342
342
|
// position. This is necessary because attribute names & values may have
|
343
|
-
// whitespace
|
343
|
+
// whitespace preceding them, and so we can't assume that the actual token
|
344
344
|
// starting point was the end of the last tag buffer usage.
|
345
345
|
static void reset_tag_buffer_start_point(GumboParser* parser) {
|
346
346
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
@@ -506,6 +506,7 @@ static void abandon_current_tag(GumboParser* parser) {
|
|
506
506
|
for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
|
507
507
|
gumbo_destroy_attribute(tag_state->_attributes.data[i]);
|
508
508
|
}
|
509
|
+
gumbo_free(tag_state->_name);
|
509
510
|
gumbo_free(tag_state->_attributes.data);
|
510
511
|
mark_tag_state_as_empty(tag_state);
|
511
512
|
gumbo_string_buffer_destroy(&tag_state->_buffer);
|
@@ -568,17 +569,17 @@ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
|
|
568
569
|
}
|
569
570
|
|
570
571
|
// Appends a codepoint to the current tag buffer. If
|
571
|
-
//
|
572
|
+
// reinitialize_position_on_first is set, this also initializes the tag buffer
|
572
573
|
// start point; the only time you would *not* want to pass true for this
|
573
574
|
// parameter is if you want the original_text to include character (like an
|
574
575
|
// opening quote) that doesn't appear in the value.
|
575
576
|
static void append_char_to_tag_buffer (
|
576
577
|
GumboParser* parser,
|
577
578
|
int codepoint,
|
578
|
-
bool
|
579
|
+
bool reinitialize_position_on_first
|
579
580
|
) {
|
580
581
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
581
|
-
if (buffer->length == 0 &&
|
582
|
+
if (buffer->length == 0 && reinitialize_position_on_first) {
|
582
583
|
reset_tag_buffer_start_point(parser);
|
583
584
|
}
|
584
585
|
gumbo_string_buffer_append_codepoint(codepoint, buffer);
|
@@ -588,10 +589,10 @@ static void append_char_to_tag_buffer (
|
|
588
589
|
static void append_string_to_tag_buffer (
|
589
590
|
GumboParser* parser,
|
590
591
|
GumboStringPiece* str,
|
591
|
-
bool
|
592
|
+
bool reinitialize_position_on_first
|
592
593
|
) {
|
593
594
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
594
|
-
if (buffer->length == 0 &&
|
595
|
+
if (buffer->length == 0 && reinitialize_position_on_first) {
|
595
596
|
reset_tag_buffer_start_point(parser);
|
596
597
|
}
|
597
598
|
gumbo_string_buffer_append_string(str, buffer);
|
@@ -18,7 +18,7 @@ module Nokogiri
|
|
18
18
|
#
|
19
19
|
module ClassResolver
|
20
20
|
# #related_class restricts matching namespaces to those matching this set.
|
21
|
-
VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML"])
|
21
|
+
VALID_NAMESPACES = Set.new(["HTML", "HTML4", "HTML5", "XML", "SAX"])
|
22
22
|
|
23
23
|
# :call-seq:
|
24
24
|
# related_class(class_name) → Class
|
data/lib/nokogiri/css/node.rb
CHANGED
@@ -23,8 +23,12 @@ module Nokogiri
|
|
23
23
|
|
24
24
|
###
|
25
25
|
# Convert this CSS node to xpath with +prefix+ using +visitor+
|
26
|
-
def to_xpath(
|
27
|
-
prefix =
|
26
|
+
def to_xpath(visitor)
|
27
|
+
prefix = if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
|
28
|
+
"."
|
29
|
+
else
|
30
|
+
visitor.prefix
|
31
|
+
end
|
28
32
|
prefix + visitor.accept(self)
|
29
33
|
end
|
30
34
|
|
data/lib/nokogiri/css/parser.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
#
|
3
3
|
# DO NOT MODIFY!!!!
|
4
|
-
# This file is automatically generated by Racc 1.
|
5
|
-
# from Racc grammar file "".
|
4
|
+
# This file is automatically generated by Racc 1.8.0
|
5
|
+
# from Racc grammar file "parser.y".
|
6
6
|
#
|
7
7
|
|
8
8
|
require 'racc/parser.rb'
|
@@ -291,6 +291,7 @@ Racc_arg = [
|
|
291
291
|
racc_shift_n,
|
292
292
|
racc_reduce_n,
|
293
293
|
racc_use_result_var ]
|
294
|
+
Ractor.make_shareable(Racc_arg) if defined?(Ractor)
|
294
295
|
|
295
296
|
Racc_token_to_s_table = [
|
296
297
|
"$end",
|
@@ -351,6 +352,7 @@ Racc_token_to_s_table = [
|
|
351
352
|
"negation",
|
352
353
|
"eql_incl_dash",
|
353
354
|
"negation_arg" ]
|
355
|
+
Ractor.make_shareable(Racc_token_to_s_table) if defined?(Ractor)
|
354
356
|
|
355
357
|
Racc_debug_parser = false
|
356
358
|
|
@@ -468,12 +470,12 @@ def _reduce_23(val, _values, result)
|
|
468
470
|
end
|
469
471
|
|
470
472
|
def _reduce_24(val, _values, result)
|
471
|
-
result = Node.new(:ELEMENT_NAME, [
|
473
|
+
result = Node.new(:ELEMENT_NAME, [val[0], val[2]])
|
472
474
|
result
|
473
475
|
end
|
474
476
|
|
475
477
|
def _reduce_25(val, _values, result)
|
476
|
-
name =
|
478
|
+
name = val[0]
|
477
479
|
result = Node.new(:ELEMENT_NAME, [name])
|
478
480
|
|
479
481
|
result
|