nokogumbo 2.0.0.pre.alpha → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +81 -10
- data/ext/nokogumbo/extconf.rb +6 -1
- data/ext/nokogumbo/nokogumbo.c +579 -233
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +376 -120
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +47 -4
- data/gumbo-parser/src/parser.c +849 -709
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1426 -1261
- data/gumbo-parser/src/tokenizer.h +5 -5
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +12 -59
- data/gumbo-parser/src/utf8.h +51 -16
- data/lib/nokogumbo.rb +0 -1
- data/lib/nokogumbo/html5.rb +2 -1
- data/lib/nokogumbo/html5/document.rb +12 -1
- data/lib/nokogumbo/html5/document_fragment.rb +35 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +16 -9
- data/CHANGELOG.md +0 -56
data/gumbo-parser/src/error.h
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
#include "insertion_mode.h"
|
8
8
|
#include "string_buffer.h"
|
9
9
|
#include "token_type.h"
|
10
|
+
#include "tokenizer_states.h"
|
10
11
|
|
11
12
|
#ifdef __cplusplus
|
12
13
|
extern "C" {
|
@@ -15,85 +16,66 @@ extern "C" {
|
|
15
16
|
struct GumboInternalParser;
|
16
17
|
|
17
18
|
typedef enum {
|
19
|
+
// Defined errors.
|
20
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
|
21
|
+
GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
|
22
|
+
GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
|
23
|
+
GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
|
24
|
+
GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
|
25
|
+
GUMBO_ERR_CDATA_IN_HTML_CONTENT,
|
26
|
+
GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
|
27
|
+
GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM,
|
28
|
+
GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
|
29
|
+
GUMBO_ERR_END_TAG_WITH_ATTRIBUTES,
|
30
|
+
GUMBO_ERR_DUPLICATE_ATTRIBUTE,
|
31
|
+
GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS,
|
32
|
+
GUMBO_ERR_EOF_BEFORE_TAG_NAME,
|
33
|
+
GUMBO_ERR_EOF_IN_CDATA,
|
34
|
+
GUMBO_ERR_EOF_IN_COMMENT,
|
35
|
+
GUMBO_ERR_EOF_IN_DOCTYPE,
|
36
|
+
GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
|
37
|
+
GUMBO_ERR_EOF_IN_TAG,
|
38
|
+
GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT,
|
39
|
+
GUMBO_ERR_INCORRECTLY_OPENED_COMMENT,
|
40
|
+
GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
|
41
|
+
GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
|
42
|
+
GUMBO_ERR_MISSING_ATTRIBUTE_VALUE,
|
43
|
+
GUMBO_ERR_MISSING_DOCTYPE_NAME,
|
44
|
+
GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
|
45
|
+
GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
|
46
|
+
GUMBO_ERR_MISSING_END_TAG_NAME,
|
47
|
+
GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
|
48
|
+
GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
|
49
|
+
GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
|
50
|
+
GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
|
51
|
+
GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
|
52
|
+
GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
|
53
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
|
54
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
|
55
|
+
GUMBO_ERR_NESTED_COMMENT,
|
56
|
+
GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
|
57
|
+
GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM,
|
58
|
+
GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
|
59
|
+
GUMBO_ERR_NULL_CHARACTER_REFERENCE,
|
60
|
+
GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
|
61
|
+
GUMBO_ERR_SURROGATE_IN_INPUT_STREAM,
|
62
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
|
63
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
|
64
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
|
65
|
+
GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
|
66
|
+
GUMBO_ERR_UNEXPECTED_NULL_CHARACTER,
|
67
|
+
GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
|
68
|
+
GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG,
|
69
|
+
GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE,
|
70
|
+
|
71
|
+
// Encoding errors.
|
18
72
|
GUMBO_ERR_UTF8_INVALID,
|
19
73
|
GUMBO_ERR_UTF8_TRUNCATED,
|
20
|
-
|
21
|
-
|
22
|
-
GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
|
23
|
-
GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
|
24
|
-
GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
|
25
|
-
GUMBO_ERR_NAMED_CHAR_REF_INVALID,
|
26
|
-
GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
|
27
|
-
GUMBO_ERR_TAG_EOF,
|
28
|
-
GUMBO_ERR_TAG_INVALID,
|
29
|
-
GUMBO_ERR_CLOSE_TAG_EMPTY,
|
30
|
-
GUMBO_ERR_CLOSE_TAG_EOF,
|
31
|
-
GUMBO_ERR_CLOSE_TAG_INVALID,
|
32
|
-
GUMBO_ERR_SCRIPT_EOF,
|
33
|
-
GUMBO_ERR_ATTR_NAME_EOF,
|
34
|
-
GUMBO_ERR_ATTR_NAME_INVALID,
|
35
|
-
GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
|
36
|
-
GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
|
37
|
-
GUMBO_ERR_ATTR_UNQUOTED_EOF,
|
38
|
-
GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
|
39
|
-
GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
|
40
|
-
GUMBO_ERR_ATTR_AFTER_EOF,
|
41
|
-
GUMBO_ERR_ATTR_AFTER_INVALID,
|
42
|
-
GUMBO_ERR_DUPLICATE_ATTR,
|
43
|
-
GUMBO_ERR_SOLIDUS_EOF,
|
44
|
-
GUMBO_ERR_SOLIDUS_INVALID,
|
45
|
-
GUMBO_ERR_DASHES_OR_DOCTYPE,
|
46
|
-
GUMBO_ERR_COMMENT_EOF,
|
47
|
-
GUMBO_ERR_COMMENT_INVALID,
|
48
|
-
GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
|
49
|
-
GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
|
50
|
-
GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
|
51
|
-
GUMBO_ERR_COMMENT_END_BANG_EOF,
|
52
|
-
GUMBO_ERR_DOCTYPE_EOF,
|
53
|
-
GUMBO_ERR_DOCTYPE_INVALID,
|
54
|
-
GUMBO_ERR_DOCTYPE_SPACE,
|
55
|
-
GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
|
56
|
-
GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
|
57
|
-
GUMBO_ERR_DOCTYPE_END,
|
74
|
+
|
75
|
+
// Generic parser error.
|
58
76
|
GUMBO_ERR_PARSER,
|
59
|
-
GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
|
60
|
-
GUMBO_ERR_SELF_CLOSING_END_TAG,
|
61
77
|
} GumboErrorType;
|
62
78
|
|
63
|
-
// Additional data for duplicated attributes.
|
64
|
-
typedef struct GumboInternalDuplicateAttrError {
|
65
|
-
// The name of the attribute. Owned by this struct.
|
66
|
-
const char* name;
|
67
|
-
|
68
|
-
// The (0-based) index within the attributes vector of the original
|
69
|
-
// occurrence.
|
70
|
-
unsigned int original_index;
|
71
|
-
|
72
|
-
// The (0-based) index where the new occurrence would be.
|
73
|
-
unsigned int new_index;
|
74
|
-
} GumboDuplicateAttrError;
|
75
|
-
|
76
|
-
// A simplified representation of the tokenizer state, designed to be more
|
77
|
-
// useful to clients of this library than the internal representation. This
|
78
|
-
// condenses the actual states used in the tokenizer state machine into a few
|
79
|
-
// values that will be familiar to users of HTML.
|
80
|
-
typedef enum {
|
81
|
-
GUMBO_ERR_TOKENIZER_DATA,
|
82
|
-
GUMBO_ERR_TOKENIZER_CHAR_REF,
|
83
|
-
GUMBO_ERR_TOKENIZER_RCDATA,
|
84
|
-
GUMBO_ERR_TOKENIZER_RAWTEXT,
|
85
|
-
GUMBO_ERR_TOKENIZER_PLAINTEXT,
|
86
|
-
GUMBO_ERR_TOKENIZER_SCRIPT,
|
87
|
-
GUMBO_ERR_TOKENIZER_TAG,
|
88
|
-
GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
|
89
|
-
GUMBO_ERR_TOKENIZER_ATTR_NAME,
|
90
|
-
GUMBO_ERR_TOKENIZER_ATTR_VALUE,
|
91
|
-
GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
|
92
|
-
GUMBO_ERR_TOKENIZER_COMMENT,
|
93
|
-
GUMBO_ERR_TOKENIZER_DOCTYPE,
|
94
|
-
GUMBO_ERR_TOKENIZER_CDATA,
|
95
|
-
} GumboTokenizerErrorState;
|
96
|
-
|
97
79
|
// Additional data for tokenizer errors.
|
98
80
|
// This records the current state and codepoint encountered - this is usually
|
99
81
|
// enough to reconstruct what went wrong and provide a friendly error message.
|
@@ -102,7 +84,7 @@ typedef struct GumboInternalTokenizerError {
|
|
102
84
|
int codepoint;
|
103
85
|
|
104
86
|
// The state that the tokenizer was in at the time.
|
105
|
-
|
87
|
+
GumboTokenizerEnum state;
|
106
88
|
} GumboTokenizerError;
|
107
89
|
|
108
90
|
// Additional data for parse errors.
|
@@ -125,43 +107,25 @@ typedef struct GumboInternalParserError {
|
|
125
107
|
// The overall error struct representing an error in decoding/tokenizing/parsing
|
126
108
|
// the HTML. This contains an enumerated type flag, a source position, and then
|
127
109
|
// a union of fields containing data specific to the error.
|
128
|
-
|
110
|
+
struct GumboInternalError {
|
129
111
|
// The type of error.
|
130
112
|
GumboErrorType type;
|
131
113
|
|
132
114
|
// The position within the source file where the error occurred.
|
133
115
|
GumboSourcePosition position;
|
134
116
|
|
135
|
-
//
|
136
|
-
|
137
|
-
// character-based instead of byte-based offsets).
|
138
|
-
const char* original_text;
|
117
|
+
// The piece of text that caused the error.
|
118
|
+
GumboStringPiece original_text;
|
139
119
|
|
140
120
|
// Type-specific error information.
|
141
121
|
union {
|
142
|
-
// The code point we encountered, for:
|
143
|
-
// * GUMBO_ERR_UTF8_INVALID
|
144
|
-
// * GUMBO_ERR_UTF8_TRUNCATED
|
145
|
-
// * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
|
146
|
-
// * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
|
147
|
-
uint32_t codepoint;
|
148
|
-
|
149
122
|
// Tokenizer errors.
|
150
123
|
GumboTokenizerError tokenizer;
|
151
124
|
|
152
|
-
//
|
153
|
-
|
154
|
-
// * GUMBO_ERR_NAMED_CHAR_REF_INVALID
|
155
|
-
GumboStringPiece text;
|
156
|
-
|
157
|
-
// Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
|
158
|
-
GumboDuplicateAttrError duplicate_attr;
|
159
|
-
|
160
|
-
// Parser state, for GUMBO_ERR_PARSER and
|
161
|
-
// GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
|
162
|
-
struct GumboInternalParserError parser;
|
125
|
+
// Parser errors.
|
126
|
+
GumboParserError parser;
|
163
127
|
} v;
|
164
|
-
}
|
128
|
+
};
|
165
129
|
|
166
130
|
// Adds a new error to the parser's error list, and returns a pointer to it so
|
167
131
|
// that clients can fill out the rest of its fields. May return NULL if we're
|
@@ -177,32 +141,6 @@ void gumbo_destroy_errors(struct GumboInternalParser* errors);
|
|
177
141
|
// Frees the memory used for a single GumboError.
|
178
142
|
void gumbo_error_destroy(GumboError* error);
|
179
143
|
|
180
|
-
// Prints an error to a string. This fills an empty GumboStringBuffer with a
|
181
|
-
// freshly-allocated buffer containing the error message text. The caller is
|
182
|
-
// responsible for freeing the buffer.
|
183
|
-
void gumbo_error_to_string (
|
184
|
-
const GumboError* error,
|
185
|
-
GumboStringBuffer* output
|
186
|
-
);
|
187
|
-
|
188
|
-
// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
|
189
|
-
// with a freshly-allocated buffer containing the error message text. The
|
190
|
-
// caller is responsible for freeing the buffer.
|
191
|
-
void gumbo_caret_diagnostic_to_string (
|
192
|
-
const GumboError* error,
|
193
|
-
const char* source_text,
|
194
|
-
size_t source_length,
|
195
|
-
GumboStringBuffer* output
|
196
|
-
);
|
197
|
-
|
198
|
-
// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
|
199
|
-
// of writing to a string.
|
200
|
-
void gumbo_print_caret_diagnostic (
|
201
|
-
const GumboError* error,
|
202
|
-
const char* source_text,
|
203
|
-
size_t source_length
|
204
|
-
);
|
205
|
-
|
206
144
|
#ifdef __cplusplus
|
207
145
|
}
|
208
146
|
#endif
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -817,10 +817,6 @@ typedef struct GumboInternalOutput {
|
|
817
817
|
|
818
818
|
/**
|
819
819
|
* A list of errors that occurred during the parse.
|
820
|
-
* NOTE: In version 1.0 of this library, the API for errors hasn't been fully
|
821
|
-
* fleshed out and may change in the future. For this reason, the GumboError
|
822
|
-
* header isn't part of the public API. Contact us if you need errors
|
823
|
-
* reported so we can work out something appropriate for your use-case.
|
824
820
|
*/
|
825
821
|
GumboVector /* GumboError */ errors;
|
826
822
|
|
@@ -866,6 +862,53 @@ const char* gumbo_status_to_string(GumboOutputStatus status);
|
|
866
862
|
/** Release the memory used for the parse tree and parse errors. */
|
867
863
|
void gumbo_destroy_output(GumboOutput* output);
|
868
864
|
|
865
|
+
/** Opaque GumboError type */
|
866
|
+
typedef struct GumboInternalError GumboError;
|
867
|
+
|
868
|
+
/**
|
869
|
+
* Returns the position of the error.
|
870
|
+
*/
|
871
|
+
GumboSourcePosition gumbo_error_position(const GumboError* error);
|
872
|
+
|
873
|
+
/**
|
874
|
+
* Returns a constant string representation of the error's code. This is owned
|
875
|
+
* by the library and should not be freed by the caller.
|
876
|
+
*/
|
877
|
+
const char* gumbo_error_code(const GumboError* error);
|
878
|
+
|
879
|
+
/**
|
880
|
+
* Prints an error to a string. This stores a freshly-allocated buffer
|
881
|
+
* containing the error message text in output. The caller is responsible for
|
882
|
+
* freeing the buffer. The size of the error message is returned. The error
|
883
|
+
* message itself may not be NULL-terminated and may contain NULL bytes so the
|
884
|
+
* returned size must be used.
|
885
|
+
*/
|
886
|
+
size_t gumbo_error_to_string(const GumboError* error, char **output);
|
887
|
+
|
888
|
+
/**
|
889
|
+
* Prints a caret diagnostic to a string. This stores a freshly-allocated
|
890
|
+
* buffer containing the error message text in output. The caller is responsible for
|
891
|
+
* freeing the buffer. The size of the error message is returned. The error
|
892
|
+
* message itself may not be NULL-terminated and may contain NULL bytes so the
|
893
|
+
* returned size must be used.
|
894
|
+
*/
|
895
|
+
size_t gumbo_caret_diagnostic_to_string (
|
896
|
+
const GumboError* error,
|
897
|
+
const char* source_text,
|
898
|
+
size_t source_length,
|
899
|
+
char** output
|
900
|
+
);
|
901
|
+
|
902
|
+
/**
|
903
|
+
* Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
|
904
|
+
* instead of writing to a string.
|
905
|
+
*/
|
906
|
+
void gumbo_print_caret_diagnostic (
|
907
|
+
const GumboError* error,
|
908
|
+
const char* source_text,
|
909
|
+
size_t source_length
|
910
|
+
);
|
911
|
+
|
869
912
|
#ifdef __cplusplus
|
870
913
|
}
|
871
914
|
#endif
|
data/gumbo-parser/src/parser.c
CHANGED
@@ -31,6 +31,7 @@
|
|
31
31
|
#include "replacement.h"
|
32
32
|
#include "tokenizer.h"
|
33
33
|
#include "tokenizer_states.h"
|
34
|
+
#include "token_buffer.h"
|
34
35
|
#include "utf8.h"
|
35
36
|
#include "util.h"
|
36
37
|
#include "vector.h"
|
@@ -42,7 +43,7 @@ typedef uint8_t TagSet[GUMBO_TAG_LAST + 1];
|
|
42
43
|
|
43
44
|
#define GUMBO_EMPTY_SOURCE_POSITION_INIT { .line = 0, .column = 0, .offset = 0 }
|
44
45
|
#define kGumboEmptySourcePosition (const GumboSourcePosition) \
|
45
|
-
|
46
|
+
GUMBO_EMPTY_SOURCE_POSITION_INIT
|
46
47
|
|
47
48
|
const GumboOptions kGumboDefaultOptions = {
|
48
49
|
.tab_stop = 8,
|
@@ -59,25 +60,6 @@ const GumboOptions kGumboDefaultOptions = {
|
|
59
60
|
#define STRING(s) {.data = s, .length = sizeof(s) - 1}
|
60
61
|
#define TERMINATOR {.data = NULL, .length = 0}
|
61
62
|
|
62
|
-
static const GumboStringPiece kPublicIdHtml4_0 =
|
63
|
-
STRING("-//W3C//DTD HTML 4.0//EN");
|
64
|
-
static const GumboStringPiece kPublicIdHtml4_01 =
|
65
|
-
STRING("-//W3C//DTD HTML 4.01//EN");
|
66
|
-
static const GumboStringPiece kPublicIdXhtml1_0 =
|
67
|
-
STRING("-//W3C//DTD XHTML 1.0 Strict//EN");
|
68
|
-
static const GumboStringPiece kPublicIdXhtml1_1 =
|
69
|
-
STRING("-//W3C//DTD XHTML 1.1//EN");
|
70
|
-
static const GumboStringPiece kSystemIdRecHtml4_0 =
|
71
|
-
STRING("http://www.w3.org/TR/REC-html40/strict.dtd");
|
72
|
-
static const GumboStringPiece kSystemIdHtml4 =
|
73
|
-
STRING("http://www.w3.org/TR/html4/strict.dtd");
|
74
|
-
static const GumboStringPiece kSystemIdXhtmlStrict1_1 =
|
75
|
-
STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
|
76
|
-
static const GumboStringPiece kSystemIdXhtml1_1 =
|
77
|
-
STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
|
78
|
-
static const GumboStringPiece kSystemIdLegacyCompat =
|
79
|
-
STRING("about:legacy-compat");
|
80
|
-
|
81
63
|
// The doctype arrays have an explicit terminator because we want to pass them
|
82
64
|
// to a helper function, and passing them as a pointer discards sizeof
|
83
65
|
// information. The SVG arrays are used only by one-off functions, and so loops
|
@@ -260,6 +242,9 @@ typedef struct GumboInternalParserState {
|
|
260
242
|
// The accumulated text node buffer state.
|
261
243
|
TextNodeBufferState _text_node;
|
262
244
|
|
245
|
+
// The accumulated character tokens in tables for error purposes.
|
246
|
+
GumboCharacterTokenBuffer _table_character_tokens;
|
247
|
+
|
263
248
|
// The current token.
|
264
249
|
GumboToken* _current_token;
|
265
250
|
|
@@ -365,6 +350,7 @@ static void parser_state_init(GumboParser* parser) {
|
|
365
350
|
parser_state->_foster_parent_insertions = false;
|
366
351
|
parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
|
367
352
|
gumbo_string_buffer_init(&parser_state->_text_node._buffer);
|
353
|
+
gumbo_character_token_buffer_init(&parser_state->_table_character_tokens);
|
368
354
|
gumbo_vector_init(10, &parser_state->_open_elements);
|
369
355
|
gumbo_vector_init(5, &parser_state->_active_formatting_elements);
|
370
356
|
gumbo_vector_init(5, &parser_state->_template_insertion_modes);
|
@@ -463,6 +449,7 @@ static void parser_state_destroy(GumboParser* parser) {
|
|
463
449
|
gumbo_vector_destroy(&state->_open_elements);
|
464
450
|
gumbo_vector_destroy(&state->_template_insertion_modes);
|
465
451
|
gumbo_string_buffer_destroy(&state->_text_node._buffer);
|
452
|
+
gumbo_character_token_buffer_destroy(&state->_table_character_tokens);
|
466
453
|
gumbo_free(state);
|
467
454
|
}
|
468
455
|
|
@@ -573,11 +560,11 @@ static bool tag_in (
|
|
573
560
|
static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
|
574
561
|
if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
|
575
562
|
return token->v.start_tag.tag == tag;
|
576
|
-
}
|
563
|
+
}
|
564
|
+
if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
|
577
565
|
return token->v.end_tag.tag == tag;
|
578
|
-
} else {
|
579
|
-
return false;
|
580
566
|
}
|
567
|
+
return false;
|
581
568
|
}
|
582
569
|
|
583
570
|
static inline bool tagset_includes (
|
@@ -738,18 +725,18 @@ static void reset_insertion_mode_appropriately(GumboParser* parser) {
|
|
738
725
|
assert(0);
|
739
726
|
}
|
740
727
|
|
741
|
-
static
|
728
|
+
static void parser_add_parse_error (
|
742
729
|
GumboParser* parser,
|
743
730
|
const GumboToken* token
|
744
731
|
) {
|
745
732
|
gumbo_debug("Adding parse error.\n");
|
746
733
|
GumboError* error = gumbo_add_error(parser);
|
747
734
|
if (!error) {
|
748
|
-
return
|
735
|
+
return;
|
749
736
|
}
|
750
737
|
error->type = GUMBO_ERR_PARSER;
|
751
738
|
error->position = token->position;
|
752
|
-
error->original_text = token->original_text
|
739
|
+
error->original_text = token->original_text;
|
753
740
|
GumboParserError* extra_data = &error->v.parser;
|
754
741
|
extra_data->input_type = token->type;
|
755
742
|
extra_data->input_tag = GUMBO_TAG_UNKNOWN;
|
@@ -772,7 +759,6 @@ static GumboError* parser_add_parse_error (
|
|
772
759
|
&extra_data->tag_stack
|
773
760
|
);
|
774
761
|
}
|
775
|
-
return error;
|
776
762
|
}
|
777
763
|
|
778
764
|
// https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point
|
@@ -1639,9 +1625,11 @@ static bool has_node_in_scope(const GumboParser* parser, const GumboNode* node)
|
|
1639
1625
|
const GumboNodeType type = current->type;
|
1640
1626
|
if (current == node) {
|
1641
1627
|
return true;
|
1642
|
-
}
|
1628
|
+
}
|
1629
|
+
if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
|
1643
1630
|
continue;
|
1644
|
-
}
|
1631
|
+
}
|
1632
|
+
if (node_tag_in_set(current, &tags)) {
|
1645
1633
|
return false;
|
1646
1634
|
}
|
1647
1635
|
}
|
@@ -1689,8 +1677,8 @@ static bool has_an_element_in_select_scope(const GumboParser* parser, GumboTag t
|
|
1689
1677
|
// Pass GUMBO_TAG_LAST to not exclude any of them.
|
1690
1678
|
static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
|
1691
1679
|
static const TagSet tags = {
|
1692
|
-
TAG(DD), TAG(DT), TAG(LI), TAG(
|
1693
|
-
TAG(P), TAG(
|
1680
|
+
TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION),
|
1681
|
+
TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC)
|
1694
1682
|
};
|
1695
1683
|
while (
|
1696
1684
|
node_tag_in_set(get_current_node(parser), &tags)
|
@@ -1704,15 +1692,36 @@ static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
|
|
1704
1692
|
// https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags
|
1705
1693
|
static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
|
1706
1694
|
static const TagSet tags = {
|
1707
|
-
TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(
|
1708
|
-
TAG(
|
1709
|
-
TAG(TD), TAG(TFOOT), TAG(TH), TAG(
|
1695
|
+
TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
|
1696
|
+
TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY),
|
1697
|
+
TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
1710
1698
|
};
|
1711
1699
|
while (node_tag_in_set(get_current_node(parser), &tags)) {
|
1712
1700
|
pop_current_node(parser);
|
1713
1701
|
}
|
1714
1702
|
}
|
1715
1703
|
|
1704
|
+
// This factors out the clauses in the "in body" insertion mode checking "if
|
1705
|
+
// there is a node in the stack of open elements that is not" one of a list of
|
1706
|
+
// elements in which case it's a parse error.
|
1707
|
+
// This is used in "an end-of-file token", "an end tag whose tag name is
|
1708
|
+
// 'body'", and "an end tag whose tag name is 'html'".
|
1709
|
+
static bool stack_contains_nonclosable_element (
|
1710
|
+
GumboParser* parser
|
1711
|
+
) {
|
1712
|
+
static const TagSet tags = {
|
1713
|
+
TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P), TAG(RB),
|
1714
|
+
TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH),
|
1715
|
+
TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML),
|
1716
|
+
};
|
1717
|
+
GumboVector* open_elements = &parser->_parser_state->_open_elements;
|
1718
|
+
for (size_t i = 0; i < open_elements->length; ++i) {
|
1719
|
+
if (!node_tag_in_set(open_elements->data[i], &tags))
|
1720
|
+
return true;
|
1721
|
+
}
|
1722
|
+
return false;
|
1723
|
+
}
|
1724
|
+
|
1716
1725
|
// This factors out the clauses relating to "act as if an end tag token with tag
|
1717
1726
|
// name "table" had been seen. Returns true if there's a table element in table
|
1718
1727
|
// scope which was successfully closed, false if not and the token should be
|
@@ -1756,13 +1765,15 @@ static bool close_table_cell (
|
|
1756
1765
|
// https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell
|
1757
1766
|
// This holds the logic to determine whether we should close a <td> or a <th>.
|
1758
1767
|
static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
|
1768
|
+
GumboTag cell_tag;
|
1759
1769
|
if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
|
1760
1770
|
assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
|
1761
|
-
|
1771
|
+
cell_tag = GUMBO_TAG_TD;
|
1762
1772
|
} else {
|
1763
1773
|
assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
|
1764
|
-
|
1774
|
+
cell_tag = GUMBO_TAG_TH;
|
1765
1775
|
}
|
1776
|
+
return close_table_cell(parser, token, cell_tag);
|
1766
1777
|
}
|
1767
1778
|
|
1768
1779
|
// This factors out the "act as if an end tag of tag name 'select' had been
|
@@ -1862,13 +1873,13 @@ static bool maybe_implicitly_close_p_tag (
|
|
1862
1873
|
|
1863
1874
|
// Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
|
1864
1875
|
// tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
|
1865
|
-
static
|
1876
|
+
static bool maybe_implicitly_close_list_tag (
|
1866
1877
|
GumboParser* parser,
|
1867
1878
|
GumboToken* token,
|
1868
1879
|
bool is_li
|
1869
1880
|
) {
|
1870
1881
|
GumboParserState* state = parser->_parser_state;
|
1871
|
-
|
1882
|
+
set_frameset_not_ok(parser);
|
1872
1883
|
for (int i = state->_open_elements.length; --i >= 0;) {
|
1873
1884
|
const GumboNode* node = state->_open_elements.data[i];
|
1874
1885
|
bool is_list_tag = is_li
|
@@ -1876,21 +1887,21 @@ static void maybe_implicitly_close_list_tag (
|
|
1876
1887
|
: node_tag_in_set(node, &dd_dt_tags)
|
1877
1888
|
;
|
1878
1889
|
if (is_list_tag) {
|
1879
|
-
implicitly_close_tags (
|
1890
|
+
return implicitly_close_tags (
|
1880
1891
|
parser,
|
1881
1892
|
token,
|
1882
1893
|
node->v.element.tag_namespace,
|
1883
1894
|
node->v.element.tag
|
1884
1895
|
);
|
1885
|
-
return;
|
1886
1896
|
}
|
1887
1897
|
if (
|
1888
1898
|
is_special_node(node)
|
1889
1899
|
&& !node_tag_in_set(node, &(const TagSet){TAG(ADDRESS), TAG(DIV), TAG(P)})
|
1890
1900
|
) {
|
1891
|
-
return;
|
1901
|
+
return true;
|
1892
1902
|
}
|
1893
1903
|
}
|
1904
|
+
return true;
|
1894
1905
|
}
|
1895
1906
|
|
1896
1907
|
static void merge_attributes (
|
@@ -2009,36 +2020,17 @@ static void adjust_mathml_attributes(GumboToken* token) {
|
|
2009
2020
|
attr->name = gumbo_strdup("definitionURL");
|
2010
2021
|
}
|
2011
2022
|
|
2012
|
-
static bool doctype_matches (
|
2013
|
-
const GumboTokenDocType* doctype,
|
2014
|
-
const GumboStringPiece* public_id,
|
2015
|
-
const GumboStringPiece* system_id,
|
2016
|
-
bool allow_missing_system_id
|
2017
|
-
) {
|
2018
|
-
return
|
2019
|
-
!strcmp(doctype->public_identifier, public_id->data)
|
2020
|
-
&& (allow_missing_system_id || doctype->has_system_identifier)
|
2021
|
-
&& !strcmp(doctype->system_identifier, system_id->data);
|
2022
|
-
}
|
2023
|
-
|
2024
2023
|
static bool maybe_add_doctype_error (
|
2025
2024
|
GumboParser* parser,
|
2026
2025
|
const GumboToken* token
|
2027
2026
|
) {
|
2028
2027
|
const GumboTokenDocType* doctype = &token->v.doc_type;
|
2029
|
-
|
2030
|
-
|
2031
|
-
|
2032
|
-
|
2033
|
-
|
2034
|
-
|
2035
|
-
&kSystemIdRecHtml4_0, true) ||
|
2036
|
-
doctype_matches(doctype, &kPublicIdHtml4_01,
|
2037
|
-
&kSystemIdHtml4, true) ||
|
2038
|
-
doctype_matches(doctype, &kPublicIdXhtml1_0,
|
2039
|
-
&kSystemIdXhtmlStrict1_1, false) ||
|
2040
|
-
doctype_matches(doctype, &kPublicIdXhtml1_1,
|
2041
|
-
&kSystemIdXhtml1_1, false)))) {
|
2028
|
+
if (
|
2029
|
+
strcmp(doctype->name, "html")
|
2030
|
+
|| doctype->has_public_identifier
|
2031
|
+
|| (doctype->has_system_identifier
|
2032
|
+
&& strcmp(doctype->system_identifier, "about:legacy-compat"))
|
2033
|
+
) {
|
2042
2034
|
parser_add_parse_error(parser, token);
|
2043
2035
|
return false;
|
2044
2036
|
}
|
@@ -2069,6 +2061,8 @@ static void remove_from_parent(GumboNode* node) {
|
|
2069
2061
|
|
2070
2062
|
// https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
|
2071
2063
|
// Also described in the "in body" handling for end formatting tags.
|
2064
|
+
// Returns true if the algorithm handled the token and false to indicate that
|
2065
|
+
// it should be handled according to "any other end tag."
|
2072
2066
|
static bool adoption_agency_algorithm (
|
2073
2067
|
GumboParser* parser,
|
2074
2068
|
GumboToken* token,
|
@@ -2076,7 +2070,7 @@ static bool adoption_agency_algorithm (
|
|
2076
2070
|
) {
|
2077
2071
|
GumboParserState* state = parser->_parser_state;
|
2078
2072
|
gumbo_debug("Entering adoption agency algorithm.\n");
|
2079
|
-
// Step
|
2073
|
+
// Step 2.
|
2080
2074
|
GumboNode* current_node = get_current_node(parser);
|
2081
2075
|
if (
|
2082
2076
|
current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML
|
@@ -2087,18 +2081,19 @@ static bool adoption_agency_algorithm (
|
|
2087
2081
|
)
|
2088
2082
|
) {
|
2089
2083
|
pop_current_node(parser);
|
2090
|
-
return
|
2084
|
+
return true;
|
2091
2085
|
}
|
2092
|
-
// Steps
|
2086
|
+
// Steps 3-5 & 21:
|
2093
2087
|
for (unsigned int i = 0; i < 8; ++i) {
|
2094
|
-
// Step
|
2088
|
+
// Step 6.
|
2095
2089
|
GumboNode* formatting_node = NULL;
|
2096
2090
|
int formatting_node_in_open_elements = -1;
|
2097
2091
|
for (int j = state->_active_formatting_elements.length; --j >= 0;) {
|
2098
2092
|
GumboNode* current_node = state->_active_formatting_elements.data[j];
|
2099
2093
|
if (current_node == &kActiveFormattingScopeMarker) {
|
2100
2094
|
gumbo_debug("Broke on scope marker; aborting.\n");
|
2101
|
-
// Last scope marker; abort the algorithm
|
2095
|
+
// Last scope marker; abort the algorithm and handle according to "any
|
2096
|
+
// other end tag."
|
2102
2097
|
return false;
|
2103
2098
|
}
|
2104
2099
|
if (node_html_tag_is(current_node, subject)) {
|
@@ -2124,7 +2119,7 @@ static bool adoption_agency_algorithm (
|
|
2124
2119
|
return false;
|
2125
2120
|
}
|
2126
2121
|
|
2127
|
-
// Step
|
2122
|
+
// Step 7
|
2128
2123
|
if (formatting_node_in_open_elements == -1) {
|
2129
2124
|
gumbo_debug("Formatting node not on stack of open elements.\n");
|
2130
2125
|
parser_add_parse_error(parser, token);
|
@@ -2132,17 +2127,17 @@ static bool adoption_agency_algorithm (
|
|
2132
2127
|
formatting_node,
|
2133
2128
|
&state->_active_formatting_elements
|
2134
2129
|
);
|
2135
|
-
return
|
2130
|
+
return true;
|
2136
2131
|
}
|
2137
2132
|
|
2138
|
-
// Step
|
2133
|
+
// Step 8
|
2139
2134
|
if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
|
2140
2135
|
parser_add_parse_error(parser, token);
|
2141
2136
|
gumbo_debug("Element not in scope.\n");
|
2142
|
-
return
|
2137
|
+
return true;
|
2143
2138
|
}
|
2144
2139
|
|
2145
|
-
// Step
|
2140
|
+
// Step 9
|
2146
2141
|
if (formatting_node != get_current_node(parser)) {
|
2147
2142
|
parser_add_parse_error(parser, token); // But continue onwards.
|
2148
2143
|
}
|
@@ -2150,7 +2145,7 @@ static bool adoption_agency_algorithm (
|
|
2150
2145
|
assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
|
2151
2146
|
assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
|
2152
2147
|
|
2153
|
-
// Step
|
2148
|
+
// Step 10
|
2154
2149
|
GumboNode* furthest_block = NULL;
|
2155
2150
|
for (
|
2156
2151
|
unsigned int j = formatting_node_in_open_elements;
|
@@ -2160,32 +2155,27 @@ static bool adoption_agency_algorithm (
|
|
2160
2155
|
assert(j > 0);
|
2161
2156
|
GumboNode* current = state->_open_elements.data[j];
|
2162
2157
|
if (is_special_node(current)) {
|
2163
|
-
// Step 9.
|
2164
2158
|
furthest_block = current;
|
2165
2159
|
break;
|
2166
2160
|
}
|
2167
2161
|
}
|
2162
|
+
// Step 11.
|
2168
2163
|
if (!furthest_block) {
|
2169
|
-
|
2170
|
-
|
2171
|
-
pop_current_node(parser);
|
2172
|
-
}
|
2173
|
-
// And the formatting element itself.
|
2174
|
-
pop_current_node(parser);
|
2164
|
+
while (pop_current_node(parser) != formatting_node)
|
2165
|
+
;
|
2175
2166
|
gumbo_vector_remove (
|
2176
2167
|
formatting_node,
|
2177
2168
|
&state->_active_formatting_elements
|
2178
2169
|
);
|
2179
|
-
return
|
2170
|
+
return true;
|
2180
2171
|
}
|
2181
2172
|
assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
|
2182
|
-
assert(furthest_block);
|
2183
2173
|
|
2184
|
-
// Step
|
2174
|
+
// Step 12.
|
2185
2175
|
// Elements may be moved and reparented by this algorithm, so
|
2186
2176
|
// common_ancestor is not necessarily the same as formatting_node->parent.
|
2187
2177
|
GumboNode* common_ancestor = state->_open_elements.data [
|
2188
|
-
|
2178
|
+
formatting_node_in_open_elements - 1
|
2189
2179
|
];
|
2190
2180
|
gumbo_debug (
|
2191
2181
|
"Common ancestor tag = %s, furthest block tag = %s.\n",
|
@@ -2193,24 +2183,24 @@ static bool adoption_agency_algorithm (
|
|
2193
2183
|
gumbo_normalized_tagname(furthest_block->v.element.tag)
|
2194
2184
|
);
|
2195
2185
|
|
2196
|
-
// Step
|
2186
|
+
// Step 13.
|
2197
2187
|
int bookmark = 1 + gumbo_vector_index_of (
|
2198
2188
|
&state->_active_formatting_elements,
|
2199
2189
|
formatting_node
|
2200
2190
|
);
|
2201
2191
|
gumbo_debug("Bookmark at %d.\n", bookmark);
|
2202
|
-
// Step
|
2192
|
+
// Step 14.
|
2203
2193
|
GumboNode* node = furthest_block;
|
2204
2194
|
GumboNode* last_node = furthest_block;
|
2205
2195
|
// Must be stored explicitly, in case node is removed from the stack of open
|
2206
|
-
// elements, to handle step
|
2196
|
+
// elements, to handle step 14.3.
|
2207
2197
|
int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
|
2208
2198
|
assert(saved_node_index > 0);
|
2209
|
-
// Step
|
2199
|
+
// Step 14.1.
|
2210
2200
|
for (int j = 0;;) {
|
2211
|
-
// Step
|
2201
|
+
// Step 14.2.
|
2212
2202
|
++j;
|
2213
|
-
// Step
|
2203
|
+
// Step 14.3.
|
2214
2204
|
int node_index = gumbo_vector_index_of(&state->_open_elements, node);
|
2215
2205
|
gumbo_debug (
|
2216
2206
|
"Current index: %d, last index: %d.\n",
|
@@ -2225,16 +2215,16 @@ static bool adoption_agency_algorithm (
|
|
2225
2215
|
assert((unsigned int) node_index < state->_open_elements.capacity);
|
2226
2216
|
node = state->_open_elements.data[node_index];
|
2227
2217
|
assert(node->parent);
|
2218
|
+
// Step 14.4.
|
2228
2219
|
if (node == formatting_node) {
|
2229
|
-
// Step 13.4.
|
2230
2220
|
break;
|
2231
2221
|
}
|
2232
2222
|
int formatting_index = gumbo_vector_index_of (
|
2233
2223
|
&state->_active_formatting_elements,
|
2234
2224
|
node
|
2235
2225
|
);
|
2226
|
+
// Step 14.5.
|
2236
2227
|
if (j > 3 && formatting_index != -1) {
|
2237
|
-
// Step 13.5.
|
2238
2228
|
gumbo_debug("Removing formatting element at %d.\n", formatting_index);
|
2239
2229
|
gumbo_vector_remove_at (
|
2240
2230
|
formatting_index,
|
@@ -2249,11 +2239,11 @@ static bool adoption_agency_algorithm (
|
|
2249
2239
|
continue;
|
2250
2240
|
}
|
2251
2241
|
if (formatting_index == -1) {
|
2252
|
-
// Step
|
2242
|
+
// Step 14.6.
|
2253
2243
|
gumbo_vector_remove_at(node_index, &state->_open_elements);
|
2254
2244
|
continue;
|
2255
2245
|
}
|
2256
|
-
// Step
|
2246
|
+
// Step 14.7.
|
2257
2247
|
// "common ancestor as the intended parent" doesn't actually mean insert
|
2258
2248
|
// it into the common ancestor; that happens below.
|
2259
2249
|
node = clone_node(node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
|
@@ -2261,21 +2251,21 @@ static bool adoption_agency_algorithm (
|
|
2261
2251
|
state->_active_formatting_elements.data[formatting_index] = node;
|
2262
2252
|
assert(node_index >= 0);
|
2263
2253
|
state->_open_elements.data[node_index] = node;
|
2264
|
-
// Step
|
2254
|
+
// Step 14.8.
|
2265
2255
|
if (last_node == furthest_block) {
|
2266
2256
|
bookmark = formatting_index + 1;
|
2267
2257
|
gumbo_debug("Bookmark moved to %d.\n", bookmark);
|
2268
2258
|
assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
|
2269
2259
|
}
|
2270
|
-
// Step
|
2260
|
+
// Step 14.9.
|
2271
2261
|
last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
|
2272
2262
|
remove_from_parent(last_node);
|
2273
2263
|
append_node(node, last_node);
|
2274
|
-
// Step
|
2264
|
+
// Step 14.10.
|
2275
2265
|
last_node = node;
|
2276
|
-
} // Step
|
2266
|
+
} // Step 14.11.
|
2277
2267
|
|
2278
|
-
// Step
|
2268
|
+
// Step 15.
|
2279
2269
|
gumbo_debug (
|
2280
2270
|
"Removing %s node from parent ",
|
2281
2271
|
gumbo_normalized_tagname(last_node->v.element.tag)
|
@@ -2292,14 +2282,14 @@ static bool adoption_agency_algorithm (
|
|
2292
2282
|
);
|
2293
2283
|
insert_node(last_node, location);
|
2294
2284
|
|
2295
|
-
// Step
|
2285
|
+
// Step 16.
|
2296
2286
|
GumboNode* new_formatting_node = clone_node (
|
2297
2287
|
formatting_node,
|
2298
2288
|
GUMBO_INSERTION_ADOPTION_AGENCY_CLONED
|
2299
2289
|
);
|
2300
2290
|
formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
|
2301
2291
|
|
2302
|
-
// Step
|
2292
|
+
// Step 17. Instead of appending nodes one-by-one, we swap the children
|
2303
2293
|
// vector of furthest_block with the empty children of new_formatting_node,
|
2304
2294
|
// reducing memory traffic and allocations. We still have to reset their
|
2305
2295
|
// parent pointers, though.
|
@@ -2313,10 +2303,10 @@ static bool adoption_agency_algorithm (
|
|
2313
2303
|
child->parent = new_formatting_node;
|
2314
2304
|
}
|
2315
2305
|
|
2316
|
-
// Step
|
2306
|
+
// Step 18.
|
2317
2307
|
append_node(furthest_block, new_formatting_node);
|
2318
2308
|
|
2319
|
-
// Step
|
2309
|
+
// Step 19.
|
2320
2310
|
// If the formatting node was before the bookmark, it may shift over all
|
2321
2311
|
// indices after it, so we need to explicitly find the index and possibly
|
2322
2312
|
// adjust the bookmark.
|
@@ -2344,7 +2334,7 @@ static bool adoption_agency_algorithm (
|
|
2344
2334
|
&state->_active_formatting_elements
|
2345
2335
|
);
|
2346
2336
|
|
2347
|
-
// Step
|
2337
|
+
// Step 20.
|
2348
2338
|
gumbo_vector_remove(formatting_node, &state->_open_elements);
|
2349
2339
|
int insert_at = 1 + gumbo_vector_index_of (
|
2350
2340
|
&state->_open_elements,
|
@@ -2357,7 +2347,7 @@ static bool adoption_agency_algorithm (
|
|
2357
2347
|
insert_at,
|
2358
2348
|
&state->_open_elements
|
2359
2349
|
);
|
2360
|
-
} // Step
|
2350
|
+
} // Step 21.
|
2361
2351
|
return true;
|
2362
2352
|
}
|
2363
2353
|
|
@@ -2406,10 +2396,12 @@ static bool handle_initial(GumboParser* parser, GumboToken* token) {
|
|
2406
2396
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2407
2397
|
ignore_token(parser);
|
2408
2398
|
return true;
|
2409
|
-
}
|
2399
|
+
}
|
2400
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2410
2401
|
append_comment_node(parser, get_document_node(parser), token);
|
2411
2402
|
return true;
|
2412
|
-
}
|
2403
|
+
}
|
2404
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2413
2405
|
document->has_doctype = true;
|
2414
2406
|
document->name = token->v.doc_type.name;
|
2415
2407
|
document->public_identifier = token->v.doc_type.public_identifier;
|
@@ -2431,95 +2423,108 @@ static bool handle_before_html(GumboParser* parser, GumboToken* token) {
|
|
2431
2423
|
parser_add_parse_error(parser, token);
|
2432
2424
|
ignore_token(parser);
|
2433
2425
|
return false;
|
2434
|
-
}
|
2426
|
+
}
|
2427
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2435
2428
|
append_comment_node(parser, get_document_node(parser), token);
|
2436
2429
|
return true;
|
2437
|
-
}
|
2430
|
+
}
|
2431
|
+
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2438
2432
|
ignore_token(parser);
|
2439
2433
|
return true;
|
2440
|
-
}
|
2434
|
+
}
|
2435
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2441
2436
|
GumboNode* html_node = insert_element_from_token(parser, token);
|
2442
2437
|
parser->_output->root = html_node;
|
2443
2438
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
|
2444
2439
|
return true;
|
2445
|
-
}
|
2440
|
+
}
|
2441
|
+
if (
|
2446
2442
|
token->type == GUMBO_TOKEN_END_TAG
|
2447
2443
|
&& !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
|
2448
2444
|
) {
|
2449
2445
|
parser_add_parse_error(parser, token);
|
2450
2446
|
ignore_token(parser);
|
2451
2447
|
return false;
|
2452
|
-
} else {
|
2453
|
-
GumboNode* html_node = insert_element_of_tag_type (
|
2454
|
-
parser,
|
2455
|
-
GUMBO_TAG_HTML,
|
2456
|
-
GUMBO_INSERTION_IMPLIED
|
2457
|
-
);
|
2458
|
-
assert(html_node);
|
2459
|
-
parser->_output->root = html_node;
|
2460
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
|
2461
|
-
parser->_parser_state->_reprocess_current_token = true;
|
2462
|
-
return true;
|
2463
2448
|
}
|
2449
|
+
GumboNode* html_node = insert_element_of_tag_type (
|
2450
|
+
parser,
|
2451
|
+
GUMBO_TAG_HTML,
|
2452
|
+
GUMBO_INSERTION_IMPLIED
|
2453
|
+
);
|
2454
|
+
assert(html_node);
|
2455
|
+
parser->_output->root = html_node;
|
2456
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
|
2457
|
+
parser->_parser_state->_reprocess_current_token = true;
|
2458
|
+
return true;
|
2464
2459
|
}
|
2465
2460
|
|
2461
|
+
// Forward declarations because of mutual dependencies.
|
2462
|
+
static bool handle_token(GumboParser* parser, GumboToken* token);
|
2463
|
+
static bool handle_in_body(GumboParser* parser, GumboToken* token);
|
2464
|
+
static bool handle_in_template(GumboParser* parser, GumboToken* token);
|
2465
|
+
|
2466
2466
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode
|
2467
2467
|
static bool handle_before_head(GumboParser* parser, GumboToken* token) {
|
2468
|
-
if (token->type ==
|
2469
|
-
parser_add_parse_error(parser, token);
|
2468
|
+
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2470
2469
|
ignore_token(parser);
|
2471
|
-
return
|
2472
|
-
}
|
2470
|
+
return true;
|
2471
|
+
}
|
2472
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2473
2473
|
append_comment_node(parser, get_current_node(parser), token);
|
2474
2474
|
return true;
|
2475
|
-
}
|
2475
|
+
}
|
2476
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2477
|
+
parser_add_parse_error(parser, token);
|
2476
2478
|
ignore_token(parser);
|
2477
|
-
return
|
2478
|
-
}
|
2479
|
+
return false;
|
2480
|
+
}
|
2481
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2482
|
+
return handle_in_body(parser, token);
|
2483
|
+
}
|
2484
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
|
2479
2485
|
GumboNode* node = insert_element_from_token(parser, token);
|
2480
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2481
2486
|
parser->_parser_state->_head_element = node;
|
2487
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2482
2488
|
return true;
|
2483
|
-
}
|
2489
|
+
}
|
2490
|
+
if (
|
2484
2491
|
token->type == GUMBO_TOKEN_END_TAG
|
2485
|
-
&& !tag_in(token,
|
2492
|
+
&& !tag_in(token, kEndTag, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
|
2486
2493
|
) {
|
2487
2494
|
parser_add_parse_error(parser, token);
|
2488
2495
|
ignore_token(parser);
|
2489
2496
|
return false;
|
2490
|
-
} else {
|
2491
|
-
GumboNode* node = insert_element_of_tag_type (
|
2492
|
-
parser,
|
2493
|
-
GUMBO_TAG_HEAD,
|
2494
|
-
GUMBO_INSERTION_IMPLIED
|
2495
|
-
);
|
2496
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2497
|
-
parser->_parser_state->_head_element = node;
|
2498
|
-
parser->_parser_state->_reprocess_current_token = true;
|
2499
|
-
return true;
|
2500
2497
|
}
|
2498
|
+
GumboNode* node = insert_element_of_tag_type (
|
2499
|
+
parser,
|
2500
|
+
GUMBO_TAG_HEAD,
|
2501
|
+
GUMBO_INSERTION_IMPLIED
|
2502
|
+
);
|
2503
|
+
parser->_parser_state->_head_element = node;
|
2504
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2505
|
+
parser->_parser_state->_reprocess_current_token = true;
|
2506
|
+
return true;
|
2501
2507
|
}
|
2502
2508
|
|
2503
|
-
// Forward declarations because of mutual dependencies.
|
2504
|
-
static bool handle_token(GumboParser* parser, GumboToken* token);
|
2505
|
-
static bool handle_in_body(GumboParser* parser, GumboToken* token);
|
2506
|
-
static bool handle_in_template(GumboParser* parser, GumboToken* token);
|
2507
|
-
|
2508
2509
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead
|
2509
2510
|
static bool handle_in_head(GumboParser* parser, GumboToken* token) {
|
2510
2511
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2511
2512
|
insert_text_token(parser, token);
|
2512
2513
|
return true;
|
2513
|
-
}
|
2514
|
+
}
|
2515
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2516
|
+
append_comment_node(parser, get_current_node(parser), token);
|
2517
|
+
return true;
|
2518
|
+
}
|
2519
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2514
2520
|
parser_add_parse_error(parser, token);
|
2515
2521
|
ignore_token(parser);
|
2516
2522
|
return false;
|
2517
|
-
}
|
2518
|
-
|
2519
|
-
return true;
|
2520
|
-
} else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2523
|
+
}
|
2524
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2521
2525
|
return handle_in_body(parser, token);
|
2522
|
-
}
|
2526
|
+
}
|
2527
|
+
if (
|
2523
2528
|
tag_in(token, kStartTag, &(const TagSet) {
|
2524
2529
|
TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK)
|
2525
2530
|
})
|
@@ -2528,7 +2533,8 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2528
2533
|
pop_current_node(parser);
|
2529
2534
|
acknowledge_self_closing_tag(parser);
|
2530
2535
|
return true;
|
2531
|
-
}
|
2536
|
+
}
|
2537
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
|
2532
2538
|
insert_element_from_token(parser, token);
|
2533
2539
|
pop_current_node(parser);
|
2534
2540
|
acknowledge_self_closing_tag(parser);
|
@@ -2537,42 +2543,50 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2537
2543
|
// should specifically look for that string in the document and re-encode it
|
2538
2544
|
// before passing to Gumbo.
|
2539
2545
|
return true;
|
2540
|
-
}
|
2546
|
+
}
|
2547
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
|
2541
2548
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
|
2542
2549
|
return true;
|
2543
|
-
}
|
2550
|
+
}
|
2551
|
+
if (
|
2544
2552
|
tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
|
2545
2553
|
) {
|
2546
2554
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
2547
2555
|
return true;
|
2548
|
-
}
|
2556
|
+
}
|
2557
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
|
2549
2558
|
insert_element_from_token(parser, token);
|
2550
2559
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
|
2551
2560
|
return true;
|
2552
|
-
}
|
2553
|
-
|
2561
|
+
}
|
2562
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
|
2563
|
+
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT_DATA);
|
2554
2564
|
return true;
|
2555
|
-
}
|
2565
|
+
}
|
2566
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
|
2556
2567
|
GumboNode* head = pop_current_node(parser);
|
2557
2568
|
UNUSED_IF_NDEBUG(head);
|
2558
2569
|
assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
|
2559
2570
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
|
2560
2571
|
return true;
|
2561
|
-
}
|
2572
|
+
}
|
2573
|
+
if (
|
2562
2574
|
tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)})
|
2563
2575
|
) {
|
2564
2576
|
pop_current_node(parser);
|
2565
2577
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
|
2566
2578
|
parser->_parser_state->_reprocess_current_token = true;
|
2567
2579
|
return true;
|
2568
|
-
}
|
2580
|
+
}
|
2581
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
|
2569
2582
|
insert_element_from_token(parser, token);
|
2570
2583
|
add_formatting_element(parser, &kActiveFormattingScopeMarker);
|
2571
|
-
parser
|
2584
|
+
set_frameset_not_ok(parser);
|
2572
2585
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
|
2573
2586
|
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
|
2574
2587
|
return true;
|
2575
|
-
}
|
2588
|
+
}
|
2589
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
|
2576
2590
|
if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2577
2591
|
parser_add_parse_error(parser, token);
|
2578
2592
|
ignore_token(parser);
|
@@ -2590,19 +2604,18 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2590
2604
|
pop_template_insertion_mode(parser);
|
2591
2605
|
reset_insertion_mode_appropriately(parser);
|
2592
2606
|
return success;
|
2593
|
-
}
|
2607
|
+
}
|
2608
|
+
if (
|
2594
2609
|
tag_is(token, kStartTag, GUMBO_TAG_HEAD)
|
2595
2610
|
|| (token->type == GUMBO_TOKEN_END_TAG)
|
2596
2611
|
) {
|
2597
2612
|
parser_add_parse_error(parser, token);
|
2598
2613
|
ignore_token(parser);
|
2599
2614
|
return false;
|
2600
|
-
} else {
|
2601
|
-
pop_current_node(parser);
|
2602
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
|
2603
|
-
parser->_parser_state->_reprocess_current_token = true;
|
2604
|
-
return true;
|
2605
2615
|
}
|
2616
|
+
pop_current_node(parser);
|
2617
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
|
2618
|
+
parser->_parser_state->_reprocess_current_token = true;
|
2606
2619
|
return true;
|
2607
2620
|
}
|
2608
2621
|
|
@@ -2611,15 +2624,18 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
|
|
2611
2624
|
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2612
2625
|
parser_add_parse_error(parser, token);
|
2613
2626
|
return false;
|
2614
|
-
}
|
2627
|
+
}
|
2628
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2615
2629
|
return handle_in_body(parser, token);
|
2616
|
-
}
|
2630
|
+
}
|
2631
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
|
2617
2632
|
const GumboNode* node = pop_current_node(parser);
|
2618
2633
|
assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
|
2619
2634
|
UNUSED_IF_NDEBUG(node);
|
2620
2635
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2621
2636
|
return true;
|
2622
|
-
}
|
2637
|
+
}
|
2638
|
+
if (
|
2623
2639
|
token->type == GUMBO_TOKEN_WHITESPACE
|
2624
2640
|
|| token->type == GUMBO_TOKEN_COMMENT
|
2625
2641
|
|| tag_in (token, kStartTag, &(const TagSet) {
|
@@ -2628,7 +2644,8 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
|
|
2628
2644
|
})
|
2629
2645
|
) {
|
2630
2646
|
return handle_in_head(parser, token);
|
2631
|
-
}
|
2647
|
+
}
|
2648
|
+
if (
|
2632
2649
|
tag_in(token, kStartTag, &(const TagSet){TAG(HEAD), TAG(NOSCRIPT)})
|
2633
2650
|
|| (
|
2634
2651
|
token->type == GUMBO_TOKEN_END_TAG
|
@@ -2638,15 +2655,14 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
|
|
2638
2655
|
parser_add_parse_error(parser, token);
|
2639
2656
|
ignore_token(parser);
|
2640
2657
|
return false;
|
2641
|
-
} else {
|
2642
|
-
parser_add_parse_error(parser, token);
|
2643
|
-
const GumboNode* node = pop_current_node(parser);
|
2644
|
-
assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
|
2645
|
-
UNUSED_IF_NDEBUG(node);
|
2646
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2647
|
-
parser->_parser_state->_reprocess_current_token = true;
|
2648
|
-
return false;
|
2649
2658
|
}
|
2659
|
+
parser_add_parse_error(parser, token);
|
2660
|
+
const GumboNode* node = pop_current_node(parser);
|
2661
|
+
assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
|
2662
|
+
UNUSED_IF_NDEBUG(node);
|
2663
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2664
|
+
parser->_parser_state->_reprocess_current_token = true;
|
2665
|
+
return false;
|
2650
2666
|
}
|
2651
2667
|
|
2652
2668
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode
|
@@ -2655,25 +2671,31 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
|
|
2655
2671
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2656
2672
|
insert_text_token(parser, token);
|
2657
2673
|
return true;
|
2658
|
-
}
|
2674
|
+
}
|
2675
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2676
|
+
append_comment_node(parser, get_current_node(parser), token);
|
2677
|
+
return true;
|
2678
|
+
}
|
2679
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2659
2680
|
parser_add_parse_error(parser, token);
|
2660
2681
|
ignore_token(parser);
|
2661
2682
|
return false;
|
2662
|
-
}
|
2663
|
-
|
2664
|
-
return true;
|
2665
|
-
} else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2683
|
+
}
|
2684
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2666
2685
|
return handle_in_body(parser, token);
|
2667
|
-
}
|
2686
|
+
}
|
2687
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
|
2668
2688
|
insert_element_from_token(parser, token);
|
2669
|
-
|
2689
|
+
set_frameset_not_ok(parser);
|
2670
2690
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
2671
2691
|
return true;
|
2672
|
-
}
|
2692
|
+
}
|
2693
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
|
2673
2694
|
insert_element_from_token(parser, token);
|
2674
2695
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
|
2675
2696
|
return true;
|
2676
|
-
}
|
2697
|
+
}
|
2698
|
+
if (
|
2677
2699
|
tag_in(token, kStartTag, &(const TagSet) {
|
2678
2700
|
TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META),
|
2679
2701
|
TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)
|
@@ -2685,12 +2707,14 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
|
|
2685
2707
|
// pending character tokens that should be attached to the root.
|
2686
2708
|
maybe_flush_text_node_buffer(parser);
|
2687
2709
|
gumbo_vector_add(state->_head_element, &state->_open_elements);
|
2688
|
-
|
2710
|
+
handle_in_head(parser, token);
|
2689
2711
|
gumbo_vector_remove(state->_head_element, &state->_open_elements);
|
2690
|
-
return
|
2691
|
-
}
|
2712
|
+
return false;
|
2713
|
+
}
|
2714
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
|
2692
2715
|
return handle_in_head(parser, token);
|
2693
|
-
}
|
2716
|
+
}
|
2717
|
+
if (
|
2694
2718
|
tag_is(token, kStartTag, GUMBO_TAG_HEAD)
|
2695
2719
|
|| (
|
2696
2720
|
token->type == GUMBO_TOKEN_END_TAG
|
@@ -2700,12 +2724,11 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
|
|
2700
2724
|
parser_add_parse_error(parser, token);
|
2701
2725
|
ignore_token(parser);
|
2702
2726
|
return false;
|
2703
|
-
} else {
|
2704
|
-
insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
|
2705
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
2706
|
-
state->_reprocess_current_token = true;
|
2707
|
-
return true;
|
2708
2727
|
}
|
2728
|
+
insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
|
2729
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
2730
|
+
state->_reprocess_current_token = true;
|
2731
|
+
return true;
|
2709
2732
|
}
|
2710
2733
|
|
2711
2734
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
|
@@ -2716,11 +2739,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2716
2739
|
parser_add_parse_error(parser, token);
|
2717
2740
|
ignore_token(parser);
|
2718
2741
|
return false;
|
2719
|
-
}
|
2742
|
+
}
|
2743
|
+
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2720
2744
|
reconstruct_active_formatting_elements(parser);
|
2721
2745
|
insert_text_token(parser, token);
|
2722
2746
|
return true;
|
2723
|
-
}
|
2747
|
+
}
|
2748
|
+
if (
|
2724
2749
|
token->type == GUMBO_TOKEN_CHARACTER
|
2725
2750
|
|| token->type == GUMBO_TOKEN_CDATA
|
2726
2751
|
) {
|
@@ -2728,14 +2753,17 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2728
2753
|
insert_text_token(parser, token);
|
2729
2754
|
set_frameset_not_ok(parser);
|
2730
2755
|
return true;
|
2731
|
-
}
|
2756
|
+
}
|
2757
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2732
2758
|
append_comment_node(parser, get_current_node(parser), token);
|
2733
2759
|
return true;
|
2734
|
-
}
|
2760
|
+
}
|
2761
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2735
2762
|
parser_add_parse_error(parser, token);
|
2736
2763
|
ignore_token(parser);
|
2737
2764
|
return false;
|
2738
|
-
}
|
2765
|
+
}
|
2766
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2739
2767
|
parser_add_parse_error(parser, token);
|
2740
2768
|
if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2741
2769
|
ignore_token(parser);
|
@@ -2745,7 +2773,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2745
2773
|
assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
|
2746
2774
|
merge_attributes(token, parser->_output->root);
|
2747
2775
|
return false;
|
2748
|
-
}
|
2776
|
+
}
|
2777
|
+
if (
|
2749
2778
|
tag_in(token, kStartTag, &(const TagSet) {
|
2750
2779
|
TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
|
2751
2780
|
TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE),
|
@@ -2754,7 +2783,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2754
2783
|
|| tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
|
2755
2784
|
) {
|
2756
2785
|
return handle_in_head(parser, token);
|
2757
|
-
}
|
2786
|
+
}
|
2787
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
|
2758
2788
|
parser_add_parse_error(parser, token);
|
2759
2789
|
if (
|
2760
2790
|
state->_open_elements.length < 2
|
@@ -2762,12 +2792,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2762
2792
|
|| has_open_element(parser, GUMBO_TAG_TEMPLATE)
|
2763
2793
|
) {
|
2764
2794
|
ignore_token(parser);
|
2765
|
-
|
2795
|
+
} else {
|
2796
|
+
set_frameset_not_ok(parser);
|
2797
|
+
merge_attributes(token, state->_open_elements.data[1]);
|
2766
2798
|
}
|
2767
|
-
state->_frameset_ok = false;
|
2768
|
-
merge_attributes(token, state->_open_elements.data[1]);
|
2769
2799
|
return false;
|
2770
|
-
}
|
2800
|
+
}
|
2801
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
|
2771
2802
|
parser_add_parse_error(parser, token);
|
2772
2803
|
if (
|
2773
2804
|
state->_open_elements.length < 2
|
@@ -2808,64 +2839,64 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2808
2839
|
insert_element_from_token(parser, token);
|
2809
2840
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
|
2810
2841
|
return true;
|
2811
|
-
}
|
2812
|
-
|
2813
|
-
if (
|
2814
|
-
!node_tag_in_set(state->_open_elements.data[i], &(const TagSet) {
|
2815
|
-
TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY), TAG(TD), TAG(TFOOT),
|
2816
|
-
TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML)
|
2817
|
-
})
|
2818
|
-
) {
|
2819
|
-
parser_add_parse_error(parser, token);
|
2820
|
-
}
|
2821
|
-
}
|
2842
|
+
}
|
2843
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
2822
2844
|
if (get_current_template_insertion_mode(parser) !=
|
2823
2845
|
GUMBO_INSERTION_MODE_INITIAL) {
|
2824
2846
|
return handle_in_template(parser, token);
|
2825
2847
|
}
|
2848
|
+
if (stack_contains_nonclosable_element(parser)) {
|
2849
|
+
parser_add_parse_error(parser, token);
|
2850
|
+
return false;
|
2851
|
+
}
|
2826
2852
|
return true;
|
2827
|
-
}
|
2853
|
+
}
|
2854
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
|
2828
2855
|
if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
|
2829
2856
|
parser_add_parse_error(parser, token);
|
2830
2857
|
ignore_token(parser);
|
2831
2858
|
return false;
|
2832
2859
|
}
|
2833
2860
|
bool success = true;
|
2834
|
-
|
2835
|
-
|
2836
|
-
|
2837
|
-
TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P),
|
2838
|
-
TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD),
|
2839
|
-
TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML)
|
2840
|
-
})
|
2841
|
-
) {
|
2842
|
-
parser_add_parse_error(parser, token);
|
2843
|
-
success = false;
|
2844
|
-
break;
|
2845
|
-
}
|
2861
|
+
if (stack_contains_nonclosable_element(parser)) {
|
2862
|
+
parser_add_parse_error(parser, token);
|
2863
|
+
success = false;
|
2846
2864
|
}
|
2865
|
+
GumboNode* body = state->_open_elements.data[1];
|
2866
|
+
assert(node_html_tag_is(body, GUMBO_TAG_BODY));
|
2867
|
+
record_end_of_element(state->_current_token, &body->v.element);
|
2847
2868
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
|
2848
|
-
|
2849
|
-
|
2850
|
-
|
2851
|
-
|
2852
|
-
|
2853
|
-
|
2869
|
+
return success;
|
2870
|
+
}
|
2871
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
|
2872
|
+
if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
|
2873
|
+
parser_add_parse_error(parser, token);
|
2874
|
+
ignore_token(parser);
|
2875
|
+
return false;
|
2854
2876
|
}
|
2877
|
+
bool success = true;
|
2878
|
+
if (stack_contains_nonclosable_element(parser)) {
|
2879
|
+
parser_add_parse_error(parser, token);
|
2880
|
+
success = false;
|
2881
|
+
}
|
2882
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
|
2883
|
+
parser->_parser_state->_reprocess_current_token = true;
|
2855
2884
|
return success;
|
2856
|
-
}
|
2885
|
+
}
|
2886
|
+
if (
|
2857
2887
|
tag_in(token, kStartTag, &(const TagSet) {
|
2858
2888
|
TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER),
|
2859
2889
|
TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
|
2860
2890
|
TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
|
2861
|
-
TAG(
|
2891
|
+
TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
|
2862
2892
|
TAG(SUMMARY), TAG(UL)
|
2863
2893
|
})
|
2864
2894
|
) {
|
2865
2895
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
2866
2896
|
insert_element_from_token(parser, token);
|
2867
2897
|
return result;
|
2868
|
-
}
|
2898
|
+
}
|
2899
|
+
if (tag_in(token, kStartTag, &heading_tags)) {
|
2869
2900
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
2870
2901
|
if (node_tag_in_set(get_current_node(parser), &heading_tags)) {
|
2871
2902
|
parser_add_parse_error(parser, token);
|
@@ -2874,13 +2905,15 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2874
2905
|
}
|
2875
2906
|
insert_element_from_token(parser, token);
|
2876
2907
|
return result;
|
2877
|
-
}
|
2908
|
+
}
|
2909
|
+
if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) {
|
2878
2910
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
2879
2911
|
insert_element_from_token(parser, token);
|
2880
2912
|
state->_ignore_next_linefeed = true;
|
2881
|
-
|
2913
|
+
set_frameset_not_ok(parser);
|
2882
2914
|
return result;
|
2883
|
-
}
|
2915
|
+
}
|
2916
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
|
2884
2917
|
if (
|
2885
2918
|
state->_form_element != NULL
|
2886
2919
|
&& !has_open_element(parser, GUMBO_TAG_TEMPLATE)
|
@@ -2896,38 +2929,42 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2896
2929
|
state->_form_element = form_element;
|
2897
2930
|
}
|
2898
2931
|
return result;
|
2899
|
-
}
|
2900
|
-
|
2901
|
-
bool result =
|
2932
|
+
}
|
2933
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
|
2934
|
+
bool result = maybe_implicitly_close_list_tag(parser, token, true);
|
2935
|
+
result = maybe_implicitly_close_p_tag(parser, token) && result;
|
2902
2936
|
insert_element_from_token(parser, token);
|
2903
2937
|
return result;
|
2904
|
-
}
|
2905
|
-
|
2906
|
-
bool result =
|
2938
|
+
}
|
2939
|
+
if (tag_in(token, kStartTag, &dd_dt_tags)) {
|
2940
|
+
bool result = maybe_implicitly_close_list_tag(parser, token, false);
|
2941
|
+
result = maybe_implicitly_close_p_tag(parser, token) && result;
|
2907
2942
|
insert_element_from_token(parser, token);
|
2908
2943
|
return result;
|
2909
|
-
}
|
2944
|
+
}
|
2945
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
|
2910
2946
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
2911
2947
|
insert_element_from_token(parser, token);
|
2912
2948
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
|
2913
2949
|
return result;
|
2914
|
-
}
|
2950
|
+
}
|
2951
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
|
2952
|
+
bool success = true;
|
2915
2953
|
if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
|
2916
2954
|
parser_add_parse_error(parser, token);
|
2917
|
-
|
2918
|
-
|
2919
|
-
|
2920
|
-
|
2921
|
-
|
2922
|
-
|
2923
|
-
state->_reprocess_current_token = true;
|
2924
|
-
return false;
|
2955
|
+
success = false;
|
2956
|
+
// We don't want to use implicitly_close_tags here because it may add an
|
2957
|
+
// error and we've already added the only error the standard specifies.
|
2958
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
2959
|
+
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_BUTTON))
|
2960
|
+
;
|
2925
2961
|
}
|
2926
2962
|
reconstruct_active_formatting_elements(parser);
|
2927
2963
|
insert_element_from_token(parser, token);
|
2928
|
-
|
2929
|
-
return
|
2930
|
-
}
|
2964
|
+
set_frameset_not_ok(parser);
|
2965
|
+
return success;
|
2966
|
+
}
|
2967
|
+
if (
|
2931
2968
|
tag_in(token, kEndTag, &(const TagSet) {
|
2932
2969
|
TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON),
|
2933
2970
|
TAG(CENTER), TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL),
|
@@ -2942,14 +2979,14 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2942
2979
|
ignore_token(parser);
|
2943
2980
|
return false;
|
2944
2981
|
}
|
2945
|
-
implicitly_close_tags (
|
2982
|
+
return implicitly_close_tags (
|
2946
2983
|
parser,
|
2947
2984
|
token,
|
2948
2985
|
GUMBO_NAMESPACE_HTML,
|
2949
2986
|
token->v.end_tag.tag
|
2950
2987
|
);
|
2951
|
-
|
2952
|
-
|
2988
|
+
}
|
2989
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
|
2953
2990
|
if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2954
2991
|
if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
|
2955
2992
|
parser_add_parse_error(parser, token);
|
@@ -2960,7 +2997,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2960
2997
|
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
2961
2998
|
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
|
2962
2999
|
parser_add_parse_error(parser, token);
|
2963
|
-
|
3000
|
+
success = false;
|
2964
3001
|
}
|
2965
3002
|
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
|
2966
3003
|
;
|
@@ -2992,7 +3029,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2992
3029
|
gumbo_vector_remove_at(index, open_elements);
|
2993
3030
|
return result;
|
2994
3031
|
}
|
2995
|
-
}
|
3032
|
+
}
|
3033
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
|
3034
|
+
bool success = true;
|
2996
3035
|
if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
|
2997
3036
|
parser_add_parse_error(parser, token);
|
2998
3037
|
// reconstruct_active_formatting_elements(parser);
|
@@ -3001,16 +3040,16 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3001
3040
|
GUMBO_TAG_P,
|
3002
3041
|
GUMBO_INSERTION_CONVERTED_FROM_END_TAG
|
3003
3042
|
);
|
3004
|
-
|
3005
|
-
return false;
|
3043
|
+
success = false;
|
3006
3044
|
}
|
3007
3045
|
return implicitly_close_tags (
|
3008
3046
|
parser,
|
3009
3047
|
token,
|
3010
3048
|
GUMBO_NAMESPACE_HTML,
|
3011
3049
|
GUMBO_TAG_P
|
3012
|
-
);
|
3013
|
-
}
|
3050
|
+
) && success;
|
3051
|
+
}
|
3052
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
|
3014
3053
|
if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
|
3015
3054
|
parser_add_parse_error(parser, token);
|
3016
3055
|
ignore_token(parser);
|
@@ -3022,8 +3061,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3022
3061
|
GUMBO_NAMESPACE_HTML,
|
3023
3062
|
GUMBO_TAG_LI
|
3024
3063
|
);
|
3025
|
-
}
|
3026
|
-
|
3064
|
+
}
|
3065
|
+
if (tag_in(token, kEndTag, &dd_dt_tags)) {
|
3027
3066
|
GumboTag token_tag = token->v.end_tag.tag;
|
3028
3067
|
if (!has_an_element_in_scope(parser, token_tag)) {
|
3029
3068
|
parser_add_parse_error(parser, token);
|
@@ -3036,7 +3075,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3036
3075
|
GUMBO_NAMESPACE_HTML,
|
3037
3076
|
token_tag
|
3038
3077
|
);
|
3039
|
-
}
|
3078
|
+
}
|
3079
|
+
if (tag_in(token, kEndTag, &heading_tags)) {
|
3040
3080
|
if (
|
3041
3081
|
!has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) {
|
3042
3082
|
GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
|
@@ -3047,30 +3087,31 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3047
3087
|
parser_add_parse_error(parser, token);
|
3048
3088
|
ignore_token(parser);
|
3049
3089
|
return false;
|
3050
|
-
} else {
|
3051
|
-
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
3052
|
-
const GumboNode* current_node = get_current_node(parser);
|
3053
|
-
bool success = node_html_tag_is(current_node, token->v.end_tag.tag);
|
3054
|
-
if (!success) {
|
3055
|
-
// There're children of the heading currently open; close them below and
|
3056
|
-
// record a parse error.
|
3057
|
-
// TODO(jdtang): Add a way to distinguish this error case from the one
|
3058
|
-
// above.
|
3059
|
-
parser_add_parse_error(parser, token);
|
3060
|
-
}
|
3061
|
-
do {
|
3062
|
-
current_node = pop_current_node(parser);
|
3063
|
-
} while (!node_tag_in_set(current_node, &heading_tags));
|
3064
|
-
return success;
|
3065
3090
|
}
|
3066
|
-
|
3091
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
3092
|
+
const GumboNode* current_node = get_current_node(parser);
|
3093
|
+
bool success = node_html_tag_is(current_node, token->v.end_tag.tag);
|
3094
|
+
if (!success) {
|
3095
|
+
// There're children of the heading currently open; close them below and
|
3096
|
+
// record a parse error.
|
3097
|
+
// TODO(jdtang): Add a way to distinguish this error case from the one
|
3098
|
+
// above.
|
3099
|
+
parser_add_parse_error(parser, token);
|
3100
|
+
}
|
3101
|
+
do {
|
3102
|
+
current_node = pop_current_node(parser);
|
3103
|
+
} while (!node_tag_in_set(current_node, &heading_tags));
|
3104
|
+
return success;
|
3105
|
+
}
|
3106
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
|
3067
3107
|
bool success = true;
|
3068
3108
|
int last_a;
|
3069
3109
|
int has_matching_a = find_last_anchor_index(parser, &last_a);
|
3070
3110
|
if (has_matching_a) {
|
3071
3111
|
assert(has_matching_a == 1);
|
3072
3112
|
parser_add_parse_error(parser, token);
|
3073
|
-
adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
|
3113
|
+
bool handled = adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
|
3114
|
+
assert(handled);
|
3074
3115
|
// The adoption agency algorithm usually removes all instances of <a>
|
3075
3116
|
// from the list of active formatting elements, but in case it doesn't,
|
3076
3117
|
// we're supposed to do this. (The conditions where it might not are
|
@@ -3087,7 +3128,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3087
3128
|
reconstruct_active_formatting_elements(parser);
|
3088
3129
|
add_formatting_element(parser, insert_element_from_token(parser, token));
|
3089
3130
|
return success;
|
3090
|
-
}
|
3131
|
+
}
|
3132
|
+
if (
|
3091
3133
|
tag_in(token, kStartTag, &(const TagSet) {
|
3092
3134
|
TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I), TAG(S),
|
3093
3135
|
TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)
|
@@ -3096,27 +3138,33 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3096
3138
|
reconstruct_active_formatting_elements(parser);
|
3097
3139
|
add_formatting_element(parser, insert_element_from_token(parser, token));
|
3098
3140
|
return true;
|
3099
|
-
}
|
3141
|
+
}
|
3142
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
|
3100
3143
|
bool result = true;
|
3101
3144
|
reconstruct_active_formatting_elements(parser);
|
3102
3145
|
if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
|
3103
3146
|
result = false;
|
3104
3147
|
parser_add_parse_error(parser, token);
|
3105
|
-
adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
|
3148
|
+
bool handled = adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
|
3149
|
+
assert(handled);
|
3106
3150
|
reconstruct_active_formatting_elements(parser);
|
3107
3151
|
}
|
3108
3152
|
insert_element_from_token(parser, token);
|
3109
3153
|
add_formatting_element(parser, get_current_node(parser));
|
3110
3154
|
return result;
|
3111
|
-
}
|
3155
|
+
}
|
3156
|
+
if (
|
3112
3157
|
tag_in(token, kEndTag, &(const TagSet) {
|
3113
3158
|
TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I),
|
3114
3159
|
TAG(NOBR), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT),
|
3115
3160
|
TAG(U)
|
3116
3161
|
})
|
3117
3162
|
) {
|
3118
|
-
|
3119
|
-
|
3163
|
+
if (!adoption_agency_algorithm(parser, token, token->v.end_tag.tag))
|
3164
|
+
goto any_other_end_tag;
|
3165
|
+
return true;
|
3166
|
+
}
|
3167
|
+
if (
|
3120
3168
|
tag_in(token, kStartTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
|
3121
3169
|
) {
|
3122
3170
|
reconstruct_active_formatting_elements(parser);
|
@@ -3124,19 +3172,21 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3124
3172
|
add_formatting_element(parser, &kActiveFormattingScopeMarker);
|
3125
3173
|
set_frameset_not_ok(parser);
|
3126
3174
|
return true;
|
3127
|
-
}
|
3175
|
+
}
|
3176
|
+
if (
|
3128
3177
|
tag_in(token, kEndTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
|
3129
3178
|
) {
|
3130
3179
|
GumboTag token_tag = token->v.end_tag.tag;
|
3131
|
-
if (!
|
3180
|
+
if (!has_an_element_in_scope(parser, token_tag)) {
|
3132
3181
|
parser_add_parse_error(parser, token);
|
3133
3182
|
ignore_token(parser);
|
3134
3183
|
return false;
|
3135
3184
|
}
|
3136
|
-
implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
|
3185
|
+
bool success = implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
|
3137
3186
|
clear_active_formatting_elements(parser);
|
3138
|
-
return
|
3139
|
-
}
|
3187
|
+
return success;
|
3188
|
+
}
|
3189
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
|
3140
3190
|
if (
|
3141
3191
|
get_document_node(parser)->v.document.doc_type_quirks_mode
|
3142
3192
|
!= GUMBO_DOCTYPE_QUIRKS
|
@@ -3147,74 +3197,88 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3147
3197
|
set_frameset_not_ok(parser);
|
3148
3198
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3149
3199
|
return true;
|
3150
|
-
}
|
3200
|
+
}
|
3201
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
|
3202
|
+
parser_add_parse_error(parser, token);
|
3203
|
+
reconstruct_active_formatting_elements(parser);
|
3204
|
+
insert_element_of_tag_type (
|
3205
|
+
parser,
|
3206
|
+
GUMBO_TAG_BR,
|
3207
|
+
GUMBO_INSERTION_CONVERTED_FROM_END_TAG
|
3208
|
+
);
|
3209
|
+
pop_current_node(parser);
|
3210
|
+
acknowledge_self_closing_tag(parser);
|
3211
|
+
set_frameset_not_ok(parser);
|
3212
|
+
return false;
|
3213
|
+
}
|
3214
|
+
if (
|
3151
3215
|
tag_in(token, kStartTag, &(const TagSet) {
|
3152
3216
|
TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), TAG(IMAGE), TAG(KEYGEN),
|
3153
3217
|
TAG(WBR)
|
3154
3218
|
})
|
3155
3219
|
) {
|
3156
|
-
bool
|
3157
|
-
if (
|
3158
|
-
success = false;
|
3220
|
+
bool is_image = tag_is(token, kStartTag, GUMBO_TAG_IMAGE);
|
3221
|
+
if (is_image) {
|
3159
3222
|
parser_add_parse_error(parser, token);
|
3160
3223
|
token->v.start_tag.tag = GUMBO_TAG_IMG;
|
3161
3224
|
}
|
3162
3225
|
reconstruct_active_formatting_elements(parser);
|
3163
3226
|
GumboNode* node = insert_element_from_token(parser, token);
|
3164
|
-
if (
|
3165
|
-
success = false;
|
3166
|
-
parser_add_parse_error(parser, token);
|
3167
|
-
node->v.element.tag = GUMBO_TAG_IMG;
|
3227
|
+
if (is_image)
|
3168
3228
|
node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
|
3169
|
-
}
|
3170
3229
|
pop_current_node(parser);
|
3171
3230
|
acknowledge_self_closing_tag(parser);
|
3172
3231
|
set_frameset_not_ok(parser);
|
3173
|
-
return
|
3174
|
-
}
|
3175
|
-
|
3176
|
-
// Must be before the element is inserted, as that takes ownership of the
|
3177
|
-
// token's attribute vector.
|
3178
|
-
set_frameset_not_ok(parser);
|
3179
|
-
}
|
3232
|
+
return !is_image;
|
3233
|
+
}
|
3234
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
|
3180
3235
|
reconstruct_active_formatting_elements(parser);
|
3181
|
-
insert_element_from_token(parser, token);
|
3236
|
+
GumboNode *input = insert_element_from_token(parser, token);
|
3182
3237
|
pop_current_node(parser);
|
3183
3238
|
acknowledge_self_closing_tag(parser);
|
3239
|
+
if (!attribute_matches(&input->v.element.attributes, "type", "hidden"))
|
3240
|
+
set_frameset_not_ok(parser);
|
3184
3241
|
return true;
|
3185
|
-
}
|
3242
|
+
}
|
3243
|
+
if (
|
3186
3244
|
tag_in(token, kStartTag, &(const TagSet){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})
|
3187
3245
|
) {
|
3188
3246
|
insert_element_from_token(parser, token);
|
3189
3247
|
pop_current_node(parser);
|
3190
3248
|
acknowledge_self_closing_tag(parser);
|
3191
3249
|
return true;
|
3192
|
-
}
|
3250
|
+
}
|
3251
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
|
3193
3252
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
3194
3253
|
insert_element_from_token(parser, token);
|
3195
3254
|
pop_current_node(parser);
|
3196
3255
|
acknowledge_self_closing_tag(parser);
|
3197
3256
|
set_frameset_not_ok(parser);
|
3198
3257
|
return result;
|
3199
|
-
}
|
3258
|
+
}
|
3259
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
|
3200
3260
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
|
3201
3261
|
parser->_parser_state->_ignore_next_linefeed = true;
|
3202
3262
|
set_frameset_not_ok(parser);
|
3203
3263
|
return true;
|
3204
|
-
}
|
3264
|
+
}
|
3265
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
|
3205
3266
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
3206
3267
|
reconstruct_active_formatting_elements(parser);
|
3207
3268
|
set_frameset_not_ok(parser);
|
3208
3269
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3209
3270
|
return result;
|
3210
|
-
}
|
3271
|
+
}
|
3272
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
|
3211
3273
|
set_frameset_not_ok(parser);
|
3212
3274
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3213
3275
|
return true;
|
3214
|
-
}
|
3276
|
+
}
|
3277
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
|
3215
3278
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3216
3279
|
return true;
|
3217
|
-
}
|
3280
|
+
}
|
3281
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
|
3218
3282
|
reconstruct_active_formatting_elements(parser);
|
3219
3283
|
insert_element_from_token(parser, token);
|
3220
3284
|
set_frameset_not_ok(parser);
|
@@ -3231,8 +3295,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3231
3295
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
|
3232
3296
|
}
|
3233
3297
|
return true;
|
3234
|
-
}
|
3235
|
-
|
3298
|
+
}
|
3299
|
+
if (
|
3300
|
+
tag_in(token, kStartTag, &(const TagSet){TAG(OPTGROUP), TAG(OPTION)})
|
3236
3301
|
) {
|
3237
3302
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
3238
3303
|
pop_current_node(parser);
|
@@ -3240,40 +3305,34 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3240
3305
|
reconstruct_active_formatting_elements(parser);
|
3241
3306
|
insert_element_from_token(parser, token);
|
3242
3307
|
return true;
|
3243
|
-
}
|
3244
|
-
|
3245
|
-
) {
|
3308
|
+
}
|
3309
|
+
if (tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RTC)})) {
|
3246
3310
|
bool success = true;
|
3247
|
-
GumboTag exception = tag_in(token, kStartTag, &(const TagSet){TAG(RT), TAG(RP)})
|
3248
|
-
? GUMBO_TAG_RTC
|
3249
|
-
: GUMBO_TAG_LAST
|
3250
|
-
;
|
3251
3311
|
if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
|
3252
|
-
generate_implied_end_tags(parser,
|
3312
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
3313
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)) {
|
3314
|
+
parser_add_parse_error(parser, token);
|
3315
|
+
success = false;
|
3316
|
+
}
|
3253
3317
|
}
|
3254
|
-
|
3255
|
-
|
3256
|
-
|
3257
|
-
|
3258
|
-
|
3259
|
-
|
3260
|
-
|
3261
|
-
|
3262
|
-
|
3318
|
+
insert_element_from_token(parser, token);
|
3319
|
+
return success;
|
3320
|
+
}
|
3321
|
+
if (tag_in(token, kStartTag, &(const TagSet){TAG(RP), TAG(RT)})) {
|
3322
|
+
bool success = true;
|
3323
|
+
if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
|
3324
|
+
generate_implied_end_tags(parser, GUMBO_TAG_RTC);
|
3325
|
+
GumboNode* current = get_current_node(parser);
|
3326
|
+
if (!node_html_tag_is(current, GUMBO_TAG_RUBY) &&
|
3327
|
+
!node_html_tag_is(current, GUMBO_TAG_RTC)) {
|
3328
|
+
parser_add_parse_error(parser, token);
|
3329
|
+
success = false;
|
3330
|
+
}
|
3263
3331
|
}
|
3264
3332
|
insert_element_from_token(parser, token);
|
3265
3333
|
return success;
|
3266
|
-
}
|
3267
|
-
|
3268
|
-
reconstruct_active_formatting_elements(parser);
|
3269
|
-
insert_element_of_tag_type (
|
3270
|
-
parser,
|
3271
|
-
GUMBO_TAG_BR,
|
3272
|
-
GUMBO_INSERTION_CONVERTED_FROM_END_TAG
|
3273
|
-
);
|
3274
|
-
pop_current_node(parser);
|
3275
|
-
return false;
|
3276
|
-
} else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
|
3334
|
+
}
|
3335
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
|
3277
3336
|
reconstruct_active_formatting_elements(parser);
|
3278
3337
|
adjust_mathml_attributes(token);
|
3279
3338
|
adjust_foreign_attributes(token);
|
@@ -3283,7 +3342,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3283
3342
|
acknowledge_self_closing_tag(parser);
|
3284
3343
|
}
|
3285
3344
|
return true;
|
3286
|
-
}
|
3345
|
+
}
|
3346
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
|
3287
3347
|
reconstruct_active_formatting_elements(parser);
|
3288
3348
|
adjust_svg_attributes(token);
|
3289
3349
|
adjust_foreign_attributes(token);
|
@@ -3293,7 +3353,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3293
3353
|
acknowledge_self_closing_tag(parser);
|
3294
3354
|
}
|
3295
3355
|
return true;
|
3296
|
-
}
|
3356
|
+
}
|
3357
|
+
if (
|
3297
3358
|
tag_in(token, kStartTag, &(const TagSet) {
|
3298
3359
|
TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(FRAME), TAG(HEAD),
|
3299
3360
|
TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
@@ -3302,48 +3363,49 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3302
3363
|
parser_add_parse_error(parser, token);
|
3303
3364
|
ignore_token(parser);
|
3304
3365
|
return false;
|
3305
|
-
}
|
3366
|
+
}
|
3367
|
+
if (token->type == GUMBO_TOKEN_START_TAG) {
|
3306
3368
|
reconstruct_active_formatting_elements(parser);
|
3307
3369
|
insert_element_from_token(parser, token);
|
3308
3370
|
return true;
|
3309
|
-
}
|
3310
|
-
|
3311
|
-
|
3312
|
-
|
3313
|
-
|
3314
|
-
|
3315
|
-
|
3316
|
-
|
3317
|
-
|
3318
|
-
|
3319
|
-
|
3320
|
-
|
3321
|
-
|
3322
|
-
|
3323
|
-
|
3324
|
-
|
3325
|
-
|
3326
|
-
|
3327
|
-
|
3328
|
-
|
3329
|
-
|
3330
|
-
|
3331
|
-
|
3332
|
-
|
3333
|
-
|
3334
|
-
while (node != pop_current_node(parser))
|
3335
|
-
; // Pop everything.
|
3336
|
-
return true;
|
3337
|
-
} else if (is_special_node(node)) {
|
3371
|
+
}
|
3372
|
+
any_other_end_tag:
|
3373
|
+
assert(token->type == GUMBO_TOKEN_END_TAG);
|
3374
|
+
GumboTag end_tag = token->v.end_tag.tag;
|
3375
|
+
const char *end_tagname = token->v.end_tag.name;
|
3376
|
+
assert(state->_open_elements.length > 0);
|
3377
|
+
assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
|
3378
|
+
// Walk up the stack of open elements until we find one that either:
|
3379
|
+
// a) Matches the tag name we saw
|
3380
|
+
// b) Is in the "special" category.
|
3381
|
+
// If we see a), implicitly close everything up to and including it. If we
|
3382
|
+
// see b), then record a parse error, don't close anything (except the
|
3383
|
+
// implied end tags) and ignore the end tag token.
|
3384
|
+
for (int i = state->_open_elements.length; --i >= 0;) {
|
3385
|
+
const GumboNode* node = state->_open_elements.data[i];
|
3386
|
+
if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, end_tag, end_tagname)) {
|
3387
|
+
generate_implied_end_tags(parser, end_tag);
|
3388
|
+
// TODO(jdtang): Do I need to add a parse error here? The condition in
|
3389
|
+
// the spec seems like it's the inverse of the loop condition above, and
|
3390
|
+
// so would never fire.
|
3391
|
+
// sfc: Yes, an error is needed here.
|
3392
|
+
// <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example.
|
3393
|
+
// foo is the "current node" but sarcasm is node.
|
3394
|
+
// XXX: Write a test for this.
|
3395
|
+
if (node != get_current_node(parser))
|
3338
3396
|
parser_add_parse_error(parser, token);
|
3339
|
-
|
3340
|
-
|
3341
|
-
|
3397
|
+
while (node != pop_current_node(parser))
|
3398
|
+
; // Pop everything.
|
3399
|
+
return true;
|
3400
|
+
} else if (is_special_node(node)) {
|
3401
|
+
parser_add_parse_error(parser, token);
|
3402
|
+
ignore_token(parser);
|
3403
|
+
return false;
|
3342
3404
|
}
|
3343
|
-
// <html> is in the special category, so we should never get here.
|
3344
|
-
assert(0);
|
3345
|
-
return false;
|
3346
3405
|
}
|
3406
|
+
// <html> is in the special category, so we should never get here.
|
3407
|
+
assert(0);
|
3408
|
+
return false;
|
3347
3409
|
}
|
3348
3410
|
|
3349
3411
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata
|
@@ -3353,30 +3415,36 @@ static bool handle_text(GumboParser* parser, GumboToken* token) {
|
|
3353
3415
|
|| token->type == GUMBO_TOKEN_WHITESPACE
|
3354
3416
|
) {
|
3355
3417
|
insert_text_token(parser, token);
|
3356
|
-
|
3357
|
-
// We provide only bare-bones script handling that doesn't involve any of
|
3358
|
-
// the parser-pause/already-started/script-nesting flags or re-entrant
|
3359
|
-
// invocations of the tokenizer. Because the intended usage of this library
|
3360
|
-
// is mostly for templating, refactoring, and static-analysis libraries, we
|
3361
|
-
// provide the script body as a text-node child of the <script> element.
|
3362
|
-
// This behavior doesn't support document.write of partial HTML elements,
|
3363
|
-
// but should be adequate for almost all other scripting support.
|
3364
|
-
if (token->type == GUMBO_TOKEN_EOF) {
|
3365
|
-
parser_add_parse_error(parser, token);
|
3366
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3367
|
-
}
|
3368
|
-
pop_current_node(parser);
|
3369
|
-
set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
|
3418
|
+
return true;
|
3370
3419
|
}
|
3371
|
-
|
3420
|
+
// We provide only bare-bones script handling that doesn't involve any of
|
3421
|
+
// the parser-pause/already-started/script-nesting flags or re-entrant
|
3422
|
+
// invocations of the tokenizer. Because the intended usage of this library
|
3423
|
+
// is mostly for templating, refactoring, and static-analysis libraries, we
|
3424
|
+
// provide the script body as a text-node child of the <script> element.
|
3425
|
+
// This behavior doesn't support document.write of partial HTML elements,
|
3426
|
+
// but should be adequate for almost all other scripting support.
|
3427
|
+
bool success = true;
|
3428
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
3429
|
+
parser_add_parse_error(parser, token);
|
3430
|
+
success = false;
|
3431
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3432
|
+
}
|
3433
|
+
pop_current_node(parser);
|
3434
|
+
set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
|
3435
|
+
return success;
|
3372
3436
|
}
|
3373
3437
|
|
3374
3438
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable
|
3375
3439
|
static bool handle_in_table(GumboParser* parser, GumboToken* token) {
|
3376
3440
|
GumboParserState* state = parser->_parser_state;
|
3377
3441
|
if (
|
3378
|
-
token->type == GUMBO_TOKEN_CHARACTER
|
3379
|
-
|
3442
|
+
(token->type == GUMBO_TOKEN_CHARACTER
|
3443
|
+
|| token->type == GUMBO_TOKEN_WHITESPACE
|
3444
|
+
|| token->type == GUMBO_TOKEN_NULL)
|
3445
|
+
&& node_tag_in_set(get_current_node(parser), &(const TagSet) {
|
3446
|
+
TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
|
3447
|
+
})
|
3380
3448
|
) {
|
3381
3449
|
// The "pending table character tokens" list described in the spec is
|
3382
3450
|
// nothing more than the TextNodeBufferState. We accumulate text tokens as
|
@@ -3384,71 +3452,87 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
|
|
3384
3452
|
// we set _foster_parent_insertions if there're non-whitespace characters in
|
3385
3453
|
// the buffer.
|
3386
3454
|
assert(state->_text_node._buffer.length == 0);
|
3455
|
+
assert(state->_table_character_tokens.length == 0);
|
3387
3456
|
state->_original_insertion_mode = state->_insertion_mode;
|
3388
3457
|
state->_reprocess_current_token = true;
|
3389
3458
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
|
3390
3459
|
return true;
|
3391
|
-
}
|
3460
|
+
}
|
3461
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
3462
|
+
append_comment_node(parser, get_current_node(parser), token);
|
3463
|
+
return true;
|
3464
|
+
}
|
3465
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
3392
3466
|
parser_add_parse_error(parser, token);
|
3393
3467
|
ignore_token(parser);
|
3394
3468
|
return false;
|
3395
|
-
}
|
3396
|
-
|
3397
|
-
return true;
|
3398
|
-
} else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
|
3469
|
+
}
|
3470
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
|
3399
3471
|
clear_stack_to_table_context(parser);
|
3400
3472
|
add_formatting_element(parser, &kActiveFormattingScopeMarker);
|
3401
3473
|
insert_element_from_token(parser, token);
|
3402
3474
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
|
3403
3475
|
return true;
|
3404
|
-
}
|
3476
|
+
}
|
3477
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
|
3405
3478
|
clear_stack_to_table_context(parser);
|
3406
3479
|
insert_element_from_token(parser, token);
|
3407
3480
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
|
3408
3481
|
return true;
|
3409
|
-
}
|
3482
|
+
}
|
3483
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
|
3410
3484
|
clear_stack_to_table_context(parser);
|
3411
3485
|
insert_element_of_tag_type (
|
3412
3486
|
parser,
|
3413
3487
|
GUMBO_TAG_COLGROUP,
|
3414
3488
|
GUMBO_INSERTION_IMPLIED
|
3415
3489
|
);
|
3416
|
-
|
3490
|
+
state->_reprocess_current_token = true;
|
3417
3491
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
|
3418
3492
|
return true;
|
3419
|
-
}
|
3493
|
+
}
|
3494
|
+
if (
|
3420
3495
|
tag_in(token, kStartTag, &(const TagSet) {
|
3421
|
-
TAG(TBODY), TAG(TFOOT), TAG(THEAD)
|
3496
|
+
TAG(TBODY), TAG(TFOOT), TAG(THEAD)
|
3422
3497
|
})
|
3423
3498
|
) {
|
3424
3499
|
clear_stack_to_table_context(parser);
|
3500
|
+
insert_element_from_token(parser, token);
|
3425
3501
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3426
|
-
if (tag_in(token, kStartTag, &(const TagSet){TAG(TD), TAG(TH), TAG(TR)})) {
|
3427
|
-
insert_element_of_tag_type (
|
3428
|
-
parser,
|
3429
|
-
GUMBO_TAG_TBODY,
|
3430
|
-
GUMBO_INSERTION_IMPLIED
|
3431
|
-
);
|
3432
|
-
state->_reprocess_current_token = true;
|
3433
|
-
} else {
|
3434
|
-
insert_element_from_token(parser, token);
|
3435
|
-
}
|
3436
3502
|
return true;
|
3437
|
-
}
|
3503
|
+
}
|
3504
|
+
if (
|
3505
|
+
tag_in(token, kStartTag, &(const TagSet) {
|
3506
|
+
TAG(TD), TAG(TH), TAG(TR)
|
3507
|
+
})
|
3508
|
+
) {
|
3509
|
+
clear_stack_to_table_context(parser);
|
3510
|
+
insert_element_of_tag_type (
|
3511
|
+
parser,
|
3512
|
+
GUMBO_TAG_TBODY,
|
3513
|
+
GUMBO_INSERTION_IMPLIED
|
3514
|
+
);
|
3515
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3516
|
+
state->_reprocess_current_token = true;
|
3517
|
+
return true;
|
3518
|
+
}
|
3519
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
|
3438
3520
|
parser_add_parse_error(parser, token);
|
3439
3521
|
if (close_table(parser)) {
|
3440
|
-
|
3522
|
+
state->_reprocess_current_token = true;
|
3441
3523
|
} else {
|
3442
3524
|
ignore_token(parser);
|
3443
3525
|
}
|
3444
3526
|
return false;
|
3445
|
-
}
|
3527
|
+
}
|
3528
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
|
3446
3529
|
if (!close_table(parser)) {
|
3447
3530
|
parser_add_parse_error(parser, token);
|
3448
3531
|
return false;
|
3449
3532
|
}
|
3450
3533
|
return true;
|
3451
|
-
}
|
3534
|
+
}
|
3535
|
+
if (
|
3452
3536
|
tag_in(token, kEndTag, &(const TagSet) {
|
3453
3537
|
TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
|
3454
3538
|
TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
@@ -3457,20 +3541,24 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
|
|
3457
3541
|
parser_add_parse_error(parser, token);
|
3458
3542
|
ignore_token(parser);
|
3459
3543
|
return false;
|
3460
|
-
}
|
3544
|
+
}
|
3545
|
+
if (
|
3461
3546
|
tag_in(token, kStartTag, &(const TagSet){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)})
|
3462
3547
|
|| (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))
|
3463
3548
|
) {
|
3464
3549
|
return handle_in_head(parser, token);
|
3465
|
-
}
|
3550
|
+
}
|
3551
|
+
if (
|
3466
3552
|
tag_is(token, kStartTag, GUMBO_TAG_INPUT)
|
3467
3553
|
&& attribute_matches(&token->v.start_tag.attributes, "type", "hidden")
|
3468
3554
|
) {
|
3469
3555
|
parser_add_parse_error(parser, token);
|
3470
3556
|
insert_element_from_token(parser, token);
|
3471
3557
|
pop_current_node(parser);
|
3558
|
+
acknowledge_self_closing_tag(parser);
|
3472
3559
|
return false;
|
3473
|
-
}
|
3560
|
+
}
|
3561
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
|
3474
3562
|
parser_add_parse_error(parser, token);
|
3475
3563
|
if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
3476
3564
|
ignore_token(parser);
|
@@ -3479,15 +3567,16 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
|
|
3479
3567
|
state->_form_element = insert_element_from_token(parser, token);
|
3480
3568
|
pop_current_node(parser);
|
3481
3569
|
return false;
|
3482
|
-
}
|
3570
|
+
}
|
3571
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
3483
3572
|
return handle_in_body(parser, token);
|
3484
|
-
} else {
|
3485
|
-
parser_add_parse_error(parser, token);
|
3486
|
-
state->_foster_parent_insertions = true;
|
3487
|
-
bool result = handle_in_body(parser, token);
|
3488
|
-
state->_foster_parent_insertions = false;
|
3489
|
-
return result;
|
3490
3573
|
}
|
3574
|
+
// foster-parenting-start-tag or foster-parenting-end-tag error
|
3575
|
+
parser_add_parse_error(parser, token);
|
3576
|
+
state->_foster_parent_insertions = true;
|
3577
|
+
bool result = handle_in_body(parser, token);
|
3578
|
+
state->_foster_parent_insertions = false;
|
3579
|
+
return result;
|
3491
3580
|
}
|
3492
3581
|
|
3493
3582
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext
|
@@ -3496,40 +3585,38 @@ static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
|
|
3496
3585
|
parser_add_parse_error(parser, token);
|
3497
3586
|
ignore_token(parser);
|
3498
3587
|
return false;
|
3499
|
-
}
|
3500
|
-
|
3501
|
-
|
3502
|
-
|
3588
|
+
}
|
3589
|
+
GumboParserState* state = parser->_parser_state;
|
3590
|
+
// Non-whitespace tokens will cause parse errors later.
|
3591
|
+
// It's not entirely clear from the spec how this is supposed to work.
|
3592
|
+
// https://github.com/whatwg/html/issues/4046
|
3593
|
+
if (token->type == GUMBO_TOKEN_WHITESPACE
|
3594
|
+
|| token->type == GUMBO_TOKEN_CHARACTER) {
|
3503
3595
|
insert_text_token(parser, token);
|
3596
|
+
gumbo_character_token_buffer_append(token, &state->_table_character_tokens);
|
3504
3597
|
return true;
|
3505
|
-
}
|
3506
|
-
|
3507
|
-
|
3508
|
-
|
3509
|
-
//
|
3510
|
-
//
|
3511
|
-
// the flag, so this loop is still valid.
|
3598
|
+
}
|
3599
|
+
|
3600
|
+
GumboCharacterTokenBuffer* buffer = &state->_table_character_tokens;
|
3601
|
+
if (state->_text_node._type != GUMBO_NODE_WHITESPACE) {
|
3602
|
+
// Each character in buffer is an error. Unfortunately, that means we need
|
3603
|
+
// to emit a bunch of errors at the appropriate locations.
|
3512
3604
|
for (size_t i = 0, n = buffer->length; i < n; ++i) {
|
3513
|
-
|
3514
|
-
|
3515
|
-
|
3516
|
-
|
3517
|
-
case '\r':
|
3518
|
-
case ' ':
|
3519
|
-
continue;
|
3520
|
-
default:
|
3521
|
-
state->_foster_parent_insertions = true;
|
3522
|
-
reconstruct_active_formatting_elements(parser);
|
3523
|
-
goto loopbreak;
|
3524
|
-
}
|
3605
|
+
GumboToken tok;
|
3606
|
+
gumbo_character_token_buffer_get(buffer, i, &tok);
|
3607
|
+
// foster-parenting-character error
|
3608
|
+
parser_add_parse_error(parser, &tok);
|
3525
3609
|
}
|
3526
|
-
|
3527
|
-
|
3528
|
-
|
3529
|
-
state->_reprocess_current_token = true;
|
3530
|
-
state->_insertion_mode = state->_original_insertion_mode;
|
3531
|
-
return true;
|
3610
|
+
state->_foster_parent_insertions = true;
|
3611
|
+
set_frameset_not_ok(parser);
|
3612
|
+
reconstruct_active_formatting_elements(parser);
|
3532
3613
|
}
|
3614
|
+
maybe_flush_text_node_buffer(parser);
|
3615
|
+
gumbo_character_token_buffer_clear(buffer);
|
3616
|
+
state->_foster_parent_insertions = false;
|
3617
|
+
state->_reprocess_current_token = true;
|
3618
|
+
state->_insertion_mode = state->_original_insertion_mode;
|
3619
|
+
return true;
|
3533
3620
|
}
|
3534
3621
|
|
3535
3622
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incaption
|
@@ -3539,19 +3626,18 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
|
|
3539
3626
|
parser_add_parse_error(parser, token);
|
3540
3627
|
ignore_token(parser);
|
3541
3628
|
return false;
|
3542
|
-
} else {
|
3543
|
-
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
3544
|
-
bool result = true;
|
3545
|
-
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
|
3546
|
-
parser_add_parse_error(parser, token);
|
3547
|
-
}
|
3548
|
-
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
|
3549
|
-
;
|
3550
|
-
clear_active_formatting_elements(parser);
|
3551
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3552
|
-
return result;
|
3553
3629
|
}
|
3554
|
-
|
3630
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
3631
|
+
bool result = node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION);
|
3632
|
+
if (!result)
|
3633
|
+
parser_add_parse_error(parser, token);
|
3634
|
+
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
|
3635
|
+
;
|
3636
|
+
clear_active_formatting_elements(parser);
|
3637
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3638
|
+
return result;
|
3639
|
+
}
|
3640
|
+
if (
|
3555
3641
|
tag_in(token, kStartTag, &(const TagSet) {
|
3556
3642
|
TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
|
3557
3643
|
TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
@@ -3563,13 +3649,18 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
|
|
3563
3649
|
ignore_token(parser);
|
3564
3650
|
return false;
|
3565
3651
|
}
|
3652
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
3653
|
+
bool result = node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION);
|
3654
|
+
if (!result)
|
3655
|
+
parser_add_parse_error(parser, token);
|
3566
3656
|
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
|
3567
3657
|
;
|
3568
3658
|
clear_active_formatting_elements(parser);
|
3569
3659
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3570
3660
|
parser->_parser_state->_reprocess_current_token = true;
|
3571
|
-
return
|
3572
|
-
}
|
3661
|
+
return result;
|
3662
|
+
}
|
3663
|
+
if (
|
3573
3664
|
tag_in(token, kEndTag, &(const TagSet) {
|
3574
3665
|
TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD),
|
3575
3666
|
TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
@@ -3578,9 +3669,8 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
|
|
3578
3669
|
parser_add_parse_error(parser, token);
|
3579
3670
|
ignore_token(parser);
|
3580
3671
|
return false;
|
3581
|
-
} else {
|
3582
|
-
return handle_in_body(parser, token);
|
3583
3672
|
}
|
3673
|
+
return handle_in_body(parser, token);
|
3584
3674
|
}
|
3585
3675
|
|
3586
3676
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incolgroup
|
@@ -3588,21 +3678,26 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
|
|
3588
3678
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
3589
3679
|
insert_text_token(parser, token);
|
3590
3680
|
return true;
|
3591
|
-
}
|
3681
|
+
}
|
3682
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
3683
|
+
append_comment_node(parser, get_current_node(parser), token);
|
3684
|
+
return true;
|
3685
|
+
}
|
3686
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
3592
3687
|
parser_add_parse_error(parser, token);
|
3593
3688
|
ignore_token(parser);
|
3594
3689
|
return false;
|
3595
|
-
}
|
3596
|
-
|
3597
|
-
return true;
|
3598
|
-
} else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
3690
|
+
}
|
3691
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
3599
3692
|
return handle_in_body(parser, token);
|
3600
|
-
}
|
3693
|
+
}
|
3694
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
|
3601
3695
|
insert_element_from_token(parser, token);
|
3602
3696
|
pop_current_node(parser);
|
3603
3697
|
acknowledge_self_closing_tag(parser);
|
3604
3698
|
return true;
|
3605
|
-
}
|
3699
|
+
}
|
3700
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
|
3606
3701
|
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
|
3607
3702
|
parser_add_parse_error(parser, token);
|
3608
3703
|
ignore_token(parser);
|
@@ -3611,28 +3706,30 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
|
|
3611
3706
|
pop_current_node(parser);
|
3612
3707
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3613
3708
|
return false;
|
3614
|
-
}
|
3709
|
+
}
|
3710
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
|
3615
3711
|
parser_add_parse_error(parser, token);
|
3616
3712
|
ignore_token(parser);
|
3617
3713
|
return false;
|
3618
|
-
}
|
3714
|
+
}
|
3715
|
+
if (
|
3619
3716
|
tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)
|
3620
3717
|
|| tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
|
3621
3718
|
) {
|
3622
3719
|
return handle_in_head(parser, token);
|
3623
|
-
}
|
3720
|
+
}
|
3721
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
3624
3722
|
return handle_in_body(parser, token);
|
3625
|
-
} else {
|
3626
|
-
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
|
3627
|
-
parser_add_parse_error(parser, token);
|
3628
|
-
ignore_token(parser);
|
3629
|
-
return false;
|
3630
|
-
}
|
3631
|
-
pop_current_node(parser);
|
3632
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3633
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3634
|
-
return true;
|
3635
3723
|
}
|
3724
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
|
3725
|
+
parser_add_parse_error(parser, token);
|
3726
|
+
ignore_token(parser);
|
3727
|
+
return false;
|
3728
|
+
}
|
3729
|
+
pop_current_node(parser);
|
3730
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3731
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3732
|
+
return true;
|
3636
3733
|
}
|
3637
3734
|
|
3638
3735
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intbody
|
@@ -3642,14 +3739,16 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
|
|
3642
3739
|
insert_element_from_token(parser, token);
|
3643
3740
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
3644
3741
|
return true;
|
3645
|
-
}
|
3742
|
+
}
|
3743
|
+
if (tag_in(token, kStartTag, &td_th_tags)) {
|
3646
3744
|
parser_add_parse_error(parser, token);
|
3647
3745
|
clear_stack_to_table_body_context(parser);
|
3648
3746
|
insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
|
3649
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3650
3747
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
3748
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3651
3749
|
return false;
|
3652
|
-
}
|
3750
|
+
}
|
3751
|
+
if (
|
3653
3752
|
tag_in(token, kEndTag, &(const TagSet){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
|
3654
3753
|
) {
|
3655
3754
|
if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
|
@@ -3661,7 +3760,8 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
|
|
3661
3760
|
pop_current_node(parser);
|
3662
3761
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3663
3762
|
return true;
|
3664
|
-
}
|
3763
|
+
}
|
3764
|
+
if (
|
3665
3765
|
tag_in(token, kStartTag, &(const TagSet) {
|
3666
3766
|
TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
|
3667
3767
|
TAG(THEAD)
|
@@ -3684,18 +3784,18 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
|
|
3684
3784
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3685
3785
|
parser->_parser_state->_reprocess_current_token = true;
|
3686
3786
|
return true;
|
3687
|
-
}
|
3787
|
+
}
|
3788
|
+
if (
|
3688
3789
|
tag_in(token, kEndTag, &(const TagSet) {
|
3689
|
-
TAG(BODY), TAG(CAPTION), TAG(COL), TAG(
|
3690
|
-
TAG(
|
3790
|
+
TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TD),
|
3791
|
+
TAG(TH), TAG(TR)
|
3691
3792
|
})
|
3692
3793
|
) {
|
3693
3794
|
parser_add_parse_error(parser, token);
|
3694
3795
|
ignore_token(parser);
|
3695
3796
|
return false;
|
3696
|
-
} else {
|
3697
|
-
return handle_in_table(parser, token);
|
3698
3797
|
}
|
3798
|
+
return handle_in_table(parser, token);
|
3699
3799
|
}
|
3700
3800
|
|
3701
3801
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intr
|
@@ -3706,18 +3806,19 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
|
|
3706
3806
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
|
3707
3807
|
add_formatting_element(parser, &kActiveFormattingScopeMarker);
|
3708
3808
|
return true;
|
3709
|
-
}
|
3809
|
+
}
|
3810
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
|
3710
3811
|
if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
|
3711
3812
|
parser_add_parse_error(parser, token);
|
3712
3813
|
ignore_token(parser);
|
3713
3814
|
return false;
|
3714
|
-
} else {
|
3715
|
-
clear_stack_to_table_row_context(parser);
|
3716
|
-
pop_current_node(parser);
|
3717
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3718
|
-
return true;
|
3719
3815
|
}
|
3720
|
-
|
3816
|
+
clear_stack_to_table_row_context(parser);
|
3817
|
+
pop_current_node(parser);
|
3818
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3819
|
+
return true;
|
3820
|
+
}
|
3821
|
+
if (
|
3721
3822
|
tag_in(token, kStartTag, &(const TagSet) {
|
3722
3823
|
TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
|
3723
3824
|
TAG(THEAD), TAG(TR)
|
@@ -3728,31 +3829,32 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
|
|
3728
3829
|
parser_add_parse_error(parser, token);
|
3729
3830
|
ignore_token(parser);
|
3730
3831
|
return false;
|
3731
|
-
} else {
|
3732
|
-
clear_stack_to_table_row_context(parser);
|
3733
|
-
pop_current_node(parser);
|
3734
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3735
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3736
|
-
return true;
|
3737
3832
|
}
|
3738
|
-
|
3833
|
+
clear_stack_to_table_row_context(parser);
|
3834
|
+
pop_current_node(parser);
|
3835
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3836
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3837
|
+
return true;
|
3838
|
+
}
|
3839
|
+
if (
|
3739
3840
|
tag_in(token, kEndTag, &(const TagSet) {TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
|
3740
3841
|
) {
|
3741
|
-
if (
|
3742
|
-
!has_an_element_in_table_scope(parser, token->v.end_tag.tag)
|
3743
|
-
|| !has_an_element_in_table_scope(parser, GUMBO_TAG_TR)
|
3744
|
-
) {
|
3842
|
+
if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
|
3745
3843
|
parser_add_parse_error(parser, token);
|
3746
3844
|
ignore_token(parser);
|
3747
3845
|
return false;
|
3748
|
-
}
|
3749
|
-
|
3750
|
-
|
3751
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3752
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3846
|
+
}
|
3847
|
+
if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
|
3848
|
+
ignore_token(parser);
|
3753
3849
|
return true;
|
3754
3850
|
}
|
3755
|
-
|
3851
|
+
clear_stack_to_table_row_context(parser);
|
3852
|
+
pop_current_node(parser);
|
3853
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3854
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3855
|
+
return true;
|
3856
|
+
}
|
3857
|
+
if (
|
3756
3858
|
tag_in(token, kEndTag, &(const TagSet) {
|
3757
3859
|
TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
|
3758
3860
|
TAG(TD), TAG(TH)
|
@@ -3761,9 +3863,8 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
|
|
3761
3863
|
parser_add_parse_error(parser, token);
|
3762
3864
|
ignore_token(parser);
|
3763
3865
|
return false;
|
3764
|
-
} else {
|
3765
|
-
return handle_in_table(parser, token);
|
3766
3866
|
}
|
3867
|
+
return handle_in_table(parser, token);
|
3767
3868
|
}
|
3768
3869
|
|
3769
3870
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intd
|
@@ -3776,7 +3877,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
|
|
3776
3877
|
return false;
|
3777
3878
|
}
|
3778
3879
|
return close_table_cell(parser, token, token_tag);
|
3779
|
-
}
|
3880
|
+
}
|
3881
|
+
if (
|
3780
3882
|
tag_in(token, kStartTag, &(const TagSet) {
|
3781
3883
|
TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
|
3782
3884
|
TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
@@ -3794,7 +3896,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
|
|
3794
3896
|
}
|
3795
3897
|
parser->_parser_state->_reprocess_current_token = true;
|
3796
3898
|
return close_current_cell(parser, token);
|
3797
|
-
}
|
3899
|
+
}
|
3900
|
+
if (
|
3798
3901
|
tag_in(token, kEndTag, &(const TagSet) {
|
3799
3902
|
TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML)
|
3800
3903
|
})
|
@@ -3802,7 +3905,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
|
|
3802
3905
|
parser_add_parse_error(parser, token);
|
3803
3906
|
ignore_token(parser);
|
3804
3907
|
return false;
|
3805
|
-
}
|
3908
|
+
}
|
3909
|
+
if (
|
3806
3910
|
tag_in(token, kEndTag, &(const TagSet) {
|
3807
3911
|
TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
|
3808
3912
|
})
|
@@ -3814,9 +3918,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
|
|
3814
3918
|
}
|
3815
3919
|
parser->_parser_state->_reprocess_current_token = true;
|
3816
3920
|
return close_current_cell(parser, token);
|
3817
|
-
} else {
|
3818
|
-
return handle_in_body(parser, token);
|
3819
3921
|
}
|
3922
|
+
return handle_in_body(parser, token);
|
3820
3923
|
}
|
3821
3924
|
|
3822
3925
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect
|
@@ -3825,28 +3928,34 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3825
3928
|
parser_add_parse_error(parser, token);
|
3826
3929
|
ignore_token(parser);
|
3827
3930
|
return false;
|
3828
|
-
}
|
3931
|
+
}
|
3932
|
+
if (
|
3829
3933
|
token->type == GUMBO_TOKEN_CHARACTER
|
3830
3934
|
|| token->type == GUMBO_TOKEN_WHITESPACE
|
3831
3935
|
) {
|
3832
3936
|
insert_text_token(parser, token);
|
3833
3937
|
return true;
|
3834
|
-
}
|
3938
|
+
}
|
3939
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
3940
|
+
append_comment_node(parser, get_current_node(parser), token);
|
3941
|
+
return true;
|
3942
|
+
}
|
3943
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
3835
3944
|
parser_add_parse_error(parser, token);
|
3836
3945
|
ignore_token(parser);
|
3837
3946
|
return false;
|
3838
|
-
}
|
3839
|
-
|
3840
|
-
return true;
|
3841
|
-
} else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
3947
|
+
}
|
3948
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
3842
3949
|
return handle_in_body(parser, token);
|
3843
|
-
}
|
3950
|
+
}
|
3951
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
|
3844
3952
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
3845
3953
|
pop_current_node(parser);
|
3846
3954
|
}
|
3847
3955
|
insert_element_from_token(parser, token);
|
3848
3956
|
return true;
|
3849
|
-
}
|
3957
|
+
}
|
3958
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
|
3850
3959
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
3851
3960
|
pop_current_node(parser);
|
3852
3961
|
}
|
@@ -3855,7 +3964,8 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3855
3964
|
}
|
3856
3965
|
insert_element_from_token(parser, token);
|
3857
3966
|
return true;
|
3858
|
-
}
|
3967
|
+
}
|
3968
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
|
3859
3969
|
GumboVector* open_elements = &parser->_parser_state->_open_elements;
|
3860
3970
|
if (
|
3861
3971
|
node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)
|
@@ -3869,21 +3979,21 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3869
3979
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
|
3870
3980
|
pop_current_node(parser);
|
3871
3981
|
return true;
|
3872
|
-
} else {
|
3873
|
-
parser_add_parse_error(parser, token);
|
3874
|
-
ignore_token(parser);
|
3875
|
-
return false;
|
3876
3982
|
}
|
3877
|
-
|
3983
|
+
parser_add_parse_error(parser, token);
|
3984
|
+
ignore_token(parser);
|
3985
|
+
return false;
|
3986
|
+
}
|
3987
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
|
3878
3988
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
3879
3989
|
pop_current_node(parser);
|
3880
3990
|
return true;
|
3881
|
-
} else {
|
3882
|
-
parser_add_parse_error(parser, token);
|
3883
|
-
ignore_token(parser);
|
3884
|
-
return false;
|
3885
3991
|
}
|
3886
|
-
|
3992
|
+
parser_add_parse_error(parser, token);
|
3993
|
+
ignore_token(parser);
|
3994
|
+
return false;
|
3995
|
+
}
|
3996
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
|
3887
3997
|
if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
|
3888
3998
|
parser_add_parse_error(parser, token);
|
3889
3999
|
ignore_token(parser);
|
@@ -3891,14 +4001,16 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3891
4001
|
}
|
3892
4002
|
close_current_select(parser);
|
3893
4003
|
return true;
|
3894
|
-
}
|
4004
|
+
}
|
4005
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
|
3895
4006
|
parser_add_parse_error(parser, token);
|
3896
4007
|
ignore_token(parser);
|
3897
4008
|
if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
|
3898
4009
|
close_current_select(parser);
|
3899
4010
|
}
|
3900
4011
|
return false;
|
3901
|
-
}
|
4012
|
+
}
|
4013
|
+
if (
|
3902
4014
|
tag_in(token, kStartTag, &(const TagSet) {TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})
|
3903
4015
|
) {
|
3904
4016
|
parser_add_parse_error(parser, token);
|
@@ -3909,18 +4021,18 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3909
4021
|
parser->_parser_state->_reprocess_current_token = true;
|
3910
4022
|
}
|
3911
4023
|
return false;
|
3912
|
-
}
|
4024
|
+
}
|
4025
|
+
if (
|
3913
4026
|
tag_in(token, kStartTag, &(const TagSet){TAG(SCRIPT), TAG(TEMPLATE)})
|
3914
4027
|
|| tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
|
3915
4028
|
) {
|
3916
4029
|
return handle_in_head(parser, token);
|
3917
|
-
} else if (token->type == GUMBO_TOKEN_EOF) {
|
3918
|
-
return handle_in_body(parser, token);
|
3919
|
-
} else {
|
3920
|
-
parser_add_parse_error(parser, token);
|
3921
|
-
ignore_token(parser);
|
3922
|
-
return false;
|
3923
4030
|
}
|
4031
|
+
if (token->type == GUMBO_TOKEN_EOF)
|
4032
|
+
return handle_in_body(parser, token);
|
4033
|
+
parser_add_parse_error(parser, token);
|
4034
|
+
ignore_token(parser);
|
4035
|
+
return false;
|
3924
4036
|
}
|
3925
4037
|
|
3926
4038
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselectintable
|
@@ -3934,22 +4046,18 @@ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
|
|
3934
4046
|
close_current_select(parser);
|
3935
4047
|
parser->_parser_state->_reprocess_current_token = true;
|
3936
4048
|
return false;
|
3937
|
-
}
|
4049
|
+
}
|
4050
|
+
if (tag_in(token, kEndTag, &tags)) {
|
3938
4051
|
parser_add_parse_error(parser, token);
|
3939
4052
|
if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
|
3940
4053
|
ignore_token(parser);
|
3941
4054
|
return false;
|
3942
|
-
} else {
|
3943
|
-
close_current_select(parser);
|
3944
|
-
// close_current_select already does the
|
3945
|
-
// reset_insertion_mode_appropriately
|
3946
|
-
// reset_insertion_mode_appropriately(parser);
|
3947
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3948
|
-
return false;
|
3949
4055
|
}
|
3950
|
-
|
3951
|
-
|
4056
|
+
close_current_select(parser);
|
4057
|
+
parser->_parser_state->_reprocess_current_token = true;
|
4058
|
+
return false;
|
3952
4059
|
}
|
4060
|
+
return handle_in_select(parser, token);
|
3953
4061
|
}
|
3954
4062
|
|
3955
4063
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate
|
@@ -3973,7 +4081,8 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
|
|
3973
4081
|
|| tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
|
3974
4082
|
) {
|
3975
4083
|
return handle_in_head(parser, token);
|
3976
|
-
}
|
4084
|
+
}
|
4085
|
+
if (
|
3977
4086
|
tag_in(token, kStartTag, &(const TagSet) {
|
3978
4087
|
TAG(CAPTION), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), TAG(THEAD)
|
3979
4088
|
})
|
@@ -3983,35 +4092,41 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
|
|
3983
4092
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3984
4093
|
state->_reprocess_current_token = true;
|
3985
4094
|
return true;
|
3986
|
-
}
|
4095
|
+
}
|
4096
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
|
3987
4097
|
pop_template_insertion_mode(parser);
|
3988
4098
|
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
|
3989
4099
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
|
3990
4100
|
state->_reprocess_current_token = true;
|
3991
4101
|
return true;
|
3992
|
-
}
|
4102
|
+
}
|
4103
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
|
3993
4104
|
pop_template_insertion_mode(parser);
|
3994
4105
|
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3995
4106
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3996
4107
|
state->_reprocess_current_token = true;
|
3997
4108
|
return true;
|
3998
|
-
}
|
4109
|
+
}
|
4110
|
+
if (tag_in(token, kStartTag, &td_th_tags)) {
|
3999
4111
|
pop_template_insertion_mode(parser);
|
4000
4112
|
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
4001
4113
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
4002
4114
|
state->_reprocess_current_token = true;
|
4003
4115
|
return true;
|
4004
|
-
}
|
4116
|
+
}
|
4117
|
+
if (token->type == GUMBO_TOKEN_START_TAG) {
|
4005
4118
|
pop_template_insertion_mode(parser);
|
4006
4119
|
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4007
4120
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4008
4121
|
state->_reprocess_current_token = true;
|
4009
4122
|
return true;
|
4010
|
-
}
|
4123
|
+
}
|
4124
|
+
if (token->type == GUMBO_TOKEN_END_TAG) {
|
4011
4125
|
parser_add_parse_error(parser, token);
|
4012
4126
|
ignore_token(parser);
|
4013
4127
|
return false;
|
4014
|
-
}
|
4128
|
+
}
|
4129
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4015
4130
|
if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
4016
4131
|
// Stop parsing.
|
4017
4132
|
return true;
|
@@ -4024,10 +4139,9 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
|
|
4024
4139
|
reset_insertion_mode_appropriately(parser);
|
4025
4140
|
state->_reprocess_current_token = true;
|
4026
4141
|
return false;
|
4027
|
-
} else {
|
4028
|
-
assert(0);
|
4029
|
-
return false;
|
4030
4142
|
}
|
4143
|
+
assert(0 && "unreachable");
|
4144
|
+
return false;
|
4031
4145
|
}
|
4032
4146
|
|
4033
4147
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterbody
|
@@ -4037,16 +4151,22 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
|
|
4037
4151
|
|| tag_is(token, kStartTag, GUMBO_TAG_HTML)
|
4038
4152
|
) {
|
4039
4153
|
return handle_in_body(parser, token);
|
4040
|
-
}
|
4154
|
+
}
|
4155
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
4041
4156
|
GumboNode* html_node = parser->_output->root;
|
4042
4157
|
assert(html_node != NULL);
|
4043
4158
|
append_comment_node(parser, html_node, token);
|
4044
4159
|
return true;
|
4045
|
-
}
|
4160
|
+
}
|
4161
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
4046
4162
|
parser_add_parse_error(parser, token);
|
4047
4163
|
ignore_token(parser);
|
4048
4164
|
return false;
|
4049
|
-
}
|
4165
|
+
}
|
4166
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
4167
|
+
return handle_in_body(parser, token);
|
4168
|
+
}
|
4169
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
|
4050
4170
|
/* fragment case: ignore the closing HTML token */
|
4051
4171
|
if (is_fragment_parser(parser)) {
|
4052
4172
|
parser_add_parse_error(parser, token);
|
@@ -4061,14 +4181,14 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
|
|
4061
4181
|
&html->v.element
|
4062
4182
|
);
|
4063
4183
|
return true;
|
4064
|
-
}
|
4184
|
+
}
|
4185
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4065
4186
|
return true;
|
4066
|
-
} else {
|
4067
|
-
parser_add_parse_error(parser, token);
|
4068
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4069
|
-
parser->_parser_state->_reprocess_current_token = true;
|
4070
|
-
return false;
|
4071
4187
|
}
|
4188
|
+
parser_add_parse_error(parser, token);
|
4189
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4190
|
+
parser->_parser_state->_reprocess_current_token = true;
|
4191
|
+
return false;
|
4072
4192
|
}
|
4073
4193
|
|
4074
4194
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inframeset
|
@@ -4076,19 +4196,24 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
|
|
4076
4196
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
4077
4197
|
insert_text_token(parser, token);
|
4078
4198
|
return true;
|
4079
|
-
}
|
4199
|
+
}
|
4200
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
4080
4201
|
append_comment_node(parser, get_current_node(parser), token);
|
4081
4202
|
return true;
|
4082
|
-
}
|
4203
|
+
}
|
4204
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
4083
4205
|
parser_add_parse_error(parser, token);
|
4084
4206
|
ignore_token(parser);
|
4085
4207
|
return false;
|
4086
|
-
}
|
4208
|
+
}
|
4209
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
4087
4210
|
return handle_in_body(parser, token);
|
4088
|
-
}
|
4211
|
+
}
|
4212
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
|
4089
4213
|
insert_element_from_token(parser, token);
|
4090
4214
|
return true;
|
4091
|
-
}
|
4215
|
+
}
|
4216
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
|
4092
4217
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
|
4093
4218
|
parser_add_parse_error(parser, token);
|
4094
4219
|
ignore_token(parser);
|
@@ -4102,24 +4227,26 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
|
|
4102
4227
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
|
4103
4228
|
}
|
4104
4229
|
return true;
|
4105
|
-
}
|
4230
|
+
}
|
4231
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
|
4106
4232
|
insert_element_from_token(parser, token);
|
4107
4233
|
pop_current_node(parser);
|
4108
4234
|
acknowledge_self_closing_tag(parser);
|
4109
4235
|
return true;
|
4110
|
-
}
|
4236
|
+
}
|
4237
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
|
4111
4238
|
return handle_in_head(parser, token);
|
4112
|
-
}
|
4239
|
+
}
|
4240
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4113
4241
|
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
|
4114
4242
|
parser_add_parse_error(parser, token);
|
4115
4243
|
return false;
|
4116
4244
|
}
|
4117
4245
|
return true;
|
4118
|
-
} else {
|
4119
|
-
parser_add_parse_error(parser, token);
|
4120
|
-
ignore_token(parser);
|
4121
|
-
return false;
|
4122
4246
|
}
|
4247
|
+
parser_add_parse_error(parser, token);
|
4248
|
+
ignore_token(parser);
|
4249
|
+
return false;
|
4123
4250
|
}
|
4124
4251
|
|
4125
4252
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterframeset
|
@@ -4127,16 +4254,20 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
|
|
4127
4254
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
4128
4255
|
insert_text_token(parser, token);
|
4129
4256
|
return true;
|
4130
|
-
}
|
4257
|
+
}
|
4258
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
4131
4259
|
append_comment_node(parser, get_current_node(parser), token);
|
4132
4260
|
return true;
|
4133
|
-
}
|
4261
|
+
}
|
4262
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
4134
4263
|
parser_add_parse_error(parser, token);
|
4135
4264
|
ignore_token(parser);
|
4136
4265
|
return false;
|
4137
|
-
}
|
4266
|
+
}
|
4267
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
4138
4268
|
return handle_in_body(parser, token);
|
4139
|
-
}
|
4269
|
+
}
|
4270
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
|
4140
4271
|
GumboNode* html = parser->_parser_state->_open_elements.data[0];
|
4141
4272
|
assert(node_html_tag_is(html, GUMBO_TAG_HTML));
|
4142
4273
|
record_end_of_element (
|
@@ -4145,15 +4276,16 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
|
|
4145
4276
|
);
|
4146
4277
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
|
4147
4278
|
return true;
|
4148
|
-
}
|
4279
|
+
}
|
4280
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
|
4149
4281
|
return handle_in_head(parser, token);
|
4150
|
-
}
|
4282
|
+
}
|
4283
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4151
4284
|
return true;
|
4152
|
-
} else {
|
4153
|
-
parser_add_parse_error(parser, token);
|
4154
|
-
ignore_token(parser);
|
4155
|
-
return false;
|
4156
4285
|
}
|
4286
|
+
parser_add_parse_error(parser, token);
|
4287
|
+
ignore_token(parser);
|
4288
|
+
return false;
|
4157
4289
|
}
|
4158
4290
|
|
4159
4291
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-body-insertion-mode
|
@@ -4161,20 +4293,21 @@ static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
|
|
4161
4293
|
if (token->type == GUMBO_TOKEN_COMMENT) {
|
4162
4294
|
append_comment_node(parser, get_document_node(parser), token);
|
4163
4295
|
return true;
|
4164
|
-
}
|
4296
|
+
}
|
4297
|
+
if (
|
4165
4298
|
token->type == GUMBO_TOKEN_DOCTYPE
|
4166
4299
|
|| token->type == GUMBO_TOKEN_WHITESPACE
|
4167
4300
|
|| tag_is(token, kStartTag, GUMBO_TAG_HTML)
|
4168
4301
|
) {
|
4169
4302
|
return handle_in_body(parser, token);
|
4170
|
-
}
|
4303
|
+
}
|
4304
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4171
4305
|
return true;
|
4172
|
-
} else {
|
4173
|
-
parser_add_parse_error(parser, token);
|
4174
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4175
|
-
parser->_parser_state->_reprocess_current_token = true;
|
4176
|
-
return false;
|
4177
4306
|
}
|
4307
|
+
parser_add_parse_error(parser, token);
|
4308
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4309
|
+
parser->_parser_state->_reprocess_current_token = true;
|
4310
|
+
return false;
|
4178
4311
|
}
|
4179
4312
|
|
4180
4313
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-frameset-insertion-mode
|
@@ -4185,21 +4318,23 @@ static bool handle_after_after_frameset (
|
|
4185
4318
|
if (token->type == GUMBO_TOKEN_COMMENT) {
|
4186
4319
|
append_comment_node(parser, get_document_node(parser), token);
|
4187
4320
|
return true;
|
4188
|
-
}
|
4321
|
+
}
|
4322
|
+
if (
|
4189
4323
|
token->type == GUMBO_TOKEN_DOCTYPE
|
4190
4324
|
|| token->type == GUMBO_TOKEN_WHITESPACE
|
4191
4325
|
|| tag_is(token, kStartTag, GUMBO_TAG_HTML)
|
4192
4326
|
) {
|
4193
4327
|
return handle_in_body(parser, token);
|
4194
|
-
}
|
4328
|
+
}
|
4329
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4195
4330
|
return true;
|
4196
|
-
}
|
4331
|
+
}
|
4332
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
|
4197
4333
|
return handle_in_head(parser, token);
|
4198
|
-
} else {
|
4199
|
-
parser_add_parse_error(parser, token);
|
4200
|
-
ignore_token(parser);
|
4201
|
-
return false;
|
4202
4334
|
}
|
4335
|
+
parser_add_parse_error(parser, token);
|
4336
|
+
ignore_token(parser);
|
4337
|
+
return false;
|
4203
4338
|
}
|
4204
4339
|
|
4205
4340
|
// Function pointers for each insertion mode.
|
@@ -4306,8 +4441,7 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
|
4306
4441
|
parser->_parser_state->_reprocess_current_token = true;
|
4307
4442
|
return false;
|
4308
4443
|
}
|
4309
|
-
|
4310
|
-
assert(token->type == GUMBO_TOKEN_START_TAG);
|
4444
|
+
// This is a start tag so the next if's then branch will be taken.
|
4311
4445
|
}
|
4312
4446
|
|
4313
4447
|
if (token->type == GUMBO_TOKEN_START_TAG) {
|
@@ -4329,49 +4463,48 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
|
4329
4463
|
return true;
|
4330
4464
|
// </script> tags are handled like any other end tag, putting the script's
|
4331
4465
|
// text into a text node child and closing the current node.
|
4332
|
-
}
|
4333
|
-
|
4334
|
-
|
4335
|
-
|
4336
|
-
|
4337
|
-
|
4466
|
+
}
|
4467
|
+
assert(token->type == GUMBO_TOKEN_END_TAG);
|
4468
|
+
GumboNode* node = get_current_node(parser);
|
4469
|
+
GumboTag tag = token->v.end_tag.tag;
|
4470
|
+
const char* name = token->v.end_tag.name;
|
4471
|
+
assert(node != NULL);
|
4338
4472
|
|
4339
|
-
|
4340
|
-
|
4341
|
-
|
4342
|
-
|
4343
|
-
|
4344
|
-
|
4345
|
-
|
4346
|
-
|
4347
|
-
|
4348
|
-
|
4349
|
-
|
4350
|
-
|
4351
|
-
|
4352
|
-
|
4353
|
-
|
4354
|
-
|
4355
|
-
|
4356
|
-
|
4357
|
-
}
|
4358
|
-
return is_success;
|
4359
|
-
}
|
4360
|
-
--i;
|
4361
|
-
node = parser->_parser_state->_open_elements.data[i];
|
4362
|
-
if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
|
4363
|
-
// The loop continues only in foreign namespaces.
|
4364
|
-
break;
|
4473
|
+
bool is_success = true;
|
4474
|
+
if (!node_tagname_is(node, tag, name)) {
|
4475
|
+
parser_add_parse_error(parser, token);
|
4476
|
+
is_success = false;
|
4477
|
+
}
|
4478
|
+
int i = parser->_parser_state->_open_elements.length;
|
4479
|
+
for (--i; i > 0;) {
|
4480
|
+
// Here we move up the stack until we find an HTML element (in which
|
4481
|
+
// case we do nothing) or we find the element that we're about to
|
4482
|
+
// close (in which case we pop everything we've seen until that
|
4483
|
+
// point.)
|
4484
|
+
gumbo_debug("Foreign %s node at %d.\n", node->v.element.name, i);
|
4485
|
+
if (node_tagname_is(node, tag, name)) {
|
4486
|
+
gumbo_debug("Matches.\n");
|
4487
|
+
while (node != pop_current_node(parser)) {
|
4488
|
+
// Pop all the nodes below the current one. Node is guaranteed to
|
4489
|
+
// be an element on the stack of open elements (set below), so
|
4490
|
+
// this loop is guaranteed to terminate.
|
4365
4491
|
}
|
4366
|
-
}
|
4367
|
-
assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
|
4368
|
-
if (i == 0)
|
4369
4492
|
return is_success;
|
4370
|
-
|
4371
|
-
|
4372
|
-
|
4373
|
-
|
4493
|
+
}
|
4494
|
+
--i;
|
4495
|
+
node = parser->_parser_state->_open_elements.data[i];
|
4496
|
+
if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
|
4497
|
+
// The loop continues only in foreign namespaces.
|
4498
|
+
break;
|
4499
|
+
}
|
4374
4500
|
}
|
4501
|
+
assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
|
4502
|
+
if (i == 0)
|
4503
|
+
return is_success;
|
4504
|
+
// We can't call handle_token directly because the current node is still in
|
4505
|
+
// a foriegn namespace, so it would re-enter this and result in infinite
|
4506
|
+
// recursion.
|
4507
|
+
return handle_html_content(parser, token) && is_success;
|
4375
4508
|
}
|
4376
4509
|
|
4377
4510
|
// https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
|
@@ -4517,7 +4650,7 @@ static void fragment_parser_init (
|
|
4517
4650
|
break;
|
4518
4651
|
|
4519
4652
|
case GUMBO_TAG_SCRIPT:
|
4520
|
-
gumbo_tokenizer_set_state(parser,
|
4653
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
4521
4654
|
break;
|
4522
4655
|
|
4523
4656
|
case GUMBO_TAG_NOSCRIPT:
|
@@ -4554,7 +4687,7 @@ static void fragment_parser_init (
|
|
4554
4687
|
// 11.
|
4555
4688
|
if (ctx_has_form_ancestor
|
4556
4689
|
|| (ctx_tag == GUMBO_TAG_FORM
|
4557
|
-
|
4690
|
+
&& fragment_namespace == GUMBO_NAMESPACE_HTML)) {
|
4558
4691
|
static const GumboNode form_ancestor = {
|
4559
4692
|
.type = GUMBO_NODE_ELEMENT,
|
4560
4693
|
.parent = NULL,
|
@@ -4619,11 +4752,11 @@ GumboOutput* gumbo_parse_with_options (
|
|
4619
4752
|
if (state->_reprocess_current_token) {
|
4620
4753
|
state->_reprocess_current_token = false;
|
4621
4754
|
} else {
|
4622
|
-
GumboNode*
|
4623
|
-
|
4755
|
+
GumboNode* adjusted_current_node = get_adjusted_current_node(&parser);
|
4756
|
+
gumbo_tokenizer_set_is_adjusted_current_node_foreign (
|
4624
4757
|
&parser,
|
4625
|
-
|
4626
|
-
|
4758
|
+
adjusted_current_node &&
|
4759
|
+
adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
|
4627
4760
|
);
|
4628
4761
|
has_error = !gumbo_lex(&parser, &token) || has_error;
|
4629
4762
|
}
|
@@ -4649,10 +4782,10 @@ GumboOutput* gumbo_parse_with_options (
|
|
4649
4782
|
break;
|
4650
4783
|
}
|
4651
4784
|
gumbo_debug (
|
4652
|
-
"Handling %s token @%
|
4785
|
+
"Handling %s token @%lu:%lu in state %u.\n",
|
4653
4786
|
(char*) token_type,
|
4654
|
-
token.position.line,
|
4655
|
-
token.position.column,
|
4787
|
+
(unsigned long)token.position.line,
|
4788
|
+
(unsigned long)token.position.column,
|
4656
4789
|
state->_insertion_mode
|
4657
4790
|
);
|
4658
4791
|
|
@@ -4671,19 +4804,26 @@ GumboOutput* gumbo_parse_with_options (
|
|
4671
4804
|
);
|
4672
4805
|
|
4673
4806
|
if (!state->_reprocess_current_token) {
|
4807
|
+
// If we're done with the token, check for unacknowledged self-closing
|
4808
|
+
// flags on start tags.
|
4674
4809
|
if (token.type == GUMBO_TOKEN_START_TAG &&
|
4675
4810
|
token.v.start_tag.is_self_closing &&
|
4676
4811
|
!state->_self_closing_flag_acknowledged) {
|
4677
|
-
|
4678
|
-
|
4679
|
-
|
4812
|
+
has_error = true;
|
4813
|
+
GumboError* error = gumbo_add_error(&parser);
|
4814
|
+
if (error) {
|
4815
|
+
// This is essentially a tokenizer error that's only caught during
|
4816
|
+
// tree construction.
|
4817
|
+
error->type = GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS;
|
4818
|
+
error->original_text = token.original_text;
|
4819
|
+
error->position = token.position;
|
4820
|
+
}
|
4680
4821
|
}
|
4822
|
+
// Make sure we free the end tag's name since it doesn't get transferred
|
4823
|
+
// to a token.
|
4681
4824
|
if (token.type == GUMBO_TOKEN_END_TAG &&
|
4682
|
-
token.v.end_tag.
|
4683
|
-
|
4684
|
-
if (error)
|
4685
|
-
error->type = GUMBO_ERR_SELF_CLOSING_END_TAG;
|
4686
|
-
}
|
4825
|
+
token.v.end_tag.tag == GUMBO_TAG_UNKNOWN)
|
4826
|
+
gumbo_free(token.v.end_tag.name);
|
4687
4827
|
}
|
4688
4828
|
|
4689
4829
|
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|