nokogumbo 2.0.0.pre.alpha → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +101 -14
- data/ext/nokogumbo/extconf.rb +7 -2
- data/ext/nokogumbo/nokogumbo.c +630 -235
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +391 -126
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +74 -4
- data/gumbo-parser/src/parser.c +1161 -1025
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1440 -1278
- data/gumbo-parser/src/tokenizer.h +7 -18
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +17 -59
- data/gumbo-parser/src/utf8.h +52 -16
- data/lib/nokogumbo.rb +3 -1
- data/lib/nokogumbo/html5.rb +17 -15
- data/lib/nokogumbo/html5/document.rb +19 -3
- data/lib/nokogumbo/html5/document_fragment.rb +36 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +20 -14
- data/CHANGELOG.md +0 -56
data/gumbo-parser/src/error.h
CHANGED
@@ -7,6 +7,7 @@
|
|
7
7
|
#include "insertion_mode.h"
|
8
8
|
#include "string_buffer.h"
|
9
9
|
#include "token_type.h"
|
10
|
+
#include "tokenizer_states.h"
|
10
11
|
|
11
12
|
#ifdef __cplusplus
|
12
13
|
extern "C" {
|
@@ -15,85 +16,66 @@ extern "C" {
|
|
15
16
|
struct GumboInternalParser;
|
16
17
|
|
17
18
|
typedef enum {
|
19
|
+
// Defined errors.
|
20
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
|
21
|
+
GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
|
22
|
+
GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
|
23
|
+
GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
|
24
|
+
GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
|
25
|
+
GUMBO_ERR_CDATA_IN_HTML_CONTENT,
|
26
|
+
GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
|
27
|
+
GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM,
|
28
|
+
GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
|
29
|
+
GUMBO_ERR_END_TAG_WITH_ATTRIBUTES,
|
30
|
+
GUMBO_ERR_DUPLICATE_ATTRIBUTE,
|
31
|
+
GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS,
|
32
|
+
GUMBO_ERR_EOF_BEFORE_TAG_NAME,
|
33
|
+
GUMBO_ERR_EOF_IN_CDATA,
|
34
|
+
GUMBO_ERR_EOF_IN_COMMENT,
|
35
|
+
GUMBO_ERR_EOF_IN_DOCTYPE,
|
36
|
+
GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
|
37
|
+
GUMBO_ERR_EOF_IN_TAG,
|
38
|
+
GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT,
|
39
|
+
GUMBO_ERR_INCORRECTLY_OPENED_COMMENT,
|
40
|
+
GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
|
41
|
+
GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
|
42
|
+
GUMBO_ERR_MISSING_ATTRIBUTE_VALUE,
|
43
|
+
GUMBO_ERR_MISSING_DOCTYPE_NAME,
|
44
|
+
GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
|
45
|
+
GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
|
46
|
+
GUMBO_ERR_MISSING_END_TAG_NAME,
|
47
|
+
GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
|
48
|
+
GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
|
49
|
+
GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
|
50
|
+
GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
|
51
|
+
GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
|
52
|
+
GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
|
53
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
|
54
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
|
55
|
+
GUMBO_ERR_NESTED_COMMENT,
|
56
|
+
GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
|
57
|
+
GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM,
|
58
|
+
GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
|
59
|
+
GUMBO_ERR_NULL_CHARACTER_REFERENCE,
|
60
|
+
GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
|
61
|
+
GUMBO_ERR_SURROGATE_IN_INPUT_STREAM,
|
62
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
|
63
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
|
64
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
|
65
|
+
GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
|
66
|
+
GUMBO_ERR_UNEXPECTED_NULL_CHARACTER,
|
67
|
+
GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
|
68
|
+
GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG,
|
69
|
+
GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE,
|
70
|
+
|
71
|
+
// Encoding errors.
|
18
72
|
GUMBO_ERR_UTF8_INVALID,
|
19
73
|
GUMBO_ERR_UTF8_TRUNCATED,
|
20
|
-
|
21
|
-
|
22
|
-
GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
|
23
|
-
GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
|
24
|
-
GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
|
25
|
-
GUMBO_ERR_NAMED_CHAR_REF_INVALID,
|
26
|
-
GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
|
27
|
-
GUMBO_ERR_TAG_EOF,
|
28
|
-
GUMBO_ERR_TAG_INVALID,
|
29
|
-
GUMBO_ERR_CLOSE_TAG_EMPTY,
|
30
|
-
GUMBO_ERR_CLOSE_TAG_EOF,
|
31
|
-
GUMBO_ERR_CLOSE_TAG_INVALID,
|
32
|
-
GUMBO_ERR_SCRIPT_EOF,
|
33
|
-
GUMBO_ERR_ATTR_NAME_EOF,
|
34
|
-
GUMBO_ERR_ATTR_NAME_INVALID,
|
35
|
-
GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
|
36
|
-
GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
|
37
|
-
GUMBO_ERR_ATTR_UNQUOTED_EOF,
|
38
|
-
GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
|
39
|
-
GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
|
40
|
-
GUMBO_ERR_ATTR_AFTER_EOF,
|
41
|
-
GUMBO_ERR_ATTR_AFTER_INVALID,
|
42
|
-
GUMBO_ERR_DUPLICATE_ATTR,
|
43
|
-
GUMBO_ERR_SOLIDUS_EOF,
|
44
|
-
GUMBO_ERR_SOLIDUS_INVALID,
|
45
|
-
GUMBO_ERR_DASHES_OR_DOCTYPE,
|
46
|
-
GUMBO_ERR_COMMENT_EOF,
|
47
|
-
GUMBO_ERR_COMMENT_INVALID,
|
48
|
-
GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
|
49
|
-
GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
|
50
|
-
GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
|
51
|
-
GUMBO_ERR_COMMENT_END_BANG_EOF,
|
52
|
-
GUMBO_ERR_DOCTYPE_EOF,
|
53
|
-
GUMBO_ERR_DOCTYPE_INVALID,
|
54
|
-
GUMBO_ERR_DOCTYPE_SPACE,
|
55
|
-
GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
|
56
|
-
GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
|
57
|
-
GUMBO_ERR_DOCTYPE_END,
|
74
|
+
|
75
|
+
// Generic parser error.
|
58
76
|
GUMBO_ERR_PARSER,
|
59
|
-
GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
|
60
|
-
GUMBO_ERR_SELF_CLOSING_END_TAG,
|
61
77
|
} GumboErrorType;
|
62
78
|
|
63
|
-
// Additional data for duplicated attributes.
|
64
|
-
typedef struct GumboInternalDuplicateAttrError {
|
65
|
-
// The name of the attribute. Owned by this struct.
|
66
|
-
const char* name;
|
67
|
-
|
68
|
-
// The (0-based) index within the attributes vector of the original
|
69
|
-
// occurrence.
|
70
|
-
unsigned int original_index;
|
71
|
-
|
72
|
-
// The (0-based) index where the new occurrence would be.
|
73
|
-
unsigned int new_index;
|
74
|
-
} GumboDuplicateAttrError;
|
75
|
-
|
76
|
-
// A simplified representation of the tokenizer state, designed to be more
|
77
|
-
// useful to clients of this library than the internal representation. This
|
78
|
-
// condenses the actual states used in the tokenizer state machine into a few
|
79
|
-
// values that will be familiar to users of HTML.
|
80
|
-
typedef enum {
|
81
|
-
GUMBO_ERR_TOKENIZER_DATA,
|
82
|
-
GUMBO_ERR_TOKENIZER_CHAR_REF,
|
83
|
-
GUMBO_ERR_TOKENIZER_RCDATA,
|
84
|
-
GUMBO_ERR_TOKENIZER_RAWTEXT,
|
85
|
-
GUMBO_ERR_TOKENIZER_PLAINTEXT,
|
86
|
-
GUMBO_ERR_TOKENIZER_SCRIPT,
|
87
|
-
GUMBO_ERR_TOKENIZER_TAG,
|
88
|
-
GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
|
89
|
-
GUMBO_ERR_TOKENIZER_ATTR_NAME,
|
90
|
-
GUMBO_ERR_TOKENIZER_ATTR_VALUE,
|
91
|
-
GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
|
92
|
-
GUMBO_ERR_TOKENIZER_COMMENT,
|
93
|
-
GUMBO_ERR_TOKENIZER_DOCTYPE,
|
94
|
-
GUMBO_ERR_TOKENIZER_CDATA,
|
95
|
-
} GumboTokenizerErrorState;
|
96
|
-
|
97
79
|
// Additional data for tokenizer errors.
|
98
80
|
// This records the current state and codepoint encountered - this is usually
|
99
81
|
// enough to reconstruct what went wrong and provide a friendly error message.
|
@@ -102,7 +84,7 @@ typedef struct GumboInternalTokenizerError {
|
|
102
84
|
int codepoint;
|
103
85
|
|
104
86
|
// The state that the tokenizer was in at the time.
|
105
|
-
|
87
|
+
GumboTokenizerEnum state;
|
106
88
|
} GumboTokenizerError;
|
107
89
|
|
108
90
|
// Additional data for parse errors.
|
@@ -125,43 +107,25 @@ typedef struct GumboInternalParserError {
|
|
125
107
|
// The overall error struct representing an error in decoding/tokenizing/parsing
|
126
108
|
// the HTML. This contains an enumerated type flag, a source position, and then
|
127
109
|
// a union of fields containing data specific to the error.
|
128
|
-
|
110
|
+
struct GumboInternalError {
|
129
111
|
// The type of error.
|
130
112
|
GumboErrorType type;
|
131
113
|
|
132
114
|
// The position within the source file where the error occurred.
|
133
115
|
GumboSourcePosition position;
|
134
116
|
|
135
|
-
//
|
136
|
-
|
137
|
-
// character-based instead of byte-based offsets).
|
138
|
-
const char* original_text;
|
117
|
+
// The piece of text that caused the error.
|
118
|
+
GumboStringPiece original_text;
|
139
119
|
|
140
120
|
// Type-specific error information.
|
141
121
|
union {
|
142
|
-
// The code point we encountered, for:
|
143
|
-
// * GUMBO_ERR_UTF8_INVALID
|
144
|
-
// * GUMBO_ERR_UTF8_TRUNCATED
|
145
|
-
// * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
|
146
|
-
// * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
|
147
|
-
uint32_t codepoint;
|
148
|
-
|
149
122
|
// Tokenizer errors.
|
150
123
|
GumboTokenizerError tokenizer;
|
151
124
|
|
152
|
-
//
|
153
|
-
|
154
|
-
// * GUMBO_ERR_NAMED_CHAR_REF_INVALID
|
155
|
-
GumboStringPiece text;
|
156
|
-
|
157
|
-
// Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
|
158
|
-
GumboDuplicateAttrError duplicate_attr;
|
159
|
-
|
160
|
-
// Parser state, for GUMBO_ERR_PARSER and
|
161
|
-
// GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
|
162
|
-
struct GumboInternalParserError parser;
|
125
|
+
// Parser errors.
|
126
|
+
GumboParserError parser;
|
163
127
|
} v;
|
164
|
-
}
|
128
|
+
};
|
165
129
|
|
166
130
|
// Adds a new error to the parser's error list, and returns a pointer to it so
|
167
131
|
// that clients can fill out the rest of its fields. May return NULL if we're
|
@@ -177,32 +141,6 @@ void gumbo_destroy_errors(struct GumboInternalParser* errors);
|
|
177
141
|
// Frees the memory used for a single GumboError.
|
178
142
|
void gumbo_error_destroy(GumboError* error);
|
179
143
|
|
180
|
-
// Prints an error to a string. This fills an empty GumboStringBuffer with a
|
181
|
-
// freshly-allocated buffer containing the error message text. The caller is
|
182
|
-
// responsible for freeing the buffer.
|
183
|
-
void gumbo_error_to_string (
|
184
|
-
const GumboError* error,
|
185
|
-
GumboStringBuffer* output
|
186
|
-
);
|
187
|
-
|
188
|
-
// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
|
189
|
-
// with a freshly-allocated buffer containing the error message text. The
|
190
|
-
// caller is responsible for freeing the buffer.
|
191
|
-
void gumbo_caret_diagnostic_to_string (
|
192
|
-
const GumboError* error,
|
193
|
-
const char* source_text,
|
194
|
-
size_t source_length,
|
195
|
-
GumboStringBuffer* output
|
196
|
-
);
|
197
|
-
|
198
|
-
// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
|
199
|
-
// of writing to a string.
|
200
|
-
void gumbo_print_caret_diagnostic (
|
201
|
-
const GumboError* error,
|
202
|
-
const char* source_text,
|
203
|
-
size_t source_length
|
204
|
-
);
|
205
|
-
|
206
144
|
#ifdef __cplusplus
|
207
145
|
}
|
208
146
|
#endif
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -706,6 +706,15 @@ typedef struct GumboInternalOptions {
|
|
706
706
|
*/
|
707
707
|
bool stop_on_first_error;
|
708
708
|
|
709
|
+
/**
|
710
|
+
* Maximum allowed number of attributes per element. If this limit is
|
711
|
+
* exceeded, the parser will return early with a partial document and
|
712
|
+
* the returned `GumboOutput` will have its `status` field set to
|
713
|
+
* `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
|
714
|
+
* Default: `400`.
|
715
|
+
*/
|
716
|
+
int max_attributes;
|
717
|
+
|
709
718
|
/**
|
710
719
|
* Maximum allowed depth for the parse tree. If this limit is exceeded,
|
711
720
|
* the parser will return early with a partial document and the returned
|
@@ -796,6 +805,16 @@ typedef enum {
|
|
796
805
|
*/
|
797
806
|
GUMBO_STATUS_TREE_TOO_DEEP,
|
798
807
|
|
808
|
+
/**
|
809
|
+
* Indicates that the maximum number of attributes per element
|
810
|
+
* (`GumboOptions::max_attributes`) was reached during parsing. The
|
811
|
+
* resulting tree will be a partial document, with no further nodes
|
812
|
+
* created after the point where the limit was reached. The partial
|
813
|
+
* document may be useful for constructing an error message but
|
814
|
+
* typically shouldn't be used for other purposes.
|
815
|
+
*/
|
816
|
+
GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
|
817
|
+
|
799
818
|
// Currently unused
|
800
819
|
GUMBO_STATUS_OUT_OF_MEMORY,
|
801
820
|
} GumboOutputStatus;
|
@@ -817,13 +836,17 @@ typedef struct GumboInternalOutput {
|
|
817
836
|
|
818
837
|
/**
|
819
838
|
* A list of errors that occurred during the parse.
|
820
|
-
* NOTE: In version 1.0 of this library, the API for errors hasn't been fully
|
821
|
-
* fleshed out and may change in the future. For this reason, the GumboError
|
822
|
-
* header isn't part of the public API. Contact us if you need errors
|
823
|
-
* reported so we can work out something appropriate for your use-case.
|
824
839
|
*/
|
825
840
|
GumboVector /* GumboError */ errors;
|
826
841
|
|
842
|
+
/**
|
843
|
+
* True if the parser encounted an error.
|
844
|
+
*
|
845
|
+
* This can be true and `errors` an empty `GumboVector` if the `max_errors`
|
846
|
+
* option was set to 0.
|
847
|
+
*/
|
848
|
+
bool document_error;
|
849
|
+
|
827
850
|
/**
|
828
851
|
* A status code indicating whether parsing finished successfully or was
|
829
852
|
* stopped mid-document due to exceptional circumstances.
|
@@ -866,6 +889,53 @@ const char* gumbo_status_to_string(GumboOutputStatus status);
|
|
866
889
|
/** Release the memory used for the parse tree and parse errors. */
|
867
890
|
void gumbo_destroy_output(GumboOutput* output);
|
868
891
|
|
892
|
+
/** Opaque GumboError type */
|
893
|
+
typedef struct GumboInternalError GumboError;
|
894
|
+
|
895
|
+
/**
|
896
|
+
* Returns the position of the error.
|
897
|
+
*/
|
898
|
+
GumboSourcePosition gumbo_error_position(const GumboError* error);
|
899
|
+
|
900
|
+
/**
|
901
|
+
* Returns a constant string representation of the error's code. This is owned
|
902
|
+
* by the library and should not be freed by the caller.
|
903
|
+
*/
|
904
|
+
const char* gumbo_error_code(const GumboError* error);
|
905
|
+
|
906
|
+
/**
|
907
|
+
* Prints an error to a string. This stores a freshly-allocated buffer
|
908
|
+
* containing the error message text in output. The caller is responsible for
|
909
|
+
* freeing the buffer. The size of the error message is returned. The error
|
910
|
+
* message itself may not be NULL-terminated and may contain NULL bytes so the
|
911
|
+
* returned size must be used.
|
912
|
+
*/
|
913
|
+
size_t gumbo_error_to_string(const GumboError* error, char **output);
|
914
|
+
|
915
|
+
/**
|
916
|
+
* Prints a caret diagnostic to a string. This stores a freshly-allocated
|
917
|
+
* buffer containing the error message text in output. The caller is responsible for
|
918
|
+
* freeing the buffer. The size of the error message is returned. The error
|
919
|
+
* message itself may not be NULL-terminated and may contain NULL bytes so the
|
920
|
+
* returned size must be used.
|
921
|
+
*/
|
922
|
+
size_t gumbo_caret_diagnostic_to_string (
|
923
|
+
const GumboError* error,
|
924
|
+
const char* source_text,
|
925
|
+
size_t source_length,
|
926
|
+
char** output
|
927
|
+
);
|
928
|
+
|
929
|
+
/**
|
930
|
+
* Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
|
931
|
+
* instead of writing to a string.
|
932
|
+
*/
|
933
|
+
void gumbo_print_caret_diagnostic (
|
934
|
+
const GumboError* error,
|
935
|
+
const char* source_text,
|
936
|
+
size_t source_length
|
937
|
+
);
|
938
|
+
|
869
939
|
#ifdef __cplusplus
|
870
940
|
}
|
871
941
|
#endif
|
data/gumbo-parser/src/parser.c
CHANGED
@@ -31,6 +31,7 @@
|
|
31
31
|
#include "replacement.h"
|
32
32
|
#include "tokenizer.h"
|
33
33
|
#include "tokenizer_states.h"
|
34
|
+
#include "token_buffer.h"
|
34
35
|
#include "utf8.h"
|
35
36
|
#include "util.h"
|
36
37
|
#include "vector.h"
|
@@ -42,11 +43,12 @@ typedef uint8_t TagSet[GUMBO_TAG_LAST + 1];
|
|
42
43
|
|
43
44
|
#define GUMBO_EMPTY_SOURCE_POSITION_INIT { .line = 0, .column = 0, .offset = 0 }
|
44
45
|
#define kGumboEmptySourcePosition (const GumboSourcePosition) \
|
45
|
-
|
46
|
+
GUMBO_EMPTY_SOURCE_POSITION_INIT
|
46
47
|
|
47
48
|
const GumboOptions kGumboDefaultOptions = {
|
48
49
|
.tab_stop = 8,
|
49
50
|
.stop_on_first_error = false,
|
51
|
+
.max_attributes = 400,
|
50
52
|
.max_tree_depth = 400,
|
51
53
|
.max_errors = -1,
|
52
54
|
.fragment_context = NULL,
|
@@ -59,25 +61,6 @@ const GumboOptions kGumboDefaultOptions = {
|
|
59
61
|
#define STRING(s) {.data = s, .length = sizeof(s) - 1}
|
60
62
|
#define TERMINATOR {.data = NULL, .length = 0}
|
61
63
|
|
62
|
-
static const GumboStringPiece kPublicIdHtml4_0 =
|
63
|
-
STRING("-//W3C//DTD HTML 4.0//EN");
|
64
|
-
static const GumboStringPiece kPublicIdHtml4_01 =
|
65
|
-
STRING("-//W3C//DTD HTML 4.01//EN");
|
66
|
-
static const GumboStringPiece kPublicIdXhtml1_0 =
|
67
|
-
STRING("-//W3C//DTD XHTML 1.0 Strict//EN");
|
68
|
-
static const GumboStringPiece kPublicIdXhtml1_1 =
|
69
|
-
STRING("-//W3C//DTD XHTML 1.1//EN");
|
70
|
-
static const GumboStringPiece kSystemIdRecHtml4_0 =
|
71
|
-
STRING("http://www.w3.org/TR/REC-html40/strict.dtd");
|
72
|
-
static const GumboStringPiece kSystemIdHtml4 =
|
73
|
-
STRING("http://www.w3.org/TR/html4/strict.dtd");
|
74
|
-
static const GumboStringPiece kSystemIdXhtmlStrict1_1 =
|
75
|
-
STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
|
76
|
-
static const GumboStringPiece kSystemIdXhtml1_1 =
|
77
|
-
STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
|
78
|
-
static const GumboStringPiece kSystemIdLegacyCompat =
|
79
|
-
STRING("about:legacy-compat");
|
80
|
-
|
81
64
|
// The doctype arrays have an explicit terminator because we want to pass them
|
82
65
|
// to a helper function, and passing them as a pointer discards sizeof
|
83
66
|
// information. The SVG arrays are used only by one-off functions, and so loops
|
@@ -260,6 +243,9 @@ typedef struct GumboInternalParserState {
|
|
260
243
|
// The accumulated text node buffer state.
|
261
244
|
TextNodeBufferState _text_node;
|
262
245
|
|
246
|
+
// The accumulated character tokens in tables for error purposes.
|
247
|
+
GumboCharacterTokenBuffer _table_character_tokens;
|
248
|
+
|
263
249
|
// The current token.
|
264
250
|
GumboToken* _current_token;
|
265
251
|
|
@@ -351,6 +337,7 @@ static void output_init(GumboParser* parser) {
|
|
351
337
|
GumboOutput* output = gumbo_alloc(sizeof(GumboOutput));
|
352
338
|
output->root = NULL;
|
353
339
|
output->document = new_document_node();
|
340
|
+
output->document_error = false;
|
354
341
|
output->status = GUMBO_STATUS_OK;
|
355
342
|
parser->_output = output;
|
356
343
|
gumbo_init_errors(parser);
|
@@ -365,6 +352,7 @@ static void parser_state_init(GumboParser* parser) {
|
|
365
352
|
parser_state->_foster_parent_insertions = false;
|
366
353
|
parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
|
367
354
|
gumbo_string_buffer_init(&parser_state->_text_node._buffer);
|
355
|
+
gumbo_character_token_buffer_init(&parser_state->_table_character_tokens);
|
368
356
|
gumbo_vector_init(10, &parser_state->_open_elements);
|
369
357
|
gumbo_vector_init(5, &parser_state->_active_formatting_elements);
|
370
358
|
gumbo_vector_init(5, &parser_state->_template_insertion_modes);
|
@@ -463,6 +451,7 @@ static void parser_state_destroy(GumboParser* parser) {
|
|
463
451
|
gumbo_vector_destroy(&state->_open_elements);
|
464
452
|
gumbo_vector_destroy(&state->_template_insertion_modes);
|
465
453
|
gumbo_string_buffer_destroy(&state->_text_node._buffer);
|
454
|
+
gumbo_character_token_buffer_destroy(&state->_table_character_tokens);
|
466
455
|
gumbo_free(state);
|
467
456
|
}
|
468
457
|
|
@@ -573,11 +562,11 @@ static bool tag_in (
|
|
573
562
|
static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
|
574
563
|
if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
|
575
564
|
return token->v.start_tag.tag == tag;
|
576
|
-
}
|
565
|
+
}
|
566
|
+
if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
|
577
567
|
return token->v.end_tag.tag == tag;
|
578
|
-
} else {
|
579
|
-
return false;
|
580
568
|
}
|
569
|
+
return false;
|
581
570
|
}
|
582
571
|
|
583
572
|
static inline bool tagset_includes (
|
@@ -621,6 +610,14 @@ static bool node_qualified_tagname_is (
|
|
621
610
|
return !gumbo_ascii_strcasecmp(element_name, name);
|
622
611
|
}
|
623
612
|
|
613
|
+
static bool node_html_tagname_is (
|
614
|
+
const GumboNode* node,
|
615
|
+
GumboTag tag,
|
616
|
+
const char *name
|
617
|
+
) {
|
618
|
+
return node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, tag, name);
|
619
|
+
}
|
620
|
+
|
624
621
|
static bool node_tagname_is (
|
625
622
|
const GumboNode* node,
|
626
623
|
GumboTag tag,
|
@@ -646,7 +643,6 @@ static bool node_qualified_tag_is (
|
|
646
643
|
|
647
644
|
// Like node_tag_in, but for the single-tag case in the HTML namespace
|
648
645
|
static bool node_html_tag_is(const GumboNode* node, GumboTag tag) {
|
649
|
-
assert(tag != GUMBO_TAG_UNKNOWN);
|
650
646
|
return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
|
651
647
|
}
|
652
648
|
|
@@ -738,18 +734,18 @@ static void reset_insertion_mode_appropriately(GumboParser* parser) {
|
|
738
734
|
assert(0);
|
739
735
|
}
|
740
736
|
|
741
|
-
static
|
737
|
+
static void parser_add_parse_error (
|
742
738
|
GumboParser* parser,
|
743
739
|
const GumboToken* token
|
744
740
|
) {
|
745
741
|
gumbo_debug("Adding parse error.\n");
|
746
742
|
GumboError* error = gumbo_add_error(parser);
|
747
743
|
if (!error) {
|
748
|
-
return
|
744
|
+
return;
|
749
745
|
}
|
750
746
|
error->type = GUMBO_ERR_PARSER;
|
751
747
|
error->position = token->position;
|
752
|
-
error->original_text = token->original_text
|
748
|
+
error->original_text = token->original_text;
|
753
749
|
GumboParserError* extra_data = &error->v.parser;
|
754
750
|
extra_data->input_type = token->type;
|
755
751
|
extra_data->input_tag = GUMBO_TAG_UNKNOWN;
|
@@ -772,7 +768,6 @@ static GumboError* parser_add_parse_error (
|
|
772
768
|
&extra_data->tag_stack
|
773
769
|
);
|
774
770
|
}
|
775
|
-
return error;
|
776
771
|
}
|
777
772
|
|
778
773
|
// https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point
|
@@ -1639,9 +1634,11 @@ static bool has_node_in_scope(const GumboParser* parser, const GumboNode* node)
|
|
1639
1634
|
const GumboNodeType type = current->type;
|
1640
1635
|
if (current == node) {
|
1641
1636
|
return true;
|
1642
|
-
}
|
1637
|
+
}
|
1638
|
+
if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
|
1643
1639
|
continue;
|
1644
|
-
}
|
1640
|
+
}
|
1641
|
+
if (node_tag_in_set(current, &tags)) {
|
1645
1642
|
return false;
|
1646
1643
|
}
|
1647
1644
|
}
|
@@ -1687,14 +1684,18 @@ static bool has_an_element_in_select_scope(const GumboParser* parser, GumboTag t
|
|
1687
1684
|
// https://html.spec.whatwg.org/multipage/parsing.html#generate-implied-end-tags
|
1688
1685
|
// "exception" is the "element to exclude from the process" listed in the spec.
|
1689
1686
|
// Pass GUMBO_TAG_LAST to not exclude any of them.
|
1690
|
-
static void generate_implied_end_tags(
|
1687
|
+
static void generate_implied_end_tags (
|
1688
|
+
GumboParser* parser,
|
1689
|
+
GumboTag exception,
|
1690
|
+
const char* exception_name
|
1691
|
+
) {
|
1691
1692
|
static const TagSet tags = {
|
1692
|
-
TAG(DD), TAG(DT), TAG(LI), TAG(
|
1693
|
-
TAG(P), TAG(
|
1693
|
+
TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION),
|
1694
|
+
TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC)
|
1694
1695
|
};
|
1695
1696
|
while (
|
1696
1697
|
node_tag_in_set(get_current_node(parser), &tags)
|
1697
|
-
&& !
|
1698
|
+
&& !node_html_tagname_is(get_current_node(parser), exception, exception_name)
|
1698
1699
|
) {
|
1699
1700
|
pop_current_node(parser);
|
1700
1701
|
}
|
@@ -1704,15 +1705,36 @@ static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
|
|
1704
1705
|
// https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags
|
1705
1706
|
static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
|
1706
1707
|
static const TagSet tags = {
|
1707
|
-
TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(
|
1708
|
-
TAG(
|
1709
|
-
TAG(TD), TAG(TFOOT), TAG(TH), TAG(
|
1708
|
+
TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
|
1709
|
+
TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY),
|
1710
|
+
TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
1710
1711
|
};
|
1711
1712
|
while (node_tag_in_set(get_current_node(parser), &tags)) {
|
1712
1713
|
pop_current_node(parser);
|
1713
1714
|
}
|
1714
1715
|
}
|
1715
1716
|
|
1717
|
+
// This factors out the clauses in the "in body" insertion mode checking "if
|
1718
|
+
// there is a node in the stack of open elements that is not" one of a list of
|
1719
|
+
// elements in which case it's a parse error.
|
1720
|
+
// This is used in "an end-of-file token", "an end tag whose tag name is
|
1721
|
+
// 'body'", and "an end tag whose tag name is 'html'".
|
1722
|
+
static bool stack_contains_nonclosable_element (
|
1723
|
+
GumboParser* parser
|
1724
|
+
) {
|
1725
|
+
static const TagSet tags = {
|
1726
|
+
TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P), TAG(RB),
|
1727
|
+
TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH),
|
1728
|
+
TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML),
|
1729
|
+
};
|
1730
|
+
GumboVector* open_elements = &parser->_parser_state->_open_elements;
|
1731
|
+
for (size_t i = 0; i < open_elements->length; ++i) {
|
1732
|
+
if (!node_tag_in_set(open_elements->data[i], &tags))
|
1733
|
+
return true;
|
1734
|
+
}
|
1735
|
+
return false;
|
1736
|
+
}
|
1737
|
+
|
1716
1738
|
// This factors out the clauses relating to "act as if an end tag token with tag
|
1717
1739
|
// name "table" had been seen. Returns true if there's a table element in table
|
1718
1740
|
// scope which was successfully closed, false if not and the token should be
|
@@ -1732,37 +1754,35 @@ static bool close_table(GumboParser* parser) {
|
|
1732
1754
|
|
1733
1755
|
// This factors out the clauses relating to "act as if an end tag token with tag
|
1734
1756
|
// name `cell_tag` had been seen".
|
1735
|
-
static
|
1757
|
+
static void close_table_cell (
|
1736
1758
|
GumboParser* parser,
|
1737
1759
|
const GumboToken* token,
|
1738
1760
|
GumboTag cell_tag
|
1739
1761
|
) {
|
1740
|
-
|
1741
|
-
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
1762
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
|
1742
1763
|
const GumboNode* node = get_current_node(parser);
|
1743
|
-
if (!node_html_tag_is(node, cell_tag))
|
1764
|
+
if (!node_html_tag_is(node, cell_tag))
|
1744
1765
|
parser_add_parse_error(parser, token);
|
1745
|
-
result = false;
|
1746
|
-
}
|
1747
1766
|
do {
|
1748
1767
|
node = pop_current_node(parser);
|
1749
1768
|
} while (!node_html_tag_is(node, cell_tag));
|
1750
1769
|
|
1751
1770
|
clear_active_formatting_elements(parser);
|
1752
1771
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
1753
|
-
return result;
|
1754
1772
|
}
|
1755
1773
|
|
1756
1774
|
// https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell
|
1757
1775
|
// This holds the logic to determine whether we should close a <td> or a <th>.
|
1758
|
-
static
|
1776
|
+
static void close_current_cell(GumboParser* parser, const GumboToken* token) {
|
1777
|
+
GumboTag cell_tag;
|
1759
1778
|
if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
|
1760
1779
|
assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
|
1761
|
-
|
1780
|
+
cell_tag = GUMBO_TAG_TD;
|
1762
1781
|
} else {
|
1763
1782
|
assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
|
1764
|
-
|
1783
|
+
cell_tag = GUMBO_TAG_TH;
|
1765
1784
|
}
|
1785
|
+
close_table_cell(parser, token, cell_tag);
|
1766
1786
|
}
|
1767
1787
|
|
1768
1788
|
// This factors out the "act as if an end tag of tag name 'select' had been
|
@@ -1819,14 +1839,14 @@ static bool is_special_node(const GumboNode* node) {
|
|
1819
1839
|
// specified qualified name. If the elements closed are in the set handled by
|
1820
1840
|
// generate_implied_end_tags, this is normal operation and this function returns
|
1821
1841
|
// true. Otherwise, a parse error is recorded and this function returns false.
|
1822
|
-
static
|
1842
|
+
static void implicitly_close_tags (
|
1823
1843
|
GumboParser* parser,
|
1824
1844
|
GumboToken* token,
|
1825
1845
|
GumboNamespaceEnum target_ns,
|
1826
1846
|
GumboTag target
|
1827
1847
|
) {
|
1828
|
-
|
1829
|
-
generate_implied_end_tags(parser, target);
|
1848
|
+
assert(target != GUMBO_TAG_UNKNOWN);
|
1849
|
+
generate_implied_end_tags(parser, target, NULL);
|
1830
1850
|
if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
|
1831
1851
|
parser_add_parse_error(parser, token);
|
1832
1852
|
while (
|
@@ -1834,30 +1854,27 @@ static bool implicitly_close_tags (
|
|
1834
1854
|
) {
|
1835
1855
|
pop_current_node(parser);
|
1836
1856
|
}
|
1837
|
-
result = false;
|
1838
1857
|
}
|
1839
1858
|
assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
|
1840
1859
|
pop_current_node(parser);
|
1841
|
-
return result;
|
1842
1860
|
}
|
1843
1861
|
|
1844
1862
|
// If the stack of open elements has a <p> tag in button scope, this acts as if
|
1845
1863
|
// a </p> tag was encountered, implicitly closing tags. Returns false if a
|
1846
1864
|
// parse error occurs. This is a convenience function because this particular
|
1847
1865
|
// clause appears several times in the spec.
|
1848
|
-
static
|
1866
|
+
static void maybe_implicitly_close_p_tag (
|
1849
1867
|
GumboParser* parser,
|
1850
1868
|
GumboToken* token
|
1851
1869
|
) {
|
1852
1870
|
if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
|
1853
|
-
|
1871
|
+
implicitly_close_tags (
|
1854
1872
|
parser,
|
1855
1873
|
token,
|
1856
1874
|
GUMBO_NAMESPACE_HTML,
|
1857
1875
|
GUMBO_TAG_P
|
1858
1876
|
);
|
1859
1877
|
}
|
1860
|
-
return true;
|
1861
1878
|
}
|
1862
1879
|
|
1863
1880
|
// Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
|
@@ -1868,7 +1885,7 @@ static void maybe_implicitly_close_list_tag (
|
|
1868
1885
|
bool is_li
|
1869
1886
|
) {
|
1870
1887
|
GumboParserState* state = parser->_parser_state;
|
1871
|
-
|
1888
|
+
set_frameset_not_ok(parser);
|
1872
1889
|
for (int i = state->_open_elements.length; --i >= 0;) {
|
1873
1890
|
const GumboNode* node = state->_open_elements.data[i];
|
1874
1891
|
bool is_list_tag = is_li
|
@@ -1884,6 +1901,7 @@ static void maybe_implicitly_close_list_tag (
|
|
1884
1901
|
);
|
1885
1902
|
return;
|
1886
1903
|
}
|
1904
|
+
|
1887
1905
|
if (
|
1888
1906
|
is_special_node(node)
|
1889
1907
|
&& !node_tag_in_set(node, &(const TagSet){TAG(ADDRESS), TAG(DIV), TAG(P)})
|
@@ -2009,40 +2027,19 @@ static void adjust_mathml_attributes(GumboToken* token) {
|
|
2009
2027
|
attr->name = gumbo_strdup("definitionURL");
|
2010
2028
|
}
|
2011
2029
|
|
2012
|
-
static
|
2013
|
-
const GumboTokenDocType* doctype,
|
2014
|
-
const GumboStringPiece* public_id,
|
2015
|
-
const GumboStringPiece* system_id,
|
2016
|
-
bool allow_missing_system_id
|
2017
|
-
) {
|
2018
|
-
return
|
2019
|
-
!strcmp(doctype->public_identifier, public_id->data)
|
2020
|
-
&& (allow_missing_system_id || doctype->has_system_identifier)
|
2021
|
-
&& !strcmp(doctype->system_identifier, system_id->data);
|
2022
|
-
}
|
2023
|
-
|
2024
|
-
static bool maybe_add_doctype_error (
|
2030
|
+
static void maybe_add_doctype_error (
|
2025
2031
|
GumboParser* parser,
|
2026
2032
|
const GumboToken* token
|
2027
2033
|
) {
|
2028
2034
|
const GumboTokenDocType* doctype = &token->v.doc_type;
|
2029
|
-
|
2030
|
-
|
2031
|
-
|
2032
|
-
|
2033
|
-
|
2034
|
-
|
2035
|
-
&kSystemIdRecHtml4_0, true) ||
|
2036
|
-
doctype_matches(doctype, &kPublicIdHtml4_01,
|
2037
|
-
&kSystemIdHtml4, true) ||
|
2038
|
-
doctype_matches(doctype, &kPublicIdXhtml1_0,
|
2039
|
-
&kSystemIdXhtmlStrict1_1, false) ||
|
2040
|
-
doctype_matches(doctype, &kPublicIdXhtml1_1,
|
2041
|
-
&kSystemIdXhtml1_1, false)))) {
|
2035
|
+
if (
|
2036
|
+
strcmp(doctype->name, "html")
|
2037
|
+
|| doctype->has_public_identifier
|
2038
|
+
|| (doctype->has_system_identifier
|
2039
|
+
&& strcmp(doctype->system_identifier, "about:legacy-compat"))
|
2040
|
+
) {
|
2042
2041
|
parser_add_parse_error(parser, token);
|
2043
|
-
return false;
|
2044
2042
|
}
|
2045
|
-
return true;
|
2046
2043
|
}
|
2047
2044
|
|
2048
2045
|
static void remove_from_parent(GumboNode* node) {
|
@@ -2067,39 +2064,115 @@ static void remove_from_parent(GumboNode* node) {
|
|
2067
2064
|
}
|
2068
2065
|
}
|
2069
2066
|
|
2067
|
+
// This is here to clean up memory when the spec says "Ignore current token."
|
2068
|
+
static void ignore_token(GumboParser* parser) {
|
2069
|
+
GumboToken* token = parser->_parser_state->_current_token;
|
2070
|
+
// Ownership of the token's internal buffers are normally transferred to the
|
2071
|
+
// element, but if no element is emitted (as happens in non-verbatim-mode
|
2072
|
+
// when a token is ignored), we need to free it here to prevent a memory
|
2073
|
+
// leak.
|
2074
|
+
gumbo_token_destroy(token);
|
2075
|
+
#ifndef NDEBUG
|
2076
|
+
if (token->type == GUMBO_TOKEN_START_TAG) {
|
2077
|
+
// Mark this sentinel so the assertion in the main loop knows it's been
|
2078
|
+
// destroyed.
|
2079
|
+
token->v.start_tag.attributes = kGumboEmptyVector;
|
2080
|
+
token->v.start_tag.name = NULL;
|
2081
|
+
}
|
2082
|
+
#endif
|
2083
|
+
}
|
2084
|
+
|
2085
|
+
// The token is usually an end tag; however, the adoption agency algorithm may
|
2086
|
+
// invoke this for an 'a' or 'nobr' start tag.
|
2087
|
+
// Returns false if there was an error.
|
2088
|
+
static void in_body_any_other_end_tag(GumboParser* parser, GumboToken* token)
|
2089
|
+
{
|
2090
|
+
GumboParserState* state = parser->_parser_state;
|
2091
|
+
GumboTag tag;
|
2092
|
+
const char* tagname;
|
2093
|
+
|
2094
|
+
if (token->type == GUMBO_TOKEN_END_TAG) {
|
2095
|
+
tag = token->v.end_tag.tag;
|
2096
|
+
tagname = token->v.end_tag.name;
|
2097
|
+
} else {
|
2098
|
+
assert(token->type == GUMBO_TOKEN_START_TAG);
|
2099
|
+
tag = token->v.start_tag.tag;
|
2100
|
+
tagname = token->v.start_tag.name;
|
2101
|
+
}
|
2102
|
+
|
2103
|
+
assert(state->_open_elements.length > 0);
|
2104
|
+
assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
|
2105
|
+
// Walk up the stack of open elements until we find one that either:
|
2106
|
+
// a) Matches the tag name we saw
|
2107
|
+
// b) Is in the "special" category.
|
2108
|
+
// If we see a), implicitly close everything up to and including it. If we
|
2109
|
+
// see b), then record a parse error, don't close anything (except the
|
2110
|
+
// implied end tags) and ignore the end tag token.
|
2111
|
+
for (int i = state->_open_elements.length; --i >= 0;) {
|
2112
|
+
const GumboNode* node = state->_open_elements.data[i];
|
2113
|
+
if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, tag, tagname)) {
|
2114
|
+
generate_implied_end_tags(parser, tag, tagname);
|
2115
|
+
// <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example of an error.
|
2116
|
+
// foo is the "current node" but sarcasm is node.
|
2117
|
+
// XXX: Write a test for this.
|
2118
|
+
if (node != get_current_node(parser)) {
|
2119
|
+
parser_add_parse_error(parser, token);
|
2120
|
+
}
|
2121
|
+
while (node != pop_current_node(parser))
|
2122
|
+
; // Pop everything.
|
2123
|
+
return;
|
2124
|
+
} else if (is_special_node(node)) {
|
2125
|
+
parser_add_parse_error(parser, token);
|
2126
|
+
ignore_token(parser);
|
2127
|
+
return;
|
2128
|
+
}
|
2129
|
+
}
|
2130
|
+
// <html> is in the special category, so we should never get here.
|
2131
|
+
assert(0 && "unreachable");
|
2132
|
+
}
|
2133
|
+
|
2070
2134
|
// https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
|
2071
2135
|
// Also described in the "in body" handling for end formatting tags.
|
2072
|
-
|
2073
|
-
|
2074
|
-
|
2075
|
-
GumboTag subject
|
2076
|
-
) {
|
2136
|
+
// Returns false if there was an error.
|
2137
|
+
static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
|
2138
|
+
{
|
2077
2139
|
GumboParserState* state = parser->_parser_state;
|
2078
2140
|
gumbo_debug("Entering adoption agency algorithm.\n");
|
2079
2141
|
// Step 1.
|
2142
|
+
GumboTag subject;
|
2143
|
+
if (token->type == GUMBO_TOKEN_START_TAG) {
|
2144
|
+
subject = token->v.start_tag.tag;
|
2145
|
+
} else {
|
2146
|
+
assert(token->type == GUMBO_TOKEN_END_TAG);
|
2147
|
+
subject = token->v.end_tag.tag;
|
2148
|
+
}
|
2149
|
+
assert(subject != GUMBO_TAG_UNKNOWN);
|
2150
|
+
|
2151
|
+
// Step 2.
|
2080
2152
|
GumboNode* current_node = get_current_node(parser);
|
2081
2153
|
if (
|
2082
|
-
current_node
|
2083
|
-
&& current_node->v.element.tag == subject
|
2154
|
+
node_html_tag_is(current_node, subject)
|
2084
2155
|
&& -1 == gumbo_vector_index_of (
|
2085
2156
|
&state->_active_formatting_elements,
|
2086
2157
|
current_node
|
2087
2158
|
)
|
2088
2159
|
) {
|
2089
2160
|
pop_current_node(parser);
|
2090
|
-
return
|
2161
|
+
return;
|
2091
2162
|
}
|
2092
|
-
|
2163
|
+
|
2164
|
+
// Steps 3-5 & 21:
|
2093
2165
|
for (unsigned int i = 0; i < 8; ++i) {
|
2094
|
-
// Step
|
2166
|
+
// Step 6.
|
2095
2167
|
GumboNode* formatting_node = NULL;
|
2096
2168
|
int formatting_node_in_open_elements = -1;
|
2097
2169
|
for (int j = state->_active_formatting_elements.length; --j >= 0;) {
|
2098
2170
|
GumboNode* current_node = state->_active_formatting_elements.data[j];
|
2099
2171
|
if (current_node == &kActiveFormattingScopeMarker) {
|
2100
2172
|
gumbo_debug("Broke on scope marker; aborting.\n");
|
2101
|
-
// Last scope marker; abort the algorithm
|
2102
|
-
|
2173
|
+
// Last scope marker; abort the algorithm and handle according to "any
|
2174
|
+
// other end tag" (below).
|
2175
|
+
break;
|
2103
2176
|
}
|
2104
2177
|
if (node_html_tag_is(current_node, subject)) {
|
2105
2178
|
// Found it.
|
@@ -2121,10 +2194,11 @@ static bool adoption_agency_algorithm (
|
|
2121
2194
|
// "any other end tag" clause (which may potentially add a parse error,
|
2122
2195
|
// but not always).
|
2123
2196
|
gumbo_debug("No active formatting elements; aborting.\n");
|
2124
|
-
|
2197
|
+
in_body_any_other_end_tag(parser, token);
|
2198
|
+
return;
|
2125
2199
|
}
|
2126
2200
|
|
2127
|
-
// Step
|
2201
|
+
// Step 7
|
2128
2202
|
if (formatting_node_in_open_elements == -1) {
|
2129
2203
|
gumbo_debug("Formatting node not on stack of open elements.\n");
|
2130
2204
|
parser_add_parse_error(parser, token);
|
@@ -2132,25 +2206,24 @@ static bool adoption_agency_algorithm (
|
|
2132
2206
|
formatting_node,
|
2133
2207
|
&state->_active_formatting_elements
|
2134
2208
|
);
|
2135
|
-
return
|
2209
|
+
return;
|
2136
2210
|
}
|
2137
2211
|
|
2138
|
-
// Step
|
2212
|
+
// Step 8
|
2139
2213
|
if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
|
2140
2214
|
parser_add_parse_error(parser, token);
|
2141
2215
|
gumbo_debug("Element not in scope.\n");
|
2142
|
-
return
|
2216
|
+
return;
|
2143
2217
|
}
|
2144
2218
|
|
2145
|
-
// Step
|
2146
|
-
if (formatting_node != get_current_node(parser))
|
2219
|
+
// Step 9
|
2220
|
+
if (formatting_node != get_current_node(parser))
|
2147
2221
|
parser_add_parse_error(parser, token); // But continue onwards.
|
2148
|
-
}
|
2149
2222
|
assert(formatting_node);
|
2150
2223
|
assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
|
2151
2224
|
assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
|
2152
2225
|
|
2153
|
-
// Step
|
2226
|
+
// Step 10
|
2154
2227
|
GumboNode* furthest_block = NULL;
|
2155
2228
|
for (
|
2156
2229
|
unsigned int j = formatting_node_in_open_elements;
|
@@ -2160,32 +2233,27 @@ static bool adoption_agency_algorithm (
|
|
2160
2233
|
assert(j > 0);
|
2161
2234
|
GumboNode* current = state->_open_elements.data[j];
|
2162
2235
|
if (is_special_node(current)) {
|
2163
|
-
// Step 9.
|
2164
2236
|
furthest_block = current;
|
2165
2237
|
break;
|
2166
2238
|
}
|
2167
2239
|
}
|
2240
|
+
// Step 11.
|
2168
2241
|
if (!furthest_block) {
|
2169
|
-
|
2170
|
-
|
2171
|
-
pop_current_node(parser);
|
2172
|
-
}
|
2173
|
-
// And the formatting element itself.
|
2174
|
-
pop_current_node(parser);
|
2242
|
+
while (pop_current_node(parser) != formatting_node)
|
2243
|
+
;
|
2175
2244
|
gumbo_vector_remove (
|
2176
2245
|
formatting_node,
|
2177
2246
|
&state->_active_formatting_elements
|
2178
2247
|
);
|
2179
|
-
return
|
2248
|
+
return;
|
2180
2249
|
}
|
2181
2250
|
assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
|
2182
|
-
assert(furthest_block);
|
2183
2251
|
|
2184
|
-
// Step
|
2252
|
+
// Step 12.
|
2185
2253
|
// Elements may be moved and reparented by this algorithm, so
|
2186
2254
|
// common_ancestor is not necessarily the same as formatting_node->parent.
|
2187
2255
|
GumboNode* common_ancestor = state->_open_elements.data [
|
2188
|
-
|
2256
|
+
formatting_node_in_open_elements - 1
|
2189
2257
|
];
|
2190
2258
|
gumbo_debug (
|
2191
2259
|
"Common ancestor tag = %s, furthest block tag = %s.\n",
|
@@ -2193,24 +2261,24 @@ static bool adoption_agency_algorithm (
|
|
2193
2261
|
gumbo_normalized_tagname(furthest_block->v.element.tag)
|
2194
2262
|
);
|
2195
2263
|
|
2196
|
-
// Step
|
2264
|
+
// Step 13.
|
2197
2265
|
int bookmark = 1 + gumbo_vector_index_of (
|
2198
2266
|
&state->_active_formatting_elements,
|
2199
2267
|
formatting_node
|
2200
2268
|
);
|
2201
2269
|
gumbo_debug("Bookmark at %d.\n", bookmark);
|
2202
|
-
// Step
|
2270
|
+
// Step 14.
|
2203
2271
|
GumboNode* node = furthest_block;
|
2204
2272
|
GumboNode* last_node = furthest_block;
|
2205
2273
|
// Must be stored explicitly, in case node is removed from the stack of open
|
2206
|
-
// elements, to handle step
|
2274
|
+
// elements, to handle step 14.3.
|
2207
2275
|
int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
|
2208
2276
|
assert(saved_node_index > 0);
|
2209
|
-
// Step
|
2277
|
+
// Step 14.1.
|
2210
2278
|
for (int j = 0;;) {
|
2211
|
-
// Step
|
2279
|
+
// Step 14.2.
|
2212
2280
|
++j;
|
2213
|
-
// Step
|
2281
|
+
// Step 14.3.
|
2214
2282
|
int node_index = gumbo_vector_index_of(&state->_open_elements, node);
|
2215
2283
|
gumbo_debug (
|
2216
2284
|
"Current index: %d, last index: %d.\n",
|
@@ -2225,16 +2293,16 @@ static bool adoption_agency_algorithm (
|
|
2225
2293
|
assert((unsigned int) node_index < state->_open_elements.capacity);
|
2226
2294
|
node = state->_open_elements.data[node_index];
|
2227
2295
|
assert(node->parent);
|
2296
|
+
// Step 14.4.
|
2228
2297
|
if (node == formatting_node) {
|
2229
|
-
// Step 13.4.
|
2230
2298
|
break;
|
2231
2299
|
}
|
2232
2300
|
int formatting_index = gumbo_vector_index_of (
|
2233
2301
|
&state->_active_formatting_elements,
|
2234
2302
|
node
|
2235
2303
|
);
|
2304
|
+
// Step 14.5.
|
2236
2305
|
if (j > 3 && formatting_index != -1) {
|
2237
|
-
// Step 13.5.
|
2238
2306
|
gumbo_debug("Removing formatting element at %d.\n", formatting_index);
|
2239
2307
|
gumbo_vector_remove_at (
|
2240
2308
|
formatting_index,
|
@@ -2249,11 +2317,11 @@ static bool adoption_agency_algorithm (
|
|
2249
2317
|
continue;
|
2250
2318
|
}
|
2251
2319
|
if (formatting_index == -1) {
|
2252
|
-
// Step
|
2320
|
+
// Step 14.6.
|
2253
2321
|
gumbo_vector_remove_at(node_index, &state->_open_elements);
|
2254
2322
|
continue;
|
2255
2323
|
}
|
2256
|
-
// Step
|
2324
|
+
// Step 14.7.
|
2257
2325
|
// "common ancestor as the intended parent" doesn't actually mean insert
|
2258
2326
|
// it into the common ancestor; that happens below.
|
2259
2327
|
node = clone_node(node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
|
@@ -2261,21 +2329,21 @@ static bool adoption_agency_algorithm (
|
|
2261
2329
|
state->_active_formatting_elements.data[formatting_index] = node;
|
2262
2330
|
assert(node_index >= 0);
|
2263
2331
|
state->_open_elements.data[node_index] = node;
|
2264
|
-
// Step
|
2332
|
+
// Step 14.8.
|
2265
2333
|
if (last_node == furthest_block) {
|
2266
2334
|
bookmark = formatting_index + 1;
|
2267
2335
|
gumbo_debug("Bookmark moved to %d.\n", bookmark);
|
2268
2336
|
assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
|
2269
2337
|
}
|
2270
|
-
// Step
|
2338
|
+
// Step 14.9.
|
2271
2339
|
last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
|
2272
2340
|
remove_from_parent(last_node);
|
2273
2341
|
append_node(node, last_node);
|
2274
|
-
// Step
|
2342
|
+
// Step 14.10.
|
2275
2343
|
last_node = node;
|
2276
|
-
} // Step
|
2344
|
+
} // Step 14.11.
|
2277
2345
|
|
2278
|
-
// Step
|
2346
|
+
// Step 15.
|
2279
2347
|
gumbo_debug (
|
2280
2348
|
"Removing %s node from parent ",
|
2281
2349
|
gumbo_normalized_tagname(last_node->v.element.tag)
|
@@ -2292,14 +2360,14 @@ static bool adoption_agency_algorithm (
|
|
2292
2360
|
);
|
2293
2361
|
insert_node(last_node, location);
|
2294
2362
|
|
2295
|
-
// Step
|
2363
|
+
// Step 16.
|
2296
2364
|
GumboNode* new_formatting_node = clone_node (
|
2297
2365
|
formatting_node,
|
2298
2366
|
GUMBO_INSERTION_ADOPTION_AGENCY_CLONED
|
2299
2367
|
);
|
2300
2368
|
formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
|
2301
2369
|
|
2302
|
-
// Step
|
2370
|
+
// Step 17. Instead of appending nodes one-by-one, we swap the children
|
2303
2371
|
// vector of furthest_block with the empty children of new_formatting_node,
|
2304
2372
|
// reducing memory traffic and allocations. We still have to reset their
|
2305
2373
|
// parent pointers, though.
|
@@ -2313,10 +2381,10 @@ static bool adoption_agency_algorithm (
|
|
2313
2381
|
child->parent = new_formatting_node;
|
2314
2382
|
}
|
2315
2383
|
|
2316
|
-
// Step
|
2384
|
+
// Step 18.
|
2317
2385
|
append_node(furthest_block, new_formatting_node);
|
2318
2386
|
|
2319
|
-
// Step
|
2387
|
+
// Step 19.
|
2320
2388
|
// If the formatting node was before the bookmark, it may shift over all
|
2321
2389
|
// indices after it, so we need to explicitly find the index and possibly
|
2322
2390
|
// adjust the bookmark.
|
@@ -2344,7 +2412,7 @@ static bool adoption_agency_algorithm (
|
|
2344
2412
|
&state->_active_formatting_elements
|
2345
2413
|
);
|
2346
2414
|
|
2347
|
-
// Step
|
2415
|
+
// Step 20.
|
2348
2416
|
gumbo_vector_remove(formatting_node, &state->_open_elements);
|
2349
2417
|
int insert_at = 1 + gumbo_vector_index_of (
|
2350
2418
|
&state->_open_elements,
|
@@ -2357,26 +2425,7 @@ static bool adoption_agency_algorithm (
|
|
2357
2425
|
insert_at,
|
2358
2426
|
&state->_open_elements
|
2359
2427
|
);
|
2360
|
-
} // Step
|
2361
|
-
return true;
|
2362
|
-
}
|
2363
|
-
|
2364
|
-
// This is here to clean up memory when the spec says "Ignore current token."
|
2365
|
-
static void ignore_token(GumboParser* parser) {
|
2366
|
-
GumboToken* token = parser->_parser_state->_current_token;
|
2367
|
-
// Ownership of the token's internal buffers are normally transferred to the
|
2368
|
-
// element, but if no element is emitted (as happens in non-verbatim-mode
|
2369
|
-
// when a token is ignored), we need to free it here to prevent a memory
|
2370
|
-
// leak.
|
2371
|
-
gumbo_token_destroy(token);
|
2372
|
-
#ifndef NDEBUG
|
2373
|
-
if (token->type == GUMBO_TOKEN_START_TAG) {
|
2374
|
-
// Mark this sentinel so the assertion in the main loop knows it's been
|
2375
|
-
// destroyed.
|
2376
|
-
token->v.start_tag.attributes = kGumboEmptyVector;
|
2377
|
-
token->v.start_tag.name = NULL;
|
2378
|
-
}
|
2379
|
-
#endif
|
2428
|
+
} // Step 21.
|
2380
2429
|
}
|
2381
2430
|
|
2382
2431
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-end
|
@@ -2401,125 +2450,139 @@ static void finish_parsing(GumboParser* parser) {
|
|
2401
2450
|
; // Pop them all.
|
2402
2451
|
}
|
2403
2452
|
|
2404
|
-
static
|
2453
|
+
static void handle_initial(GumboParser* parser, GumboToken* token) {
|
2405
2454
|
GumboDocument* document = &get_document_node(parser)->v.document;
|
2406
2455
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2407
2456
|
ignore_token(parser);
|
2408
|
-
return
|
2409
|
-
}
|
2457
|
+
return;
|
2458
|
+
}
|
2459
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2410
2460
|
append_comment_node(parser, get_document_node(parser), token);
|
2411
|
-
return
|
2412
|
-
}
|
2461
|
+
return;
|
2462
|
+
}
|
2463
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2413
2464
|
document->has_doctype = true;
|
2414
2465
|
document->name = token->v.doc_type.name;
|
2415
2466
|
document->public_identifier = token->v.doc_type.public_identifier;
|
2416
2467
|
document->system_identifier = token->v.doc_type.system_identifier;
|
2417
2468
|
document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
|
2418
2469
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
|
2419
|
-
|
2470
|
+
maybe_add_doctype_error(parser, token);
|
2471
|
+
return;
|
2420
2472
|
}
|
2421
2473
|
parser_add_parse_error(parser, token);
|
2422
2474
|
document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
|
2423
2475
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
|
2424
2476
|
parser->_parser_state->_reprocess_current_token = true;
|
2425
|
-
return true;
|
2426
2477
|
}
|
2427
2478
|
|
2428
2479
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode
|
2429
|
-
static
|
2480
|
+
static void handle_before_html(GumboParser* parser, GumboToken* token) {
|
2430
2481
|
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2431
2482
|
parser_add_parse_error(parser, token);
|
2432
2483
|
ignore_token(parser);
|
2433
|
-
return
|
2434
|
-
}
|
2484
|
+
return;
|
2485
|
+
}
|
2486
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2435
2487
|
append_comment_node(parser, get_document_node(parser), token);
|
2436
|
-
return
|
2437
|
-
}
|
2488
|
+
return;
|
2489
|
+
}
|
2490
|
+
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2438
2491
|
ignore_token(parser);
|
2439
|
-
return
|
2440
|
-
}
|
2492
|
+
return;
|
2493
|
+
}
|
2494
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2441
2495
|
GumboNode* html_node = insert_element_from_token(parser, token);
|
2442
2496
|
parser->_output->root = html_node;
|
2443
2497
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
|
2444
|
-
return
|
2445
|
-
}
|
2498
|
+
return;
|
2499
|
+
}
|
2500
|
+
if (
|
2446
2501
|
token->type == GUMBO_TOKEN_END_TAG
|
2447
2502
|
&& !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
|
2448
2503
|
) {
|
2449
2504
|
parser_add_parse_error(parser, token);
|
2450
2505
|
ignore_token(parser);
|
2451
|
-
return
|
2452
|
-
} else {
|
2453
|
-
GumboNode* html_node = insert_element_of_tag_type (
|
2454
|
-
parser,
|
2455
|
-
GUMBO_TAG_HTML,
|
2456
|
-
GUMBO_INSERTION_IMPLIED
|
2457
|
-
);
|
2458
|
-
assert(html_node);
|
2459
|
-
parser->_output->root = html_node;
|
2460
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
|
2461
|
-
parser->_parser_state->_reprocess_current_token = true;
|
2462
|
-
return true;
|
2506
|
+
return;
|
2463
2507
|
}
|
2508
|
+
GumboNode* html_node = insert_element_of_tag_type (
|
2509
|
+
parser,
|
2510
|
+
GUMBO_TAG_HTML,
|
2511
|
+
GUMBO_INSERTION_IMPLIED
|
2512
|
+
);
|
2513
|
+
assert(html_node);
|
2514
|
+
parser->_output->root = html_node;
|
2515
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
|
2516
|
+
parser->_parser_state->_reprocess_current_token = true;
|
2464
2517
|
}
|
2465
2518
|
|
2519
|
+
// Forward declarations because of mutual dependencies.
|
2520
|
+
static void handle_token(GumboParser* parser, GumboToken* token);
|
2521
|
+
static void handle_in_body(GumboParser* parser, GumboToken* token);
|
2522
|
+
static void handle_in_template(GumboParser* parser, GumboToken* token);
|
2523
|
+
|
2466
2524
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode
|
2467
|
-
static
|
2468
|
-
if (token->type ==
|
2469
|
-
parser_add_parse_error(parser, token);
|
2525
|
+
static void handle_before_head(GumboParser* parser, GumboToken* token) {
|
2526
|
+
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2470
2527
|
ignore_token(parser);
|
2471
|
-
return
|
2472
|
-
}
|
2528
|
+
return;
|
2529
|
+
}
|
2530
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2473
2531
|
append_comment_node(parser, get_current_node(parser), token);
|
2474
|
-
return
|
2475
|
-
}
|
2532
|
+
return;
|
2533
|
+
}
|
2534
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2535
|
+
parser_add_parse_error(parser, token);
|
2476
2536
|
ignore_token(parser);
|
2477
|
-
return
|
2478
|
-
}
|
2537
|
+
return;
|
2538
|
+
}
|
2539
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2540
|
+
handle_in_body(parser, token);
|
2541
|
+
return;
|
2542
|
+
}
|
2543
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
|
2479
2544
|
GumboNode* node = insert_element_from_token(parser, token);
|
2480
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2481
2545
|
parser->_parser_state->_head_element = node;
|
2482
|
-
|
2483
|
-
|
2546
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2547
|
+
return;
|
2548
|
+
}
|
2549
|
+
if (
|
2484
2550
|
token->type == GUMBO_TOKEN_END_TAG
|
2485
|
-
&& !tag_in(token,
|
2551
|
+
&& !tag_in(token, kEndTag, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
|
2486
2552
|
) {
|
2487
2553
|
parser_add_parse_error(parser, token);
|
2488
2554
|
ignore_token(parser);
|
2489
|
-
return
|
2490
|
-
} else {
|
2491
|
-
GumboNode* node = insert_element_of_tag_type (
|
2492
|
-
parser,
|
2493
|
-
GUMBO_TAG_HEAD,
|
2494
|
-
GUMBO_INSERTION_IMPLIED
|
2495
|
-
);
|
2496
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2497
|
-
parser->_parser_state->_head_element = node;
|
2498
|
-
parser->_parser_state->_reprocess_current_token = true;
|
2499
|
-
return true;
|
2555
|
+
return;
|
2500
2556
|
}
|
2557
|
+
GumboNode* node = insert_element_of_tag_type (
|
2558
|
+
parser,
|
2559
|
+
GUMBO_TAG_HEAD,
|
2560
|
+
GUMBO_INSERTION_IMPLIED
|
2561
|
+
);
|
2562
|
+
parser->_parser_state->_head_element = node;
|
2563
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2564
|
+
parser->_parser_state->_reprocess_current_token = true;
|
2501
2565
|
}
|
2502
2566
|
|
2503
|
-
// Forward declarations because of mutual dependencies.
|
2504
|
-
static bool handle_token(GumboParser* parser, GumboToken* token);
|
2505
|
-
static bool handle_in_body(GumboParser* parser, GumboToken* token);
|
2506
|
-
static bool handle_in_template(GumboParser* parser, GumboToken* token);
|
2507
|
-
|
2508
2567
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead
|
2509
|
-
static
|
2568
|
+
static void handle_in_head(GumboParser* parser, GumboToken* token) {
|
2510
2569
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2511
2570
|
insert_text_token(parser, token);
|
2512
|
-
return
|
2513
|
-
}
|
2571
|
+
return;
|
2572
|
+
}
|
2573
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2574
|
+
append_comment_node(parser, get_current_node(parser), token);
|
2575
|
+
return;
|
2576
|
+
}
|
2577
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2514
2578
|
parser_add_parse_error(parser, token);
|
2515
2579
|
ignore_token(parser);
|
2516
|
-
return
|
2517
|
-
}
|
2518
|
-
|
2519
|
-
return true;
|
2520
|
-
} else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2580
|
+
return;
|
2581
|
+
}
|
2582
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2521
2583
|
return handle_in_body(parser, token);
|
2522
|
-
}
|
2584
|
+
}
|
2585
|
+
if (
|
2523
2586
|
tag_in(token, kStartTag, &(const TagSet) {
|
2524
2587
|
TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK)
|
2525
2588
|
})
|
@@ -2527,8 +2590,9 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2527
2590
|
insert_element_from_token(parser, token);
|
2528
2591
|
pop_current_node(parser);
|
2529
2592
|
acknowledge_self_closing_tag(parser);
|
2530
|
-
return
|
2531
|
-
}
|
2593
|
+
return;
|
2594
|
+
}
|
2595
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
|
2532
2596
|
insert_element_from_token(parser, token);
|
2533
2597
|
pop_current_node(parser);
|
2534
2598
|
acknowledge_self_closing_tag(parser);
|
@@ -2536,90 +2600,98 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2536
2600
|
// spec doesn't apply. If clients want to handle meta-tag re-encoding, they
|
2537
2601
|
// should specifically look for that string in the document and re-encode it
|
2538
2602
|
// before passing to Gumbo.
|
2539
|
-
return
|
2540
|
-
}
|
2603
|
+
return;
|
2604
|
+
}
|
2605
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
|
2541
2606
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
|
2542
|
-
return
|
2543
|
-
}
|
2607
|
+
return;
|
2608
|
+
}
|
2609
|
+
if (
|
2544
2610
|
tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
|
2545
2611
|
) {
|
2546
2612
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
2547
|
-
return
|
2548
|
-
}
|
2613
|
+
return;
|
2614
|
+
}
|
2615
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
|
2549
2616
|
insert_element_from_token(parser, token);
|
2550
2617
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
|
2551
|
-
return
|
2552
|
-
}
|
2553
|
-
|
2554
|
-
|
2555
|
-
|
2618
|
+
return;
|
2619
|
+
}
|
2620
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
|
2621
|
+
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT_DATA);
|
2622
|
+
return;
|
2623
|
+
}
|
2624
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
|
2556
2625
|
GumboNode* head = pop_current_node(parser);
|
2557
2626
|
UNUSED_IF_NDEBUG(head);
|
2558
2627
|
assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
|
2559
2628
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
|
2560
|
-
return
|
2561
|
-
}
|
2629
|
+
return;
|
2630
|
+
}
|
2631
|
+
if (
|
2562
2632
|
tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)})
|
2563
2633
|
) {
|
2564
2634
|
pop_current_node(parser);
|
2565
2635
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
|
2566
2636
|
parser->_parser_state->_reprocess_current_token = true;
|
2567
|
-
return
|
2568
|
-
}
|
2637
|
+
return;
|
2638
|
+
}
|
2639
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
|
2569
2640
|
insert_element_from_token(parser, token);
|
2570
2641
|
add_formatting_element(parser, &kActiveFormattingScopeMarker);
|
2571
|
-
parser
|
2642
|
+
set_frameset_not_ok(parser);
|
2572
2643
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
|
2573
2644
|
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
|
2574
|
-
return
|
2575
|
-
}
|
2645
|
+
return;
|
2646
|
+
}
|
2647
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
|
2576
2648
|
if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2577
2649
|
parser_add_parse_error(parser, token);
|
2578
2650
|
ignore_token(parser);
|
2579
|
-
return
|
2651
|
+
return;
|
2580
2652
|
}
|
2581
2653
|
generate_all_implied_end_tags_thoroughly(parser);
|
2582
|
-
|
2583
|
-
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) {
|
2654
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE))
|
2584
2655
|
parser_add_parse_error(parser, token);
|
2585
|
-
success = false;
|
2586
|
-
}
|
2587
2656
|
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
|
2588
2657
|
;
|
2589
2658
|
clear_active_formatting_elements(parser);
|
2590
2659
|
pop_template_insertion_mode(parser);
|
2591
2660
|
reset_insertion_mode_appropriately(parser);
|
2592
|
-
return
|
2593
|
-
}
|
2661
|
+
return;
|
2662
|
+
}
|
2663
|
+
if (
|
2594
2664
|
tag_is(token, kStartTag, GUMBO_TAG_HEAD)
|
2595
2665
|
|| (token->type == GUMBO_TOKEN_END_TAG)
|
2596
2666
|
) {
|
2597
2667
|
parser_add_parse_error(parser, token);
|
2598
2668
|
ignore_token(parser);
|
2599
|
-
return
|
2600
|
-
} else {
|
2601
|
-
pop_current_node(parser);
|
2602
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
|
2603
|
-
parser->_parser_state->_reprocess_current_token = true;
|
2604
|
-
return true;
|
2669
|
+
return;
|
2605
2670
|
}
|
2606
|
-
|
2671
|
+
pop_current_node(parser);
|
2672
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
|
2673
|
+
parser->_parser_state->_reprocess_current_token = true;
|
2674
|
+
return;
|
2607
2675
|
}
|
2608
2676
|
|
2609
2677
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inheadnoscript
|
2610
|
-
static
|
2678
|
+
static void handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
|
2611
2679
|
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2612
2680
|
parser_add_parse_error(parser, token);
|
2613
|
-
return
|
2614
|
-
}
|
2615
|
-
|
2616
|
-
|
2681
|
+
return;
|
2682
|
+
}
|
2683
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2684
|
+
handle_in_body(parser, token);
|
2685
|
+
return;
|
2686
|
+
}
|
2687
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
|
2617
2688
|
const GumboNode* node = pop_current_node(parser);
|
2618
2689
|
assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
|
2619
2690
|
UNUSED_IF_NDEBUG(node);
|
2620
2691
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2621
|
-
return
|
2622
|
-
}
|
2692
|
+
return;
|
2693
|
+
}
|
2694
|
+
if (
|
2623
2695
|
token->type == GUMBO_TOKEN_WHITESPACE
|
2624
2696
|
|| token->type == GUMBO_TOKEN_COMMENT
|
2625
2697
|
|| tag_in (token, kStartTag, &(const TagSet) {
|
@@ -2627,8 +2699,10 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
|
|
2627
2699
|
TAG(META), TAG(NOFRAMES), TAG(STYLE)
|
2628
2700
|
})
|
2629
2701
|
) {
|
2630
|
-
|
2631
|
-
|
2702
|
+
handle_in_head(parser, token);
|
2703
|
+
return;
|
2704
|
+
}
|
2705
|
+
if (
|
2632
2706
|
tag_in(token, kStartTag, &(const TagSet){TAG(HEAD), TAG(NOSCRIPT)})
|
2633
2707
|
|| (
|
2634
2708
|
token->type == GUMBO_TOKEN_END_TAG
|
@@ -2637,43 +2711,48 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
|
|
2637
2711
|
) {
|
2638
2712
|
parser_add_parse_error(parser, token);
|
2639
2713
|
ignore_token(parser);
|
2640
|
-
return
|
2641
|
-
} else {
|
2642
|
-
parser_add_parse_error(parser, token);
|
2643
|
-
const GumboNode* node = pop_current_node(parser);
|
2644
|
-
assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
|
2645
|
-
UNUSED_IF_NDEBUG(node);
|
2646
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2647
|
-
parser->_parser_state->_reprocess_current_token = true;
|
2648
|
-
return false;
|
2714
|
+
return;
|
2649
2715
|
}
|
2716
|
+
parser_add_parse_error(parser, token);
|
2717
|
+
const GumboNode* node = pop_current_node(parser);
|
2718
|
+
assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
|
2719
|
+
UNUSED_IF_NDEBUG(node);
|
2720
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2721
|
+
parser->_parser_state->_reprocess_current_token = true;
|
2650
2722
|
}
|
2651
2723
|
|
2652
2724
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode
|
2653
|
-
static
|
2725
|
+
static void handle_after_head(GumboParser* parser, GumboToken* token) {
|
2654
2726
|
GumboParserState* state = parser->_parser_state;
|
2655
2727
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2656
2728
|
insert_text_token(parser, token);
|
2657
|
-
return
|
2658
|
-
}
|
2729
|
+
return;
|
2730
|
+
}
|
2731
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2732
|
+
append_comment_node(parser, get_current_node(parser), token);
|
2733
|
+
return;
|
2734
|
+
}
|
2735
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2659
2736
|
parser_add_parse_error(parser, token);
|
2660
2737
|
ignore_token(parser);
|
2661
|
-
return
|
2662
|
-
}
|
2663
|
-
|
2664
|
-
|
2665
|
-
|
2666
|
-
|
2667
|
-
|
2738
|
+
return;
|
2739
|
+
}
|
2740
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2741
|
+
handle_in_body(parser, token);
|
2742
|
+
return;
|
2743
|
+
}
|
2744
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
|
2668
2745
|
insert_element_from_token(parser, token);
|
2669
|
-
|
2746
|
+
set_frameset_not_ok(parser);
|
2670
2747
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
2671
|
-
return
|
2672
|
-
}
|
2748
|
+
return;
|
2749
|
+
}
|
2750
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
|
2673
2751
|
insert_element_from_token(parser, token);
|
2674
2752
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
|
2675
|
-
return
|
2676
|
-
}
|
2753
|
+
return;
|
2754
|
+
}
|
2755
|
+
if (
|
2677
2756
|
tag_in(token, kStartTag, &(const TagSet) {
|
2678
2757
|
TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META),
|
2679
2758
|
TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)
|
@@ -2685,12 +2764,15 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
|
|
2685
2764
|
// pending character tokens that should be attached to the root.
|
2686
2765
|
maybe_flush_text_node_buffer(parser);
|
2687
2766
|
gumbo_vector_add(state->_head_element, &state->_open_elements);
|
2688
|
-
|
2767
|
+
handle_in_head(parser, token);
|
2689
2768
|
gumbo_vector_remove(state->_head_element, &state->_open_elements);
|
2690
|
-
return
|
2691
|
-
}
|
2692
|
-
|
2693
|
-
|
2769
|
+
return;
|
2770
|
+
}
|
2771
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
|
2772
|
+
handle_in_head(parser, token);
|
2773
|
+
return;
|
2774
|
+
}
|
2775
|
+
if (
|
2694
2776
|
tag_is(token, kStartTag, GUMBO_TAG_HEAD)
|
2695
2777
|
|| (
|
2696
2778
|
token->type == GUMBO_TOKEN_END_TAG
|
@@ -2699,53 +2781,57 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
|
|
2699
2781
|
) {
|
2700
2782
|
parser_add_parse_error(parser, token);
|
2701
2783
|
ignore_token(parser);
|
2702
|
-
return
|
2703
|
-
} else {
|
2704
|
-
insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
|
2705
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
2706
|
-
state->_reprocess_current_token = true;
|
2707
|
-
return true;
|
2784
|
+
return;
|
2708
2785
|
}
|
2786
|
+
insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
|
2787
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
2788
|
+
state->_reprocess_current_token = true;
|
2709
2789
|
}
|
2710
2790
|
|
2711
2791
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
|
2712
|
-
static
|
2792
|
+
static void handle_in_body(GumboParser* parser, GumboToken* token) {
|
2713
2793
|
GumboParserState* state = parser->_parser_state;
|
2714
2794
|
assert(state->_open_elements.length > 0);
|
2715
2795
|
if (token->type == GUMBO_TOKEN_NULL) {
|
2716
2796
|
parser_add_parse_error(parser, token);
|
2717
2797
|
ignore_token(parser);
|
2718
|
-
return
|
2719
|
-
}
|
2798
|
+
return;
|
2799
|
+
}
|
2800
|
+
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
2720
2801
|
reconstruct_active_formatting_elements(parser);
|
2721
2802
|
insert_text_token(parser, token);
|
2722
|
-
return
|
2723
|
-
}
|
2803
|
+
return;
|
2804
|
+
}
|
2805
|
+
if (
|
2724
2806
|
token->type == GUMBO_TOKEN_CHARACTER
|
2725
2807
|
|| token->type == GUMBO_TOKEN_CDATA
|
2726
2808
|
) {
|
2727
2809
|
reconstruct_active_formatting_elements(parser);
|
2728
2810
|
insert_text_token(parser, token);
|
2729
2811
|
set_frameset_not_ok(parser);
|
2730
|
-
return
|
2731
|
-
}
|
2812
|
+
return;
|
2813
|
+
}
|
2814
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
2732
2815
|
append_comment_node(parser, get_current_node(parser), token);
|
2733
|
-
return
|
2734
|
-
}
|
2816
|
+
return;
|
2817
|
+
}
|
2818
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
2735
2819
|
parser_add_parse_error(parser, token);
|
2736
2820
|
ignore_token(parser);
|
2737
|
-
return
|
2738
|
-
}
|
2821
|
+
return;
|
2822
|
+
}
|
2823
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2739
2824
|
parser_add_parse_error(parser, token);
|
2740
2825
|
if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2741
2826
|
ignore_token(parser);
|
2742
|
-
return
|
2827
|
+
return;
|
2743
2828
|
}
|
2744
2829
|
assert(parser->_output->root != NULL);
|
2745
2830
|
assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
|
2746
2831
|
merge_attributes(token, parser->_output->root);
|
2747
|
-
return
|
2748
|
-
}
|
2832
|
+
return;
|
2833
|
+
}
|
2834
|
+
if (
|
2749
2835
|
tag_in(token, kStartTag, &(const TagSet) {
|
2750
2836
|
TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
|
2751
2837
|
TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE),
|
@@ -2753,8 +2839,10 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2753
2839
|
})
|
2754
2840
|
|| tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
|
2755
2841
|
) {
|
2756
|
-
|
2757
|
-
|
2842
|
+
handle_in_head(parser, token);
|
2843
|
+
return;
|
2844
|
+
}
|
2845
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
|
2758
2846
|
parser_add_parse_error(parser, token);
|
2759
2847
|
if (
|
2760
2848
|
state->_open_elements.length < 2
|
@@ -2762,12 +2850,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2762
2850
|
|| has_open_element(parser, GUMBO_TAG_TEMPLATE)
|
2763
2851
|
) {
|
2764
2852
|
ignore_token(parser);
|
2765
|
-
|
2853
|
+
} else {
|
2854
|
+
set_frameset_not_ok(parser);
|
2855
|
+
merge_attributes(token, state->_open_elements.data[1]);
|
2766
2856
|
}
|
2767
|
-
|
2768
|
-
|
2769
|
-
|
2770
|
-
} else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
|
2857
|
+
return;
|
2858
|
+
}
|
2859
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
|
2771
2860
|
parser_add_parse_error(parser, token);
|
2772
2861
|
if (
|
2773
2862
|
state->_open_elements.length < 2
|
@@ -2775,7 +2864,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2775
2864
|
|| !state->_frameset_ok
|
2776
2865
|
) {
|
2777
2866
|
ignore_token(parser);
|
2778
|
-
return
|
2867
|
+
return;
|
2779
2868
|
}
|
2780
2869
|
// Save the body node for later removal.
|
2781
2870
|
GumboNode* body_node = state->_open_elements.data[1];
|
@@ -2807,80 +2896,74 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2807
2896
|
// Insert the <frameset>, and switch the insertion mode.
|
2808
2897
|
insert_element_from_token(parser, token);
|
2809
2898
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
|
2810
|
-
return
|
2811
|
-
}
|
2812
|
-
|
2813
|
-
if (
|
2814
|
-
!node_tag_in_set(state->_open_elements.data[i], &(const TagSet) {
|
2815
|
-
TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY), TAG(TD), TAG(TFOOT),
|
2816
|
-
TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML)
|
2817
|
-
})
|
2818
|
-
) {
|
2819
|
-
parser_add_parse_error(parser, token);
|
2820
|
-
}
|
2821
|
-
}
|
2899
|
+
return;
|
2900
|
+
}
|
2901
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
2822
2902
|
if (get_current_template_insertion_mode(parser) !=
|
2823
2903
|
GUMBO_INSERTION_MODE_INITIAL) {
|
2824
|
-
|
2904
|
+
handle_in_template(parser, token);
|
2905
|
+
return;
|
2825
2906
|
}
|
2826
|
-
|
2827
|
-
|
2907
|
+
if (stack_contains_nonclosable_element(parser))
|
2908
|
+
parser_add_parse_error(parser, token);
|
2909
|
+
return;
|
2910
|
+
}
|
2911
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
|
2828
2912
|
if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
|
2829
2913
|
parser_add_parse_error(parser, token);
|
2830
2914
|
ignore_token(parser);
|
2831
|
-
return
|
2832
|
-
}
|
2833
|
-
bool success = true;
|
2834
|
-
for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
|
2835
|
-
if (
|
2836
|
-
!node_tag_in_set(state->_open_elements.data[i], &(const TagSet) {
|
2837
|
-
TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P),
|
2838
|
-
TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD),
|
2839
|
-
TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML)
|
2840
|
-
})
|
2841
|
-
) {
|
2842
|
-
parser_add_parse_error(parser, token);
|
2843
|
-
success = false;
|
2844
|
-
break;
|
2845
|
-
}
|
2915
|
+
return;
|
2846
2916
|
}
|
2917
|
+
if (stack_contains_nonclosable_element(parser))
|
2918
|
+
parser_add_parse_error(parser, token);
|
2919
|
+
GumboNode* body = state->_open_elements.data[1];
|
2920
|
+
assert(node_html_tag_is(body, GUMBO_TAG_BODY));
|
2921
|
+
record_end_of_element(state->_current_token, &body->v.element);
|
2847
2922
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
|
2848
|
-
|
2849
|
-
|
2850
|
-
|
2851
|
-
|
2852
|
-
|
2853
|
-
|
2923
|
+
return;
|
2924
|
+
}
|
2925
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
|
2926
|
+
if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
|
2927
|
+
parser_add_parse_error(parser, token);
|
2928
|
+
ignore_token(parser);
|
2929
|
+
return;
|
2854
2930
|
}
|
2855
|
-
|
2856
|
-
|
2931
|
+
if (stack_contains_nonclosable_element(parser))
|
2932
|
+
parser_add_parse_error(parser, token);
|
2933
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
|
2934
|
+
parser->_parser_state->_reprocess_current_token = true;
|
2935
|
+
return;
|
2936
|
+
}
|
2937
|
+
if (
|
2857
2938
|
tag_in(token, kStartTag, &(const TagSet) {
|
2858
2939
|
TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER),
|
2859
2940
|
TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
|
2860
2941
|
TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
|
2861
|
-
TAG(
|
2942
|
+
TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
|
2862
2943
|
TAG(SUMMARY), TAG(UL)
|
2863
2944
|
})
|
2864
2945
|
) {
|
2865
|
-
|
2946
|
+
maybe_implicitly_close_p_tag(parser, token);
|
2866
2947
|
insert_element_from_token(parser, token);
|
2867
|
-
return
|
2868
|
-
}
|
2869
|
-
|
2948
|
+
return;
|
2949
|
+
}
|
2950
|
+
if (tag_in(token, kStartTag, &heading_tags)) {
|
2951
|
+
maybe_implicitly_close_p_tag(parser, token);
|
2870
2952
|
if (node_tag_in_set(get_current_node(parser), &heading_tags)) {
|
2871
2953
|
parser_add_parse_error(parser, token);
|
2872
2954
|
pop_current_node(parser);
|
2873
|
-
result = false;
|
2874
2955
|
}
|
2875
2956
|
insert_element_from_token(parser, token);
|
2876
|
-
return
|
2877
|
-
}
|
2878
|
-
|
2957
|
+
return;
|
2958
|
+
}
|
2959
|
+
if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) {
|
2960
|
+
maybe_implicitly_close_p_tag(parser, token);
|
2879
2961
|
insert_element_from_token(parser, token);
|
2880
2962
|
state->_ignore_next_linefeed = true;
|
2881
|
-
|
2882
|
-
return
|
2883
|
-
}
|
2963
|
+
set_frameset_not_ok(parser);
|
2964
|
+
return;
|
2965
|
+
}
|
2966
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
|
2884
2967
|
if (
|
2885
2968
|
state->_form_element != NULL
|
2886
2969
|
&& !has_open_element(parser, GUMBO_TAG_TEMPLATE)
|
@@ -2888,46 +2971,48 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2888
2971
|
gumbo_debug("Ignoring nested form.\n");
|
2889
2972
|
parser_add_parse_error(parser, token);
|
2890
2973
|
ignore_token(parser);
|
2891
|
-
return
|
2974
|
+
return;
|
2892
2975
|
}
|
2893
|
-
|
2976
|
+
maybe_implicitly_close_p_tag(parser, token);
|
2894
2977
|
GumboNode* form_element = insert_element_from_token(parser, token);
|
2895
2978
|
if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2896
2979
|
state->_form_element = form_element;
|
2897
2980
|
}
|
2898
|
-
return
|
2899
|
-
}
|
2981
|
+
return;
|
2982
|
+
}
|
2983
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
|
2900
2984
|
maybe_implicitly_close_list_tag(parser, token, true);
|
2901
|
-
|
2985
|
+
maybe_implicitly_close_p_tag(parser, token);
|
2902
2986
|
insert_element_from_token(parser, token);
|
2903
|
-
return
|
2904
|
-
}
|
2987
|
+
return;
|
2988
|
+
}
|
2989
|
+
if (tag_in(token, kStartTag, &dd_dt_tags)) {
|
2905
2990
|
maybe_implicitly_close_list_tag(parser, token, false);
|
2906
|
-
|
2991
|
+
maybe_implicitly_close_p_tag(parser, token);
|
2907
2992
|
insert_element_from_token(parser, token);
|
2908
|
-
return
|
2909
|
-
}
|
2910
|
-
|
2993
|
+
return;
|
2994
|
+
}
|
2995
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
|
2996
|
+
maybe_implicitly_close_p_tag(parser, token);
|
2911
2997
|
insert_element_from_token(parser, token);
|
2912
2998
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
|
2913
|
-
return
|
2914
|
-
}
|
2999
|
+
return;
|
3000
|
+
}
|
3001
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
|
2915
3002
|
if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
|
2916
3003
|
parser_add_parse_error(parser, token);
|
2917
|
-
implicitly_close_tags
|
2918
|
-
|
2919
|
-
|
2920
|
-
|
2921
|
-
|
2922
|
-
);
|
2923
|
-
state->_reprocess_current_token = true;
|
2924
|
-
return false;
|
3004
|
+
// We don't want to use implicitly_close_tags here because it may add an
|
3005
|
+
// error and we've already added the only error the standard specifies.
|
3006
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
|
3007
|
+
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_BUTTON))
|
3008
|
+
;
|
2925
3009
|
}
|
2926
3010
|
reconstruct_active_formatting_elements(parser);
|
2927
3011
|
insert_element_from_token(parser, token);
|
2928
|
-
|
2929
|
-
return
|
2930
|
-
}
|
3012
|
+
set_frameset_not_ok(parser);
|
3013
|
+
return;
|
3014
|
+
}
|
3015
|
+
if (
|
2931
3016
|
tag_in(token, kEndTag, &(const TagSet) {
|
2932
3017
|
TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON),
|
2933
3018
|
TAG(CENTER), TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL),
|
@@ -2940,33 +3025,29 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2940
3025
|
if (!has_an_element_in_scope(parser, tag)) {
|
2941
3026
|
parser_add_parse_error(parser, token);
|
2942
3027
|
ignore_token(parser);
|
2943
|
-
return
|
3028
|
+
return;
|
2944
3029
|
}
|
2945
|
-
implicitly_close_tags (
|
3030
|
+
return implicitly_close_tags (
|
2946
3031
|
parser,
|
2947
3032
|
token,
|
2948
3033
|
GUMBO_NAMESPACE_HTML,
|
2949
3034
|
token->v.end_tag.tag
|
2950
3035
|
);
|
2951
|
-
|
2952
|
-
|
3036
|
+
}
|
3037
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
|
2953
3038
|
if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2954
3039
|
if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
|
2955
3040
|
parser_add_parse_error(parser, token);
|
2956
3041
|
ignore_token(parser);
|
2957
|
-
return
|
3042
|
+
return;
|
2958
3043
|
}
|
2959
|
-
|
2960
|
-
|
2961
|
-
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
|
3044
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
|
3045
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM))
|
2962
3046
|
parser_add_parse_error(parser, token);
|
2963
|
-
return false;
|
2964
|
-
}
|
2965
3047
|
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
|
2966
3048
|
;
|
2967
|
-
return
|
3049
|
+
return;
|
2968
3050
|
} else {
|
2969
|
-
bool result = true;
|
2970
3051
|
GumboNode* node = state->_form_element;
|
2971
3052
|
assert(!node || node->type == GUMBO_NODE_ELEMENT);
|
2972
3053
|
state->_form_element = NULL;
|
@@ -2974,25 +3055,24 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2974
3055
|
gumbo_debug("Closing an unopened form.\n");
|
2975
3056
|
parser_add_parse_error(parser, token);
|
2976
3057
|
ignore_token(parser);
|
2977
|
-
return
|
3058
|
+
return;
|
2978
3059
|
}
|
2979
3060
|
// This differs from implicitly_close_tags because we remove *only* the
|
2980
3061
|
// <form> element; other nodes are left in scope.
|
2981
|
-
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
2982
|
-
if (get_current_node(parser) != node)
|
3062
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
|
3063
|
+
if (get_current_node(parser) != node)
|
2983
3064
|
parser_add_parse_error(parser, token);
|
2984
|
-
|
2985
|
-
} else {
|
3065
|
+
else
|
2986
3066
|
record_end_of_element(token, &node->v.element);
|
2987
|
-
}
|
2988
3067
|
|
2989
3068
|
GumboVector* open_elements = &state->_open_elements;
|
2990
3069
|
int index = gumbo_vector_index_of(open_elements, node);
|
2991
3070
|
assert(index >= 0);
|
2992
3071
|
gumbo_vector_remove_at(index, open_elements);
|
2993
|
-
return
|
3072
|
+
return;
|
2994
3073
|
}
|
2995
|
-
}
|
3074
|
+
}
|
3075
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
|
2996
3076
|
if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
|
2997
3077
|
parser_add_parse_error(parser, token);
|
2998
3078
|
// reconstruct_active_formatting_elements(parser);
|
@@ -3001,42 +3081,45 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3001
3081
|
GUMBO_TAG_P,
|
3002
3082
|
GUMBO_INSERTION_CONVERTED_FROM_END_TAG
|
3003
3083
|
);
|
3004
|
-
state->_reprocess_current_token = true;
|
3005
|
-
return false;
|
3006
3084
|
}
|
3007
|
-
|
3085
|
+
implicitly_close_tags (
|
3008
3086
|
parser,
|
3009
3087
|
token,
|
3010
3088
|
GUMBO_NAMESPACE_HTML,
|
3011
3089
|
GUMBO_TAG_P
|
3012
3090
|
);
|
3013
|
-
|
3091
|
+
return;
|
3092
|
+
}
|
3093
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
|
3014
3094
|
if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
|
3015
3095
|
parser_add_parse_error(parser, token);
|
3016
3096
|
ignore_token(parser);
|
3017
|
-
return
|
3097
|
+
return;
|
3018
3098
|
}
|
3019
|
-
|
3099
|
+
implicitly_close_tags (
|
3020
3100
|
parser,
|
3021
3101
|
token,
|
3022
3102
|
GUMBO_NAMESPACE_HTML,
|
3023
3103
|
GUMBO_TAG_LI
|
3024
3104
|
);
|
3025
|
-
|
3026
|
-
|
3105
|
+
return;
|
3106
|
+
}
|
3107
|
+
if (tag_in(token, kEndTag, &dd_dt_tags)) {
|
3027
3108
|
GumboTag token_tag = token->v.end_tag.tag;
|
3028
3109
|
if (!has_an_element_in_scope(parser, token_tag)) {
|
3029
3110
|
parser_add_parse_error(parser, token);
|
3030
3111
|
ignore_token(parser);
|
3031
|
-
return
|
3112
|
+
return;
|
3032
3113
|
}
|
3033
|
-
|
3114
|
+
implicitly_close_tags (
|
3034
3115
|
parser,
|
3035
3116
|
token,
|
3036
3117
|
GUMBO_NAMESPACE_HTML,
|
3037
3118
|
token_tag
|
3038
3119
|
);
|
3039
|
-
|
3120
|
+
return;
|
3121
|
+
}
|
3122
|
+
if (tag_in(token, kEndTag, &heading_tags)) {
|
3040
3123
|
if (
|
3041
3124
|
!has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) {
|
3042
3125
|
GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
|
@@ -3046,31 +3129,29 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3046
3129
|
// No heading open; ignore the token entirely.
|
3047
3130
|
parser_add_parse_error(parser, token);
|
3048
3131
|
ignore_token(parser);
|
3049
|
-
return
|
3050
|
-
}
|
3051
|
-
|
3052
|
-
|
3053
|
-
|
3054
|
-
|
3055
|
-
|
3056
|
-
|
3057
|
-
|
3058
|
-
|
3059
|
-
parser_add_parse_error(parser, token);
|
3060
|
-
}
|
3061
|
-
do {
|
3062
|
-
current_node = pop_current_node(parser);
|
3063
|
-
} while (!node_tag_in_set(current_node, &heading_tags));
|
3064
|
-
return success;
|
3132
|
+
return;
|
3133
|
+
}
|
3134
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
|
3135
|
+
const GumboNode* current_node = get_current_node(parser);
|
3136
|
+
if (!node_html_tag_is(current_node, token->v.end_tag.tag)) {
|
3137
|
+
// There're children of the heading currently open; close them below and
|
3138
|
+
// record a parse error.
|
3139
|
+
// TODO(jdtang): Add a way to distinguish this error case from the one
|
3140
|
+
// above.
|
3141
|
+
parser_add_parse_error(parser, token);
|
3065
3142
|
}
|
3066
|
-
|
3067
|
-
|
3143
|
+
do {
|
3144
|
+
current_node = pop_current_node(parser);
|
3145
|
+
} while (!node_tag_in_set(current_node, &heading_tags));
|
3146
|
+
return;
|
3147
|
+
}
|
3148
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
|
3068
3149
|
int last_a;
|
3069
3150
|
int has_matching_a = find_last_anchor_index(parser, &last_a);
|
3070
3151
|
if (has_matching_a) {
|
3071
3152
|
assert(has_matching_a == 1);
|
3072
3153
|
parser_add_parse_error(parser, token);
|
3073
|
-
adoption_agency_algorithm(parser, token
|
3154
|
+
(void)adoption_agency_algorithm(parser, token);
|
3074
3155
|
// The adoption agency algorithm usually removes all instances of <a>
|
3075
3156
|
// from the list of active formatting elements, but in case it doesn't,
|
3076
3157
|
// we're supposed to do this. (The conditions where it might not are
|
@@ -3082,12 +3163,12 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3082
3163
|
);
|
3083
3164
|
gumbo_vector_remove(last_element, &state->_open_elements);
|
3084
3165
|
}
|
3085
|
-
success = false;
|
3086
3166
|
}
|
3087
3167
|
reconstruct_active_formatting_elements(parser);
|
3088
3168
|
add_formatting_element(parser, insert_element_from_token(parser, token));
|
3089
|
-
return
|
3090
|
-
}
|
3169
|
+
return;
|
3170
|
+
}
|
3171
|
+
if (
|
3091
3172
|
tag_in(token, kStartTag, &(const TagSet) {
|
3092
3173
|
TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I), TAG(S),
|
3093
3174
|
TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)
|
@@ -3095,48 +3176,52 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3095
3176
|
) {
|
3096
3177
|
reconstruct_active_formatting_elements(parser);
|
3097
3178
|
add_formatting_element(parser, insert_element_from_token(parser, token));
|
3098
|
-
return
|
3099
|
-
}
|
3100
|
-
|
3179
|
+
return;
|
3180
|
+
}
|
3181
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
|
3101
3182
|
reconstruct_active_formatting_elements(parser);
|
3102
3183
|
if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
|
3103
|
-
result = false;
|
3104
3184
|
parser_add_parse_error(parser, token);
|
3105
|
-
adoption_agency_algorithm(parser, token
|
3185
|
+
adoption_agency_algorithm(parser, token);
|
3106
3186
|
reconstruct_active_formatting_elements(parser);
|
3107
3187
|
}
|
3108
3188
|
insert_element_from_token(parser, token);
|
3109
3189
|
add_formatting_element(parser, get_current_node(parser));
|
3110
|
-
return
|
3111
|
-
}
|
3190
|
+
return;
|
3191
|
+
}
|
3192
|
+
if (
|
3112
3193
|
tag_in(token, kEndTag, &(const TagSet) {
|
3113
3194
|
TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I),
|
3114
3195
|
TAG(NOBR), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT),
|
3115
3196
|
TAG(U)
|
3116
3197
|
})
|
3117
3198
|
) {
|
3118
|
-
|
3119
|
-
|
3199
|
+
adoption_agency_algorithm(parser, token);
|
3200
|
+
return;
|
3201
|
+
}
|
3202
|
+
if (
|
3120
3203
|
tag_in(token, kStartTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
|
3121
3204
|
) {
|
3122
3205
|
reconstruct_active_formatting_elements(parser);
|
3123
3206
|
insert_element_from_token(parser, token);
|
3124
3207
|
add_formatting_element(parser, &kActiveFormattingScopeMarker);
|
3125
3208
|
set_frameset_not_ok(parser);
|
3126
|
-
return
|
3127
|
-
}
|
3209
|
+
return;
|
3210
|
+
}
|
3211
|
+
if (
|
3128
3212
|
tag_in(token, kEndTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
|
3129
3213
|
) {
|
3130
3214
|
GumboTag token_tag = token->v.end_tag.tag;
|
3131
|
-
if (!
|
3215
|
+
if (!has_an_element_in_scope(parser, token_tag)) {
|
3132
3216
|
parser_add_parse_error(parser, token);
|
3133
3217
|
ignore_token(parser);
|
3134
|
-
return
|
3218
|
+
return;
|
3135
3219
|
}
|
3136
3220
|
implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
|
3137
3221
|
clear_active_formatting_elements(parser);
|
3138
|
-
return
|
3139
|
-
}
|
3222
|
+
return;
|
3223
|
+
}
|
3224
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
|
3140
3225
|
if (
|
3141
3226
|
get_document_node(parser)->v.document.doc_type_quirks_mode
|
3142
3227
|
!= GUMBO_DOCTYPE_QUIRKS
|
@@ -3146,75 +3231,89 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3146
3231
|
insert_element_from_token(parser, token);
|
3147
3232
|
set_frameset_not_ok(parser);
|
3148
3233
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3149
|
-
return
|
3150
|
-
}
|
3151
|
-
|
3234
|
+
return;
|
3235
|
+
}
|
3236
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
|
3237
|
+
parser_add_parse_error(parser, token);
|
3238
|
+
reconstruct_active_formatting_elements(parser);
|
3239
|
+
insert_element_of_tag_type (
|
3240
|
+
parser,
|
3241
|
+
GUMBO_TAG_BR,
|
3242
|
+
GUMBO_INSERTION_CONVERTED_FROM_END_TAG
|
3243
|
+
);
|
3244
|
+
pop_current_node(parser);
|
3245
|
+
acknowledge_self_closing_tag(parser);
|
3246
|
+
set_frameset_not_ok(parser);
|
3247
|
+
return;
|
3248
|
+
}
|
3249
|
+
if (
|
3250
|
+
tag_in(token, kStartTag, &(const TagSet) {
|
3152
3251
|
TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), TAG(IMAGE), TAG(KEYGEN),
|
3153
3252
|
TAG(WBR)
|
3154
3253
|
})
|
3155
3254
|
) {
|
3156
|
-
bool
|
3157
|
-
if (
|
3158
|
-
success = false;
|
3255
|
+
bool is_image = tag_is(token, kStartTag, GUMBO_TAG_IMAGE);
|
3256
|
+
if (is_image) {
|
3159
3257
|
parser_add_parse_error(parser, token);
|
3160
3258
|
token->v.start_tag.tag = GUMBO_TAG_IMG;
|
3161
3259
|
}
|
3162
3260
|
reconstruct_active_formatting_elements(parser);
|
3163
3261
|
GumboNode* node = insert_element_from_token(parser, token);
|
3164
|
-
if (
|
3165
|
-
success = false;
|
3166
|
-
parser_add_parse_error(parser, token);
|
3167
|
-
node->v.element.tag = GUMBO_TAG_IMG;
|
3262
|
+
if (is_image)
|
3168
3263
|
node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
|
3169
|
-
}
|
3170
3264
|
pop_current_node(parser);
|
3171
3265
|
acknowledge_self_closing_tag(parser);
|
3172
3266
|
set_frameset_not_ok(parser);
|
3173
|
-
return
|
3174
|
-
}
|
3175
|
-
|
3176
|
-
// Must be before the element is inserted, as that takes ownership of the
|
3177
|
-
// token's attribute vector.
|
3178
|
-
set_frameset_not_ok(parser);
|
3179
|
-
}
|
3267
|
+
return;
|
3268
|
+
}
|
3269
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
|
3180
3270
|
reconstruct_active_formatting_elements(parser);
|
3181
|
-
insert_element_from_token(parser, token);
|
3271
|
+
GumboNode *input = insert_element_from_token(parser, token);
|
3182
3272
|
pop_current_node(parser);
|
3183
3273
|
acknowledge_self_closing_tag(parser);
|
3184
|
-
|
3185
|
-
|
3274
|
+
if (!attribute_matches(&input->v.element.attributes, "type", "hidden"))
|
3275
|
+
set_frameset_not_ok(parser);
|
3276
|
+
return;
|
3277
|
+
}
|
3278
|
+
if (
|
3186
3279
|
tag_in(token, kStartTag, &(const TagSet){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})
|
3187
3280
|
) {
|
3188
3281
|
insert_element_from_token(parser, token);
|
3189
3282
|
pop_current_node(parser);
|
3190
3283
|
acknowledge_self_closing_tag(parser);
|
3191
|
-
return
|
3192
|
-
}
|
3193
|
-
|
3284
|
+
return;
|
3285
|
+
}
|
3286
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
|
3287
|
+
maybe_implicitly_close_p_tag(parser, token);
|
3194
3288
|
insert_element_from_token(parser, token);
|
3195
3289
|
pop_current_node(parser);
|
3196
3290
|
acknowledge_self_closing_tag(parser);
|
3197
3291
|
set_frameset_not_ok(parser);
|
3198
|
-
return
|
3199
|
-
}
|
3292
|
+
return;
|
3293
|
+
}
|
3294
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
|
3200
3295
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
|
3201
3296
|
parser->_parser_state->_ignore_next_linefeed = true;
|
3202
3297
|
set_frameset_not_ok(parser);
|
3203
|
-
return
|
3204
|
-
}
|
3205
|
-
|
3298
|
+
return;
|
3299
|
+
}
|
3300
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
|
3301
|
+
maybe_implicitly_close_p_tag(parser, token);
|
3206
3302
|
reconstruct_active_formatting_elements(parser);
|
3207
3303
|
set_frameset_not_ok(parser);
|
3208
3304
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3209
|
-
return
|
3210
|
-
}
|
3305
|
+
return;
|
3306
|
+
}
|
3307
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
|
3211
3308
|
set_frameset_not_ok(parser);
|
3212
3309
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3213
|
-
return
|
3214
|
-
}
|
3310
|
+
return;
|
3311
|
+
}
|
3312
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
|
3215
3313
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
3216
|
-
return
|
3217
|
-
}
|
3314
|
+
return;
|
3315
|
+
}
|
3316
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
|
3218
3317
|
reconstruct_active_formatting_elements(parser);
|
3219
3318
|
insert_element_from_token(parser, token);
|
3220
3319
|
set_frameset_not_ok(parser);
|
@@ -3230,50 +3329,40 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3230
3329
|
} else {
|
3231
3330
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
|
3232
3331
|
}
|
3233
|
-
return
|
3234
|
-
}
|
3235
|
-
|
3332
|
+
return;
|
3333
|
+
}
|
3334
|
+
if (
|
3335
|
+
tag_in(token, kStartTag, &(const TagSet){TAG(OPTGROUP), TAG(OPTION)})
|
3236
3336
|
) {
|
3237
3337
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
3238
3338
|
pop_current_node(parser);
|
3239
3339
|
}
|
3240
3340
|
reconstruct_active_formatting_elements(parser);
|
3241
3341
|
insert_element_from_token(parser, token);
|
3242
|
-
return
|
3243
|
-
}
|
3244
|
-
|
3245
|
-
) {
|
3246
|
-
bool success = true;
|
3247
|
-
GumboTag exception = tag_in(token, kStartTag, &(const TagSet){TAG(RT), TAG(RP)})
|
3248
|
-
? GUMBO_TAG_RTC
|
3249
|
-
: GUMBO_TAG_LAST
|
3250
|
-
;
|
3342
|
+
return;
|
3343
|
+
}
|
3344
|
+
if (tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RTC)})) {
|
3251
3345
|
if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
|
3252
|
-
generate_implied_end_tags(parser,
|
3346
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
|
3347
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY))
|
3348
|
+
parser_add_parse_error(parser, token);
|
3253
3349
|
}
|
3254
|
-
|
3255
|
-
|
3256
|
-
|
3257
|
-
|
3258
|
-
|
3259
|
-
)
|
3260
|
-
|
3261
|
-
|
3262
|
-
|
3350
|
+
insert_element_from_token(parser, token);
|
3351
|
+
return;
|
3352
|
+
}
|
3353
|
+
if (tag_in(token, kStartTag, &(const TagSet){TAG(RP), TAG(RT)})) {
|
3354
|
+
if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
|
3355
|
+
generate_implied_end_tags(parser, GUMBO_TAG_RTC, NULL);
|
3356
|
+
GumboNode* current = get_current_node(parser);
|
3357
|
+
if (!node_html_tag_is(current, GUMBO_TAG_RUBY) &&
|
3358
|
+
!node_html_tag_is(current, GUMBO_TAG_RTC)) {
|
3359
|
+
parser_add_parse_error(parser, token);
|
3360
|
+
}
|
3263
3361
|
}
|
3264
3362
|
insert_element_from_token(parser, token);
|
3265
|
-
return
|
3266
|
-
}
|
3267
|
-
|
3268
|
-
reconstruct_active_formatting_elements(parser);
|
3269
|
-
insert_element_of_tag_type (
|
3270
|
-
parser,
|
3271
|
-
GUMBO_TAG_BR,
|
3272
|
-
GUMBO_INSERTION_CONVERTED_FROM_END_TAG
|
3273
|
-
);
|
3274
|
-
pop_current_node(parser);
|
3275
|
-
return false;
|
3276
|
-
} else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
|
3363
|
+
return;
|
3364
|
+
}
|
3365
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
|
3277
3366
|
reconstruct_active_formatting_elements(parser);
|
3278
3367
|
adjust_mathml_attributes(token);
|
3279
3368
|
adjust_foreign_attributes(token);
|
@@ -3282,8 +3371,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3282
3371
|
pop_current_node(parser);
|
3283
3372
|
acknowledge_self_closing_tag(parser);
|
3284
3373
|
}
|
3285
|
-
return
|
3286
|
-
}
|
3374
|
+
return;
|
3375
|
+
}
|
3376
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
|
3287
3377
|
reconstruct_active_formatting_elements(parser);
|
3288
3378
|
adjust_svg_attributes(token);
|
3289
3379
|
adjust_foreign_attributes(token);
|
@@ -3292,8 +3382,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3292
3382
|
pop_current_node(parser);
|
3293
3383
|
acknowledge_self_closing_tag(parser);
|
3294
3384
|
}
|
3295
|
-
return
|
3296
|
-
}
|
3385
|
+
return;
|
3386
|
+
}
|
3387
|
+
if (
|
3297
3388
|
tag_in(token, kStartTag, &(const TagSet) {
|
3298
3389
|
TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(FRAME), TAG(HEAD),
|
3299
3390
|
TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
@@ -3301,82 +3392,50 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3301
3392
|
) {
|
3302
3393
|
parser_add_parse_error(parser, token);
|
3303
3394
|
ignore_token(parser);
|
3304
|
-
return
|
3305
|
-
}
|
3395
|
+
return;
|
3396
|
+
}
|
3397
|
+
if (token->type == GUMBO_TOKEN_START_TAG) {
|
3306
3398
|
reconstruct_active_formatting_elements(parser);
|
3307
3399
|
insert_element_from_token(parser, token);
|
3308
|
-
return
|
3309
|
-
} else {
|
3310
|
-
assert(token->type == GUMBO_TOKEN_END_TAG);
|
3311
|
-
GumboTag end_tag = token->v.end_tag.tag;
|
3312
|
-
const char *end_tagname = token->v.end_tag.name;
|
3313
|
-
assert(state->_open_elements.length > 0);
|
3314
|
-
assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
|
3315
|
-
// Walk up the stack of open elements until we find one that either:
|
3316
|
-
// a) Matches the tag name we saw
|
3317
|
-
// b) Is in the "special" category.
|
3318
|
-
// If we see a), implicitly close everything up to and including it. If we
|
3319
|
-
// see b), then record a parse error, don't close anything (except the
|
3320
|
-
// implied end tags) and ignore the end tag token.
|
3321
|
-
for (int i = state->_open_elements.length; --i >= 0;) {
|
3322
|
-
const GumboNode* node = state->_open_elements.data[i];
|
3323
|
-
if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, end_tag, end_tagname)) {
|
3324
|
-
generate_implied_end_tags(parser, end_tag);
|
3325
|
-
// TODO(jdtang): Do I need to add a parse error here? The condition in
|
3326
|
-
// the spec seems like it's the inverse of the loop condition above, and
|
3327
|
-
// so would never fire.
|
3328
|
-
// sfc: Yes, an error is needed here.
|
3329
|
-
// <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example.
|
3330
|
-
// foo is the "current node" but sarcasm is node.
|
3331
|
-
// XXX: Write a test for this.
|
3332
|
-
if (node != get_current_node(parser))
|
3333
|
-
parser_add_parse_error(parser, token);
|
3334
|
-
while (node != pop_current_node(parser))
|
3335
|
-
; // Pop everything.
|
3336
|
-
return true;
|
3337
|
-
} else if (is_special_node(node)) {
|
3338
|
-
parser_add_parse_error(parser, token);
|
3339
|
-
ignore_token(parser);
|
3340
|
-
return false;
|
3341
|
-
}
|
3342
|
-
}
|
3343
|
-
// <html> is in the special category, so we should never get here.
|
3344
|
-
assert(0);
|
3345
|
-
return false;
|
3400
|
+
return;
|
3346
3401
|
}
|
3402
|
+
in_body_any_other_end_tag(parser, token);
|
3347
3403
|
}
|
3348
3404
|
|
3349
3405
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata
|
3350
|
-
static
|
3406
|
+
static void handle_text(GumboParser* parser, GumboToken* token) {
|
3351
3407
|
if (
|
3352
3408
|
token->type == GUMBO_TOKEN_CHARACTER
|
3353
3409
|
|| token->type == GUMBO_TOKEN_WHITESPACE
|
3354
3410
|
) {
|
3355
3411
|
insert_text_token(parser, token);
|
3356
|
-
|
3357
|
-
// We provide only bare-bones script handling that doesn't involve any of
|
3358
|
-
// the parser-pause/already-started/script-nesting flags or re-entrant
|
3359
|
-
// invocations of the tokenizer. Because the intended usage of this library
|
3360
|
-
// is mostly for templating, refactoring, and static-analysis libraries, we
|
3361
|
-
// provide the script body as a text-node child of the <script> element.
|
3362
|
-
// This behavior doesn't support document.write of partial HTML elements,
|
3363
|
-
// but should be adequate for almost all other scripting support.
|
3364
|
-
if (token->type == GUMBO_TOKEN_EOF) {
|
3365
|
-
parser_add_parse_error(parser, token);
|
3366
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3367
|
-
}
|
3368
|
-
pop_current_node(parser);
|
3369
|
-
set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
|
3412
|
+
return;
|
3370
3413
|
}
|
3371
|
-
|
3414
|
+
// We provide only bare-bones script handling that doesn't involve any of
|
3415
|
+
// the parser-pause/already-started/script-nesting flags or re-entrant
|
3416
|
+
// invocations of the tokenizer. Because the intended usage of this library
|
3417
|
+
// is mostly for templating, refactoring, and static-analysis libraries, we
|
3418
|
+
// provide the script body as a text-node child of the <script> element.
|
3419
|
+
// This behavior doesn't support document.write of partial HTML elements,
|
3420
|
+
// but should be adequate for almost all other scripting support.
|
3421
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
3422
|
+
parser_add_parse_error(parser, token);
|
3423
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3424
|
+
}
|
3425
|
+
pop_current_node(parser);
|
3426
|
+
set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
|
3372
3427
|
}
|
3373
3428
|
|
3374
3429
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable
|
3375
|
-
static
|
3430
|
+
static void handle_in_table(GumboParser* parser, GumboToken* token) {
|
3376
3431
|
GumboParserState* state = parser->_parser_state;
|
3377
3432
|
if (
|
3378
|
-
token->type == GUMBO_TOKEN_CHARACTER
|
3379
|
-
|
3433
|
+
(token->type == GUMBO_TOKEN_CHARACTER
|
3434
|
+
|| token->type == GUMBO_TOKEN_WHITESPACE
|
3435
|
+
|| token->type == GUMBO_TOKEN_NULL)
|
3436
|
+
&& node_tag_in_set(get_current_node(parser), &(const TagSet) {
|
3437
|
+
TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
|
3438
|
+
})
|
3380
3439
|
) {
|
3381
3440
|
// The "pending table character tokens" list described in the spec is
|
3382
3441
|
// nothing more than the TextNodeBufferState. We accumulate text tokens as
|
@@ -3384,71 +3443,87 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
|
|
3384
3443
|
// we set _foster_parent_insertions if there're non-whitespace characters in
|
3385
3444
|
// the buffer.
|
3386
3445
|
assert(state->_text_node._buffer.length == 0);
|
3446
|
+
assert(state->_table_character_tokens.length == 0);
|
3387
3447
|
state->_original_insertion_mode = state->_insertion_mode;
|
3388
3448
|
state->_reprocess_current_token = true;
|
3389
3449
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
|
3390
|
-
return
|
3391
|
-
}
|
3450
|
+
return;
|
3451
|
+
}
|
3452
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
3453
|
+
append_comment_node(parser, get_current_node(parser), token);
|
3454
|
+
return;
|
3455
|
+
}
|
3456
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
3392
3457
|
parser_add_parse_error(parser, token);
|
3393
3458
|
ignore_token(parser);
|
3394
|
-
return
|
3395
|
-
}
|
3396
|
-
|
3397
|
-
return true;
|
3398
|
-
} else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
|
3459
|
+
return;
|
3460
|
+
}
|
3461
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
|
3399
3462
|
clear_stack_to_table_context(parser);
|
3400
3463
|
add_formatting_element(parser, &kActiveFormattingScopeMarker);
|
3401
3464
|
insert_element_from_token(parser, token);
|
3402
3465
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
|
3403
|
-
return
|
3404
|
-
}
|
3466
|
+
return;
|
3467
|
+
}
|
3468
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
|
3405
3469
|
clear_stack_to_table_context(parser);
|
3406
3470
|
insert_element_from_token(parser, token);
|
3407
3471
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
|
3408
|
-
return
|
3409
|
-
}
|
3472
|
+
return;
|
3473
|
+
}
|
3474
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
|
3410
3475
|
clear_stack_to_table_context(parser);
|
3411
3476
|
insert_element_of_tag_type (
|
3412
3477
|
parser,
|
3413
3478
|
GUMBO_TAG_COLGROUP,
|
3414
3479
|
GUMBO_INSERTION_IMPLIED
|
3415
3480
|
);
|
3416
|
-
|
3481
|
+
state->_reprocess_current_token = true;
|
3417
3482
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
|
3418
|
-
return
|
3419
|
-
}
|
3483
|
+
return;
|
3484
|
+
}
|
3485
|
+
if (
|
3420
3486
|
tag_in(token, kStartTag, &(const TagSet) {
|
3421
|
-
TAG(TBODY), TAG(TFOOT), TAG(THEAD)
|
3487
|
+
TAG(TBODY), TAG(TFOOT), TAG(THEAD)
|
3422
3488
|
})
|
3423
3489
|
) {
|
3424
3490
|
clear_stack_to_table_context(parser);
|
3491
|
+
insert_element_from_token(parser, token);
|
3425
3492
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3426
|
-
|
3427
|
-
|
3428
|
-
|
3429
|
-
|
3430
|
-
|
3431
|
-
|
3432
|
-
|
3433
|
-
|
3434
|
-
|
3435
|
-
|
3436
|
-
|
3437
|
-
|
3493
|
+
return;
|
3494
|
+
}
|
3495
|
+
if (
|
3496
|
+
tag_in(token, kStartTag, &(const TagSet) {
|
3497
|
+
TAG(TD), TAG(TH), TAG(TR)
|
3498
|
+
})
|
3499
|
+
) {
|
3500
|
+
clear_stack_to_table_context(parser);
|
3501
|
+
insert_element_of_tag_type (
|
3502
|
+
parser,
|
3503
|
+
GUMBO_TAG_TBODY,
|
3504
|
+
GUMBO_INSERTION_IMPLIED
|
3505
|
+
);
|
3506
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3507
|
+
state->_reprocess_current_token = true;
|
3508
|
+
return;
|
3509
|
+
}
|
3510
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
|
3438
3511
|
parser_add_parse_error(parser, token);
|
3439
3512
|
if (close_table(parser)) {
|
3440
|
-
|
3513
|
+
state->_reprocess_current_token = true;
|
3441
3514
|
} else {
|
3442
3515
|
ignore_token(parser);
|
3443
3516
|
}
|
3444
|
-
return
|
3445
|
-
}
|
3517
|
+
return;
|
3518
|
+
}
|
3519
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
|
3446
3520
|
if (!close_table(parser)) {
|
3447
3521
|
parser_add_parse_error(parser, token);
|
3448
|
-
return
|
3522
|
+
return;
|
3449
3523
|
}
|
3450
|
-
return
|
3451
|
-
}
|
3524
|
+
return;
|
3525
|
+
}
|
3526
|
+
if (
|
3452
3527
|
tag_in(token, kEndTag, &(const TagSet) {
|
3453
3528
|
TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
|
3454
3529
|
TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
@@ -3456,102 +3531,103 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
|
|
3456
3531
|
) {
|
3457
3532
|
parser_add_parse_error(parser, token);
|
3458
3533
|
ignore_token(parser);
|
3459
|
-
return
|
3460
|
-
}
|
3534
|
+
return;
|
3535
|
+
}
|
3536
|
+
if (
|
3461
3537
|
tag_in(token, kStartTag, &(const TagSet){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)})
|
3462
3538
|
|| (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))
|
3463
3539
|
) {
|
3464
|
-
|
3465
|
-
|
3540
|
+
handle_in_head(parser, token);
|
3541
|
+
return;
|
3542
|
+
}
|
3543
|
+
if (
|
3466
3544
|
tag_is(token, kStartTag, GUMBO_TAG_INPUT)
|
3467
3545
|
&& attribute_matches(&token->v.start_tag.attributes, "type", "hidden")
|
3468
3546
|
) {
|
3469
3547
|
parser_add_parse_error(parser, token);
|
3470
3548
|
insert_element_from_token(parser, token);
|
3471
3549
|
pop_current_node(parser);
|
3472
|
-
|
3473
|
-
|
3550
|
+
acknowledge_self_closing_tag(parser);
|
3551
|
+
return;
|
3552
|
+
}
|
3553
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
|
3474
3554
|
parser_add_parse_error(parser, token);
|
3475
3555
|
if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
3476
3556
|
ignore_token(parser);
|
3477
|
-
return
|
3557
|
+
return;
|
3478
3558
|
}
|
3479
3559
|
state->_form_element = insert_element_from_token(parser, token);
|
3480
3560
|
pop_current_node(parser);
|
3481
|
-
return
|
3482
|
-
}
|
3483
|
-
|
3484
|
-
|
3485
|
-
|
3486
|
-
state->_foster_parent_insertions = true;
|
3487
|
-
bool result = handle_in_body(parser, token);
|
3488
|
-
state->_foster_parent_insertions = false;
|
3489
|
-
return result;
|
3561
|
+
return;
|
3562
|
+
}
|
3563
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
3564
|
+
handle_in_body(parser, token);
|
3565
|
+
return;
|
3490
3566
|
}
|
3567
|
+
// foster-parenting-start-tag or foster-parenting-end-tag error
|
3568
|
+
parser_add_parse_error(parser, token);
|
3569
|
+
state->_foster_parent_insertions = true;
|
3570
|
+
handle_in_body(parser, token);
|
3571
|
+
state->_foster_parent_insertions = false;
|
3491
3572
|
}
|
3492
3573
|
|
3493
3574
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext
|
3494
|
-
static
|
3575
|
+
static void handle_in_table_text(GumboParser* parser, GumboToken* token) {
|
3495
3576
|
if (token->type == GUMBO_TOKEN_NULL) {
|
3496
3577
|
parser_add_parse_error(parser, token);
|
3497
3578
|
ignore_token(parser);
|
3498
|
-
return
|
3499
|
-
}
|
3500
|
-
|
3501
|
-
|
3502
|
-
|
3579
|
+
return;
|
3580
|
+
}
|
3581
|
+
GumboParserState* state = parser->_parser_state;
|
3582
|
+
// Non-whitespace tokens will cause parse errors later.
|
3583
|
+
// It's not entirely clear from the spec how this is supposed to work.
|
3584
|
+
// https://github.com/whatwg/html/issues/4046
|
3585
|
+
if (token->type == GUMBO_TOKEN_WHITESPACE
|
3586
|
+
|| token->type == GUMBO_TOKEN_CHARACTER) {
|
3503
3587
|
insert_text_token(parser, token);
|
3504
|
-
|
3505
|
-
|
3506
|
-
|
3507
|
-
|
3508
|
-
|
3509
|
-
|
3510
|
-
//
|
3511
|
-
//
|
3588
|
+
gumbo_character_token_buffer_append(token, &state->_table_character_tokens);
|
3589
|
+
return;
|
3590
|
+
}
|
3591
|
+
|
3592
|
+
GumboCharacterTokenBuffer* buffer = &state->_table_character_tokens;
|
3593
|
+
if (state->_text_node._type != GUMBO_NODE_WHITESPACE) {
|
3594
|
+
// Each character in buffer is an error. Unfortunately, that means we need
|
3595
|
+
// to emit a bunch of errors at the appropriate locations.
|
3512
3596
|
for (size_t i = 0, n = buffer->length; i < n; ++i) {
|
3513
|
-
|
3514
|
-
|
3515
|
-
|
3516
|
-
|
3517
|
-
case '\r':
|
3518
|
-
case ' ':
|
3519
|
-
continue;
|
3520
|
-
default:
|
3521
|
-
state->_foster_parent_insertions = true;
|
3522
|
-
reconstruct_active_formatting_elements(parser);
|
3523
|
-
goto loopbreak;
|
3524
|
-
}
|
3597
|
+
GumboToken tok;
|
3598
|
+
gumbo_character_token_buffer_get(buffer, i, &tok);
|
3599
|
+
// foster-parenting-character error
|
3600
|
+
parser_add_parse_error(parser, &tok);
|
3525
3601
|
}
|
3526
|
-
|
3527
|
-
|
3528
|
-
|
3529
|
-
state->_reprocess_current_token = true;
|
3530
|
-
state->_insertion_mode = state->_original_insertion_mode;
|
3531
|
-
return true;
|
3602
|
+
state->_foster_parent_insertions = true;
|
3603
|
+
set_frameset_not_ok(parser);
|
3604
|
+
reconstruct_active_formatting_elements(parser);
|
3532
3605
|
}
|
3606
|
+
maybe_flush_text_node_buffer(parser);
|
3607
|
+
gumbo_character_token_buffer_clear(buffer);
|
3608
|
+
state->_foster_parent_insertions = false;
|
3609
|
+
state->_reprocess_current_token = true;
|
3610
|
+
state->_insertion_mode = state->_original_insertion_mode;
|
3533
3611
|
}
|
3534
3612
|
|
3535
3613
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incaption
|
3536
|
-
static
|
3614
|
+
static void handle_in_caption(GumboParser* parser, GumboToken* token) {
|
3537
3615
|
if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
|
3538
3616
|
if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
|
3539
3617
|
parser_add_parse_error(parser, token);
|
3540
3618
|
ignore_token(parser);
|
3541
|
-
return
|
3542
|
-
} else {
|
3543
|
-
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
3544
|
-
bool result = true;
|
3545
|
-
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
|
3546
|
-
parser_add_parse_error(parser, token);
|
3547
|
-
}
|
3548
|
-
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
|
3549
|
-
;
|
3550
|
-
clear_active_formatting_elements(parser);
|
3551
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3552
|
-
return result;
|
3619
|
+
return;
|
3553
3620
|
}
|
3554
|
-
|
3621
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
|
3622
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION))
|
3623
|
+
parser_add_parse_error(parser, token);
|
3624
|
+
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
|
3625
|
+
;
|
3626
|
+
clear_active_formatting_elements(parser);
|
3627
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3628
|
+
return;
|
3629
|
+
}
|
3630
|
+
if (
|
3555
3631
|
tag_in(token, kStartTag, &(const TagSet) {
|
3556
3632
|
TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
|
3557
3633
|
TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
@@ -3561,15 +3637,19 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
|
|
3561
3637
|
if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
|
3562
3638
|
parser_add_parse_error(parser, token);
|
3563
3639
|
ignore_token(parser);
|
3564
|
-
return
|
3640
|
+
return;
|
3565
3641
|
}
|
3642
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
|
3643
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION))
|
3644
|
+
parser_add_parse_error(parser, token);
|
3566
3645
|
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
|
3567
3646
|
;
|
3568
3647
|
clear_active_formatting_elements(parser);
|
3569
3648
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3570
3649
|
parser->_parser_state->_reprocess_current_token = true;
|
3571
|
-
return
|
3572
|
-
}
|
3650
|
+
return;
|
3651
|
+
}
|
3652
|
+
if (
|
3573
3653
|
tag_in(token, kEndTag, &(const TagSet) {
|
3574
3654
|
TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD),
|
3575
3655
|
TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
@@ -3577,91 +3657,102 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
|
|
3577
3657
|
) {
|
3578
3658
|
parser_add_parse_error(parser, token);
|
3579
3659
|
ignore_token(parser);
|
3580
|
-
return
|
3581
|
-
} else {
|
3582
|
-
return handle_in_body(parser, token);
|
3660
|
+
return;
|
3583
3661
|
}
|
3662
|
+
handle_in_body(parser, token);
|
3584
3663
|
}
|
3585
3664
|
|
3586
3665
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incolgroup
|
3587
|
-
static
|
3666
|
+
static void handle_in_column_group(GumboParser* parser, GumboToken* token) {
|
3588
3667
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
3589
3668
|
insert_text_token(parser, token);
|
3590
|
-
return
|
3591
|
-
}
|
3669
|
+
return;
|
3670
|
+
}
|
3671
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
3672
|
+
append_comment_node(parser, get_current_node(parser), token);
|
3673
|
+
return;
|
3674
|
+
}
|
3675
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
3592
3676
|
parser_add_parse_error(parser, token);
|
3593
3677
|
ignore_token(parser);
|
3594
|
-
return
|
3595
|
-
}
|
3596
|
-
|
3597
|
-
|
3598
|
-
|
3599
|
-
|
3600
|
-
|
3678
|
+
return;
|
3679
|
+
}
|
3680
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
3681
|
+
handle_in_body(parser, token);
|
3682
|
+
return;
|
3683
|
+
}
|
3684
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
|
3601
3685
|
insert_element_from_token(parser, token);
|
3602
3686
|
pop_current_node(parser);
|
3603
3687
|
acknowledge_self_closing_tag(parser);
|
3604
|
-
return
|
3605
|
-
}
|
3688
|
+
return;
|
3689
|
+
}
|
3690
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
|
3606
3691
|
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
|
3607
3692
|
parser_add_parse_error(parser, token);
|
3608
3693
|
ignore_token(parser);
|
3609
|
-
return
|
3694
|
+
return;
|
3610
3695
|
}
|
3611
3696
|
pop_current_node(parser);
|
3612
3697
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3613
|
-
return
|
3614
|
-
}
|
3698
|
+
return;
|
3699
|
+
}
|
3700
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
|
3615
3701
|
parser_add_parse_error(parser, token);
|
3616
3702
|
ignore_token(parser);
|
3617
|
-
return
|
3618
|
-
}
|
3703
|
+
return;
|
3704
|
+
}
|
3705
|
+
if (
|
3619
3706
|
tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)
|
3620
3707
|
|| tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
|
3621
3708
|
) {
|
3622
|
-
|
3623
|
-
|
3624
|
-
|
3625
|
-
|
3626
|
-
|
3627
|
-
|
3628
|
-
ignore_token(parser);
|
3629
|
-
return false;
|
3630
|
-
}
|
3631
|
-
pop_current_node(parser);
|
3632
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3633
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3634
|
-
return true;
|
3709
|
+
handle_in_head(parser, token);
|
3710
|
+
return;
|
3711
|
+
}
|
3712
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
3713
|
+
handle_in_body(parser, token);
|
3714
|
+
return;
|
3635
3715
|
}
|
3716
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
|
3717
|
+
parser_add_parse_error(parser, token);
|
3718
|
+
ignore_token(parser);
|
3719
|
+
return;
|
3720
|
+
}
|
3721
|
+
pop_current_node(parser);
|
3722
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3723
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3636
3724
|
}
|
3637
3725
|
|
3638
3726
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intbody
|
3639
|
-
static
|
3727
|
+
static void handle_in_table_body(GumboParser* parser, GumboToken* token) {
|
3640
3728
|
if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
|
3641
3729
|
clear_stack_to_table_body_context(parser);
|
3642
3730
|
insert_element_from_token(parser, token);
|
3643
3731
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
3644
|
-
return
|
3645
|
-
}
|
3732
|
+
return;
|
3733
|
+
}
|
3734
|
+
if (tag_in(token, kStartTag, &td_th_tags)) {
|
3646
3735
|
parser_add_parse_error(parser, token);
|
3647
3736
|
clear_stack_to_table_body_context(parser);
|
3648
3737
|
insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
|
3649
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3650
3738
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
3651
|
-
|
3652
|
-
|
3739
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3740
|
+
return;
|
3741
|
+
}
|
3742
|
+
if (
|
3653
3743
|
tag_in(token, kEndTag, &(const TagSet){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
|
3654
3744
|
) {
|
3655
3745
|
if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
|
3656
3746
|
parser_add_parse_error(parser, token);
|
3657
3747
|
ignore_token(parser);
|
3658
|
-
return
|
3748
|
+
return;
|
3659
3749
|
}
|
3660
3750
|
clear_stack_to_table_body_context(parser);
|
3661
3751
|
pop_current_node(parser);
|
3662
3752
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3663
|
-
return
|
3664
|
-
}
|
3753
|
+
return;
|
3754
|
+
}
|
3755
|
+
if (
|
3665
3756
|
tag_in(token, kStartTag, &(const TagSet) {
|
3666
3757
|
TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
|
3667
3758
|
TAG(THEAD)
|
@@ -3677,47 +3768,48 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
|
|
3677
3768
|
) {
|
3678
3769
|
parser_add_parse_error(parser, token);
|
3679
3770
|
ignore_token(parser);
|
3680
|
-
return
|
3771
|
+
return;
|
3681
3772
|
}
|
3682
3773
|
clear_stack_to_table_body_context(parser);
|
3683
3774
|
pop_current_node(parser);
|
3684
3775
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3685
3776
|
parser->_parser_state->_reprocess_current_token = true;
|
3686
|
-
return
|
3687
|
-
}
|
3777
|
+
return;
|
3778
|
+
}
|
3779
|
+
if (
|
3688
3780
|
tag_in(token, kEndTag, &(const TagSet) {
|
3689
|
-
TAG(BODY), TAG(CAPTION), TAG(COL), TAG(
|
3690
|
-
TAG(
|
3781
|
+
TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TD),
|
3782
|
+
TAG(TH), TAG(TR)
|
3691
3783
|
})
|
3692
3784
|
) {
|
3693
3785
|
parser_add_parse_error(parser, token);
|
3694
3786
|
ignore_token(parser);
|
3695
|
-
return
|
3696
|
-
} else {
|
3697
|
-
return handle_in_table(parser, token);
|
3787
|
+
return;
|
3698
3788
|
}
|
3789
|
+
handle_in_table(parser, token);
|
3699
3790
|
}
|
3700
3791
|
|
3701
3792
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intr
|
3702
|
-
static
|
3793
|
+
static void handle_in_row(GumboParser* parser, GumboToken* token) {
|
3703
3794
|
if (tag_in(token, kStartTag, &td_th_tags)) {
|
3704
3795
|
clear_stack_to_table_row_context(parser);
|
3705
3796
|
insert_element_from_token(parser, token);
|
3706
3797
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
|
3707
3798
|
add_formatting_element(parser, &kActiveFormattingScopeMarker);
|
3708
|
-
return
|
3709
|
-
}
|
3799
|
+
return;
|
3800
|
+
}
|
3801
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
|
3710
3802
|
if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
|
3711
3803
|
parser_add_parse_error(parser, token);
|
3712
3804
|
ignore_token(parser);
|
3713
|
-
return
|
3714
|
-
} else {
|
3715
|
-
clear_stack_to_table_row_context(parser);
|
3716
|
-
pop_current_node(parser);
|
3717
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3718
|
-
return true;
|
3805
|
+
return;
|
3719
3806
|
}
|
3720
|
-
|
3807
|
+
clear_stack_to_table_row_context(parser);
|
3808
|
+
pop_current_node(parser);
|
3809
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3810
|
+
return;
|
3811
|
+
}
|
3812
|
+
if (
|
3721
3813
|
tag_in(token, kStartTag, &(const TagSet) {
|
3722
3814
|
TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
|
3723
3815
|
TAG(THEAD), TAG(TR)
|
@@ -3727,32 +3819,33 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
|
|
3727
3819
|
if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
|
3728
3820
|
parser_add_parse_error(parser, token);
|
3729
3821
|
ignore_token(parser);
|
3730
|
-
return
|
3731
|
-
} else {
|
3732
|
-
clear_stack_to_table_row_context(parser);
|
3733
|
-
pop_current_node(parser);
|
3734
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3735
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3736
|
-
return true;
|
3822
|
+
return;
|
3737
3823
|
}
|
3738
|
-
|
3824
|
+
clear_stack_to_table_row_context(parser);
|
3825
|
+
pop_current_node(parser);
|
3826
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3827
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3828
|
+
return;
|
3829
|
+
}
|
3830
|
+
if (
|
3739
3831
|
tag_in(token, kEndTag, &(const TagSet) {TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
|
3740
3832
|
) {
|
3741
|
-
if (
|
3742
|
-
!has_an_element_in_table_scope(parser, token->v.end_tag.tag)
|
3743
|
-
|| !has_an_element_in_table_scope(parser, GUMBO_TAG_TR)
|
3744
|
-
) {
|
3833
|
+
if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
|
3745
3834
|
parser_add_parse_error(parser, token);
|
3746
3835
|
ignore_token(parser);
|
3747
|
-
return
|
3748
|
-
}
|
3749
|
-
|
3750
|
-
|
3751
|
-
|
3752
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3753
|
-
return true;
|
3836
|
+
return;
|
3837
|
+
}
|
3838
|
+
if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
|
3839
|
+
ignore_token(parser);
|
3840
|
+
return;
|
3754
3841
|
}
|
3755
|
-
|
3842
|
+
clear_stack_to_table_row_context(parser);
|
3843
|
+
pop_current_node(parser);
|
3844
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3845
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3846
|
+
return;
|
3847
|
+
}
|
3848
|
+
if (
|
3756
3849
|
tag_in(token, kEndTag, &(const TagSet) {
|
3757
3850
|
TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
|
3758
3851
|
TAG(TD), TAG(TH)
|
@@ -3760,23 +3853,24 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
|
|
3760
3853
|
) {
|
3761
3854
|
parser_add_parse_error(parser, token);
|
3762
3855
|
ignore_token(parser);
|
3763
|
-
return
|
3764
|
-
} else {
|
3765
|
-
return handle_in_table(parser, token);
|
3856
|
+
return;
|
3766
3857
|
}
|
3858
|
+
handle_in_table(parser, token);
|
3767
3859
|
}
|
3768
3860
|
|
3769
3861
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intd
|
3770
|
-
static
|
3862
|
+
static void handle_in_cell(GumboParser* parser, GumboToken* token) {
|
3771
3863
|
if (tag_in(token, kEndTag, &td_th_tags)) {
|
3772
3864
|
GumboTag token_tag = token->v.end_tag.tag;
|
3773
3865
|
if (!has_an_element_in_table_scope(parser, token_tag)) {
|
3774
3866
|
parser_add_parse_error(parser, token);
|
3775
3867
|
ignore_token(parser);
|
3776
|
-
return
|
3868
|
+
return;
|
3777
3869
|
}
|
3778
|
-
|
3779
|
-
|
3870
|
+
close_table_cell(parser, token, token_tag);
|
3871
|
+
return;
|
3872
|
+
}
|
3873
|
+
if (
|
3780
3874
|
tag_in(token, kStartTag, &(const TagSet) {
|
3781
3875
|
TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
|
3782
3876
|
TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
|
@@ -3790,19 +3884,22 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
|
|
3790
3884
|
gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
|
3791
3885
|
parser_add_parse_error(parser, token);
|
3792
3886
|
ignore_token(parser);
|
3793
|
-
return
|
3887
|
+
return;
|
3794
3888
|
}
|
3795
3889
|
parser->_parser_state->_reprocess_current_token = true;
|
3796
|
-
|
3797
|
-
|
3890
|
+
close_current_cell(parser, token);
|
3891
|
+
return;
|
3892
|
+
}
|
3893
|
+
if (
|
3798
3894
|
tag_in(token, kEndTag, &(const TagSet) {
|
3799
3895
|
TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML)
|
3800
3896
|
})
|
3801
3897
|
) {
|
3802
3898
|
parser_add_parse_error(parser, token);
|
3803
3899
|
ignore_token(parser);
|
3804
|
-
return
|
3805
|
-
}
|
3900
|
+
return;
|
3901
|
+
}
|
3902
|
+
if (
|
3806
3903
|
tag_in(token, kEndTag, &(const TagSet) {
|
3807
3904
|
TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
|
3808
3905
|
})
|
@@ -3810,43 +3907,50 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
|
|
3810
3907
|
if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
|
3811
3908
|
parser_add_parse_error(parser, token);
|
3812
3909
|
ignore_token(parser);
|
3813
|
-
return
|
3910
|
+
return;
|
3814
3911
|
}
|
3815
3912
|
parser->_parser_state->_reprocess_current_token = true;
|
3816
|
-
|
3817
|
-
|
3818
|
-
return handle_in_body(parser, token);
|
3913
|
+
close_current_cell(parser, token);
|
3914
|
+
return;
|
3819
3915
|
}
|
3916
|
+
handle_in_body(parser, token);
|
3820
3917
|
}
|
3821
3918
|
|
3822
3919
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect
|
3823
|
-
static
|
3920
|
+
static void handle_in_select(GumboParser* parser, GumboToken* token) {
|
3824
3921
|
if (token->type == GUMBO_TOKEN_NULL) {
|
3825
3922
|
parser_add_parse_error(parser, token);
|
3826
3923
|
ignore_token(parser);
|
3827
|
-
return
|
3828
|
-
}
|
3924
|
+
return;
|
3925
|
+
}
|
3926
|
+
if (
|
3829
3927
|
token->type == GUMBO_TOKEN_CHARACTER
|
3830
3928
|
|| token->type == GUMBO_TOKEN_WHITESPACE
|
3831
3929
|
) {
|
3832
3930
|
insert_text_token(parser, token);
|
3833
|
-
return
|
3834
|
-
}
|
3931
|
+
return;
|
3932
|
+
}
|
3933
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
3934
|
+
append_comment_node(parser, get_current_node(parser), token);
|
3935
|
+
return;
|
3936
|
+
}
|
3937
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
3835
3938
|
parser_add_parse_error(parser, token);
|
3836
3939
|
ignore_token(parser);
|
3837
|
-
return
|
3838
|
-
}
|
3839
|
-
|
3840
|
-
|
3841
|
-
|
3842
|
-
|
3843
|
-
|
3940
|
+
return;
|
3941
|
+
}
|
3942
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
3943
|
+
handle_in_body(parser, token);
|
3944
|
+
return;
|
3945
|
+
}
|
3946
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
|
3844
3947
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
3845
3948
|
pop_current_node(parser);
|
3846
3949
|
}
|
3847
3950
|
insert_element_from_token(parser, token);
|
3848
|
-
return
|
3849
|
-
}
|
3951
|
+
return;
|
3952
|
+
}
|
3953
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
|
3850
3954
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
3851
3955
|
pop_current_node(parser);
|
3852
3956
|
}
|
@@ -3854,8 +3958,9 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3854
3958
|
pop_current_node(parser);
|
3855
3959
|
}
|
3856
3960
|
insert_element_from_token(parser, token);
|
3857
|
-
return
|
3858
|
-
}
|
3961
|
+
return;
|
3962
|
+
}
|
3963
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
|
3859
3964
|
GumboVector* open_elements = &parser->_parser_state->_open_elements;
|
3860
3965
|
if (
|
3861
3966
|
node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)
|
@@ -3868,37 +3973,39 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3868
3973
|
}
|
3869
3974
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
|
3870
3975
|
pop_current_node(parser);
|
3871
|
-
return
|
3872
|
-
} else {
|
3873
|
-
parser_add_parse_error(parser, token);
|
3874
|
-
ignore_token(parser);
|
3875
|
-
return false;
|
3976
|
+
return;
|
3876
3977
|
}
|
3877
|
-
|
3978
|
+
parser_add_parse_error(parser, token);
|
3979
|
+
ignore_token(parser);
|
3980
|
+
return;
|
3981
|
+
}
|
3982
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
|
3878
3983
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
3879
3984
|
pop_current_node(parser);
|
3880
|
-
return
|
3881
|
-
} else {
|
3882
|
-
parser_add_parse_error(parser, token);
|
3883
|
-
ignore_token(parser);
|
3884
|
-
return false;
|
3985
|
+
return;
|
3885
3986
|
}
|
3886
|
-
|
3987
|
+
parser_add_parse_error(parser, token);
|
3988
|
+
ignore_token(parser);
|
3989
|
+
return;
|
3990
|
+
}
|
3991
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
|
3887
3992
|
if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
|
3888
3993
|
parser_add_parse_error(parser, token);
|
3889
3994
|
ignore_token(parser);
|
3890
|
-
return
|
3995
|
+
return;
|
3891
3996
|
}
|
3892
3997
|
close_current_select(parser);
|
3893
|
-
return
|
3894
|
-
}
|
3998
|
+
return;
|
3999
|
+
}
|
4000
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
|
3895
4001
|
parser_add_parse_error(parser, token);
|
3896
4002
|
ignore_token(parser);
|
3897
4003
|
if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
|
3898
4004
|
close_current_select(parser);
|
3899
4005
|
}
|
3900
|
-
return
|
3901
|
-
}
|
4006
|
+
return;
|
4007
|
+
}
|
4008
|
+
if (
|
3902
4009
|
tag_in(token, kStartTag, &(const TagSet) {TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})
|
3903
4010
|
) {
|
3904
4011
|
parser_add_parse_error(parser, token);
|
@@ -3908,23 +4015,25 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3908
4015
|
close_current_select(parser);
|
3909
4016
|
parser->_parser_state->_reprocess_current_token = true;
|
3910
4017
|
}
|
3911
|
-
return
|
3912
|
-
}
|
4018
|
+
return;
|
4019
|
+
}
|
4020
|
+
if (
|
3913
4021
|
tag_in(token, kStartTag, &(const TagSet){TAG(SCRIPT), TAG(TEMPLATE)})
|
3914
4022
|
|| tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
|
3915
4023
|
) {
|
3916
|
-
|
3917
|
-
|
3918
|
-
return handle_in_body(parser, token);
|
3919
|
-
} else {
|
3920
|
-
parser_add_parse_error(parser, token);
|
3921
|
-
ignore_token(parser);
|
3922
|
-
return false;
|
4024
|
+
handle_in_head(parser, token);
|
4025
|
+
return;
|
3923
4026
|
}
|
4027
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4028
|
+
handle_in_body(parser, token);
|
4029
|
+
return;
|
4030
|
+
}
|
4031
|
+
parser_add_parse_error(parser, token);
|
4032
|
+
ignore_token(parser);
|
3924
4033
|
}
|
3925
4034
|
|
3926
4035
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselectintable
|
3927
|
-
static
|
4036
|
+
static void handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
|
3928
4037
|
static const TagSet tags = {
|
3929
4038
|
TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD),
|
3930
4039
|
TAG(TR), TAG(TD), TAG(TH)
|
@@ -3933,27 +4042,23 @@ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
|
|
3933
4042
|
parser_add_parse_error(parser, token);
|
3934
4043
|
close_current_select(parser);
|
3935
4044
|
parser->_parser_state->_reprocess_current_token = true;
|
3936
|
-
return
|
3937
|
-
}
|
4045
|
+
return;
|
4046
|
+
}
|
4047
|
+
if (tag_in(token, kEndTag, &tags)) {
|
3938
4048
|
parser_add_parse_error(parser, token);
|
3939
4049
|
if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
|
3940
4050
|
ignore_token(parser);
|
3941
|
-
return
|
3942
|
-
} else {
|
3943
|
-
close_current_select(parser);
|
3944
|
-
// close_current_select already does the
|
3945
|
-
// reset_insertion_mode_appropriately
|
3946
|
-
// reset_insertion_mode_appropriately(parser);
|
3947
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3948
|
-
return false;
|
4051
|
+
return;
|
3949
4052
|
}
|
3950
|
-
|
3951
|
-
|
4053
|
+
close_current_select(parser);
|
4054
|
+
parser->_parser_state->_reprocess_current_token = true;
|
4055
|
+
return;
|
3952
4056
|
}
|
4057
|
+
handle_in_select(parser, token);
|
3953
4058
|
}
|
3954
4059
|
|
3955
4060
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate
|
3956
|
-
static
|
4061
|
+
static void handle_in_template(GumboParser* parser, GumboToken* token) {
|
3957
4062
|
GumboParserState* state = parser->_parser_state;
|
3958
4063
|
switch (token->type) {
|
3959
4064
|
case GUMBO_TOKEN_WHITESPACE:
|
@@ -3961,7 +4066,8 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
|
|
3961
4066
|
case GUMBO_TOKEN_COMMENT:
|
3962
4067
|
case GUMBO_TOKEN_NULL:
|
3963
4068
|
case GUMBO_TOKEN_DOCTYPE:
|
3964
|
-
|
4069
|
+
handle_in_body(parser, token);
|
4070
|
+
return;
|
3965
4071
|
default:
|
3966
4072
|
break;
|
3967
4073
|
}
|
@@ -3972,8 +4078,10 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
|
|
3972
4078
|
})
|
3973
4079
|
|| tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
|
3974
4080
|
) {
|
3975
|
-
|
3976
|
-
|
4081
|
+
handle_in_head(parser, token);
|
4082
|
+
return;
|
4083
|
+
}
|
4084
|
+
if (
|
3977
4085
|
tag_in(token, kStartTag, &(const TagSet) {
|
3978
4086
|
TAG(CAPTION), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), TAG(THEAD)
|
3979
4087
|
})
|
@@ -3982,39 +4090,45 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
|
|
3982
4090
|
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3983
4091
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3984
4092
|
state->_reprocess_current_token = true;
|
3985
|
-
return
|
3986
|
-
}
|
4093
|
+
return;
|
4094
|
+
}
|
4095
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
|
3987
4096
|
pop_template_insertion_mode(parser);
|
3988
4097
|
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
|
3989
4098
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
|
3990
4099
|
state->_reprocess_current_token = true;
|
3991
|
-
return
|
3992
|
-
}
|
4100
|
+
return;
|
4101
|
+
}
|
4102
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
|
3993
4103
|
pop_template_insertion_mode(parser);
|
3994
4104
|
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3995
4105
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3996
4106
|
state->_reprocess_current_token = true;
|
3997
|
-
return
|
3998
|
-
}
|
4107
|
+
return;
|
4108
|
+
}
|
4109
|
+
if (tag_in(token, kStartTag, &td_th_tags)) {
|
3999
4110
|
pop_template_insertion_mode(parser);
|
4000
4111
|
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
4001
4112
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
4002
4113
|
state->_reprocess_current_token = true;
|
4003
|
-
return
|
4004
|
-
}
|
4114
|
+
return;
|
4115
|
+
}
|
4116
|
+
if (token->type == GUMBO_TOKEN_START_TAG) {
|
4005
4117
|
pop_template_insertion_mode(parser);
|
4006
4118
|
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4007
4119
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4008
4120
|
state->_reprocess_current_token = true;
|
4009
|
-
return
|
4010
|
-
}
|
4121
|
+
return;
|
4122
|
+
}
|
4123
|
+
if (token->type == GUMBO_TOKEN_END_TAG) {
|
4011
4124
|
parser_add_parse_error(parser, token);
|
4012
4125
|
ignore_token(parser);
|
4013
|
-
return
|
4014
|
-
}
|
4126
|
+
return;
|
4127
|
+
}
|
4128
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4015
4129
|
if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
4016
4130
|
// Stop parsing.
|
4017
|
-
return
|
4131
|
+
return;
|
4018
4132
|
}
|
4019
4133
|
parser_add_parse_error(parser, token);
|
4020
4134
|
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
|
@@ -4023,35 +4137,41 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
|
|
4023
4137
|
pop_template_insertion_mode(parser);
|
4024
4138
|
reset_insertion_mode_appropriately(parser);
|
4025
4139
|
state->_reprocess_current_token = true;
|
4026
|
-
return
|
4027
|
-
} else {
|
4028
|
-
assert(0);
|
4029
|
-
return false;
|
4140
|
+
return;
|
4030
4141
|
}
|
4142
|
+
assert(0 && "unreachable");
|
4031
4143
|
}
|
4032
4144
|
|
4033
4145
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterbody
|
4034
|
-
static
|
4146
|
+
static void handle_after_body(GumboParser* parser, GumboToken* token) {
|
4035
4147
|
if (
|
4036
4148
|
token->type == GUMBO_TOKEN_WHITESPACE
|
4037
4149
|
|| tag_is(token, kStartTag, GUMBO_TAG_HTML)
|
4038
4150
|
) {
|
4039
|
-
|
4040
|
-
|
4151
|
+
handle_in_body(parser, token);
|
4152
|
+
return;
|
4153
|
+
}
|
4154
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
4041
4155
|
GumboNode* html_node = parser->_output->root;
|
4042
4156
|
assert(html_node != NULL);
|
4043
4157
|
append_comment_node(parser, html_node, token);
|
4044
|
-
return
|
4045
|
-
}
|
4158
|
+
return;
|
4159
|
+
}
|
4160
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
4046
4161
|
parser_add_parse_error(parser, token);
|
4047
4162
|
ignore_token(parser);
|
4048
|
-
return
|
4049
|
-
}
|
4163
|
+
return;
|
4164
|
+
}
|
4165
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
4166
|
+
handle_in_body(parser, token);
|
4167
|
+
return;
|
4168
|
+
}
|
4169
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
|
4050
4170
|
/* fragment case: ignore the closing HTML token */
|
4051
4171
|
if (is_fragment_parser(parser)) {
|
4052
4172
|
parser_add_parse_error(parser, token);
|
4053
4173
|
ignore_token(parser);
|
4054
|
-
return
|
4174
|
+
return;
|
4055
4175
|
}
|
4056
4176
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
|
4057
4177
|
GumboNode* html = parser->_parser_state->_open_elements.data[0];
|
@@ -4060,39 +4180,44 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
|
|
4060
4180
|
parser->_parser_state->_current_token,
|
4061
4181
|
&html->v.element
|
4062
4182
|
);
|
4063
|
-
return
|
4064
|
-
}
|
4065
|
-
|
4066
|
-
|
4067
|
-
parser_add_parse_error(parser, token);
|
4068
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4069
|
-
parser->_parser_state->_reprocess_current_token = true;
|
4070
|
-
return false;
|
4183
|
+
return;
|
4184
|
+
}
|
4185
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4186
|
+
return;
|
4071
4187
|
}
|
4188
|
+
parser_add_parse_error(parser, token);
|
4189
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4190
|
+
parser->_parser_state->_reprocess_current_token = true;
|
4072
4191
|
}
|
4073
4192
|
|
4074
4193
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inframeset
|
4075
|
-
static
|
4194
|
+
static void handle_in_frameset(GumboParser* parser, GumboToken* token) {
|
4076
4195
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
4077
4196
|
insert_text_token(parser, token);
|
4078
|
-
return
|
4079
|
-
}
|
4197
|
+
return;
|
4198
|
+
}
|
4199
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
4080
4200
|
append_comment_node(parser, get_current_node(parser), token);
|
4081
|
-
return
|
4082
|
-
}
|
4201
|
+
return;
|
4202
|
+
}
|
4203
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
4083
4204
|
parser_add_parse_error(parser, token);
|
4084
4205
|
ignore_token(parser);
|
4085
|
-
return
|
4086
|
-
}
|
4087
|
-
|
4088
|
-
|
4206
|
+
return;
|
4207
|
+
}
|
4208
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
4209
|
+
handle_in_body(parser, token);
|
4210
|
+
return;
|
4211
|
+
}
|
4212
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
|
4089
4213
|
insert_element_from_token(parser, token);
|
4090
|
-
return
|
4091
|
-
}
|
4214
|
+
return;
|
4215
|
+
}
|
4216
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
|
4092
4217
|
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
|
4093
4218
|
parser_add_parse_error(parser, token);
|
4094
4219
|
ignore_token(parser);
|
4095
|
-
return
|
4220
|
+
return;
|
4096
4221
|
}
|
4097
4222
|
pop_current_node(parser);
|
4098
4223
|
if (
|
@@ -4101,42 +4226,47 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
|
|
4101
4226
|
) {
|
4102
4227
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
|
4103
4228
|
}
|
4104
|
-
return
|
4105
|
-
}
|
4229
|
+
return;
|
4230
|
+
}
|
4231
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
|
4106
4232
|
insert_element_from_token(parser, token);
|
4107
4233
|
pop_current_node(parser);
|
4108
4234
|
acknowledge_self_closing_tag(parser);
|
4109
|
-
return
|
4110
|
-
}
|
4111
|
-
|
4112
|
-
|
4113
|
-
|
4235
|
+
return;
|
4236
|
+
}
|
4237
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
|
4238
|
+
handle_in_head(parser, token);
|
4239
|
+
return;
|
4240
|
+
}
|
4241
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4242
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML))
|
4114
4243
|
parser_add_parse_error(parser, token);
|
4115
|
-
|
4116
|
-
}
|
4117
|
-
return true;
|
4118
|
-
} else {
|
4119
|
-
parser_add_parse_error(parser, token);
|
4120
|
-
ignore_token(parser);
|
4121
|
-
return false;
|
4244
|
+
return;
|
4122
4245
|
}
|
4246
|
+
parser_add_parse_error(parser, token);
|
4247
|
+
ignore_token(parser);
|
4123
4248
|
}
|
4124
4249
|
|
4125
4250
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterframeset
|
4126
|
-
static
|
4251
|
+
static void handle_after_frameset(GumboParser* parser, GumboToken* token) {
|
4127
4252
|
if (token->type == GUMBO_TOKEN_WHITESPACE) {
|
4128
4253
|
insert_text_token(parser, token);
|
4129
|
-
return
|
4130
|
-
}
|
4254
|
+
return;
|
4255
|
+
}
|
4256
|
+
if (token->type == GUMBO_TOKEN_COMMENT) {
|
4131
4257
|
append_comment_node(parser, get_current_node(parser), token);
|
4132
|
-
return
|
4133
|
-
}
|
4258
|
+
return;
|
4259
|
+
}
|
4260
|
+
if (token->type == GUMBO_TOKEN_DOCTYPE) {
|
4134
4261
|
parser_add_parse_error(parser, token);
|
4135
4262
|
ignore_token(parser);
|
4136
|
-
return
|
4137
|
-
}
|
4138
|
-
|
4139
|
-
|
4263
|
+
return;
|
4264
|
+
}
|
4265
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
4266
|
+
handle_in_body(parser, token);
|
4267
|
+
return;
|
4268
|
+
}
|
4269
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
|
4140
4270
|
GumboNode* html = parser->_parser_state->_open_elements.data[0];
|
4141
4271
|
assert(node_html_tag_is(html, GUMBO_TAG_HTML));
|
4142
4272
|
record_end_of_element (
|
@@ -4144,67 +4274,71 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
|
|
4144
4274
|
&html->v.element
|
4145
4275
|
);
|
4146
4276
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
|
4147
|
-
return
|
4148
|
-
}
|
4277
|
+
return;
|
4278
|
+
}
|
4279
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
|
4149
4280
|
return handle_in_head(parser, token);
|
4150
|
-
} else if (token->type == GUMBO_TOKEN_EOF) {
|
4151
|
-
return true;
|
4152
|
-
} else {
|
4153
|
-
parser_add_parse_error(parser, token);
|
4154
|
-
ignore_token(parser);
|
4155
|
-
return false;
|
4156
4281
|
}
|
4282
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4283
|
+
return;
|
4284
|
+
}
|
4285
|
+
parser_add_parse_error(parser, token);
|
4286
|
+
ignore_token(parser);
|
4157
4287
|
}
|
4158
4288
|
|
4159
4289
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-body-insertion-mode
|
4160
|
-
static
|
4290
|
+
static void handle_after_after_body(GumboParser* parser, GumboToken* token) {
|
4161
4291
|
if (token->type == GUMBO_TOKEN_COMMENT) {
|
4162
4292
|
append_comment_node(parser, get_document_node(parser), token);
|
4163
|
-
return
|
4164
|
-
}
|
4293
|
+
return;
|
4294
|
+
}
|
4295
|
+
if (
|
4165
4296
|
token->type == GUMBO_TOKEN_DOCTYPE
|
4166
4297
|
|| token->type == GUMBO_TOKEN_WHITESPACE
|
4167
4298
|
|| tag_is(token, kStartTag, GUMBO_TAG_HTML)
|
4168
4299
|
) {
|
4169
|
-
|
4170
|
-
|
4171
|
-
return true;
|
4172
|
-
} else {
|
4173
|
-
parser_add_parse_error(parser, token);
|
4174
|
-
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4175
|
-
parser->_parser_state->_reprocess_current_token = true;
|
4176
|
-
return false;
|
4300
|
+
handle_in_body(parser, token);
|
4301
|
+
return;
|
4177
4302
|
}
|
4303
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4304
|
+
return;
|
4305
|
+
}
|
4306
|
+
parser_add_parse_error(parser, token);
|
4307
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
4308
|
+
parser->_parser_state->_reprocess_current_token = true;
|
4178
4309
|
}
|
4179
4310
|
|
4180
4311
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-frameset-insertion-mode
|
4181
|
-
static
|
4312
|
+
static void handle_after_after_frameset (
|
4182
4313
|
GumboParser* parser,
|
4183
4314
|
GumboToken* token
|
4184
4315
|
) {
|
4185
4316
|
if (token->type == GUMBO_TOKEN_COMMENT) {
|
4186
4317
|
append_comment_node(parser, get_document_node(parser), token);
|
4187
|
-
return
|
4188
|
-
}
|
4318
|
+
return;
|
4319
|
+
}
|
4320
|
+
if (
|
4189
4321
|
token->type == GUMBO_TOKEN_DOCTYPE
|
4190
4322
|
|| token->type == GUMBO_TOKEN_WHITESPACE
|
4191
4323
|
|| tag_is(token, kStartTag, GUMBO_TAG_HTML)
|
4192
4324
|
) {
|
4193
|
-
|
4194
|
-
|
4195
|
-
return true;
|
4196
|
-
} else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
|
4197
|
-
return handle_in_head(parser, token);
|
4198
|
-
} else {
|
4199
|
-
parser_add_parse_error(parser, token);
|
4200
|
-
ignore_token(parser);
|
4201
|
-
return false;
|
4325
|
+
handle_in_body(parser, token);
|
4326
|
+
return;
|
4202
4327
|
}
|
4328
|
+
if (token->type == GUMBO_TOKEN_EOF) {
|
4329
|
+
return;
|
4330
|
+
}
|
4331
|
+
if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
|
4332
|
+
handle_in_head(parser, token);
|
4333
|
+
return;
|
4334
|
+
}
|
4335
|
+
parser_add_parse_error(parser, token);
|
4336
|
+
ignore_token(parser);
|
4203
4337
|
}
|
4204
4338
|
|
4205
4339
|
// Function pointers for each insertion mode.
|
4206
4340
|
// Keep in sync with insertion_mode.h.
|
4207
|
-
typedef
|
4341
|
+
typedef void (*TokenHandler)(GumboParser* parser, GumboToken* token);
|
4208
4342
|
static const TokenHandler kTokenHandlers[] = {
|
4209
4343
|
handle_initial,
|
4210
4344
|
handle_before_html,
|
@@ -4231,36 +4365,36 @@ static const TokenHandler kTokenHandlers[] = {
|
|
4231
4365
|
handle_after_after_frameset
|
4232
4366
|
};
|
4233
4367
|
|
4234
|
-
static
|
4368
|
+
static void handle_html_content(GumboParser* parser, GumboToken* token) {
|
4235
4369
|
const GumboInsertionMode mode = parser->_parser_state->_insertion_mode;
|
4236
4370
|
const TokenHandler handler = kTokenHandlers[mode];
|
4237
|
-
|
4371
|
+
handler(parser, token);
|
4238
4372
|
}
|
4239
4373
|
|
4240
4374
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
|
4241
|
-
static
|
4375
|
+
static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
4242
4376
|
gumbo_debug("Handling foreign content");
|
4243
4377
|
switch (token->type) {
|
4244
4378
|
case GUMBO_TOKEN_NULL:
|
4245
4379
|
parser_add_parse_error(parser, token);
|
4246
4380
|
token->v.character = kUtf8ReplacementChar;
|
4247
4381
|
insert_text_token(parser, token);
|
4248
|
-
return
|
4382
|
+
return;
|
4249
4383
|
case GUMBO_TOKEN_WHITESPACE:
|
4250
4384
|
insert_text_token(parser, token);
|
4251
|
-
return
|
4385
|
+
return;
|
4252
4386
|
case GUMBO_TOKEN_CDATA:
|
4253
4387
|
case GUMBO_TOKEN_CHARACTER:
|
4254
4388
|
insert_text_token(parser, token);
|
4255
4389
|
set_frameset_not_ok(parser);
|
4256
|
-
return
|
4390
|
+
return;
|
4257
4391
|
case GUMBO_TOKEN_COMMENT:
|
4258
4392
|
append_comment_node(parser, get_current_node(parser), token);
|
4259
|
-
return
|
4393
|
+
return;
|
4260
4394
|
case GUMBO_TOKEN_DOCTYPE:
|
4261
4395
|
parser_add_parse_error(parser, token);
|
4262
4396
|
ignore_token(parser);
|
4263
|
-
return
|
4397
|
+
return;
|
4264
4398
|
default:
|
4265
4399
|
// Fall through to the if-statements below.
|
4266
4400
|
break;
|
@@ -4304,10 +4438,9 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
|
4304
4438
|
)
|
4305
4439
|
);
|
4306
4440
|
parser->_parser_state->_reprocess_current_token = true;
|
4307
|
-
return
|
4441
|
+
return;
|
4308
4442
|
}
|
4309
|
-
|
4310
|
-
assert(token->type == GUMBO_TOKEN_START_TAG);
|
4443
|
+
// This is a start tag so the next if's then branch will be taken.
|
4311
4444
|
}
|
4312
4445
|
|
4313
4446
|
if (token->type == GUMBO_TOKEN_START_TAG) {
|
@@ -4326,63 +4459,59 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
|
4326
4459
|
pop_current_node(parser);
|
4327
4460
|
acknowledge_self_closing_tag(parser);
|
4328
4461
|
}
|
4329
|
-
return
|
4462
|
+
return;
|
4330
4463
|
// </script> tags are handled like any other end tag, putting the script's
|
4331
4464
|
// text into a text node child and closing the current node.
|
4332
|
-
}
|
4333
|
-
|
4334
|
-
|
4335
|
-
|
4336
|
-
|
4337
|
-
|
4465
|
+
}
|
4466
|
+
assert(token->type == GUMBO_TOKEN_END_TAG);
|
4467
|
+
GumboNode* node = get_current_node(parser);
|
4468
|
+
GumboTag tag = token->v.end_tag.tag;
|
4469
|
+
const char* name = token->v.end_tag.name;
|
4470
|
+
assert(node != NULL);
|
4338
4471
|
|
4339
|
-
|
4340
|
-
|
4341
|
-
|
4342
|
-
|
4343
|
-
|
4344
|
-
|
4345
|
-
|
4346
|
-
|
4347
|
-
|
4348
|
-
|
4349
|
-
|
4350
|
-
|
4351
|
-
|
4352
|
-
|
4353
|
-
|
4354
|
-
// Pop all the nodes below the current one. Node is guaranteed to
|
4355
|
-
// be an element on the stack of open elements (set below), so
|
4356
|
-
// this loop is guaranteed to terminate.
|
4357
|
-
}
|
4358
|
-
return is_success;
|
4359
|
-
}
|
4360
|
-
--i;
|
4361
|
-
node = parser->_parser_state->_open_elements.data[i];
|
4362
|
-
if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
|
4363
|
-
// The loop continues only in foreign namespaces.
|
4364
|
-
break;
|
4472
|
+
if (!node_tagname_is(node, tag, name))
|
4473
|
+
parser_add_parse_error(parser, token);
|
4474
|
+
int i = parser->_parser_state->_open_elements.length;
|
4475
|
+
for (--i; i > 0;) {
|
4476
|
+
// Here we move up the stack until we find an HTML element (in which
|
4477
|
+
// case we do nothing) or we find the element that we're about to
|
4478
|
+
// close (in which case we pop everything we've seen until that
|
4479
|
+
// point.)
|
4480
|
+
gumbo_debug("Foreign %s node at %d.\n", node->v.element.name, i);
|
4481
|
+
if (node_tagname_is(node, tag, name)) {
|
4482
|
+
gumbo_debug("Matches.\n");
|
4483
|
+
while (node != pop_current_node(parser)) {
|
4484
|
+
// Pop all the nodes below the current one. Node is guaranteed to
|
4485
|
+
// be an element on the stack of open elements (set below), so
|
4486
|
+
// this loop is guaranteed to terminate.
|
4365
4487
|
}
|
4488
|
+
return;
|
4489
|
+
}
|
4490
|
+
--i;
|
4491
|
+
node = parser->_parser_state->_open_elements.data[i];
|
4492
|
+
if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
|
4493
|
+
// The loop continues only in foreign namespaces.
|
4494
|
+
break;
|
4366
4495
|
}
|
4367
|
-
assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
|
4368
|
-
if (i == 0)
|
4369
|
-
return is_success;
|
4370
|
-
// We can't call handle_token directly because the current node is still in
|
4371
|
-
// a foriegn namespace, so it would re-enter this and result in infinite
|
4372
|
-
// recursion.
|
4373
|
-
return handle_html_content(parser, token) && is_success;
|
4374
4496
|
}
|
4497
|
+
assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
|
4498
|
+
if (i == 0)
|
4499
|
+
return;
|
4500
|
+
// We can't call handle_token directly because the current node is still in
|
4501
|
+
// a foriegn namespace, so it would re-enter this and result in infinite
|
4502
|
+
// recursion.
|
4503
|
+
handle_html_content(parser, token);
|
4375
4504
|
}
|
4376
4505
|
|
4377
4506
|
// https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
|
4378
|
-
static
|
4507
|
+
static void handle_token(GumboParser* parser, GumboToken* token) {
|
4379
4508
|
if (
|
4380
4509
|
parser->_parser_state->_ignore_next_linefeed
|
4381
4510
|
&& token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n'
|
4382
4511
|
) {
|
4383
4512
|
parser->_parser_state->_ignore_next_linefeed = false;
|
4384
4513
|
ignore_token(parser);
|
4385
|
-
return
|
4514
|
+
return;
|
4386
4515
|
}
|
4387
4516
|
// This needs to be reset both here and in the conditional above to catch both
|
4388
4517
|
// the case where the next token is not whitespace (so we don't ignore
|
@@ -4424,9 +4553,9 @@ static bool handle_token(GumboParser* parser, GumboToken* token) {
|
|
4424
4553
|
token->type == GUMBO_TOKEN_NULL ||
|
4425
4554
|
token->type == GUMBO_TOKEN_WHITESPACE)) ||
|
4426
4555
|
token->type == GUMBO_TOKEN_EOF) {
|
4427
|
-
|
4556
|
+
handle_html_content(parser, token);
|
4428
4557
|
} else {
|
4429
|
-
|
4558
|
+
handle_in_foreign_content(parser, token);
|
4430
4559
|
}
|
4431
4560
|
}
|
4432
4561
|
|
@@ -4517,7 +4646,7 @@ static void fragment_parser_init (
|
|
4517
4646
|
break;
|
4518
4647
|
|
4519
4648
|
case GUMBO_TAG_SCRIPT:
|
4520
|
-
gumbo_tokenizer_set_state(parser,
|
4649
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
4521
4650
|
break;
|
4522
4651
|
|
4523
4652
|
case GUMBO_TAG_NOSCRIPT:
|
@@ -4554,7 +4683,7 @@ static void fragment_parser_init (
|
|
4554
4683
|
// 11.
|
4555
4684
|
if (ctx_has_form_ancestor
|
4556
4685
|
|| (ctx_tag == GUMBO_TAG_FORM
|
4557
|
-
|
4686
|
+
&& fragment_namespace == GUMBO_NAMESPACE_HTML)) {
|
4558
4687
|
static const GumboNode form_ancestor = {
|
4559
4688
|
.type = GUMBO_NODE_ELEMENT,
|
4560
4689
|
.parent = NULL,
|
@@ -4613,19 +4742,18 @@ GumboOutput* gumbo_parse_with_options (
|
|
4613
4742
|
|
4614
4743
|
const unsigned int max_tree_depth = options->max_tree_depth;
|
4615
4744
|
GumboToken token;
|
4616
|
-
bool has_error = false;
|
4617
4745
|
|
4618
4746
|
do {
|
4619
4747
|
if (state->_reprocess_current_token) {
|
4620
4748
|
state->_reprocess_current_token = false;
|
4621
4749
|
} else {
|
4622
|
-
GumboNode*
|
4623
|
-
|
4750
|
+
GumboNode* adjusted_current_node = get_adjusted_current_node(&parser);
|
4751
|
+
gumbo_tokenizer_set_is_adjusted_current_node_foreign (
|
4624
4752
|
&parser,
|
4625
|
-
|
4626
|
-
|
4753
|
+
adjusted_current_node &&
|
4754
|
+
adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
|
4627
4755
|
);
|
4628
|
-
|
4756
|
+
gumbo_lex(&parser, &token);
|
4629
4757
|
}
|
4630
4758
|
|
4631
4759
|
const char* token_type = "text";
|
@@ -4649,17 +4777,17 @@ GumboOutput* gumbo_parse_with_options (
|
|
4649
4777
|
break;
|
4650
4778
|
}
|
4651
4779
|
gumbo_debug (
|
4652
|
-
"Handling %s token @%
|
4780
|
+
"Handling %s token @%lu:%lu in state %u.\n",
|
4653
4781
|
(char*) token_type,
|
4654
|
-
token.position.line,
|
4655
|
-
token.position.column,
|
4782
|
+
(unsigned long)token.position.line,
|
4783
|
+
(unsigned long)token.position.column,
|
4656
4784
|
state->_insertion_mode
|
4657
4785
|
);
|
4658
4786
|
|
4659
4787
|
state->_current_token = &token;
|
4660
4788
|
state->_self_closing_flag_acknowledged = false;
|
4661
4789
|
|
4662
|
-
|
4790
|
+
handle_token(&parser, &token);
|
4663
4791
|
|
4664
4792
|
// Check for memory leaks when ownership is transferred from start tag
|
4665
4793
|
// tokens to nodes.
|
@@ -4671,19 +4799,25 @@ GumboOutput* gumbo_parse_with_options (
|
|
4671
4799
|
);
|
4672
4800
|
|
4673
4801
|
if (!state->_reprocess_current_token) {
|
4802
|
+
// If we're done with the token, check for unacknowledged self-closing
|
4803
|
+
// flags on start tags.
|
4674
4804
|
if (token.type == GUMBO_TOKEN_START_TAG &&
|
4675
4805
|
token.v.start_tag.is_self_closing &&
|
4676
4806
|
!state->_self_closing_flag_acknowledged) {
|
4677
|
-
GumboError* error =
|
4678
|
-
if (error)
|
4679
|
-
error
|
4807
|
+
GumboError* error = gumbo_add_error(&parser);
|
4808
|
+
if (error) {
|
4809
|
+
// This is essentially a tokenizer error that's only caught during
|
4810
|
+
// tree construction.
|
4811
|
+
error->type = GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS;
|
4812
|
+
error->original_text = token.original_text;
|
4813
|
+
error->position = token.position;
|
4814
|
+
}
|
4680
4815
|
}
|
4816
|
+
// Make sure we free the end tag's name since it doesn't get transferred
|
4817
|
+
// to a token.
|
4681
4818
|
if (token.type == GUMBO_TOKEN_END_TAG &&
|
4682
|
-
token.v.end_tag.
|
4683
|
-
|
4684
|
-
if (error)
|
4685
|
-
error->type = GUMBO_ERR_SELF_CLOSING_END_TAG;
|
4686
|
-
}
|
4819
|
+
token.v.end_tag.tag == GUMBO_TAG_UNKNOWN)
|
4820
|
+
gumbo_free(token.v.end_tag.name);
|
4687
4821
|
}
|
4688
4822
|
|
4689
4823
|
if (unlikely(state->_open_elements.length > max_tree_depth)) {
|
@@ -4697,7 +4831,7 @@ GumboOutput* gumbo_parse_with_options (
|
|
4697
4831
|
|
4698
4832
|
} while (
|
4699
4833
|
(token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token)
|
4700
|
-
&& !(options->stop_on_first_error &&
|
4834
|
+
&& !(options->stop_on_first_error && parser._output->document_error)
|
4701
4835
|
);
|
4702
4836
|
|
4703
4837
|
finish_parsing(&parser);
|
@@ -4725,6 +4859,8 @@ const char* gumbo_status_to_string(GumboOutputStatus status) {
|
|
4725
4859
|
return "OK";
|
4726
4860
|
case GUMBO_STATUS_OUT_OF_MEMORY:
|
4727
4861
|
return "System allocator returned NULL during parsing";
|
4862
|
+
case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
|
4863
|
+
return "Attributes per element limit exceeded";
|
4728
4864
|
case GUMBO_STATUS_TREE_TOO_DEEP:
|
4729
4865
|
return "Document tree depth limit exceeded";
|
4730
4866
|
default:
|