nokogumbo 2.0.0.pre.alpha → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,6 +7,7 @@
7
7
  #include "insertion_mode.h"
8
8
  #include "string_buffer.h"
9
9
  #include "token_type.h"
10
+ #include "tokenizer_states.h"
10
11
 
11
12
  #ifdef __cplusplus
12
13
  extern "C" {
@@ -15,85 +16,66 @@ extern "C" {
15
16
  struct GumboInternalParser;
16
17
 
17
18
  typedef enum {
19
+ // Defined errors.
20
+ // https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
21
+ GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
22
+ GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
23
+ GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
24
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
25
+ GUMBO_ERR_CDATA_IN_HTML_CONTENT,
26
+ GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
27
+ GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM,
28
+ GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
29
+ GUMBO_ERR_END_TAG_WITH_ATTRIBUTES,
30
+ GUMBO_ERR_DUPLICATE_ATTRIBUTE,
31
+ GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS,
32
+ GUMBO_ERR_EOF_BEFORE_TAG_NAME,
33
+ GUMBO_ERR_EOF_IN_CDATA,
34
+ GUMBO_ERR_EOF_IN_COMMENT,
35
+ GUMBO_ERR_EOF_IN_DOCTYPE,
36
+ GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
37
+ GUMBO_ERR_EOF_IN_TAG,
38
+ GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT,
39
+ GUMBO_ERR_INCORRECTLY_OPENED_COMMENT,
40
+ GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
41
+ GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
42
+ GUMBO_ERR_MISSING_ATTRIBUTE_VALUE,
43
+ GUMBO_ERR_MISSING_DOCTYPE_NAME,
44
+ GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
45
+ GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
46
+ GUMBO_ERR_MISSING_END_TAG_NAME,
47
+ GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
48
+ GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
49
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
50
+ GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
51
+ GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
52
+ GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
53
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
54
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
55
+ GUMBO_ERR_NESTED_COMMENT,
56
+ GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
57
+ GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM,
58
+ GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
59
+ GUMBO_ERR_NULL_CHARACTER_REFERENCE,
60
+ GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
61
+ GUMBO_ERR_SURROGATE_IN_INPUT_STREAM,
62
+ GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
63
+ GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
64
+ GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
65
+ GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
66
+ GUMBO_ERR_UNEXPECTED_NULL_CHARACTER,
67
+ GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
68
+ GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG,
69
+ GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE,
70
+
71
+ // Encoding errors.
18
72
  GUMBO_ERR_UTF8_INVALID,
19
73
  GUMBO_ERR_UTF8_TRUNCATED,
20
- GUMBO_ERR_UTF8_NULL,
21
- GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
22
- GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
23
- GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
24
- GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
25
- GUMBO_ERR_NAMED_CHAR_REF_INVALID,
26
- GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
27
- GUMBO_ERR_TAG_EOF,
28
- GUMBO_ERR_TAG_INVALID,
29
- GUMBO_ERR_CLOSE_TAG_EMPTY,
30
- GUMBO_ERR_CLOSE_TAG_EOF,
31
- GUMBO_ERR_CLOSE_TAG_INVALID,
32
- GUMBO_ERR_SCRIPT_EOF,
33
- GUMBO_ERR_ATTR_NAME_EOF,
34
- GUMBO_ERR_ATTR_NAME_INVALID,
35
- GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
36
- GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
37
- GUMBO_ERR_ATTR_UNQUOTED_EOF,
38
- GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
39
- GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
40
- GUMBO_ERR_ATTR_AFTER_EOF,
41
- GUMBO_ERR_ATTR_AFTER_INVALID,
42
- GUMBO_ERR_DUPLICATE_ATTR,
43
- GUMBO_ERR_SOLIDUS_EOF,
44
- GUMBO_ERR_SOLIDUS_INVALID,
45
- GUMBO_ERR_DASHES_OR_DOCTYPE,
46
- GUMBO_ERR_COMMENT_EOF,
47
- GUMBO_ERR_COMMENT_INVALID,
48
- GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
49
- GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
50
- GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
51
- GUMBO_ERR_COMMENT_END_BANG_EOF,
52
- GUMBO_ERR_DOCTYPE_EOF,
53
- GUMBO_ERR_DOCTYPE_INVALID,
54
- GUMBO_ERR_DOCTYPE_SPACE,
55
- GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
56
- GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
57
- GUMBO_ERR_DOCTYPE_END,
74
+
75
+ // Generic parser error.
58
76
  GUMBO_ERR_PARSER,
59
- GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
60
- GUMBO_ERR_SELF_CLOSING_END_TAG,
61
77
  } GumboErrorType;
62
78
 
63
- // Additional data for duplicated attributes.
64
- typedef struct GumboInternalDuplicateAttrError {
65
- // The name of the attribute. Owned by this struct.
66
- const char* name;
67
-
68
- // The (0-based) index within the attributes vector of the original
69
- // occurrence.
70
- unsigned int original_index;
71
-
72
- // The (0-based) index where the new occurrence would be.
73
- unsigned int new_index;
74
- } GumboDuplicateAttrError;
75
-
76
- // A simplified representation of the tokenizer state, designed to be more
77
- // useful to clients of this library than the internal representation. This
78
- // condenses the actual states used in the tokenizer state machine into a few
79
- // values that will be familiar to users of HTML.
80
- typedef enum {
81
- GUMBO_ERR_TOKENIZER_DATA,
82
- GUMBO_ERR_TOKENIZER_CHAR_REF,
83
- GUMBO_ERR_TOKENIZER_RCDATA,
84
- GUMBO_ERR_TOKENIZER_RAWTEXT,
85
- GUMBO_ERR_TOKENIZER_PLAINTEXT,
86
- GUMBO_ERR_TOKENIZER_SCRIPT,
87
- GUMBO_ERR_TOKENIZER_TAG,
88
- GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
89
- GUMBO_ERR_TOKENIZER_ATTR_NAME,
90
- GUMBO_ERR_TOKENIZER_ATTR_VALUE,
91
- GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
92
- GUMBO_ERR_TOKENIZER_COMMENT,
93
- GUMBO_ERR_TOKENIZER_DOCTYPE,
94
- GUMBO_ERR_TOKENIZER_CDATA,
95
- } GumboTokenizerErrorState;
96
-
97
79
  // Additional data for tokenizer errors.
98
80
  // This records the current state and codepoint encountered - this is usually
99
81
  // enough to reconstruct what went wrong and provide a friendly error message.
@@ -102,7 +84,7 @@ typedef struct GumboInternalTokenizerError {
102
84
  int codepoint;
103
85
 
104
86
  // The state that the tokenizer was in at the time.
105
- GumboTokenizerErrorState state;
87
+ GumboTokenizerEnum state;
106
88
  } GumboTokenizerError;
107
89
 
108
90
  // Additional data for parse errors.
@@ -125,43 +107,25 @@ typedef struct GumboInternalParserError {
125
107
  // The overall error struct representing an error in decoding/tokenizing/parsing
126
108
  // the HTML. This contains an enumerated type flag, a source position, and then
127
109
  // a union of fields containing data specific to the error.
128
- typedef struct GumboInternalError {
110
+ struct GumboInternalError {
129
111
  // The type of error.
130
112
  GumboErrorType type;
131
113
 
132
114
  // The position within the source file where the error occurred.
133
115
  GumboSourcePosition position;
134
116
 
135
- // A pointer to the byte within the original source file text where the error
136
- // occurred (note that this is not the same as position.offset, as that gives
137
- // character-based instead of byte-based offsets).
138
- const char* original_text;
117
+ // The piece of text that caused the error.
118
+ GumboStringPiece original_text;
139
119
 
140
120
  // Type-specific error information.
141
121
  union {
142
- // The code point we encountered, for:
143
- // * GUMBO_ERR_UTF8_INVALID
144
- // * GUMBO_ERR_UTF8_TRUNCATED
145
- // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
146
- // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
147
- uint32_t codepoint;
148
-
149
122
  // Tokenizer errors.
150
123
  GumboTokenizerError tokenizer;
151
124
 
152
- // Short textual data, for:
153
- // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
154
- // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
155
- GumboStringPiece text;
156
-
157
- // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
158
- GumboDuplicateAttrError duplicate_attr;
159
-
160
- // Parser state, for GUMBO_ERR_PARSER and
161
- // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
162
- struct GumboInternalParserError parser;
125
+ // Parser errors.
126
+ GumboParserError parser;
163
127
  } v;
164
- } GumboError;
128
+ };
165
129
 
166
130
  // Adds a new error to the parser's error list, and returns a pointer to it so
167
131
  // that clients can fill out the rest of its fields. May return NULL if we're
@@ -177,32 +141,6 @@ void gumbo_destroy_errors(struct GumboInternalParser* errors);
177
141
  // Frees the memory used for a single GumboError.
178
142
  void gumbo_error_destroy(GumboError* error);
179
143
 
180
- // Prints an error to a string. This fills an empty GumboStringBuffer with a
181
- // freshly-allocated buffer containing the error message text. The caller is
182
- // responsible for freeing the buffer.
183
- void gumbo_error_to_string (
184
- const GumboError* error,
185
- GumboStringBuffer* output
186
- );
187
-
188
- // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
189
- // with a freshly-allocated buffer containing the error message text. The
190
- // caller is responsible for freeing the buffer.
191
- void gumbo_caret_diagnostic_to_string (
192
- const GumboError* error,
193
- const char* source_text,
194
- size_t source_length,
195
- GumboStringBuffer* output
196
- );
197
-
198
- // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
199
- // of writing to a string.
200
- void gumbo_print_caret_diagnostic (
201
- const GumboError* error,
202
- const char* source_text,
203
- size_t source_length
204
- );
205
-
206
144
  #ifdef __cplusplus
207
145
  }
208
146
  #endif
@@ -817,10 +817,6 @@ typedef struct GumboInternalOutput {
817
817
 
818
818
  /**
819
819
  * A list of errors that occurred during the parse.
820
- * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
821
- * fleshed out and may change in the future. For this reason, the GumboError
822
- * header isn't part of the public API. Contact us if you need errors
823
- * reported so we can work out something appropriate for your use-case.
824
820
  */
825
821
  GumboVector /* GumboError */ errors;
826
822
 
@@ -866,6 +862,53 @@ const char* gumbo_status_to_string(GumboOutputStatus status);
866
862
  /** Release the memory used for the parse tree and parse errors. */
867
863
  void gumbo_destroy_output(GumboOutput* output);
868
864
 
865
+ /** Opaque GumboError type */
866
+ typedef struct GumboInternalError GumboError;
867
+
868
+ /**
869
+ * Returns the position of the error.
870
+ */
871
+ GumboSourcePosition gumbo_error_position(const GumboError* error);
872
+
873
+ /**
874
+ * Returns a constant string representation of the error's code. This is owned
875
+ * by the library and should not be freed by the caller.
876
+ */
877
+ const char* gumbo_error_code(const GumboError* error);
878
+
879
+ /**
880
+ * Prints an error to a string. This stores a freshly-allocated buffer
881
+ * containing the error message text in output. The caller is responsible for
882
+ * freeing the buffer. The size of the error message is returned. The error
883
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
884
+ * returned size must be used.
885
+ */
886
+ size_t gumbo_error_to_string(const GumboError* error, char **output);
887
+
888
+ /**
889
+ * Prints a caret diagnostic to a string. This stores a freshly-allocated
890
+ * buffer containing the error message text in output. The caller is responsible for
891
+ * freeing the buffer. The size of the error message is returned. The error
892
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
893
+ * returned size must be used.
894
+ */
895
+ size_t gumbo_caret_diagnostic_to_string (
896
+ const GumboError* error,
897
+ const char* source_text,
898
+ size_t source_length,
899
+ char** output
900
+ );
901
+
902
+ /**
903
+ * Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
904
+ * instead of writing to a string.
905
+ */
906
+ void gumbo_print_caret_diagnostic (
907
+ const GumboError* error,
908
+ const char* source_text,
909
+ size_t source_length
910
+ );
911
+
869
912
  #ifdef __cplusplus
870
913
  }
871
914
  #endif
@@ -31,6 +31,7 @@
31
31
  #include "replacement.h"
32
32
  #include "tokenizer.h"
33
33
  #include "tokenizer_states.h"
34
+ #include "token_buffer.h"
34
35
  #include "utf8.h"
35
36
  #include "util.h"
36
37
  #include "vector.h"
@@ -42,7 +43,7 @@ typedef uint8_t TagSet[GUMBO_TAG_LAST + 1];
42
43
 
43
44
  #define GUMBO_EMPTY_SOURCE_POSITION_INIT { .line = 0, .column = 0, .offset = 0 }
44
45
  #define kGumboEmptySourcePosition (const GumboSourcePosition) \
45
- GUMBO_EMPTY_SOURCE_POSITION_INIT
46
+ GUMBO_EMPTY_SOURCE_POSITION_INIT
46
47
 
47
48
  const GumboOptions kGumboDefaultOptions = {
48
49
  .tab_stop = 8,
@@ -59,25 +60,6 @@ const GumboOptions kGumboDefaultOptions = {
59
60
  #define STRING(s) {.data = s, .length = sizeof(s) - 1}
60
61
  #define TERMINATOR {.data = NULL, .length = 0}
61
62
 
62
- static const GumboStringPiece kPublicIdHtml4_0 =
63
- STRING("-//W3C//DTD HTML 4.0//EN");
64
- static const GumboStringPiece kPublicIdHtml4_01 =
65
- STRING("-//W3C//DTD HTML 4.01//EN");
66
- static const GumboStringPiece kPublicIdXhtml1_0 =
67
- STRING("-//W3C//DTD XHTML 1.0 Strict//EN");
68
- static const GumboStringPiece kPublicIdXhtml1_1 =
69
- STRING("-//W3C//DTD XHTML 1.1//EN");
70
- static const GumboStringPiece kSystemIdRecHtml4_0 =
71
- STRING("http://www.w3.org/TR/REC-html40/strict.dtd");
72
- static const GumboStringPiece kSystemIdHtml4 =
73
- STRING("http://www.w3.org/TR/html4/strict.dtd");
74
- static const GumboStringPiece kSystemIdXhtmlStrict1_1 =
75
- STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
76
- static const GumboStringPiece kSystemIdXhtml1_1 =
77
- STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
78
- static const GumboStringPiece kSystemIdLegacyCompat =
79
- STRING("about:legacy-compat");
80
-
81
63
  // The doctype arrays have an explicit terminator because we want to pass them
82
64
  // to a helper function, and passing them as a pointer discards sizeof
83
65
  // information. The SVG arrays are used only by one-off functions, and so loops
@@ -260,6 +242,9 @@ typedef struct GumboInternalParserState {
260
242
  // The accumulated text node buffer state.
261
243
  TextNodeBufferState _text_node;
262
244
 
245
+ // The accumulated character tokens in tables for error purposes.
246
+ GumboCharacterTokenBuffer _table_character_tokens;
247
+
263
248
  // The current token.
264
249
  GumboToken* _current_token;
265
250
 
@@ -365,6 +350,7 @@ static void parser_state_init(GumboParser* parser) {
365
350
  parser_state->_foster_parent_insertions = false;
366
351
  parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
367
352
  gumbo_string_buffer_init(&parser_state->_text_node._buffer);
353
+ gumbo_character_token_buffer_init(&parser_state->_table_character_tokens);
368
354
  gumbo_vector_init(10, &parser_state->_open_elements);
369
355
  gumbo_vector_init(5, &parser_state->_active_formatting_elements);
370
356
  gumbo_vector_init(5, &parser_state->_template_insertion_modes);
@@ -463,6 +449,7 @@ static void parser_state_destroy(GumboParser* parser) {
463
449
  gumbo_vector_destroy(&state->_open_elements);
464
450
  gumbo_vector_destroy(&state->_template_insertion_modes);
465
451
  gumbo_string_buffer_destroy(&state->_text_node._buffer);
452
+ gumbo_character_token_buffer_destroy(&state->_table_character_tokens);
466
453
  gumbo_free(state);
467
454
  }
468
455
 
@@ -573,11 +560,11 @@ static bool tag_in (
573
560
  static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
574
561
  if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
575
562
  return token->v.start_tag.tag == tag;
576
- } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
563
+ }
564
+ if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
577
565
  return token->v.end_tag.tag == tag;
578
- } else {
579
- return false;
580
566
  }
567
+ return false;
581
568
  }
582
569
 
583
570
  static inline bool tagset_includes (
@@ -738,18 +725,18 @@ static void reset_insertion_mode_appropriately(GumboParser* parser) {
738
725
  assert(0);
739
726
  }
740
727
 
741
- static GumboError* parser_add_parse_error (
728
+ static void parser_add_parse_error (
742
729
  GumboParser* parser,
743
730
  const GumboToken* token
744
731
  ) {
745
732
  gumbo_debug("Adding parse error.\n");
746
733
  GumboError* error = gumbo_add_error(parser);
747
734
  if (!error) {
748
- return NULL;
735
+ return;
749
736
  }
750
737
  error->type = GUMBO_ERR_PARSER;
751
738
  error->position = token->position;
752
- error->original_text = token->original_text.data;
739
+ error->original_text = token->original_text;
753
740
  GumboParserError* extra_data = &error->v.parser;
754
741
  extra_data->input_type = token->type;
755
742
  extra_data->input_tag = GUMBO_TAG_UNKNOWN;
@@ -772,7 +759,6 @@ static GumboError* parser_add_parse_error (
772
759
  &extra_data->tag_stack
773
760
  );
774
761
  }
775
- return error;
776
762
  }
777
763
 
778
764
  // https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point
@@ -1639,9 +1625,11 @@ static bool has_node_in_scope(const GumboParser* parser, const GumboNode* node)
1639
1625
  const GumboNodeType type = current->type;
1640
1626
  if (current == node) {
1641
1627
  return true;
1642
- } else if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
1628
+ }
1629
+ if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
1643
1630
  continue;
1644
- } else if (node_tag_in_set(current, &tags)) {
1631
+ }
1632
+ if (node_tag_in_set(current, &tags)) {
1645
1633
  return false;
1646
1634
  }
1647
1635
  }
@@ -1689,8 +1677,8 @@ static bool has_an_element_in_select_scope(const GumboParser* parser, GumboTag t
1689
1677
  // Pass GUMBO_TAG_LAST to not exclude any of them.
1690
1678
  static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1691
1679
  static const TagSet tags = {
1692
- TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), TAG(OPTGROUP),
1693
- TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC)
1680
+ TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION),
1681
+ TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC)
1694
1682
  };
1695
1683
  while (
1696
1684
  node_tag_in_set(get_current_node(parser), &tags)
@@ -1704,15 +1692,36 @@ static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1704
1692
  // https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags
1705
1693
  static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
1706
1694
  static const TagSet tags = {
1707
- TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTION),
1708
- TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY),
1709
- TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)
1695
+ TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
1696
+ TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY),
1697
+ TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
1710
1698
  };
1711
1699
  while (node_tag_in_set(get_current_node(parser), &tags)) {
1712
1700
  pop_current_node(parser);
1713
1701
  }
1714
1702
  }
1715
1703
 
1704
+ // This factors out the clauses in the "in body" insertion mode checking "if
1705
+ // there is a node in the stack of open elements that is not" one of a list of
1706
+ // elements in which case it's a parse error.
1707
+ // This is used in "an end-of-file token", "an end tag whose tag name is
1708
+ // 'body'", and "an end tag whose tag name is 'html'".
1709
+ static bool stack_contains_nonclosable_element (
1710
+ GumboParser* parser
1711
+ ) {
1712
+ static const TagSet tags = {
1713
+ TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P), TAG(RB),
1714
+ TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH),
1715
+ TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML),
1716
+ };
1717
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1718
+ for (size_t i = 0; i < open_elements->length; ++i) {
1719
+ if (!node_tag_in_set(open_elements->data[i], &tags))
1720
+ return true;
1721
+ }
1722
+ return false;
1723
+ }
1724
+
1716
1725
  // This factors out the clauses relating to "act as if an end tag token with tag
1717
1726
  // name "table" had been seen. Returns true if there's a table element in table
1718
1727
  // scope which was successfully closed, false if not and the token should be
@@ -1756,13 +1765,15 @@ static bool close_table_cell (
1756
1765
  // https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell
1757
1766
  // This holds the logic to determine whether we should close a <td> or a <th>.
1758
1767
  static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1768
+ GumboTag cell_tag;
1759
1769
  if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1760
1770
  assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1761
- return close_table_cell(parser, token, GUMBO_TAG_TD);
1771
+ cell_tag = GUMBO_TAG_TD;
1762
1772
  } else {
1763
1773
  assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1764
- return close_table_cell(parser, token, GUMBO_TAG_TH);
1774
+ cell_tag = GUMBO_TAG_TH;
1765
1775
  }
1776
+ return close_table_cell(parser, token, cell_tag);
1766
1777
  }
1767
1778
 
1768
1779
  // This factors out the "act as if an end tag of tag name 'select' had been
@@ -1862,13 +1873,13 @@ static bool maybe_implicitly_close_p_tag (
1862
1873
 
1863
1874
  // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
1864
1875
  // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
1865
- static void maybe_implicitly_close_list_tag (
1876
+ static bool maybe_implicitly_close_list_tag (
1866
1877
  GumboParser* parser,
1867
1878
  GumboToken* token,
1868
1879
  bool is_li
1869
1880
  ) {
1870
1881
  GumboParserState* state = parser->_parser_state;
1871
- state->_frameset_ok = false;
1882
+ set_frameset_not_ok(parser);
1872
1883
  for (int i = state->_open_elements.length; --i >= 0;) {
1873
1884
  const GumboNode* node = state->_open_elements.data[i];
1874
1885
  bool is_list_tag = is_li
@@ -1876,21 +1887,21 @@ static void maybe_implicitly_close_list_tag (
1876
1887
  : node_tag_in_set(node, &dd_dt_tags)
1877
1888
  ;
1878
1889
  if (is_list_tag) {
1879
- implicitly_close_tags (
1890
+ return implicitly_close_tags (
1880
1891
  parser,
1881
1892
  token,
1882
1893
  node->v.element.tag_namespace,
1883
1894
  node->v.element.tag
1884
1895
  );
1885
- return;
1886
1896
  }
1887
1897
  if (
1888
1898
  is_special_node(node)
1889
1899
  && !node_tag_in_set(node, &(const TagSet){TAG(ADDRESS), TAG(DIV), TAG(P)})
1890
1900
  ) {
1891
- return;
1901
+ return true;
1892
1902
  }
1893
1903
  }
1904
+ return true;
1894
1905
  }
1895
1906
 
1896
1907
  static void merge_attributes (
@@ -2009,36 +2020,17 @@ static void adjust_mathml_attributes(GumboToken* token) {
2009
2020
  attr->name = gumbo_strdup("definitionURL");
2010
2021
  }
2011
2022
 
2012
- static bool doctype_matches (
2013
- const GumboTokenDocType* doctype,
2014
- const GumboStringPiece* public_id,
2015
- const GumboStringPiece* system_id,
2016
- bool allow_missing_system_id
2017
- ) {
2018
- return
2019
- !strcmp(doctype->public_identifier, public_id->data)
2020
- && (allow_missing_system_id || doctype->has_system_identifier)
2021
- && !strcmp(doctype->system_identifier, system_id->data);
2022
- }
2023
-
2024
2023
  static bool maybe_add_doctype_error (
2025
2024
  GumboParser* parser,
2026
2025
  const GumboToken* token
2027
2026
  ) {
2028
2027
  const GumboTokenDocType* doctype = &token->v.doc_type;
2029
- bool html_doctype = !strcmp(doctype->name, "html");
2030
- if ((!html_doctype || doctype->has_public_identifier ||
2031
- (doctype->has_system_identifier &&
2032
- !strcmp(
2033
- doctype->system_identifier, kSystemIdLegacyCompat.data))) &&
2034
- !(html_doctype && (doctype_matches(doctype, &kPublicIdHtml4_0,
2035
- &kSystemIdRecHtml4_0, true) ||
2036
- doctype_matches(doctype, &kPublicIdHtml4_01,
2037
- &kSystemIdHtml4, true) ||
2038
- doctype_matches(doctype, &kPublicIdXhtml1_0,
2039
- &kSystemIdXhtmlStrict1_1, false) ||
2040
- doctype_matches(doctype, &kPublicIdXhtml1_1,
2041
- &kSystemIdXhtml1_1, false)))) {
2028
+ if (
2029
+ strcmp(doctype->name, "html")
2030
+ || doctype->has_public_identifier
2031
+ || (doctype->has_system_identifier
2032
+ && strcmp(doctype->system_identifier, "about:legacy-compat"))
2033
+ ) {
2042
2034
  parser_add_parse_error(parser, token);
2043
2035
  return false;
2044
2036
  }
@@ -2069,6 +2061,8 @@ static void remove_from_parent(GumboNode* node) {
2069
2061
 
2070
2062
  // https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
2071
2063
  // Also described in the "in body" handling for end formatting tags.
2064
+ // Returns true if the algorithm handled the token and false to indicate that
2065
+ // it should be handled according to "any other end tag."
2072
2066
  static bool adoption_agency_algorithm (
2073
2067
  GumboParser* parser,
2074
2068
  GumboToken* token,
@@ -2076,7 +2070,7 @@ static bool adoption_agency_algorithm (
2076
2070
  ) {
2077
2071
  GumboParserState* state = parser->_parser_state;
2078
2072
  gumbo_debug("Entering adoption agency algorithm.\n");
2079
- // Step 1.
2073
+ // Step 2.
2080
2074
  GumboNode* current_node = get_current_node(parser);
2081
2075
  if (
2082
2076
  current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML
@@ -2087,18 +2081,19 @@ static bool adoption_agency_algorithm (
2087
2081
  )
2088
2082
  ) {
2089
2083
  pop_current_node(parser);
2090
- return false;
2084
+ return true;
2091
2085
  }
2092
- // Steps 2-4 & 20:
2086
+ // Steps 3-5 & 21:
2093
2087
  for (unsigned int i = 0; i < 8; ++i) {
2094
- // Step 5.
2088
+ // Step 6.
2095
2089
  GumboNode* formatting_node = NULL;
2096
2090
  int formatting_node_in_open_elements = -1;
2097
2091
  for (int j = state->_active_formatting_elements.length; --j >= 0;) {
2098
2092
  GumboNode* current_node = state->_active_formatting_elements.data[j];
2099
2093
  if (current_node == &kActiveFormattingScopeMarker) {
2100
2094
  gumbo_debug("Broke on scope marker; aborting.\n");
2101
- // Last scope marker; abort the algorithm.
2095
+ // Last scope marker; abort the algorithm and handle according to "any
2096
+ // other end tag."
2102
2097
  return false;
2103
2098
  }
2104
2099
  if (node_html_tag_is(current_node, subject)) {
@@ -2124,7 +2119,7 @@ static bool adoption_agency_algorithm (
2124
2119
  return false;
2125
2120
  }
2126
2121
 
2127
- // Step 6
2122
+ // Step 7
2128
2123
  if (formatting_node_in_open_elements == -1) {
2129
2124
  gumbo_debug("Formatting node not on stack of open elements.\n");
2130
2125
  parser_add_parse_error(parser, token);
@@ -2132,17 +2127,17 @@ static bool adoption_agency_algorithm (
2132
2127
  formatting_node,
2133
2128
  &state->_active_formatting_elements
2134
2129
  );
2135
- return false;
2130
+ return true;
2136
2131
  }
2137
2132
 
2138
- // Step 7
2133
+ // Step 8
2139
2134
  if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
2140
2135
  parser_add_parse_error(parser, token);
2141
2136
  gumbo_debug("Element not in scope.\n");
2142
- return false;
2137
+ return true;
2143
2138
  }
2144
2139
 
2145
- // Step 8
2140
+ // Step 9
2146
2141
  if (formatting_node != get_current_node(parser)) {
2147
2142
  parser_add_parse_error(parser, token); // But continue onwards.
2148
2143
  }
@@ -2150,7 +2145,7 @@ static bool adoption_agency_algorithm (
2150
2145
  assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
2151
2146
  assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
2152
2147
 
2153
- // Step 9 & 10
2148
+ // Step 10
2154
2149
  GumboNode* furthest_block = NULL;
2155
2150
  for (
2156
2151
  unsigned int j = formatting_node_in_open_elements;
@@ -2160,32 +2155,27 @@ static bool adoption_agency_algorithm (
2160
2155
  assert(j > 0);
2161
2156
  GumboNode* current = state->_open_elements.data[j];
2162
2157
  if (is_special_node(current)) {
2163
- // Step 9.
2164
2158
  furthest_block = current;
2165
2159
  break;
2166
2160
  }
2167
2161
  }
2162
+ // Step 11.
2168
2163
  if (!furthest_block) {
2169
- // Step 10.
2170
- while (get_current_node(parser) != formatting_node) {
2171
- pop_current_node(parser);
2172
- }
2173
- // And the formatting element itself.
2174
- pop_current_node(parser);
2164
+ while (pop_current_node(parser) != formatting_node)
2165
+ ;
2175
2166
  gumbo_vector_remove (
2176
2167
  formatting_node,
2177
2168
  &state->_active_formatting_elements
2178
2169
  );
2179
- return false;
2170
+ return true;
2180
2171
  }
2181
2172
  assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
2182
- assert(furthest_block);
2183
2173
 
2184
- // Step 11.
2174
+ // Step 12.
2185
2175
  // Elements may be moved and reparented by this algorithm, so
2186
2176
  // common_ancestor is not necessarily the same as formatting_node->parent.
2187
2177
  GumboNode* common_ancestor = state->_open_elements.data [
2188
- gumbo_vector_index_of(&state->_open_elements, formatting_node) - 1
2178
+ formatting_node_in_open_elements - 1
2189
2179
  ];
2190
2180
  gumbo_debug (
2191
2181
  "Common ancestor tag = %s, furthest block tag = %s.\n",
@@ -2193,24 +2183,24 @@ static bool adoption_agency_algorithm (
2193
2183
  gumbo_normalized_tagname(furthest_block->v.element.tag)
2194
2184
  );
2195
2185
 
2196
- // Step 12.
2186
+ // Step 13.
2197
2187
  int bookmark = 1 + gumbo_vector_index_of (
2198
2188
  &state->_active_formatting_elements,
2199
2189
  formatting_node
2200
2190
  );
2201
2191
  gumbo_debug("Bookmark at %d.\n", bookmark);
2202
- // Step 13.
2192
+ // Step 14.
2203
2193
  GumboNode* node = furthest_block;
2204
2194
  GumboNode* last_node = furthest_block;
2205
2195
  // Must be stored explicitly, in case node is removed from the stack of open
2206
- // elements, to handle step 9.4.
2196
+ // elements, to handle step 14.3.
2207
2197
  int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
2208
2198
  assert(saved_node_index > 0);
2209
- // Step 13.1.
2199
+ // Step 14.1.
2210
2200
  for (int j = 0;;) {
2211
- // Step 13.2.
2201
+ // Step 14.2.
2212
2202
  ++j;
2213
- // Step 13.3.
2203
+ // Step 14.3.
2214
2204
  int node_index = gumbo_vector_index_of(&state->_open_elements, node);
2215
2205
  gumbo_debug (
2216
2206
  "Current index: %d, last index: %d.\n",
@@ -2225,16 +2215,16 @@ static bool adoption_agency_algorithm (
2225
2215
  assert((unsigned int) node_index < state->_open_elements.capacity);
2226
2216
  node = state->_open_elements.data[node_index];
2227
2217
  assert(node->parent);
2218
+ // Step 14.4.
2228
2219
  if (node == formatting_node) {
2229
- // Step 13.4.
2230
2220
  break;
2231
2221
  }
2232
2222
  int formatting_index = gumbo_vector_index_of (
2233
2223
  &state->_active_formatting_elements,
2234
2224
  node
2235
2225
  );
2226
+ // Step 14.5.
2236
2227
  if (j > 3 && formatting_index != -1) {
2237
- // Step 13.5.
2238
2228
  gumbo_debug("Removing formatting element at %d.\n", formatting_index);
2239
2229
  gumbo_vector_remove_at (
2240
2230
  formatting_index,
@@ -2249,11 +2239,11 @@ static bool adoption_agency_algorithm (
2249
2239
  continue;
2250
2240
  }
2251
2241
  if (formatting_index == -1) {
2252
- // Step 13.6.
2242
+ // Step 14.6.
2253
2243
  gumbo_vector_remove_at(node_index, &state->_open_elements);
2254
2244
  continue;
2255
2245
  }
2256
- // Step 13.7.
2246
+ // Step 14.7.
2257
2247
  // "common ancestor as the intended parent" doesn't actually mean insert
2258
2248
  // it into the common ancestor; that happens below.
2259
2249
  node = clone_node(node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
@@ -2261,21 +2251,21 @@ static bool adoption_agency_algorithm (
2261
2251
  state->_active_formatting_elements.data[formatting_index] = node;
2262
2252
  assert(node_index >= 0);
2263
2253
  state->_open_elements.data[node_index] = node;
2264
- // Step 13.8.
2254
+ // Step 14.8.
2265
2255
  if (last_node == furthest_block) {
2266
2256
  bookmark = formatting_index + 1;
2267
2257
  gumbo_debug("Bookmark moved to %d.\n", bookmark);
2268
2258
  assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
2269
2259
  }
2270
- // Step 13.9.
2260
+ // Step 14.9.
2271
2261
  last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
2272
2262
  remove_from_parent(last_node);
2273
2263
  append_node(node, last_node);
2274
- // Step 13.10.
2264
+ // Step 14.10.
2275
2265
  last_node = node;
2276
- } // Step 13.11.
2266
+ } // Step 14.11.
2277
2267
 
2278
- // Step 14.
2268
+ // Step 15.
2279
2269
  gumbo_debug (
2280
2270
  "Removing %s node from parent ",
2281
2271
  gumbo_normalized_tagname(last_node->v.element.tag)
@@ -2292,14 +2282,14 @@ static bool adoption_agency_algorithm (
2292
2282
  );
2293
2283
  insert_node(last_node, location);
2294
2284
 
2295
- // Step 15.
2285
+ // Step 16.
2296
2286
  GumboNode* new_formatting_node = clone_node (
2297
2287
  formatting_node,
2298
2288
  GUMBO_INSERTION_ADOPTION_AGENCY_CLONED
2299
2289
  );
2300
2290
  formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2301
2291
 
2302
- // Step 16. Instead of appending nodes one-by-one, we swap the children
2292
+ // Step 17. Instead of appending nodes one-by-one, we swap the children
2303
2293
  // vector of furthest_block with the empty children of new_formatting_node,
2304
2294
  // reducing memory traffic and allocations. We still have to reset their
2305
2295
  // parent pointers, though.
@@ -2313,10 +2303,10 @@ static bool adoption_agency_algorithm (
2313
2303
  child->parent = new_formatting_node;
2314
2304
  }
2315
2305
 
2316
- // Step 17.
2306
+ // Step 18.
2317
2307
  append_node(furthest_block, new_formatting_node);
2318
2308
 
2319
- // Step 18.
2309
+ // Step 19.
2320
2310
  // If the formatting node was before the bookmark, it may shift over all
2321
2311
  // indices after it, so we need to explicitly find the index and possibly
2322
2312
  // adjust the bookmark.
@@ -2344,7 +2334,7 @@ static bool adoption_agency_algorithm (
2344
2334
  &state->_active_formatting_elements
2345
2335
  );
2346
2336
 
2347
- // Step 19.
2337
+ // Step 20.
2348
2338
  gumbo_vector_remove(formatting_node, &state->_open_elements);
2349
2339
  int insert_at = 1 + gumbo_vector_index_of (
2350
2340
  &state->_open_elements,
@@ -2357,7 +2347,7 @@ static bool adoption_agency_algorithm (
2357
2347
  insert_at,
2358
2348
  &state->_open_elements
2359
2349
  );
2360
- } // Step 20.
2350
+ } // Step 21.
2361
2351
  return true;
2362
2352
  }
2363
2353
 
@@ -2406,10 +2396,12 @@ static bool handle_initial(GumboParser* parser, GumboToken* token) {
2406
2396
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2407
2397
  ignore_token(parser);
2408
2398
  return true;
2409
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2399
+ }
2400
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2410
2401
  append_comment_node(parser, get_document_node(parser), token);
2411
2402
  return true;
2412
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2403
+ }
2404
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2413
2405
  document->has_doctype = true;
2414
2406
  document->name = token->v.doc_type.name;
2415
2407
  document->public_identifier = token->v.doc_type.public_identifier;
@@ -2431,95 +2423,108 @@ static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2431
2423
  parser_add_parse_error(parser, token);
2432
2424
  ignore_token(parser);
2433
2425
  return false;
2434
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2426
+ }
2427
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2435
2428
  append_comment_node(parser, get_document_node(parser), token);
2436
2429
  return true;
2437
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2430
+ }
2431
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2438
2432
  ignore_token(parser);
2439
2433
  return true;
2440
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2434
+ }
2435
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2441
2436
  GumboNode* html_node = insert_element_from_token(parser, token);
2442
2437
  parser->_output->root = html_node;
2443
2438
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2444
2439
  return true;
2445
- } else if (
2440
+ }
2441
+ if (
2446
2442
  token->type == GUMBO_TOKEN_END_TAG
2447
2443
  && !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2448
2444
  ) {
2449
2445
  parser_add_parse_error(parser, token);
2450
2446
  ignore_token(parser);
2451
2447
  return false;
2452
- } else {
2453
- GumboNode* html_node = insert_element_of_tag_type (
2454
- parser,
2455
- GUMBO_TAG_HTML,
2456
- GUMBO_INSERTION_IMPLIED
2457
- );
2458
- assert(html_node);
2459
- parser->_output->root = html_node;
2460
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2461
- parser->_parser_state->_reprocess_current_token = true;
2462
- return true;
2463
2448
  }
2449
+ GumboNode* html_node = insert_element_of_tag_type (
2450
+ parser,
2451
+ GUMBO_TAG_HTML,
2452
+ GUMBO_INSERTION_IMPLIED
2453
+ );
2454
+ assert(html_node);
2455
+ parser->_output->root = html_node;
2456
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2457
+ parser->_parser_state->_reprocess_current_token = true;
2458
+ return true;
2464
2459
  }
2465
2460
 
2461
+ // Forward declarations because of mutual dependencies.
2462
+ static bool handle_token(GumboParser* parser, GumboToken* token);
2463
+ static bool handle_in_body(GumboParser* parser, GumboToken* token);
2464
+ static bool handle_in_template(GumboParser* parser, GumboToken* token);
2465
+
2466
2466
  // https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode
2467
2467
  static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2468
- if (token->type == GUMBO_TOKEN_DOCTYPE) {
2469
- parser_add_parse_error(parser, token);
2468
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2470
2469
  ignore_token(parser);
2471
- return false;
2472
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2470
+ return true;
2471
+ }
2472
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2473
2473
  append_comment_node(parser, get_current_node(parser), token);
2474
2474
  return true;
2475
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2475
+ }
2476
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2477
+ parser_add_parse_error(parser, token);
2476
2478
  ignore_token(parser);
2477
- return true;
2478
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2479
+ return false;
2480
+ }
2481
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2482
+ return handle_in_body(parser, token);
2483
+ }
2484
+ if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2479
2485
  GumboNode* node = insert_element_from_token(parser, token);
2480
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2481
2486
  parser->_parser_state->_head_element = node;
2487
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2482
2488
  return true;
2483
- } else if (
2489
+ }
2490
+ if (
2484
2491
  token->type == GUMBO_TOKEN_END_TAG
2485
- && !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2492
+ && !tag_in(token, kEndTag, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2486
2493
  ) {
2487
2494
  parser_add_parse_error(parser, token);
2488
2495
  ignore_token(parser);
2489
2496
  return false;
2490
- } else {
2491
- GumboNode* node = insert_element_of_tag_type (
2492
- parser,
2493
- GUMBO_TAG_HEAD,
2494
- GUMBO_INSERTION_IMPLIED
2495
- );
2496
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2497
- parser->_parser_state->_head_element = node;
2498
- parser->_parser_state->_reprocess_current_token = true;
2499
- return true;
2500
2497
  }
2498
+ GumboNode* node = insert_element_of_tag_type (
2499
+ parser,
2500
+ GUMBO_TAG_HEAD,
2501
+ GUMBO_INSERTION_IMPLIED
2502
+ );
2503
+ parser->_parser_state->_head_element = node;
2504
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2505
+ parser->_parser_state->_reprocess_current_token = true;
2506
+ return true;
2501
2507
  }
2502
2508
 
2503
- // Forward declarations because of mutual dependencies.
2504
- static bool handle_token(GumboParser* parser, GumboToken* token);
2505
- static bool handle_in_body(GumboParser* parser, GumboToken* token);
2506
- static bool handle_in_template(GumboParser* parser, GumboToken* token);
2507
-
2508
2509
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead
2509
2510
  static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2510
2511
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2511
2512
  insert_text_token(parser, token);
2512
2513
  return true;
2513
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2514
+ }
2515
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2516
+ append_comment_node(parser, get_current_node(parser), token);
2517
+ return true;
2518
+ }
2519
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2514
2520
  parser_add_parse_error(parser, token);
2515
2521
  ignore_token(parser);
2516
2522
  return false;
2517
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2518
- append_comment_node(parser, get_current_node(parser), token);
2519
- return true;
2520
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2523
+ }
2524
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2521
2525
  return handle_in_body(parser, token);
2522
- } else if (
2526
+ }
2527
+ if (
2523
2528
  tag_in(token, kStartTag, &(const TagSet) {
2524
2529
  TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK)
2525
2530
  })
@@ -2528,7 +2533,8 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2528
2533
  pop_current_node(parser);
2529
2534
  acknowledge_self_closing_tag(parser);
2530
2535
  return true;
2531
- } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2536
+ }
2537
+ if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2532
2538
  insert_element_from_token(parser, token);
2533
2539
  pop_current_node(parser);
2534
2540
  acknowledge_self_closing_tag(parser);
@@ -2537,42 +2543,50 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2537
2543
  // should specifically look for that string in the document and re-encode it
2538
2544
  // before passing to Gumbo.
2539
2545
  return true;
2540
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2546
+ }
2547
+ if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2541
2548
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2542
2549
  return true;
2543
- } else if (
2550
+ }
2551
+ if (
2544
2552
  tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
2545
2553
  ) {
2546
2554
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2547
2555
  return true;
2548
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2556
+ }
2557
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2549
2558
  insert_element_from_token(parser, token);
2550
2559
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2551
2560
  return true;
2552
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2553
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
2561
+ }
2562
+ if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2563
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT_DATA);
2554
2564
  return true;
2555
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2565
+ }
2566
+ if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2556
2567
  GumboNode* head = pop_current_node(parser);
2557
2568
  UNUSED_IF_NDEBUG(head);
2558
2569
  assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
2559
2570
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2560
2571
  return true;
2561
- } else if (
2572
+ }
2573
+ if (
2562
2574
  tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)})
2563
2575
  ) {
2564
2576
  pop_current_node(parser);
2565
2577
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2566
2578
  parser->_parser_state->_reprocess_current_token = true;
2567
2579
  return true;
2568
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2580
+ }
2581
+ if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2569
2582
  insert_element_from_token(parser, token);
2570
2583
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
2571
- parser->_parser_state->_frameset_ok = false;
2584
+ set_frameset_not_ok(parser);
2572
2585
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2573
2586
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2574
2587
  return true;
2575
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2588
+ }
2589
+ if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2576
2590
  if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2577
2591
  parser_add_parse_error(parser, token);
2578
2592
  ignore_token(parser);
@@ -2590,19 +2604,18 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2590
2604
  pop_template_insertion_mode(parser);
2591
2605
  reset_insertion_mode_appropriately(parser);
2592
2606
  return success;
2593
- } else if (
2607
+ }
2608
+ if (
2594
2609
  tag_is(token, kStartTag, GUMBO_TAG_HEAD)
2595
2610
  || (token->type == GUMBO_TOKEN_END_TAG)
2596
2611
  ) {
2597
2612
  parser_add_parse_error(parser, token);
2598
2613
  ignore_token(parser);
2599
2614
  return false;
2600
- } else {
2601
- pop_current_node(parser);
2602
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2603
- parser->_parser_state->_reprocess_current_token = true;
2604
- return true;
2605
2615
  }
2616
+ pop_current_node(parser);
2617
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2618
+ parser->_parser_state->_reprocess_current_token = true;
2606
2619
  return true;
2607
2620
  }
2608
2621
 
@@ -2611,15 +2624,18 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2611
2624
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
2612
2625
  parser_add_parse_error(parser, token);
2613
2626
  return false;
2614
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2627
+ }
2628
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2615
2629
  return handle_in_body(parser, token);
2616
- } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2630
+ }
2631
+ if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2617
2632
  const GumboNode* node = pop_current_node(parser);
2618
2633
  assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2619
2634
  UNUSED_IF_NDEBUG(node);
2620
2635
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2621
2636
  return true;
2622
- } else if (
2637
+ }
2638
+ if (
2623
2639
  token->type == GUMBO_TOKEN_WHITESPACE
2624
2640
  || token->type == GUMBO_TOKEN_COMMENT
2625
2641
  || tag_in (token, kStartTag, &(const TagSet) {
@@ -2628,7 +2644,8 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2628
2644
  })
2629
2645
  ) {
2630
2646
  return handle_in_head(parser, token);
2631
- } else if (
2647
+ }
2648
+ if (
2632
2649
  tag_in(token, kStartTag, &(const TagSet){TAG(HEAD), TAG(NOSCRIPT)})
2633
2650
  || (
2634
2651
  token->type == GUMBO_TOKEN_END_TAG
@@ -2638,15 +2655,14 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2638
2655
  parser_add_parse_error(parser, token);
2639
2656
  ignore_token(parser);
2640
2657
  return false;
2641
- } else {
2642
- parser_add_parse_error(parser, token);
2643
- const GumboNode* node = pop_current_node(parser);
2644
- assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2645
- UNUSED_IF_NDEBUG(node);
2646
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2647
- parser->_parser_state->_reprocess_current_token = true;
2648
- return false;
2649
2658
  }
2659
+ parser_add_parse_error(parser, token);
2660
+ const GumboNode* node = pop_current_node(parser);
2661
+ assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2662
+ UNUSED_IF_NDEBUG(node);
2663
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2664
+ parser->_parser_state->_reprocess_current_token = true;
2665
+ return false;
2650
2666
  }
2651
2667
 
2652
2668
  // https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode
@@ -2655,25 +2671,31 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2655
2671
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2656
2672
  insert_text_token(parser, token);
2657
2673
  return true;
2658
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2674
+ }
2675
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2676
+ append_comment_node(parser, get_current_node(parser), token);
2677
+ return true;
2678
+ }
2679
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2659
2680
  parser_add_parse_error(parser, token);
2660
2681
  ignore_token(parser);
2661
2682
  return false;
2662
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2663
- append_comment_node(parser, get_current_node(parser), token);
2664
- return true;
2665
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2683
+ }
2684
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2666
2685
  return handle_in_body(parser, token);
2667
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2686
+ }
2687
+ if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2668
2688
  insert_element_from_token(parser, token);
2669
- state->_frameset_ok = false;
2689
+ set_frameset_not_ok(parser);
2670
2690
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2671
2691
  return true;
2672
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2692
+ }
2693
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2673
2694
  insert_element_from_token(parser, token);
2674
2695
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2675
2696
  return true;
2676
- } else if (
2697
+ }
2698
+ if (
2677
2699
  tag_in(token, kStartTag, &(const TagSet) {
2678
2700
  TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META),
2679
2701
  TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)
@@ -2685,12 +2707,14 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2685
2707
  // pending character tokens that should be attached to the root.
2686
2708
  maybe_flush_text_node_buffer(parser);
2687
2709
  gumbo_vector_add(state->_head_element, &state->_open_elements);
2688
- bool result = handle_in_head(parser, token);
2710
+ handle_in_head(parser, token);
2689
2711
  gumbo_vector_remove(state->_head_element, &state->_open_elements);
2690
- return result;
2691
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2712
+ return false;
2713
+ }
2714
+ if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2692
2715
  return handle_in_head(parser, token);
2693
- } else if (
2716
+ }
2717
+ if (
2694
2718
  tag_is(token, kStartTag, GUMBO_TAG_HEAD)
2695
2719
  || (
2696
2720
  token->type == GUMBO_TOKEN_END_TAG
@@ -2700,12 +2724,11 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2700
2724
  parser_add_parse_error(parser, token);
2701
2725
  ignore_token(parser);
2702
2726
  return false;
2703
- } else {
2704
- insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2705
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2706
- state->_reprocess_current_token = true;
2707
- return true;
2708
2727
  }
2728
+ insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2729
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2730
+ state->_reprocess_current_token = true;
2731
+ return true;
2709
2732
  }
2710
2733
 
2711
2734
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
@@ -2716,11 +2739,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2716
2739
  parser_add_parse_error(parser, token);
2717
2740
  ignore_token(parser);
2718
2741
  return false;
2719
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2742
+ }
2743
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2720
2744
  reconstruct_active_formatting_elements(parser);
2721
2745
  insert_text_token(parser, token);
2722
2746
  return true;
2723
- } else if (
2747
+ }
2748
+ if (
2724
2749
  token->type == GUMBO_TOKEN_CHARACTER
2725
2750
  || token->type == GUMBO_TOKEN_CDATA
2726
2751
  ) {
@@ -2728,14 +2753,17 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2728
2753
  insert_text_token(parser, token);
2729
2754
  set_frameset_not_ok(parser);
2730
2755
  return true;
2731
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2756
+ }
2757
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2732
2758
  append_comment_node(parser, get_current_node(parser), token);
2733
2759
  return true;
2734
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2760
+ }
2761
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2735
2762
  parser_add_parse_error(parser, token);
2736
2763
  ignore_token(parser);
2737
2764
  return false;
2738
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2765
+ }
2766
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2739
2767
  parser_add_parse_error(parser, token);
2740
2768
  if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2741
2769
  ignore_token(parser);
@@ -2745,7 +2773,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2745
2773
  assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2746
2774
  merge_attributes(token, parser->_output->root);
2747
2775
  return false;
2748
- } else if (
2776
+ }
2777
+ if (
2749
2778
  tag_in(token, kStartTag, &(const TagSet) {
2750
2779
  TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
2751
2780
  TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE),
@@ -2754,7 +2783,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2754
2783
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
2755
2784
  ) {
2756
2785
  return handle_in_head(parser, token);
2757
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2786
+ }
2787
+ if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2758
2788
  parser_add_parse_error(parser, token);
2759
2789
  if (
2760
2790
  state->_open_elements.length < 2
@@ -2762,12 +2792,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2762
2792
  || has_open_element(parser, GUMBO_TAG_TEMPLATE)
2763
2793
  ) {
2764
2794
  ignore_token(parser);
2765
- return false;
2795
+ } else {
2796
+ set_frameset_not_ok(parser);
2797
+ merge_attributes(token, state->_open_elements.data[1]);
2766
2798
  }
2767
- state->_frameset_ok = false;
2768
- merge_attributes(token, state->_open_elements.data[1]);
2769
2799
  return false;
2770
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2800
+ }
2801
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2771
2802
  parser_add_parse_error(parser, token);
2772
2803
  if (
2773
2804
  state->_open_elements.length < 2
@@ -2808,64 +2839,64 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2808
2839
  insert_element_from_token(parser, token);
2809
2840
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2810
2841
  return true;
2811
- } else if (token->type == GUMBO_TOKEN_EOF) {
2812
- for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2813
- if (
2814
- !node_tag_in_set(state->_open_elements.data[i], &(const TagSet) {
2815
- TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY), TAG(TD), TAG(TFOOT),
2816
- TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML)
2817
- })
2818
- ) {
2819
- parser_add_parse_error(parser, token);
2820
- }
2821
- }
2842
+ }
2843
+ if (token->type == GUMBO_TOKEN_EOF) {
2822
2844
  if (get_current_template_insertion_mode(parser) !=
2823
2845
  GUMBO_INSERTION_MODE_INITIAL) {
2824
2846
  return handle_in_template(parser, token);
2825
2847
  }
2848
+ if (stack_contains_nonclosable_element(parser)) {
2849
+ parser_add_parse_error(parser, token);
2850
+ return false;
2851
+ }
2826
2852
  return true;
2827
- } else if (tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML)})) {
2853
+ }
2854
+ if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
2828
2855
  if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2829
2856
  parser_add_parse_error(parser, token);
2830
2857
  ignore_token(parser);
2831
2858
  return false;
2832
2859
  }
2833
2860
  bool success = true;
2834
- for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2835
- if (
2836
- !node_tag_in_set(state->_open_elements.data[i], &(const TagSet) {
2837
- TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P),
2838
- TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD),
2839
- TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML)
2840
- })
2841
- ) {
2842
- parser_add_parse_error(parser, token);
2843
- success = false;
2844
- break;
2845
- }
2861
+ if (stack_contains_nonclosable_element(parser)) {
2862
+ parser_add_parse_error(parser, token);
2863
+ success = false;
2846
2864
  }
2865
+ GumboNode* body = state->_open_elements.data[1];
2866
+ assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2867
+ record_end_of_element(state->_current_token, &body->v.element);
2847
2868
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2848
- if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2849
- parser->_parser_state->_reprocess_current_token = true;
2850
- } else {
2851
- GumboNode* body = state->_open_elements.data[1];
2852
- assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2853
- record_end_of_element(state->_current_token, &body->v.element);
2869
+ return success;
2870
+ }
2871
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2872
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2873
+ parser_add_parse_error(parser, token);
2874
+ ignore_token(parser);
2875
+ return false;
2854
2876
  }
2877
+ bool success = true;
2878
+ if (stack_contains_nonclosable_element(parser)) {
2879
+ parser_add_parse_error(parser, token);
2880
+ success = false;
2881
+ }
2882
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2883
+ parser->_parser_state->_reprocess_current_token = true;
2855
2884
  return success;
2856
- } else if (
2885
+ }
2886
+ if (
2857
2887
  tag_in(token, kStartTag, &(const TagSet) {
2858
2888
  TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER),
2859
2889
  TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
2860
2890
  TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
2861
- TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
2891
+ TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
2862
2892
  TAG(SUMMARY), TAG(UL)
2863
2893
  })
2864
2894
  ) {
2865
2895
  bool result = maybe_implicitly_close_p_tag(parser, token);
2866
2896
  insert_element_from_token(parser, token);
2867
2897
  return result;
2868
- } else if (tag_in(token, kStartTag, &heading_tags)) {
2898
+ }
2899
+ if (tag_in(token, kStartTag, &heading_tags)) {
2869
2900
  bool result = maybe_implicitly_close_p_tag(parser, token);
2870
2901
  if (node_tag_in_set(get_current_node(parser), &heading_tags)) {
2871
2902
  parser_add_parse_error(parser, token);
@@ -2874,13 +2905,15 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2874
2905
  }
2875
2906
  insert_element_from_token(parser, token);
2876
2907
  return result;
2877
- } else if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) {
2908
+ }
2909
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) {
2878
2910
  bool result = maybe_implicitly_close_p_tag(parser, token);
2879
2911
  insert_element_from_token(parser, token);
2880
2912
  state->_ignore_next_linefeed = true;
2881
- state->_frameset_ok = false;
2913
+ set_frameset_not_ok(parser);
2882
2914
  return result;
2883
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2915
+ }
2916
+ if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2884
2917
  if (
2885
2918
  state->_form_element != NULL
2886
2919
  && !has_open_element(parser, GUMBO_TAG_TEMPLATE)
@@ -2896,38 +2929,42 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2896
2929
  state->_form_element = form_element;
2897
2930
  }
2898
2931
  return result;
2899
- } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2900
- maybe_implicitly_close_list_tag(parser, token, true);
2901
- bool result = maybe_implicitly_close_p_tag(parser, token);
2932
+ }
2933
+ if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2934
+ bool result = maybe_implicitly_close_list_tag(parser, token, true);
2935
+ result = maybe_implicitly_close_p_tag(parser, token) && result;
2902
2936
  insert_element_from_token(parser, token);
2903
2937
  return result;
2904
- } else if (tag_in(token, kStartTag, &dd_dt_tags)) {
2905
- maybe_implicitly_close_list_tag(parser, token, false);
2906
- bool result = maybe_implicitly_close_p_tag(parser, token);
2938
+ }
2939
+ if (tag_in(token, kStartTag, &dd_dt_tags)) {
2940
+ bool result = maybe_implicitly_close_list_tag(parser, token, false);
2941
+ result = maybe_implicitly_close_p_tag(parser, token) && result;
2907
2942
  insert_element_from_token(parser, token);
2908
2943
  return result;
2909
- } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2944
+ }
2945
+ if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2910
2946
  bool result = maybe_implicitly_close_p_tag(parser, token);
2911
2947
  insert_element_from_token(parser, token);
2912
2948
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
2913
2949
  return result;
2914
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2950
+ }
2951
+ if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2952
+ bool success = true;
2915
2953
  if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2916
2954
  parser_add_parse_error(parser, token);
2917
- implicitly_close_tags (
2918
- parser,
2919
- token,
2920
- GUMBO_NAMESPACE_HTML,
2921
- GUMBO_TAG_BUTTON
2922
- );
2923
- state->_reprocess_current_token = true;
2924
- return false;
2955
+ success = false;
2956
+ // We don't want to use implicitly_close_tags here because it may add an
2957
+ // error and we've already added the only error the standard specifies.
2958
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2959
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_BUTTON))
2960
+ ;
2925
2961
  }
2926
2962
  reconstruct_active_formatting_elements(parser);
2927
2963
  insert_element_from_token(parser, token);
2928
- state->_frameset_ok = false;
2929
- return true;
2930
- } else if (
2964
+ set_frameset_not_ok(parser);
2965
+ return success;
2966
+ }
2967
+ if (
2931
2968
  tag_in(token, kEndTag, &(const TagSet) {
2932
2969
  TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON),
2933
2970
  TAG(CENTER), TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL),
@@ -2942,14 +2979,14 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2942
2979
  ignore_token(parser);
2943
2980
  return false;
2944
2981
  }
2945
- implicitly_close_tags (
2982
+ return implicitly_close_tags (
2946
2983
  parser,
2947
2984
  token,
2948
2985
  GUMBO_NAMESPACE_HTML,
2949
2986
  token->v.end_tag.tag
2950
2987
  );
2951
- return true;
2952
- } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2988
+ }
2989
+ if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2953
2990
  if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2954
2991
  if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
2955
2992
  parser_add_parse_error(parser, token);
@@ -2960,7 +2997,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2960
2997
  generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2961
2998
  if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
2962
2999
  parser_add_parse_error(parser, token);
2963
- return false;
3000
+ success = false;
2964
3001
  }
2965
3002
  while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
2966
3003
  ;
@@ -2992,7 +3029,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2992
3029
  gumbo_vector_remove_at(index, open_elements);
2993
3030
  return result;
2994
3031
  }
2995
- } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
3032
+ }
3033
+ if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
3034
+ bool success = true;
2996
3035
  if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2997
3036
  parser_add_parse_error(parser, token);
2998
3037
  // reconstruct_active_formatting_elements(parser);
@@ -3001,16 +3040,16 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3001
3040
  GUMBO_TAG_P,
3002
3041
  GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3003
3042
  );
3004
- state->_reprocess_current_token = true;
3005
- return false;
3043
+ success = false;
3006
3044
  }
3007
3045
  return implicitly_close_tags (
3008
3046
  parser,
3009
3047
  token,
3010
3048
  GUMBO_NAMESPACE_HTML,
3011
3049
  GUMBO_TAG_P
3012
- );
3013
- } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
3050
+ ) && success;
3051
+ }
3052
+ if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
3014
3053
  if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
3015
3054
  parser_add_parse_error(parser, token);
3016
3055
  ignore_token(parser);
@@ -3022,8 +3061,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3022
3061
  GUMBO_NAMESPACE_HTML,
3023
3062
  GUMBO_TAG_LI
3024
3063
  );
3025
- } else if (tag_in(token, kEndTag, &dd_dt_tags)) {
3026
- assert(token->type == GUMBO_TOKEN_END_TAG);
3064
+ }
3065
+ if (tag_in(token, kEndTag, &dd_dt_tags)) {
3027
3066
  GumboTag token_tag = token->v.end_tag.tag;
3028
3067
  if (!has_an_element_in_scope(parser, token_tag)) {
3029
3068
  parser_add_parse_error(parser, token);
@@ -3036,7 +3075,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3036
3075
  GUMBO_NAMESPACE_HTML,
3037
3076
  token_tag
3038
3077
  );
3039
- } else if (tag_in(token, kEndTag, &heading_tags)) {
3078
+ }
3079
+ if (tag_in(token, kEndTag, &heading_tags)) {
3040
3080
  if (
3041
3081
  !has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) {
3042
3082
  GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
@@ -3047,30 +3087,31 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3047
3087
  parser_add_parse_error(parser, token);
3048
3088
  ignore_token(parser);
3049
3089
  return false;
3050
- } else {
3051
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3052
- const GumboNode* current_node = get_current_node(parser);
3053
- bool success = node_html_tag_is(current_node, token->v.end_tag.tag);
3054
- if (!success) {
3055
- // There're children of the heading currently open; close them below and
3056
- // record a parse error.
3057
- // TODO(jdtang): Add a way to distinguish this error case from the one
3058
- // above.
3059
- parser_add_parse_error(parser, token);
3060
- }
3061
- do {
3062
- current_node = pop_current_node(parser);
3063
- } while (!node_tag_in_set(current_node, &heading_tags));
3064
- return success;
3065
3090
  }
3066
- } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
3091
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3092
+ const GumboNode* current_node = get_current_node(parser);
3093
+ bool success = node_html_tag_is(current_node, token->v.end_tag.tag);
3094
+ if (!success) {
3095
+ // There're children of the heading currently open; close them below and
3096
+ // record a parse error.
3097
+ // TODO(jdtang): Add a way to distinguish this error case from the one
3098
+ // above.
3099
+ parser_add_parse_error(parser, token);
3100
+ }
3101
+ do {
3102
+ current_node = pop_current_node(parser);
3103
+ } while (!node_tag_in_set(current_node, &heading_tags));
3104
+ return success;
3105
+ }
3106
+ if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
3067
3107
  bool success = true;
3068
3108
  int last_a;
3069
3109
  int has_matching_a = find_last_anchor_index(parser, &last_a);
3070
3110
  if (has_matching_a) {
3071
3111
  assert(has_matching_a == 1);
3072
3112
  parser_add_parse_error(parser, token);
3073
- adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
3113
+ bool handled = adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
3114
+ assert(handled);
3074
3115
  // The adoption agency algorithm usually removes all instances of <a>
3075
3116
  // from the list of active formatting elements, but in case it doesn't,
3076
3117
  // we're supposed to do this. (The conditions where it might not are
@@ -3087,7 +3128,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3087
3128
  reconstruct_active_formatting_elements(parser);
3088
3129
  add_formatting_element(parser, insert_element_from_token(parser, token));
3089
3130
  return success;
3090
- } else if (
3131
+ }
3132
+ if (
3091
3133
  tag_in(token, kStartTag, &(const TagSet) {
3092
3134
  TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I), TAG(S),
3093
3135
  TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)
@@ -3096,27 +3138,33 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3096
3138
  reconstruct_active_formatting_elements(parser);
3097
3139
  add_formatting_element(parser, insert_element_from_token(parser, token));
3098
3140
  return true;
3099
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
3141
+ }
3142
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
3100
3143
  bool result = true;
3101
3144
  reconstruct_active_formatting_elements(parser);
3102
3145
  if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
3103
3146
  result = false;
3104
3147
  parser_add_parse_error(parser, token);
3105
- adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
3148
+ bool handled = adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
3149
+ assert(handled);
3106
3150
  reconstruct_active_formatting_elements(parser);
3107
3151
  }
3108
3152
  insert_element_from_token(parser, token);
3109
3153
  add_formatting_element(parser, get_current_node(parser));
3110
3154
  return result;
3111
- } else if (
3155
+ }
3156
+ if (
3112
3157
  tag_in(token, kEndTag, &(const TagSet) {
3113
3158
  TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I),
3114
3159
  TAG(NOBR), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT),
3115
3160
  TAG(U)
3116
3161
  })
3117
3162
  ) {
3118
- return adoption_agency_algorithm(parser, token, token->v.end_tag.tag);
3119
- } else if (
3163
+ if (!adoption_agency_algorithm(parser, token, token->v.end_tag.tag))
3164
+ goto any_other_end_tag;
3165
+ return true;
3166
+ }
3167
+ if (
3120
3168
  tag_in(token, kStartTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
3121
3169
  ) {
3122
3170
  reconstruct_active_formatting_elements(parser);
@@ -3124,19 +3172,21 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3124
3172
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
3125
3173
  set_frameset_not_ok(parser);
3126
3174
  return true;
3127
- } else if (
3175
+ }
3176
+ if (
3128
3177
  tag_in(token, kEndTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
3129
3178
  ) {
3130
3179
  GumboTag token_tag = token->v.end_tag.tag;
3131
- if (!has_an_element_in_table_scope(parser, token_tag)) {
3180
+ if (!has_an_element_in_scope(parser, token_tag)) {
3132
3181
  parser_add_parse_error(parser, token);
3133
3182
  ignore_token(parser);
3134
3183
  return false;
3135
3184
  }
3136
- implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
3185
+ bool success = implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
3137
3186
  clear_active_formatting_elements(parser);
3138
- return true;
3139
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3187
+ return success;
3188
+ }
3189
+ if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3140
3190
  if (
3141
3191
  get_document_node(parser)->v.document.doc_type_quirks_mode
3142
3192
  != GUMBO_DOCTYPE_QUIRKS
@@ -3147,74 +3197,88 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3147
3197
  set_frameset_not_ok(parser);
3148
3198
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3149
3199
  return true;
3150
- } else if (
3200
+ }
3201
+ if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
3202
+ parser_add_parse_error(parser, token);
3203
+ reconstruct_active_formatting_elements(parser);
3204
+ insert_element_of_tag_type (
3205
+ parser,
3206
+ GUMBO_TAG_BR,
3207
+ GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3208
+ );
3209
+ pop_current_node(parser);
3210
+ acknowledge_self_closing_tag(parser);
3211
+ set_frameset_not_ok(parser);
3212
+ return false;
3213
+ }
3214
+ if (
3151
3215
  tag_in(token, kStartTag, &(const TagSet) {
3152
3216
  TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), TAG(IMAGE), TAG(KEYGEN),
3153
3217
  TAG(WBR)
3154
3218
  })
3155
3219
  ) {
3156
- bool success = true;
3157
- if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
3158
- success = false;
3220
+ bool is_image = tag_is(token, kStartTag, GUMBO_TAG_IMAGE);
3221
+ if (is_image) {
3159
3222
  parser_add_parse_error(parser, token);
3160
3223
  token->v.start_tag.tag = GUMBO_TAG_IMG;
3161
3224
  }
3162
3225
  reconstruct_active_formatting_elements(parser);
3163
3226
  GumboNode* node = insert_element_from_token(parser, token);
3164
- if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
3165
- success = false;
3166
- parser_add_parse_error(parser, token);
3167
- node->v.element.tag = GUMBO_TAG_IMG;
3227
+ if (is_image)
3168
3228
  node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
3169
- }
3170
3229
  pop_current_node(parser);
3171
3230
  acknowledge_self_closing_tag(parser);
3172
3231
  set_frameset_not_ok(parser);
3173
- return success;
3174
- } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
3175
- if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
3176
- // Must be before the element is inserted, as that takes ownership of the
3177
- // token's attribute vector.
3178
- set_frameset_not_ok(parser);
3179
- }
3232
+ return !is_image;
3233
+ }
3234
+ if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
3180
3235
  reconstruct_active_formatting_elements(parser);
3181
- insert_element_from_token(parser, token);
3236
+ GumboNode *input = insert_element_from_token(parser, token);
3182
3237
  pop_current_node(parser);
3183
3238
  acknowledge_self_closing_tag(parser);
3239
+ if (!attribute_matches(&input->v.element.attributes, "type", "hidden"))
3240
+ set_frameset_not_ok(parser);
3184
3241
  return true;
3185
- } else if (
3242
+ }
3243
+ if (
3186
3244
  tag_in(token, kStartTag, &(const TagSet){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})
3187
3245
  ) {
3188
3246
  insert_element_from_token(parser, token);
3189
3247
  pop_current_node(parser);
3190
3248
  acknowledge_self_closing_tag(parser);
3191
3249
  return true;
3192
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
3250
+ }
3251
+ if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
3193
3252
  bool result = maybe_implicitly_close_p_tag(parser, token);
3194
3253
  insert_element_from_token(parser, token);
3195
3254
  pop_current_node(parser);
3196
3255
  acknowledge_self_closing_tag(parser);
3197
3256
  set_frameset_not_ok(parser);
3198
3257
  return result;
3199
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
3258
+ }
3259
+ if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
3200
3260
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
3201
3261
  parser->_parser_state->_ignore_next_linefeed = true;
3202
3262
  set_frameset_not_ok(parser);
3203
3263
  return true;
3204
- } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
3264
+ }
3265
+ if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
3205
3266
  bool result = maybe_implicitly_close_p_tag(parser, token);
3206
3267
  reconstruct_active_formatting_elements(parser);
3207
3268
  set_frameset_not_ok(parser);
3208
3269
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3209
3270
  return result;
3210
- } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
3271
+ }
3272
+ if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
3211
3273
  set_frameset_not_ok(parser);
3212
3274
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3213
3275
  return true;
3214
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3276
+ }
3277
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3215
3278
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3216
3279
  return true;
3217
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3280
+ }
3281
+ if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3218
3282
  reconstruct_active_formatting_elements(parser);
3219
3283
  insert_element_from_token(parser, token);
3220
3284
  set_frameset_not_ok(parser);
@@ -3231,8 +3295,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3231
3295
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
3232
3296
  }
3233
3297
  return true;
3234
- } else if (
3235
- tag_in(token, kStartTag, &(const TagSet){TAG(OPTION), TAG(OPTGROUP)})
3298
+ }
3299
+ if (
3300
+ tag_in(token, kStartTag, &(const TagSet){TAG(OPTGROUP), TAG(OPTION)})
3236
3301
  ) {
3237
3302
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3238
3303
  pop_current_node(parser);
@@ -3240,40 +3305,34 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3240
3305
  reconstruct_active_formatting_elements(parser);
3241
3306
  insert_element_from_token(parser, token);
3242
3307
  return true;
3243
- } else if (
3244
- tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)})
3245
- ) {
3308
+ }
3309
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RTC)})) {
3246
3310
  bool success = true;
3247
- GumboTag exception = tag_in(token, kStartTag, &(const TagSet){TAG(RT), TAG(RP)})
3248
- ? GUMBO_TAG_RTC
3249
- : GUMBO_TAG_LAST
3250
- ;
3251
3311
  if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
3252
- generate_implied_end_tags(parser, exception);
3312
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3313
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)) {
3314
+ parser_add_parse_error(parser, token);
3315
+ success = false;
3316
+ }
3253
3317
  }
3254
- if (
3255
- !node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)
3256
- && !(
3257
- exception == GUMBO_TAG_LAST
3258
- || node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC)
3259
- )
3260
- ) {
3261
- parser_add_parse_error(parser, token);
3262
- success = false;
3318
+ insert_element_from_token(parser, token);
3319
+ return success;
3320
+ }
3321
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(RP), TAG(RT)})) {
3322
+ bool success = true;
3323
+ if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
3324
+ generate_implied_end_tags(parser, GUMBO_TAG_RTC);
3325
+ GumboNode* current = get_current_node(parser);
3326
+ if (!node_html_tag_is(current, GUMBO_TAG_RUBY) &&
3327
+ !node_html_tag_is(current, GUMBO_TAG_RTC)) {
3328
+ parser_add_parse_error(parser, token);
3329
+ success = false;
3330
+ }
3263
3331
  }
3264
3332
  insert_element_from_token(parser, token);
3265
3333
  return success;
3266
- } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
3267
- parser_add_parse_error(parser, token);
3268
- reconstruct_active_formatting_elements(parser);
3269
- insert_element_of_tag_type (
3270
- parser,
3271
- GUMBO_TAG_BR,
3272
- GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3273
- );
3274
- pop_current_node(parser);
3275
- return false;
3276
- } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
3334
+ }
3335
+ if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
3277
3336
  reconstruct_active_formatting_elements(parser);
3278
3337
  adjust_mathml_attributes(token);
3279
3338
  adjust_foreign_attributes(token);
@@ -3283,7 +3342,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3283
3342
  acknowledge_self_closing_tag(parser);
3284
3343
  }
3285
3344
  return true;
3286
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
3345
+ }
3346
+ if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
3287
3347
  reconstruct_active_formatting_elements(parser);
3288
3348
  adjust_svg_attributes(token);
3289
3349
  adjust_foreign_attributes(token);
@@ -3293,7 +3353,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3293
3353
  acknowledge_self_closing_tag(parser);
3294
3354
  }
3295
3355
  return true;
3296
- } else if (
3356
+ }
3357
+ if (
3297
3358
  tag_in(token, kStartTag, &(const TagSet) {
3298
3359
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(FRAME), TAG(HEAD),
3299
3360
  TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3302,48 +3363,49 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3302
3363
  parser_add_parse_error(parser, token);
3303
3364
  ignore_token(parser);
3304
3365
  return false;
3305
- } else if (token->type == GUMBO_TOKEN_START_TAG) {
3366
+ }
3367
+ if (token->type == GUMBO_TOKEN_START_TAG) {
3306
3368
  reconstruct_active_formatting_elements(parser);
3307
3369
  insert_element_from_token(parser, token);
3308
3370
  return true;
3309
- } else {
3310
- assert(token->type == GUMBO_TOKEN_END_TAG);
3311
- GumboTag end_tag = token->v.end_tag.tag;
3312
- const char *end_tagname = token->v.end_tag.name;
3313
- assert(state->_open_elements.length > 0);
3314
- assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
3315
- // Walk up the stack of open elements until we find one that either:
3316
- // a) Matches the tag name we saw
3317
- // b) Is in the "special" category.
3318
- // If we see a), implicitly close everything up to and including it. If we
3319
- // see b), then record a parse error, don't close anything (except the
3320
- // implied end tags) and ignore the end tag token.
3321
- for (int i = state->_open_elements.length; --i >= 0;) {
3322
- const GumboNode* node = state->_open_elements.data[i];
3323
- if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, end_tag, end_tagname)) {
3324
- generate_implied_end_tags(parser, end_tag);
3325
- // TODO(jdtang): Do I need to add a parse error here? The condition in
3326
- // the spec seems like it's the inverse of the loop condition above, and
3327
- // so would never fire.
3328
- // sfc: Yes, an error is needed here.
3329
- // <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example.
3330
- // foo is the "current node" but sarcasm is node.
3331
- // XXX: Write a test for this.
3332
- if (node != get_current_node(parser))
3333
- parser_add_parse_error(parser, token);
3334
- while (node != pop_current_node(parser))
3335
- ; // Pop everything.
3336
- return true;
3337
- } else if (is_special_node(node)) {
3371
+ }
3372
+ any_other_end_tag:
3373
+ assert(token->type == GUMBO_TOKEN_END_TAG);
3374
+ GumboTag end_tag = token->v.end_tag.tag;
3375
+ const char *end_tagname = token->v.end_tag.name;
3376
+ assert(state->_open_elements.length > 0);
3377
+ assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
3378
+ // Walk up the stack of open elements until we find one that either:
3379
+ // a) Matches the tag name we saw
3380
+ // b) Is in the "special" category.
3381
+ // If we see a), implicitly close everything up to and including it. If we
3382
+ // see b), then record a parse error, don't close anything (except the
3383
+ // implied end tags) and ignore the end tag token.
3384
+ for (int i = state->_open_elements.length; --i >= 0;) {
3385
+ const GumboNode* node = state->_open_elements.data[i];
3386
+ if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, end_tag, end_tagname)) {
3387
+ generate_implied_end_tags(parser, end_tag);
3388
+ // TODO(jdtang): Do I need to add a parse error here? The condition in
3389
+ // the spec seems like it's the inverse of the loop condition above, and
3390
+ // so would never fire.
3391
+ // sfc: Yes, an error is needed here.
3392
+ // <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example.
3393
+ // foo is the "current node" but sarcasm is node.
3394
+ // XXX: Write a test for this.
3395
+ if (node != get_current_node(parser))
3338
3396
  parser_add_parse_error(parser, token);
3339
- ignore_token(parser);
3340
- return false;
3341
- }
3397
+ while (node != pop_current_node(parser))
3398
+ ; // Pop everything.
3399
+ return true;
3400
+ } else if (is_special_node(node)) {
3401
+ parser_add_parse_error(parser, token);
3402
+ ignore_token(parser);
3403
+ return false;
3342
3404
  }
3343
- // <html> is in the special category, so we should never get here.
3344
- assert(0);
3345
- return false;
3346
3405
  }
3406
+ // <html> is in the special category, so we should never get here.
3407
+ assert(0);
3408
+ return false;
3347
3409
  }
3348
3410
 
3349
3411
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata
@@ -3353,30 +3415,36 @@ static bool handle_text(GumboParser* parser, GumboToken* token) {
3353
3415
  || token->type == GUMBO_TOKEN_WHITESPACE
3354
3416
  ) {
3355
3417
  insert_text_token(parser, token);
3356
- } else {
3357
- // We provide only bare-bones script handling that doesn't involve any of
3358
- // the parser-pause/already-started/script-nesting flags or re-entrant
3359
- // invocations of the tokenizer. Because the intended usage of this library
3360
- // is mostly for templating, refactoring, and static-analysis libraries, we
3361
- // provide the script body as a text-node child of the <script> element.
3362
- // This behavior doesn't support document.write of partial HTML elements,
3363
- // but should be adequate for almost all other scripting support.
3364
- if (token->type == GUMBO_TOKEN_EOF) {
3365
- parser_add_parse_error(parser, token);
3366
- parser->_parser_state->_reprocess_current_token = true;
3367
- }
3368
- pop_current_node(parser);
3369
- set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3418
+ return true;
3370
3419
  }
3371
- return true;
3420
+ // We provide only bare-bones script handling that doesn't involve any of
3421
+ // the parser-pause/already-started/script-nesting flags or re-entrant
3422
+ // invocations of the tokenizer. Because the intended usage of this library
3423
+ // is mostly for templating, refactoring, and static-analysis libraries, we
3424
+ // provide the script body as a text-node child of the <script> element.
3425
+ // This behavior doesn't support document.write of partial HTML elements,
3426
+ // but should be adequate for almost all other scripting support.
3427
+ bool success = true;
3428
+ if (token->type == GUMBO_TOKEN_EOF) {
3429
+ parser_add_parse_error(parser, token);
3430
+ success = false;
3431
+ parser->_parser_state->_reprocess_current_token = true;
3432
+ }
3433
+ pop_current_node(parser);
3434
+ set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3435
+ return success;
3372
3436
  }
3373
3437
 
3374
3438
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable
3375
3439
  static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3376
3440
  GumboParserState* state = parser->_parser_state;
3377
3441
  if (
3378
- token->type == GUMBO_TOKEN_CHARACTER
3379
- || token->type == GUMBO_TOKEN_WHITESPACE
3442
+ (token->type == GUMBO_TOKEN_CHARACTER
3443
+ || token->type == GUMBO_TOKEN_WHITESPACE
3444
+ || token->type == GUMBO_TOKEN_NULL)
3445
+ && node_tag_in_set(get_current_node(parser), &(const TagSet) {
3446
+ TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
3447
+ })
3380
3448
  ) {
3381
3449
  // The "pending table character tokens" list described in the spec is
3382
3450
  // nothing more than the TextNodeBufferState. We accumulate text tokens as
@@ -3384,71 +3452,87 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3384
3452
  // we set _foster_parent_insertions if there're non-whitespace characters in
3385
3453
  // the buffer.
3386
3454
  assert(state->_text_node._buffer.length == 0);
3455
+ assert(state->_table_character_tokens.length == 0);
3387
3456
  state->_original_insertion_mode = state->_insertion_mode;
3388
3457
  state->_reprocess_current_token = true;
3389
3458
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
3390
3459
  return true;
3391
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3460
+ }
3461
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3462
+ append_comment_node(parser, get_current_node(parser), token);
3463
+ return true;
3464
+ }
3465
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3392
3466
  parser_add_parse_error(parser, token);
3393
3467
  ignore_token(parser);
3394
3468
  return false;
3395
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3396
- append_comment_node(parser, get_current_node(parser), token);
3397
- return true;
3398
- } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3469
+ }
3470
+ if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3399
3471
  clear_stack_to_table_context(parser);
3400
3472
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
3401
3473
  insert_element_from_token(parser, token);
3402
3474
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
3403
3475
  return true;
3404
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3476
+ }
3477
+ if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3405
3478
  clear_stack_to_table_context(parser);
3406
3479
  insert_element_from_token(parser, token);
3407
3480
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3408
3481
  return true;
3409
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3482
+ }
3483
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3410
3484
  clear_stack_to_table_context(parser);
3411
3485
  insert_element_of_tag_type (
3412
3486
  parser,
3413
3487
  GUMBO_TAG_COLGROUP,
3414
3488
  GUMBO_INSERTION_IMPLIED
3415
3489
  );
3416
- parser->_parser_state->_reprocess_current_token = true;
3490
+ state->_reprocess_current_token = true;
3417
3491
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3418
3492
  return true;
3419
- } else if (
3493
+ }
3494
+ if (
3420
3495
  tag_in(token, kStartTag, &(const TagSet) {
3421
- TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD), TAG(TH), TAG(TR)
3496
+ TAG(TBODY), TAG(TFOOT), TAG(THEAD)
3422
3497
  })
3423
3498
  ) {
3424
3499
  clear_stack_to_table_context(parser);
3500
+ insert_element_from_token(parser, token);
3425
3501
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3426
- if (tag_in(token, kStartTag, &(const TagSet){TAG(TD), TAG(TH), TAG(TR)})) {
3427
- insert_element_of_tag_type (
3428
- parser,
3429
- GUMBO_TAG_TBODY,
3430
- GUMBO_INSERTION_IMPLIED
3431
- );
3432
- state->_reprocess_current_token = true;
3433
- } else {
3434
- insert_element_from_token(parser, token);
3435
- }
3436
3502
  return true;
3437
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3503
+ }
3504
+ if (
3505
+ tag_in(token, kStartTag, &(const TagSet) {
3506
+ TAG(TD), TAG(TH), TAG(TR)
3507
+ })
3508
+ ) {
3509
+ clear_stack_to_table_context(parser);
3510
+ insert_element_of_tag_type (
3511
+ parser,
3512
+ GUMBO_TAG_TBODY,
3513
+ GUMBO_INSERTION_IMPLIED
3514
+ );
3515
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3516
+ state->_reprocess_current_token = true;
3517
+ return true;
3518
+ }
3519
+ if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3438
3520
  parser_add_parse_error(parser, token);
3439
3521
  if (close_table(parser)) {
3440
- parser->_parser_state->_reprocess_current_token = true;
3522
+ state->_reprocess_current_token = true;
3441
3523
  } else {
3442
3524
  ignore_token(parser);
3443
3525
  }
3444
3526
  return false;
3445
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3527
+ }
3528
+ if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3446
3529
  if (!close_table(parser)) {
3447
3530
  parser_add_parse_error(parser, token);
3448
3531
  return false;
3449
3532
  }
3450
3533
  return true;
3451
- } else if (
3534
+ }
3535
+ if (
3452
3536
  tag_in(token, kEndTag, &(const TagSet) {
3453
3537
  TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
3454
3538
  TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3457,20 +3541,24 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3457
3541
  parser_add_parse_error(parser, token);
3458
3542
  ignore_token(parser);
3459
3543
  return false;
3460
- } else if (
3544
+ }
3545
+ if (
3461
3546
  tag_in(token, kStartTag, &(const TagSet){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)})
3462
3547
  || (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))
3463
3548
  ) {
3464
3549
  return handle_in_head(parser, token);
3465
- } else if (
3550
+ }
3551
+ if (
3466
3552
  tag_is(token, kStartTag, GUMBO_TAG_INPUT)
3467
3553
  && attribute_matches(&token->v.start_tag.attributes, "type", "hidden")
3468
3554
  ) {
3469
3555
  parser_add_parse_error(parser, token);
3470
3556
  insert_element_from_token(parser, token);
3471
3557
  pop_current_node(parser);
3558
+ acknowledge_self_closing_tag(parser);
3472
3559
  return false;
3473
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3560
+ }
3561
+ if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3474
3562
  parser_add_parse_error(parser, token);
3475
3563
  if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3476
3564
  ignore_token(parser);
@@ -3479,15 +3567,16 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3479
3567
  state->_form_element = insert_element_from_token(parser, token);
3480
3568
  pop_current_node(parser);
3481
3569
  return false;
3482
- } else if (token->type == GUMBO_TOKEN_EOF) {
3570
+ }
3571
+ if (token->type == GUMBO_TOKEN_EOF) {
3483
3572
  return handle_in_body(parser, token);
3484
- } else {
3485
- parser_add_parse_error(parser, token);
3486
- state->_foster_parent_insertions = true;
3487
- bool result = handle_in_body(parser, token);
3488
- state->_foster_parent_insertions = false;
3489
- return result;
3490
3573
  }
3574
+ // foster-parenting-start-tag or foster-parenting-end-tag error
3575
+ parser_add_parse_error(parser, token);
3576
+ state->_foster_parent_insertions = true;
3577
+ bool result = handle_in_body(parser, token);
3578
+ state->_foster_parent_insertions = false;
3579
+ return result;
3491
3580
  }
3492
3581
 
3493
3582
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext
@@ -3496,40 +3585,38 @@ static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3496
3585
  parser_add_parse_error(parser, token);
3497
3586
  ignore_token(parser);
3498
3587
  return false;
3499
- } else if (
3500
- token->type == GUMBO_TOKEN_CHARACTER
3501
- || token->type == GUMBO_TOKEN_WHITESPACE
3502
- ) {
3588
+ }
3589
+ GumboParserState* state = parser->_parser_state;
3590
+ // Non-whitespace tokens will cause parse errors later.
3591
+ // It's not entirely clear from the spec how this is supposed to work.
3592
+ // https://github.com/whatwg/html/issues/4046
3593
+ if (token->type == GUMBO_TOKEN_WHITESPACE
3594
+ || token->type == GUMBO_TOKEN_CHARACTER) {
3503
3595
  insert_text_token(parser, token);
3596
+ gumbo_character_token_buffer_append(token, &state->_table_character_tokens);
3504
3597
  return true;
3505
- } else {
3506
- GumboParserState* state = parser->_parser_state;
3507
- GumboStringBuffer* buffer = &state->_text_node._buffer;
3508
- const char* data = buffer->data;
3509
- // Note that TextNodeBuffer may contain UTF-8 characters, but the
3510
- // presence of any one byte that is not whitespace means we flip
3511
- // the flag, so this loop is still valid.
3598
+ }
3599
+
3600
+ GumboCharacterTokenBuffer* buffer = &state->_table_character_tokens;
3601
+ if (state->_text_node._type != GUMBO_NODE_WHITESPACE) {
3602
+ // Each character in buffer is an error. Unfortunately, that means we need
3603
+ // to emit a bunch of errors at the appropriate locations.
3512
3604
  for (size_t i = 0, n = buffer->length; i < n; ++i) {
3513
- switch (data[i]) {
3514
- case '\t':
3515
- case '\n':
3516
- case '\f':
3517
- case '\r':
3518
- case ' ':
3519
- continue;
3520
- default:
3521
- state->_foster_parent_insertions = true;
3522
- reconstruct_active_formatting_elements(parser);
3523
- goto loopbreak;
3524
- }
3605
+ GumboToken tok;
3606
+ gumbo_character_token_buffer_get(buffer, i, &tok);
3607
+ // foster-parenting-character error
3608
+ parser_add_parse_error(parser, &tok);
3525
3609
  }
3526
- loopbreak:
3527
- maybe_flush_text_node_buffer(parser);
3528
- state->_foster_parent_insertions = false;
3529
- state->_reprocess_current_token = true;
3530
- state->_insertion_mode = state->_original_insertion_mode;
3531
- return true;
3610
+ state->_foster_parent_insertions = true;
3611
+ set_frameset_not_ok(parser);
3612
+ reconstruct_active_formatting_elements(parser);
3532
3613
  }
3614
+ maybe_flush_text_node_buffer(parser);
3615
+ gumbo_character_token_buffer_clear(buffer);
3616
+ state->_foster_parent_insertions = false;
3617
+ state->_reprocess_current_token = true;
3618
+ state->_insertion_mode = state->_original_insertion_mode;
3619
+ return true;
3533
3620
  }
3534
3621
 
3535
3622
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incaption
@@ -3539,19 +3626,18 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3539
3626
  parser_add_parse_error(parser, token);
3540
3627
  ignore_token(parser);
3541
3628
  return false;
3542
- } else {
3543
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3544
- bool result = true;
3545
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3546
- parser_add_parse_error(parser, token);
3547
- }
3548
- while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3549
- ;
3550
- clear_active_formatting_elements(parser);
3551
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3552
- return result;
3553
3629
  }
3554
- } else if (
3630
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3631
+ bool result = node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION);
3632
+ if (!result)
3633
+ parser_add_parse_error(parser, token);
3634
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3635
+ ;
3636
+ clear_active_formatting_elements(parser);
3637
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3638
+ return result;
3639
+ }
3640
+ if (
3555
3641
  tag_in(token, kStartTag, &(const TagSet) {
3556
3642
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
3557
3643
  TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3563,13 +3649,18 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3563
3649
  ignore_token(parser);
3564
3650
  return false;
3565
3651
  }
3652
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3653
+ bool result = node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION);
3654
+ if (!result)
3655
+ parser_add_parse_error(parser, token);
3566
3656
  while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3567
3657
  ;
3568
3658
  clear_active_formatting_elements(parser);
3569
3659
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3570
3660
  parser->_parser_state->_reprocess_current_token = true;
3571
- return true;
3572
- } else if (
3661
+ return result;
3662
+ }
3663
+ if (
3573
3664
  tag_in(token, kEndTag, &(const TagSet) {
3574
3665
  TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD),
3575
3666
  TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3578,9 +3669,8 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3578
3669
  parser_add_parse_error(parser, token);
3579
3670
  ignore_token(parser);
3580
3671
  return false;
3581
- } else {
3582
- return handle_in_body(parser, token);
3583
3672
  }
3673
+ return handle_in_body(parser, token);
3584
3674
  }
3585
3675
 
3586
3676
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incolgroup
@@ -3588,21 +3678,26 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3588
3678
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
3589
3679
  insert_text_token(parser, token);
3590
3680
  return true;
3591
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3681
+ }
3682
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3683
+ append_comment_node(parser, get_current_node(parser), token);
3684
+ return true;
3685
+ }
3686
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3592
3687
  parser_add_parse_error(parser, token);
3593
3688
  ignore_token(parser);
3594
3689
  return false;
3595
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3596
- append_comment_node(parser, get_current_node(parser), token);
3597
- return true;
3598
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3690
+ }
3691
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3599
3692
  return handle_in_body(parser, token);
3600
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3693
+ }
3694
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3601
3695
  insert_element_from_token(parser, token);
3602
3696
  pop_current_node(parser);
3603
3697
  acknowledge_self_closing_tag(parser);
3604
3698
  return true;
3605
- } else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3699
+ }
3700
+ if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3606
3701
  if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3607
3702
  parser_add_parse_error(parser, token);
3608
3703
  ignore_token(parser);
@@ -3611,28 +3706,30 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3611
3706
  pop_current_node(parser);
3612
3707
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3613
3708
  return false;
3614
- } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3709
+ }
3710
+ if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3615
3711
  parser_add_parse_error(parser, token);
3616
3712
  ignore_token(parser);
3617
3713
  return false;
3618
- } else if (
3714
+ }
3715
+ if (
3619
3716
  tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)
3620
3717
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
3621
3718
  ) {
3622
3719
  return handle_in_head(parser, token);
3623
- } else if (token->type == GUMBO_TOKEN_EOF) {
3720
+ }
3721
+ if (token->type == GUMBO_TOKEN_EOF) {
3624
3722
  return handle_in_body(parser, token);
3625
- } else {
3626
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3627
- parser_add_parse_error(parser, token);
3628
- ignore_token(parser);
3629
- return false;
3630
- }
3631
- pop_current_node(parser);
3632
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3633
- parser->_parser_state->_reprocess_current_token = true;
3634
- return true;
3635
3723
  }
3724
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3725
+ parser_add_parse_error(parser, token);
3726
+ ignore_token(parser);
3727
+ return false;
3728
+ }
3729
+ pop_current_node(parser);
3730
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3731
+ parser->_parser_state->_reprocess_current_token = true;
3732
+ return true;
3636
3733
  }
3637
3734
 
3638
3735
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intbody
@@ -3642,14 +3739,16 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3642
3739
  insert_element_from_token(parser, token);
3643
3740
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3644
3741
  return true;
3645
- } else if (tag_in(token, kStartTag, &td_th_tags)) {
3742
+ }
3743
+ if (tag_in(token, kStartTag, &td_th_tags)) {
3646
3744
  parser_add_parse_error(parser, token);
3647
3745
  clear_stack_to_table_body_context(parser);
3648
3746
  insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3649
- parser->_parser_state->_reprocess_current_token = true;
3650
3747
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3748
+ parser->_parser_state->_reprocess_current_token = true;
3651
3749
  return false;
3652
- } else if (
3750
+ }
3751
+ if (
3653
3752
  tag_in(token, kEndTag, &(const TagSet){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
3654
3753
  ) {
3655
3754
  if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
@@ -3661,7 +3760,8 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3661
3760
  pop_current_node(parser);
3662
3761
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3663
3762
  return true;
3664
- } else if (
3763
+ }
3764
+ if (
3665
3765
  tag_in(token, kStartTag, &(const TagSet) {
3666
3766
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
3667
3767
  TAG(THEAD)
@@ -3684,18 +3784,18 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3684
3784
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3685
3785
  parser->_parser_state->_reprocess_current_token = true;
3686
3786
  return true;
3687
- } else if (
3787
+ }
3788
+ if (
3688
3789
  tag_in(token, kEndTag, &(const TagSet) {
3689
- TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR), TAG(COLGROUP),
3690
- TAG(HTML), TAG(TD), TAG(TH)
3790
+ TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TD),
3791
+ TAG(TH), TAG(TR)
3691
3792
  })
3692
3793
  ) {
3693
3794
  parser_add_parse_error(parser, token);
3694
3795
  ignore_token(parser);
3695
3796
  return false;
3696
- } else {
3697
- return handle_in_table(parser, token);
3698
3797
  }
3798
+ return handle_in_table(parser, token);
3699
3799
  }
3700
3800
 
3701
3801
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intr
@@ -3706,18 +3806,19 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3706
3806
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3707
3807
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
3708
3808
  return true;
3709
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3809
+ }
3810
+ if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3710
3811
  if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3711
3812
  parser_add_parse_error(parser, token);
3712
3813
  ignore_token(parser);
3713
3814
  return false;
3714
- } else {
3715
- clear_stack_to_table_row_context(parser);
3716
- pop_current_node(parser);
3717
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3718
- return true;
3719
3815
  }
3720
- } else if (
3816
+ clear_stack_to_table_row_context(parser);
3817
+ pop_current_node(parser);
3818
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3819
+ return true;
3820
+ }
3821
+ if (
3721
3822
  tag_in(token, kStartTag, &(const TagSet) {
3722
3823
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
3723
3824
  TAG(THEAD), TAG(TR)
@@ -3728,31 +3829,32 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3728
3829
  parser_add_parse_error(parser, token);
3729
3830
  ignore_token(parser);
3730
3831
  return false;
3731
- } else {
3732
- clear_stack_to_table_row_context(parser);
3733
- pop_current_node(parser);
3734
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3735
- parser->_parser_state->_reprocess_current_token = true;
3736
- return true;
3737
3832
  }
3738
- } else if (
3833
+ clear_stack_to_table_row_context(parser);
3834
+ pop_current_node(parser);
3835
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3836
+ parser->_parser_state->_reprocess_current_token = true;
3837
+ return true;
3838
+ }
3839
+ if (
3739
3840
  tag_in(token, kEndTag, &(const TagSet) {TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
3740
3841
  ) {
3741
- if (
3742
- !has_an_element_in_table_scope(parser, token->v.end_tag.tag)
3743
- || !has_an_element_in_table_scope(parser, GUMBO_TAG_TR)
3744
- ) {
3842
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3745
3843
  parser_add_parse_error(parser, token);
3746
3844
  ignore_token(parser);
3747
3845
  return false;
3748
- } else {
3749
- clear_stack_to_table_row_context(parser);
3750
- pop_current_node(parser);
3751
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3752
- parser->_parser_state->_reprocess_current_token = true;
3846
+ }
3847
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3848
+ ignore_token(parser);
3753
3849
  return true;
3754
3850
  }
3755
- } else if (
3851
+ clear_stack_to_table_row_context(parser);
3852
+ pop_current_node(parser);
3853
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3854
+ parser->_parser_state->_reprocess_current_token = true;
3855
+ return true;
3856
+ }
3857
+ if (
3756
3858
  tag_in(token, kEndTag, &(const TagSet) {
3757
3859
  TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
3758
3860
  TAG(TD), TAG(TH)
@@ -3761,9 +3863,8 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3761
3863
  parser_add_parse_error(parser, token);
3762
3864
  ignore_token(parser);
3763
3865
  return false;
3764
- } else {
3765
- return handle_in_table(parser, token);
3766
3866
  }
3867
+ return handle_in_table(parser, token);
3767
3868
  }
3768
3869
 
3769
3870
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intd
@@ -3776,7 +3877,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3776
3877
  return false;
3777
3878
  }
3778
3879
  return close_table_cell(parser, token, token_tag);
3779
- } else if (
3880
+ }
3881
+ if (
3780
3882
  tag_in(token, kStartTag, &(const TagSet) {
3781
3883
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
3782
3884
  TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3794,7 +3896,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3794
3896
  }
3795
3897
  parser->_parser_state->_reprocess_current_token = true;
3796
3898
  return close_current_cell(parser, token);
3797
- } else if (
3899
+ }
3900
+ if (
3798
3901
  tag_in(token, kEndTag, &(const TagSet) {
3799
3902
  TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML)
3800
3903
  })
@@ -3802,7 +3905,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3802
3905
  parser_add_parse_error(parser, token);
3803
3906
  ignore_token(parser);
3804
3907
  return false;
3805
- } else if (
3908
+ }
3909
+ if (
3806
3910
  tag_in(token, kEndTag, &(const TagSet) {
3807
3911
  TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
3808
3912
  })
@@ -3814,9 +3918,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3814
3918
  }
3815
3919
  parser->_parser_state->_reprocess_current_token = true;
3816
3920
  return close_current_cell(parser, token);
3817
- } else {
3818
- return handle_in_body(parser, token);
3819
3921
  }
3922
+ return handle_in_body(parser, token);
3820
3923
  }
3821
3924
 
3822
3925
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect
@@ -3825,28 +3928,34 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3825
3928
  parser_add_parse_error(parser, token);
3826
3929
  ignore_token(parser);
3827
3930
  return false;
3828
- } else if (
3931
+ }
3932
+ if (
3829
3933
  token->type == GUMBO_TOKEN_CHARACTER
3830
3934
  || token->type == GUMBO_TOKEN_WHITESPACE
3831
3935
  ) {
3832
3936
  insert_text_token(parser, token);
3833
3937
  return true;
3834
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3938
+ }
3939
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3940
+ append_comment_node(parser, get_current_node(parser), token);
3941
+ return true;
3942
+ }
3943
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3835
3944
  parser_add_parse_error(parser, token);
3836
3945
  ignore_token(parser);
3837
3946
  return false;
3838
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3839
- append_comment_node(parser, get_current_node(parser), token);
3840
- return true;
3841
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3947
+ }
3948
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3842
3949
  return handle_in_body(parser, token);
3843
- } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3950
+ }
3951
+ if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3844
3952
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3845
3953
  pop_current_node(parser);
3846
3954
  }
3847
3955
  insert_element_from_token(parser, token);
3848
3956
  return true;
3849
- } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3957
+ }
3958
+ if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3850
3959
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3851
3960
  pop_current_node(parser);
3852
3961
  }
@@ -3855,7 +3964,8 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3855
3964
  }
3856
3965
  insert_element_from_token(parser, token);
3857
3966
  return true;
3858
- } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3967
+ }
3968
+ if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3859
3969
  GumboVector* open_elements = &parser->_parser_state->_open_elements;
3860
3970
  if (
3861
3971
  node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)
@@ -3869,21 +3979,21 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3869
3979
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3870
3980
  pop_current_node(parser);
3871
3981
  return true;
3872
- } else {
3873
- parser_add_parse_error(parser, token);
3874
- ignore_token(parser);
3875
- return false;
3876
3982
  }
3877
- } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3983
+ parser_add_parse_error(parser, token);
3984
+ ignore_token(parser);
3985
+ return false;
3986
+ }
3987
+ if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3878
3988
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3879
3989
  pop_current_node(parser);
3880
3990
  return true;
3881
- } else {
3882
- parser_add_parse_error(parser, token);
3883
- ignore_token(parser);
3884
- return false;
3885
3991
  }
3886
- } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3992
+ parser_add_parse_error(parser, token);
3993
+ ignore_token(parser);
3994
+ return false;
3995
+ }
3996
+ if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3887
3997
  if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3888
3998
  parser_add_parse_error(parser, token);
3889
3999
  ignore_token(parser);
@@ -3891,14 +4001,16 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3891
4001
  }
3892
4002
  close_current_select(parser);
3893
4003
  return true;
3894
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
4004
+ }
4005
+ if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3895
4006
  parser_add_parse_error(parser, token);
3896
4007
  ignore_token(parser);
3897
4008
  if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3898
4009
  close_current_select(parser);
3899
4010
  }
3900
4011
  return false;
3901
- } else if (
4012
+ }
4013
+ if (
3902
4014
  tag_in(token, kStartTag, &(const TagSet) {TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})
3903
4015
  ) {
3904
4016
  parser_add_parse_error(parser, token);
@@ -3909,18 +4021,18 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3909
4021
  parser->_parser_state->_reprocess_current_token = true;
3910
4022
  }
3911
4023
  return false;
3912
- } else if (
4024
+ }
4025
+ if (
3913
4026
  tag_in(token, kStartTag, &(const TagSet){TAG(SCRIPT), TAG(TEMPLATE)})
3914
4027
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
3915
4028
  ) {
3916
4029
  return handle_in_head(parser, token);
3917
- } else if (token->type == GUMBO_TOKEN_EOF) {
3918
- return handle_in_body(parser, token);
3919
- } else {
3920
- parser_add_parse_error(parser, token);
3921
- ignore_token(parser);
3922
- return false;
3923
4030
  }
4031
+ if (token->type == GUMBO_TOKEN_EOF)
4032
+ return handle_in_body(parser, token);
4033
+ parser_add_parse_error(parser, token);
4034
+ ignore_token(parser);
4035
+ return false;
3924
4036
  }
3925
4037
 
3926
4038
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselectintable
@@ -3934,22 +4046,18 @@ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3934
4046
  close_current_select(parser);
3935
4047
  parser->_parser_state->_reprocess_current_token = true;
3936
4048
  return false;
3937
- } else if (tag_in(token, kEndTag, &tags)) {
4049
+ }
4050
+ if (tag_in(token, kEndTag, &tags)) {
3938
4051
  parser_add_parse_error(parser, token);
3939
4052
  if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3940
4053
  ignore_token(parser);
3941
4054
  return false;
3942
- } else {
3943
- close_current_select(parser);
3944
- // close_current_select already does the
3945
- // reset_insertion_mode_appropriately
3946
- // reset_insertion_mode_appropriately(parser);
3947
- parser->_parser_state->_reprocess_current_token = true;
3948
- return false;
3949
4055
  }
3950
- } else {
3951
- return handle_in_select(parser, token);
4056
+ close_current_select(parser);
4057
+ parser->_parser_state->_reprocess_current_token = true;
4058
+ return false;
3952
4059
  }
4060
+ return handle_in_select(parser, token);
3953
4061
  }
3954
4062
 
3955
4063
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate
@@ -3973,7 +4081,8 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3973
4081
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
3974
4082
  ) {
3975
4083
  return handle_in_head(parser, token);
3976
- } else if (
4084
+ }
4085
+ if (
3977
4086
  tag_in(token, kStartTag, &(const TagSet) {
3978
4087
  TAG(CAPTION), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), TAG(THEAD)
3979
4088
  })
@@ -3983,35 +4092,41 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3983
4092
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3984
4093
  state->_reprocess_current_token = true;
3985
4094
  return true;
3986
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
4095
+ }
4096
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3987
4097
  pop_template_insertion_mode(parser);
3988
4098
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3989
4099
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3990
4100
  state->_reprocess_current_token = true;
3991
4101
  return true;
3992
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
4102
+ }
4103
+ if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3993
4104
  pop_template_insertion_mode(parser);
3994
4105
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3995
4106
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3996
4107
  state->_reprocess_current_token = true;
3997
4108
  return true;
3998
- } else if (tag_in(token, kStartTag, &td_th_tags)) {
4109
+ }
4110
+ if (tag_in(token, kStartTag, &td_th_tags)) {
3999
4111
  pop_template_insertion_mode(parser);
4000
4112
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
4001
4113
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
4002
4114
  state->_reprocess_current_token = true;
4003
4115
  return true;
4004
- } else if (token->type == GUMBO_TOKEN_START_TAG) {
4116
+ }
4117
+ if (token->type == GUMBO_TOKEN_START_TAG) {
4005
4118
  pop_template_insertion_mode(parser);
4006
4119
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4007
4120
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4008
4121
  state->_reprocess_current_token = true;
4009
4122
  return true;
4010
- } else if (token->type == GUMBO_TOKEN_END_TAG) {
4123
+ }
4124
+ if (token->type == GUMBO_TOKEN_END_TAG) {
4011
4125
  parser_add_parse_error(parser, token);
4012
4126
  ignore_token(parser);
4013
4127
  return false;
4014
- } else if (token->type == GUMBO_TOKEN_EOF) {
4128
+ }
4129
+ if (token->type == GUMBO_TOKEN_EOF) {
4015
4130
  if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
4016
4131
  // Stop parsing.
4017
4132
  return true;
@@ -4024,10 +4139,9 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
4024
4139
  reset_insertion_mode_appropriately(parser);
4025
4140
  state->_reprocess_current_token = true;
4026
4141
  return false;
4027
- } else {
4028
- assert(0);
4029
- return false;
4030
4142
  }
4143
+ assert(0 && "unreachable");
4144
+ return false;
4031
4145
  }
4032
4146
 
4033
4147
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterbody
@@ -4037,16 +4151,22 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
4037
4151
  || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4038
4152
  ) {
4039
4153
  return handle_in_body(parser, token);
4040
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
4154
+ }
4155
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4041
4156
  GumboNode* html_node = parser->_output->root;
4042
4157
  assert(html_node != NULL);
4043
4158
  append_comment_node(parser, html_node, token);
4044
4159
  return true;
4045
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
4160
+ }
4161
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4046
4162
  parser_add_parse_error(parser, token);
4047
4163
  ignore_token(parser);
4048
4164
  return false;
4049
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4165
+ }
4166
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4167
+ return handle_in_body(parser, token);
4168
+ }
4169
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4050
4170
  /* fragment case: ignore the closing HTML token */
4051
4171
  if (is_fragment_parser(parser)) {
4052
4172
  parser_add_parse_error(parser, token);
@@ -4061,14 +4181,14 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
4061
4181
  &html->v.element
4062
4182
  );
4063
4183
  return true;
4064
- } else if (token->type == GUMBO_TOKEN_EOF) {
4184
+ }
4185
+ if (token->type == GUMBO_TOKEN_EOF) {
4065
4186
  return true;
4066
- } else {
4067
- parser_add_parse_error(parser, token);
4068
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4069
- parser->_parser_state->_reprocess_current_token = true;
4070
- return false;
4071
4187
  }
4188
+ parser_add_parse_error(parser, token);
4189
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4190
+ parser->_parser_state->_reprocess_current_token = true;
4191
+ return false;
4072
4192
  }
4073
4193
 
4074
4194
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inframeset
@@ -4076,19 +4196,24 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
4076
4196
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
4077
4197
  insert_text_token(parser, token);
4078
4198
  return true;
4079
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
4199
+ }
4200
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4080
4201
  append_comment_node(parser, get_current_node(parser), token);
4081
4202
  return true;
4082
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
4203
+ }
4204
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4083
4205
  parser_add_parse_error(parser, token);
4084
4206
  ignore_token(parser);
4085
4207
  return false;
4086
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4208
+ }
4209
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4087
4210
  return handle_in_body(parser, token);
4088
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
4211
+ }
4212
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
4089
4213
  insert_element_from_token(parser, token);
4090
4214
  return true;
4091
- } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
4215
+ }
4216
+ if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
4092
4217
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
4093
4218
  parser_add_parse_error(parser, token);
4094
4219
  ignore_token(parser);
@@ -4102,24 +4227,26 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
4102
4227
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
4103
4228
  }
4104
4229
  return true;
4105
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
4230
+ }
4231
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
4106
4232
  insert_element_from_token(parser, token);
4107
4233
  pop_current_node(parser);
4108
4234
  acknowledge_self_closing_tag(parser);
4109
4235
  return true;
4110
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4236
+ }
4237
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4111
4238
  return handle_in_head(parser, token);
4112
- } else if (token->type == GUMBO_TOKEN_EOF) {
4239
+ }
4240
+ if (token->type == GUMBO_TOKEN_EOF) {
4113
4241
  if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
4114
4242
  parser_add_parse_error(parser, token);
4115
4243
  return false;
4116
4244
  }
4117
4245
  return true;
4118
- } else {
4119
- parser_add_parse_error(parser, token);
4120
- ignore_token(parser);
4121
- return false;
4122
4246
  }
4247
+ parser_add_parse_error(parser, token);
4248
+ ignore_token(parser);
4249
+ return false;
4123
4250
  }
4124
4251
 
4125
4252
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterframeset
@@ -4127,16 +4254,20 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
4127
4254
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
4128
4255
  insert_text_token(parser, token);
4129
4256
  return true;
4130
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
4257
+ }
4258
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4131
4259
  append_comment_node(parser, get_current_node(parser), token);
4132
4260
  return true;
4133
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
4261
+ }
4262
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4134
4263
  parser_add_parse_error(parser, token);
4135
4264
  ignore_token(parser);
4136
4265
  return false;
4137
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4266
+ }
4267
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4138
4268
  return handle_in_body(parser, token);
4139
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4269
+ }
4270
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4140
4271
  GumboNode* html = parser->_parser_state->_open_elements.data[0];
4141
4272
  assert(node_html_tag_is(html, GUMBO_TAG_HTML));
4142
4273
  record_end_of_element (
@@ -4145,15 +4276,16 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
4145
4276
  );
4146
4277
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
4147
4278
  return true;
4148
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4279
+ }
4280
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4149
4281
  return handle_in_head(parser, token);
4150
- } else if (token->type == GUMBO_TOKEN_EOF) {
4282
+ }
4283
+ if (token->type == GUMBO_TOKEN_EOF) {
4151
4284
  return true;
4152
- } else {
4153
- parser_add_parse_error(parser, token);
4154
- ignore_token(parser);
4155
- return false;
4156
4285
  }
4286
+ parser_add_parse_error(parser, token);
4287
+ ignore_token(parser);
4288
+ return false;
4157
4289
  }
4158
4290
 
4159
4291
  // https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-body-insertion-mode
@@ -4161,20 +4293,21 @@ static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
4161
4293
  if (token->type == GUMBO_TOKEN_COMMENT) {
4162
4294
  append_comment_node(parser, get_document_node(parser), token);
4163
4295
  return true;
4164
- } else if (
4296
+ }
4297
+ if (
4165
4298
  token->type == GUMBO_TOKEN_DOCTYPE
4166
4299
  || token->type == GUMBO_TOKEN_WHITESPACE
4167
4300
  || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4168
4301
  ) {
4169
4302
  return handle_in_body(parser, token);
4170
- } else if (token->type == GUMBO_TOKEN_EOF) {
4303
+ }
4304
+ if (token->type == GUMBO_TOKEN_EOF) {
4171
4305
  return true;
4172
- } else {
4173
- parser_add_parse_error(parser, token);
4174
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4175
- parser->_parser_state->_reprocess_current_token = true;
4176
- return false;
4177
4306
  }
4307
+ parser_add_parse_error(parser, token);
4308
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4309
+ parser->_parser_state->_reprocess_current_token = true;
4310
+ return false;
4178
4311
  }
4179
4312
 
4180
4313
  // https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-frameset-insertion-mode
@@ -4185,21 +4318,23 @@ static bool handle_after_after_frameset (
4185
4318
  if (token->type == GUMBO_TOKEN_COMMENT) {
4186
4319
  append_comment_node(parser, get_document_node(parser), token);
4187
4320
  return true;
4188
- } else if (
4321
+ }
4322
+ if (
4189
4323
  token->type == GUMBO_TOKEN_DOCTYPE
4190
4324
  || token->type == GUMBO_TOKEN_WHITESPACE
4191
4325
  || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4192
4326
  ) {
4193
4327
  return handle_in_body(parser, token);
4194
- } else if (token->type == GUMBO_TOKEN_EOF) {
4328
+ }
4329
+ if (token->type == GUMBO_TOKEN_EOF) {
4195
4330
  return true;
4196
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4331
+ }
4332
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4197
4333
  return handle_in_head(parser, token);
4198
- } else {
4199
- parser_add_parse_error(parser, token);
4200
- ignore_token(parser);
4201
- return false;
4202
4334
  }
4335
+ parser_add_parse_error(parser, token);
4336
+ ignore_token(parser);
4337
+ return false;
4203
4338
  }
4204
4339
 
4205
4340
  // Function pointers for each insertion mode.
@@ -4306,8 +4441,7 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4306
4441
  parser->_parser_state->_reprocess_current_token = true;
4307
4442
  return false;
4308
4443
  }
4309
-
4310
- assert(token->type == GUMBO_TOKEN_START_TAG);
4444
+ // This is a start tag so the next if's then branch will be taken.
4311
4445
  }
4312
4446
 
4313
4447
  if (token->type == GUMBO_TOKEN_START_TAG) {
@@ -4329,49 +4463,48 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4329
4463
  return true;
4330
4464
  // </script> tags are handled like any other end tag, putting the script's
4331
4465
  // text into a text node child and closing the current node.
4332
- } else {
4333
- assert(token->type == GUMBO_TOKEN_END_TAG);
4334
- GumboNode* node = get_current_node(parser);
4335
- GumboTag tag = token->v.end_tag.tag;
4336
- const char* name = token->v.end_tag.name;
4337
- assert(node != NULL);
4466
+ }
4467
+ assert(token->type == GUMBO_TOKEN_END_TAG);
4468
+ GumboNode* node = get_current_node(parser);
4469
+ GumboTag tag = token->v.end_tag.tag;
4470
+ const char* name = token->v.end_tag.name;
4471
+ assert(node != NULL);
4338
4472
 
4339
- bool is_success = true;
4340
- if (!node_tagname_is(node, tag, name)) {
4341
- parser_add_parse_error(parser, token);
4342
- is_success = false;
4343
- }
4344
- int i = parser->_parser_state->_open_elements.length;
4345
- for (--i; i > 0;) {
4346
- // Here we move up the stack until we find an HTML element (in which
4347
- // case we do nothing) or we find the element that we're about to
4348
- // close (in which case we pop everything we've seen until that
4349
- // point.)
4350
- gumbo_debug("Foreign %s node at %d.\n", node->v.element.name, i);
4351
- if (node_tagname_is(node, tag, name)) {
4352
- gumbo_debug("Matches.\n");
4353
- while (node != pop_current_node(parser)) {
4354
- // Pop all the nodes below the current one. Node is guaranteed to
4355
- // be an element on the stack of open elements (set below), so
4356
- // this loop is guaranteed to terminate.
4357
- }
4358
- return is_success;
4359
- }
4360
- --i;
4361
- node = parser->_parser_state->_open_elements.data[i];
4362
- if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
4363
- // The loop continues only in foreign namespaces.
4364
- break;
4473
+ bool is_success = true;
4474
+ if (!node_tagname_is(node, tag, name)) {
4475
+ parser_add_parse_error(parser, token);
4476
+ is_success = false;
4477
+ }
4478
+ int i = parser->_parser_state->_open_elements.length;
4479
+ for (--i; i > 0;) {
4480
+ // Here we move up the stack until we find an HTML element (in which
4481
+ // case we do nothing) or we find the element that we're about to
4482
+ // close (in which case we pop everything we've seen until that
4483
+ // point.)
4484
+ gumbo_debug("Foreign %s node at %d.\n", node->v.element.name, i);
4485
+ if (node_tagname_is(node, tag, name)) {
4486
+ gumbo_debug("Matches.\n");
4487
+ while (node != pop_current_node(parser)) {
4488
+ // Pop all the nodes below the current one. Node is guaranteed to
4489
+ // be an element on the stack of open elements (set below), so
4490
+ // this loop is guaranteed to terminate.
4365
4491
  }
4366
- }
4367
- assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
4368
- if (i == 0)
4369
4492
  return is_success;
4370
- // We can't call handle_token directly because the current node is still in
4371
- // a foriegn namespace, so it would re-enter this and result in infinite
4372
- // recursion.
4373
- return handle_html_content(parser, token) && is_success;
4493
+ }
4494
+ --i;
4495
+ node = parser->_parser_state->_open_elements.data[i];
4496
+ if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
4497
+ // The loop continues only in foreign namespaces.
4498
+ break;
4499
+ }
4374
4500
  }
4501
+ assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
4502
+ if (i == 0)
4503
+ return is_success;
4504
+ // We can't call handle_token directly because the current node is still in
4505
+ // a foriegn namespace, so it would re-enter this and result in infinite
4506
+ // recursion.
4507
+ return handle_html_content(parser, token) && is_success;
4375
4508
  }
4376
4509
 
4377
4510
  // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
@@ -4517,7 +4650,7 @@ static void fragment_parser_init (
4517
4650
  break;
4518
4651
 
4519
4652
  case GUMBO_TAG_SCRIPT:
4520
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
4653
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
4521
4654
  break;
4522
4655
 
4523
4656
  case GUMBO_TAG_NOSCRIPT:
@@ -4554,7 +4687,7 @@ static void fragment_parser_init (
4554
4687
  // 11.
4555
4688
  if (ctx_has_form_ancestor
4556
4689
  || (ctx_tag == GUMBO_TAG_FORM
4557
- && fragment_namespace == GUMBO_NAMESPACE_HTML)) {
4690
+ && fragment_namespace == GUMBO_NAMESPACE_HTML)) {
4558
4691
  static const GumboNode form_ancestor = {
4559
4692
  .type = GUMBO_NODE_ELEMENT,
4560
4693
  .parent = NULL,
@@ -4619,11 +4752,11 @@ GumboOutput* gumbo_parse_with_options (
4619
4752
  if (state->_reprocess_current_token) {
4620
4753
  state->_reprocess_current_token = false;
4621
4754
  } else {
4622
- GumboNode* current_node = get_current_node(&parser);
4623
- gumbo_tokenizer_set_is_current_node_foreign (
4755
+ GumboNode* adjusted_current_node = get_adjusted_current_node(&parser);
4756
+ gumbo_tokenizer_set_is_adjusted_current_node_foreign (
4624
4757
  &parser,
4625
- current_node &&
4626
- current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4758
+ adjusted_current_node &&
4759
+ adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4627
4760
  );
4628
4761
  has_error = !gumbo_lex(&parser, &token) || has_error;
4629
4762
  }
@@ -4649,10 +4782,10 @@ GumboOutput* gumbo_parse_with_options (
4649
4782
  break;
4650
4783
  }
4651
4784
  gumbo_debug (
4652
- "Handling %s token @%zu:%zu in state %u.\n",
4785
+ "Handling %s token @%lu:%lu in state %u.\n",
4653
4786
  (char*) token_type,
4654
- token.position.line,
4655
- token.position.column,
4787
+ (unsigned long)token.position.line,
4788
+ (unsigned long)token.position.column,
4656
4789
  state->_insertion_mode
4657
4790
  );
4658
4791
 
@@ -4671,19 +4804,26 @@ GumboOutput* gumbo_parse_with_options (
4671
4804
  );
4672
4805
 
4673
4806
  if (!state->_reprocess_current_token) {
4807
+ // If we're done with the token, check for unacknowledged self-closing
4808
+ // flags on start tags.
4674
4809
  if (token.type == GUMBO_TOKEN_START_TAG &&
4675
4810
  token.v.start_tag.is_self_closing &&
4676
4811
  !state->_self_closing_flag_acknowledged) {
4677
- GumboError* error = parser_add_parse_error(&parser, &token);
4678
- if (error)
4679
- error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
4812
+ has_error = true;
4813
+ GumboError* error = gumbo_add_error(&parser);
4814
+ if (error) {
4815
+ // This is essentially a tokenizer error that's only caught during
4816
+ // tree construction.
4817
+ error->type = GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS;
4818
+ error->original_text = token.original_text;
4819
+ error->position = token.position;
4820
+ }
4680
4821
  }
4822
+ // Make sure we free the end tag's name since it doesn't get transferred
4823
+ // to a token.
4681
4824
  if (token.type == GUMBO_TOKEN_END_TAG &&
4682
- token.v.end_tag.is_self_closing) {
4683
- GumboError* error = parser_add_parse_error(&parser, &token);
4684
- if (error)
4685
- error->type = GUMBO_ERR_SELF_CLOSING_END_TAG;
4686
- }
4825
+ token.v.end_tag.tag == GUMBO_TAG_UNKNOWN)
4826
+ gumbo_free(token.v.end_tag.name);
4687
4827
  }
4688
4828
 
4689
4829
  if (unlikely(state->_open_elements.length > max_tree_depth)) {