nokogumbo 2.0.0.pre.alpha → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@
7
7
  #include "insertion_mode.h"
8
8
  #include "string_buffer.h"
9
9
  #include "token_type.h"
10
+ #include "tokenizer_states.h"
10
11
 
11
12
  #ifdef __cplusplus
12
13
  extern "C" {
@@ -15,85 +16,66 @@ extern "C" {
15
16
  struct GumboInternalParser;
16
17
 
17
18
  typedef enum {
19
+ // Defined errors.
20
+ // https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
21
+ GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
22
+ GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
23
+ GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
24
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
25
+ GUMBO_ERR_CDATA_IN_HTML_CONTENT,
26
+ GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
27
+ GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM,
28
+ GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
29
+ GUMBO_ERR_END_TAG_WITH_ATTRIBUTES,
30
+ GUMBO_ERR_DUPLICATE_ATTRIBUTE,
31
+ GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS,
32
+ GUMBO_ERR_EOF_BEFORE_TAG_NAME,
33
+ GUMBO_ERR_EOF_IN_CDATA,
34
+ GUMBO_ERR_EOF_IN_COMMENT,
35
+ GUMBO_ERR_EOF_IN_DOCTYPE,
36
+ GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
37
+ GUMBO_ERR_EOF_IN_TAG,
38
+ GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT,
39
+ GUMBO_ERR_INCORRECTLY_OPENED_COMMENT,
40
+ GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
41
+ GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
42
+ GUMBO_ERR_MISSING_ATTRIBUTE_VALUE,
43
+ GUMBO_ERR_MISSING_DOCTYPE_NAME,
44
+ GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
45
+ GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
46
+ GUMBO_ERR_MISSING_END_TAG_NAME,
47
+ GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
48
+ GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
49
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
50
+ GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
51
+ GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
52
+ GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
53
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
54
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
55
+ GUMBO_ERR_NESTED_COMMENT,
56
+ GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
57
+ GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM,
58
+ GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
59
+ GUMBO_ERR_NULL_CHARACTER_REFERENCE,
60
+ GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
61
+ GUMBO_ERR_SURROGATE_IN_INPUT_STREAM,
62
+ GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
63
+ GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
64
+ GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
65
+ GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
66
+ GUMBO_ERR_UNEXPECTED_NULL_CHARACTER,
67
+ GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
68
+ GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG,
69
+ GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE,
70
+
71
+ // Encoding errors.
18
72
  GUMBO_ERR_UTF8_INVALID,
19
73
  GUMBO_ERR_UTF8_TRUNCATED,
20
- GUMBO_ERR_UTF8_NULL,
21
- GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
22
- GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
23
- GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
24
- GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
25
- GUMBO_ERR_NAMED_CHAR_REF_INVALID,
26
- GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
27
- GUMBO_ERR_TAG_EOF,
28
- GUMBO_ERR_TAG_INVALID,
29
- GUMBO_ERR_CLOSE_TAG_EMPTY,
30
- GUMBO_ERR_CLOSE_TAG_EOF,
31
- GUMBO_ERR_CLOSE_TAG_INVALID,
32
- GUMBO_ERR_SCRIPT_EOF,
33
- GUMBO_ERR_ATTR_NAME_EOF,
34
- GUMBO_ERR_ATTR_NAME_INVALID,
35
- GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
36
- GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
37
- GUMBO_ERR_ATTR_UNQUOTED_EOF,
38
- GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
39
- GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
40
- GUMBO_ERR_ATTR_AFTER_EOF,
41
- GUMBO_ERR_ATTR_AFTER_INVALID,
42
- GUMBO_ERR_DUPLICATE_ATTR,
43
- GUMBO_ERR_SOLIDUS_EOF,
44
- GUMBO_ERR_SOLIDUS_INVALID,
45
- GUMBO_ERR_DASHES_OR_DOCTYPE,
46
- GUMBO_ERR_COMMENT_EOF,
47
- GUMBO_ERR_COMMENT_INVALID,
48
- GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
49
- GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
50
- GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
51
- GUMBO_ERR_COMMENT_END_BANG_EOF,
52
- GUMBO_ERR_DOCTYPE_EOF,
53
- GUMBO_ERR_DOCTYPE_INVALID,
54
- GUMBO_ERR_DOCTYPE_SPACE,
55
- GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
56
- GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
57
- GUMBO_ERR_DOCTYPE_END,
74
+
75
+ // Generic parser error.
58
76
  GUMBO_ERR_PARSER,
59
- GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
60
- GUMBO_ERR_SELF_CLOSING_END_TAG,
61
77
  } GumboErrorType;
62
78
 
63
- // Additional data for duplicated attributes.
64
- typedef struct GumboInternalDuplicateAttrError {
65
- // The name of the attribute. Owned by this struct.
66
- const char* name;
67
-
68
- // The (0-based) index within the attributes vector of the original
69
- // occurrence.
70
- unsigned int original_index;
71
-
72
- // The (0-based) index where the new occurrence would be.
73
- unsigned int new_index;
74
- } GumboDuplicateAttrError;
75
-
76
- // A simplified representation of the tokenizer state, designed to be more
77
- // useful to clients of this library than the internal representation. This
78
- // condenses the actual states used in the tokenizer state machine into a few
79
- // values that will be familiar to users of HTML.
80
- typedef enum {
81
- GUMBO_ERR_TOKENIZER_DATA,
82
- GUMBO_ERR_TOKENIZER_CHAR_REF,
83
- GUMBO_ERR_TOKENIZER_RCDATA,
84
- GUMBO_ERR_TOKENIZER_RAWTEXT,
85
- GUMBO_ERR_TOKENIZER_PLAINTEXT,
86
- GUMBO_ERR_TOKENIZER_SCRIPT,
87
- GUMBO_ERR_TOKENIZER_TAG,
88
- GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
89
- GUMBO_ERR_TOKENIZER_ATTR_NAME,
90
- GUMBO_ERR_TOKENIZER_ATTR_VALUE,
91
- GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
92
- GUMBO_ERR_TOKENIZER_COMMENT,
93
- GUMBO_ERR_TOKENIZER_DOCTYPE,
94
- GUMBO_ERR_TOKENIZER_CDATA,
95
- } GumboTokenizerErrorState;
96
-
97
79
  // Additional data for tokenizer errors.
98
80
  // This records the current state and codepoint encountered - this is usually
99
81
  // enough to reconstruct what went wrong and provide a friendly error message.
@@ -102,7 +84,7 @@ typedef struct GumboInternalTokenizerError {
102
84
  int codepoint;
103
85
 
104
86
  // The state that the tokenizer was in at the time.
105
- GumboTokenizerErrorState state;
87
+ GumboTokenizerEnum state;
106
88
  } GumboTokenizerError;
107
89
 
108
90
  // Additional data for parse errors.
@@ -125,43 +107,25 @@ typedef struct GumboInternalParserError {
125
107
  // The overall error struct representing an error in decoding/tokenizing/parsing
126
108
  // the HTML. This contains an enumerated type flag, a source position, and then
127
109
  // a union of fields containing data specific to the error.
128
- typedef struct GumboInternalError {
110
+ struct GumboInternalError {
129
111
  // The type of error.
130
112
  GumboErrorType type;
131
113
 
132
114
  // The position within the source file where the error occurred.
133
115
  GumboSourcePosition position;
134
116
 
135
- // A pointer to the byte within the original source file text where the error
136
- // occurred (note that this is not the same as position.offset, as that gives
137
- // character-based instead of byte-based offsets).
138
- const char* original_text;
117
+ // The piece of text that caused the error.
118
+ GumboStringPiece original_text;
139
119
 
140
120
  // Type-specific error information.
141
121
  union {
142
- // The code point we encountered, for:
143
- // * GUMBO_ERR_UTF8_INVALID
144
- // * GUMBO_ERR_UTF8_TRUNCATED
145
- // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
146
- // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
147
- uint32_t codepoint;
148
-
149
122
  // Tokenizer errors.
150
123
  GumboTokenizerError tokenizer;
151
124
 
152
- // Short textual data, for:
153
- // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
154
- // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
155
- GumboStringPiece text;
156
-
157
- // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
158
- GumboDuplicateAttrError duplicate_attr;
159
-
160
- // Parser state, for GUMBO_ERR_PARSER and
161
- // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
162
- struct GumboInternalParserError parser;
125
+ // Parser errors.
126
+ GumboParserError parser;
163
127
  } v;
164
- } GumboError;
128
+ };
165
129
 
166
130
  // Adds a new error to the parser's error list, and returns a pointer to it so
167
131
  // that clients can fill out the rest of its fields. May return NULL if we're
@@ -177,32 +141,6 @@ void gumbo_destroy_errors(struct GumboInternalParser* errors);
177
141
  // Frees the memory used for a single GumboError.
178
142
  void gumbo_error_destroy(GumboError* error);
179
143
 
180
- // Prints an error to a string. This fills an empty GumboStringBuffer with a
181
- // freshly-allocated buffer containing the error message text. The caller is
182
- // responsible for freeing the buffer.
183
- void gumbo_error_to_string (
184
- const GumboError* error,
185
- GumboStringBuffer* output
186
- );
187
-
188
- // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
189
- // with a freshly-allocated buffer containing the error message text. The
190
- // caller is responsible for freeing the buffer.
191
- void gumbo_caret_diagnostic_to_string (
192
- const GumboError* error,
193
- const char* source_text,
194
- size_t source_length,
195
- GumboStringBuffer* output
196
- );
197
-
198
- // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
199
- // of writing to a string.
200
- void gumbo_print_caret_diagnostic (
201
- const GumboError* error,
202
- const char* source_text,
203
- size_t source_length
204
- );
205
-
206
144
  #ifdef __cplusplus
207
145
  }
208
146
  #endif
@@ -817,10 +817,6 @@ typedef struct GumboInternalOutput {
817
817
 
818
818
  /**
819
819
  * A list of errors that occurred during the parse.
820
- * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
821
- * fleshed out and may change in the future. For this reason, the GumboError
822
- * header isn't part of the public API. Contact us if you need errors
823
- * reported so we can work out something appropriate for your use-case.
824
820
  */
825
821
  GumboVector /* GumboError */ errors;
826
822
 
@@ -866,6 +862,53 @@ const char* gumbo_status_to_string(GumboOutputStatus status);
866
862
  /** Release the memory used for the parse tree and parse errors. */
867
863
  void gumbo_destroy_output(GumboOutput* output);
868
864
 
865
+ /** Opaque GumboError type */
866
+ typedef struct GumboInternalError GumboError;
867
+
868
+ /**
869
+ * Returns the position of the error.
870
+ */
871
+ GumboSourcePosition gumbo_error_position(const GumboError* error);
872
+
873
+ /**
874
+ * Returns a constant string representation of the error's code. This is owned
875
+ * by the library and should not be freed by the caller.
876
+ */
877
+ const char* gumbo_error_code(const GumboError* error);
878
+
879
+ /**
880
+ * Prints an error to a string. This stores a freshly-allocated buffer
881
+ * containing the error message text in output. The caller is responsible for
882
+ * freeing the buffer. The size of the error message is returned. The error
883
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
884
+ * returned size must be used.
885
+ */
886
+ size_t gumbo_error_to_string(const GumboError* error, char **output);
887
+
888
+ /**
889
+ * Prints a caret diagnostic to a string. This stores a freshly-allocated
890
+ * buffer containing the error message text in output. The caller is responsible for
891
+ * freeing the buffer. The size of the error message is returned. The error
892
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
893
+ * returned size must be used.
894
+ */
895
+ size_t gumbo_caret_diagnostic_to_string (
896
+ const GumboError* error,
897
+ const char* source_text,
898
+ size_t source_length,
899
+ char** output
900
+ );
901
+
902
+ /**
903
+ * Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
904
+ * instead of writing to a string.
905
+ */
906
+ void gumbo_print_caret_diagnostic (
907
+ const GumboError* error,
908
+ const char* source_text,
909
+ size_t source_length
910
+ );
911
+
869
912
  #ifdef __cplusplus
870
913
  }
871
914
  #endif
@@ -31,6 +31,7 @@
31
31
  #include "replacement.h"
32
32
  #include "tokenizer.h"
33
33
  #include "tokenizer_states.h"
34
+ #include "token_buffer.h"
34
35
  #include "utf8.h"
35
36
  #include "util.h"
36
37
  #include "vector.h"
@@ -42,7 +43,7 @@ typedef uint8_t TagSet[GUMBO_TAG_LAST + 1];
42
43
 
43
44
  #define GUMBO_EMPTY_SOURCE_POSITION_INIT { .line = 0, .column = 0, .offset = 0 }
44
45
  #define kGumboEmptySourcePosition (const GumboSourcePosition) \
45
- GUMBO_EMPTY_SOURCE_POSITION_INIT
46
+ GUMBO_EMPTY_SOURCE_POSITION_INIT
46
47
 
47
48
  const GumboOptions kGumboDefaultOptions = {
48
49
  .tab_stop = 8,
@@ -59,25 +60,6 @@ const GumboOptions kGumboDefaultOptions = {
59
60
  #define STRING(s) {.data = s, .length = sizeof(s) - 1}
60
61
  #define TERMINATOR {.data = NULL, .length = 0}
61
62
 
62
- static const GumboStringPiece kPublicIdHtml4_0 =
63
- STRING("-//W3C//DTD HTML 4.0//EN");
64
- static const GumboStringPiece kPublicIdHtml4_01 =
65
- STRING("-//W3C//DTD HTML 4.01//EN");
66
- static const GumboStringPiece kPublicIdXhtml1_0 =
67
- STRING("-//W3C//DTD XHTML 1.0 Strict//EN");
68
- static const GumboStringPiece kPublicIdXhtml1_1 =
69
- STRING("-//W3C//DTD XHTML 1.1//EN");
70
- static const GumboStringPiece kSystemIdRecHtml4_0 =
71
- STRING("http://www.w3.org/TR/REC-html40/strict.dtd");
72
- static const GumboStringPiece kSystemIdHtml4 =
73
- STRING("http://www.w3.org/TR/html4/strict.dtd");
74
- static const GumboStringPiece kSystemIdXhtmlStrict1_1 =
75
- STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
76
- static const GumboStringPiece kSystemIdXhtml1_1 =
77
- STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
78
- static const GumboStringPiece kSystemIdLegacyCompat =
79
- STRING("about:legacy-compat");
80
-
81
63
  // The doctype arrays have an explicit terminator because we want to pass them
82
64
  // to a helper function, and passing them as a pointer discards sizeof
83
65
  // information. The SVG arrays are used only by one-off functions, and so loops
@@ -260,6 +242,9 @@ typedef struct GumboInternalParserState {
260
242
  // The accumulated text node buffer state.
261
243
  TextNodeBufferState _text_node;
262
244
 
245
+ // The accumulated character tokens in tables for error purposes.
246
+ GumboCharacterTokenBuffer _table_character_tokens;
247
+
263
248
  // The current token.
264
249
  GumboToken* _current_token;
265
250
 
@@ -365,6 +350,7 @@ static void parser_state_init(GumboParser* parser) {
365
350
  parser_state->_foster_parent_insertions = false;
366
351
  parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
367
352
  gumbo_string_buffer_init(&parser_state->_text_node._buffer);
353
+ gumbo_character_token_buffer_init(&parser_state->_table_character_tokens);
368
354
  gumbo_vector_init(10, &parser_state->_open_elements);
369
355
  gumbo_vector_init(5, &parser_state->_active_formatting_elements);
370
356
  gumbo_vector_init(5, &parser_state->_template_insertion_modes);
@@ -463,6 +449,7 @@ static void parser_state_destroy(GumboParser* parser) {
463
449
  gumbo_vector_destroy(&state->_open_elements);
464
450
  gumbo_vector_destroy(&state->_template_insertion_modes);
465
451
  gumbo_string_buffer_destroy(&state->_text_node._buffer);
452
+ gumbo_character_token_buffer_destroy(&state->_table_character_tokens);
466
453
  gumbo_free(state);
467
454
  }
468
455
 
@@ -573,11 +560,11 @@ static bool tag_in (
573
560
  static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
574
561
  if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
575
562
  return token->v.start_tag.tag == tag;
576
- } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
563
+ }
564
+ if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
577
565
  return token->v.end_tag.tag == tag;
578
- } else {
579
- return false;
580
566
  }
567
+ return false;
581
568
  }
582
569
 
583
570
  static inline bool tagset_includes (
@@ -738,18 +725,18 @@ static void reset_insertion_mode_appropriately(GumboParser* parser) {
738
725
  assert(0);
739
726
  }
740
727
 
741
- static GumboError* parser_add_parse_error (
728
+ static void parser_add_parse_error (
742
729
  GumboParser* parser,
743
730
  const GumboToken* token
744
731
  ) {
745
732
  gumbo_debug("Adding parse error.\n");
746
733
  GumboError* error = gumbo_add_error(parser);
747
734
  if (!error) {
748
- return NULL;
735
+ return;
749
736
  }
750
737
  error->type = GUMBO_ERR_PARSER;
751
738
  error->position = token->position;
752
- error->original_text = token->original_text.data;
739
+ error->original_text = token->original_text;
753
740
  GumboParserError* extra_data = &error->v.parser;
754
741
  extra_data->input_type = token->type;
755
742
  extra_data->input_tag = GUMBO_TAG_UNKNOWN;
@@ -772,7 +759,6 @@ static GumboError* parser_add_parse_error (
772
759
  &extra_data->tag_stack
773
760
  );
774
761
  }
775
- return error;
776
762
  }
777
763
 
778
764
  // https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point
@@ -1639,9 +1625,11 @@ static bool has_node_in_scope(const GumboParser* parser, const GumboNode* node)
1639
1625
  const GumboNodeType type = current->type;
1640
1626
  if (current == node) {
1641
1627
  return true;
1642
- } else if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
1628
+ }
1629
+ if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
1643
1630
  continue;
1644
- } else if (node_tag_in_set(current, &tags)) {
1631
+ }
1632
+ if (node_tag_in_set(current, &tags)) {
1645
1633
  return false;
1646
1634
  }
1647
1635
  }
@@ -1689,8 +1677,8 @@ static bool has_an_element_in_select_scope(const GumboParser* parser, GumboTag t
1689
1677
  // Pass GUMBO_TAG_LAST to not exclude any of them.
1690
1678
  static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1691
1679
  static const TagSet tags = {
1692
- TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), TAG(OPTGROUP),
1693
- TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC)
1680
+ TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION),
1681
+ TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC)
1694
1682
  };
1695
1683
  while (
1696
1684
  node_tag_in_set(get_current_node(parser), &tags)
@@ -1704,15 +1692,36 @@ static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1704
1692
  // https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags
1705
1693
  static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
1706
1694
  static const TagSet tags = {
1707
- TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTION),
1708
- TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY),
1709
- TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)
1695
+ TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
1696
+ TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY),
1697
+ TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
1710
1698
  };
1711
1699
  while (node_tag_in_set(get_current_node(parser), &tags)) {
1712
1700
  pop_current_node(parser);
1713
1701
  }
1714
1702
  }
1715
1703
 
1704
+ // This factors out the clauses in the "in body" insertion mode checking "if
1705
+ // there is a node in the stack of open elements that is not" one of a list of
1706
+ // elements in which case it's a parse error.
1707
+ // This is used in "an end-of-file token", "an end tag whose tag name is
1708
+ // 'body'", and "an end tag whose tag name is 'html'".
1709
+ static bool stack_contains_nonclosable_element (
1710
+ GumboParser* parser
1711
+ ) {
1712
+ static const TagSet tags = {
1713
+ TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P), TAG(RB),
1714
+ TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH),
1715
+ TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML),
1716
+ };
1717
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1718
+ for (size_t i = 0; i < open_elements->length; ++i) {
1719
+ if (!node_tag_in_set(open_elements->data[i], &tags))
1720
+ return true;
1721
+ }
1722
+ return false;
1723
+ }
1724
+
1716
1725
  // This factors out the clauses relating to "act as if an end tag token with tag
1717
1726
  // name "table" had been seen. Returns true if there's a table element in table
1718
1727
  // scope which was successfully closed, false if not and the token should be
@@ -1756,13 +1765,15 @@ static bool close_table_cell (
1756
1765
  // https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell
1757
1766
  // This holds the logic to determine whether we should close a <td> or a <th>.
1758
1767
  static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1768
+ GumboTag cell_tag;
1759
1769
  if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1760
1770
  assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1761
- return close_table_cell(parser, token, GUMBO_TAG_TD);
1771
+ cell_tag = GUMBO_TAG_TD;
1762
1772
  } else {
1763
1773
  assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1764
- return close_table_cell(parser, token, GUMBO_TAG_TH);
1774
+ cell_tag = GUMBO_TAG_TH;
1765
1775
  }
1776
+ return close_table_cell(parser, token, cell_tag);
1766
1777
  }
1767
1778
 
1768
1779
  // This factors out the "act as if an end tag of tag name 'select' had been
@@ -1862,13 +1873,13 @@ static bool maybe_implicitly_close_p_tag (
1862
1873
 
1863
1874
  // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
1864
1875
  // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
1865
- static void maybe_implicitly_close_list_tag (
1876
+ static bool maybe_implicitly_close_list_tag (
1866
1877
  GumboParser* parser,
1867
1878
  GumboToken* token,
1868
1879
  bool is_li
1869
1880
  ) {
1870
1881
  GumboParserState* state = parser->_parser_state;
1871
- state->_frameset_ok = false;
1882
+ set_frameset_not_ok(parser);
1872
1883
  for (int i = state->_open_elements.length; --i >= 0;) {
1873
1884
  const GumboNode* node = state->_open_elements.data[i];
1874
1885
  bool is_list_tag = is_li
@@ -1876,21 +1887,21 @@ static void maybe_implicitly_close_list_tag (
1876
1887
  : node_tag_in_set(node, &dd_dt_tags)
1877
1888
  ;
1878
1889
  if (is_list_tag) {
1879
- implicitly_close_tags (
1890
+ return implicitly_close_tags (
1880
1891
  parser,
1881
1892
  token,
1882
1893
  node->v.element.tag_namespace,
1883
1894
  node->v.element.tag
1884
1895
  );
1885
- return;
1886
1896
  }
1887
1897
  if (
1888
1898
  is_special_node(node)
1889
1899
  && !node_tag_in_set(node, &(const TagSet){TAG(ADDRESS), TAG(DIV), TAG(P)})
1890
1900
  ) {
1891
- return;
1901
+ return true;
1892
1902
  }
1893
1903
  }
1904
+ return true;
1894
1905
  }
1895
1906
 
1896
1907
  static void merge_attributes (
@@ -2009,36 +2020,17 @@ static void adjust_mathml_attributes(GumboToken* token) {
2009
2020
  attr->name = gumbo_strdup("definitionURL");
2010
2021
  }
2011
2022
 
2012
- static bool doctype_matches (
2013
- const GumboTokenDocType* doctype,
2014
- const GumboStringPiece* public_id,
2015
- const GumboStringPiece* system_id,
2016
- bool allow_missing_system_id
2017
- ) {
2018
- return
2019
- !strcmp(doctype->public_identifier, public_id->data)
2020
- && (allow_missing_system_id || doctype->has_system_identifier)
2021
- && !strcmp(doctype->system_identifier, system_id->data);
2022
- }
2023
-
2024
2023
  static bool maybe_add_doctype_error (
2025
2024
  GumboParser* parser,
2026
2025
  const GumboToken* token
2027
2026
  ) {
2028
2027
  const GumboTokenDocType* doctype = &token->v.doc_type;
2029
- bool html_doctype = !strcmp(doctype->name, "html");
2030
- if ((!html_doctype || doctype->has_public_identifier ||
2031
- (doctype->has_system_identifier &&
2032
- !strcmp(
2033
- doctype->system_identifier, kSystemIdLegacyCompat.data))) &&
2034
- !(html_doctype && (doctype_matches(doctype, &kPublicIdHtml4_0,
2035
- &kSystemIdRecHtml4_0, true) ||
2036
- doctype_matches(doctype, &kPublicIdHtml4_01,
2037
- &kSystemIdHtml4, true) ||
2038
- doctype_matches(doctype, &kPublicIdXhtml1_0,
2039
- &kSystemIdXhtmlStrict1_1, false) ||
2040
- doctype_matches(doctype, &kPublicIdXhtml1_1,
2041
- &kSystemIdXhtml1_1, false)))) {
2028
+ if (
2029
+ strcmp(doctype->name, "html")
2030
+ || doctype->has_public_identifier
2031
+ || (doctype->has_system_identifier
2032
+ && strcmp(doctype->system_identifier, "about:legacy-compat"))
2033
+ ) {
2042
2034
  parser_add_parse_error(parser, token);
2043
2035
  return false;
2044
2036
  }
@@ -2069,6 +2061,8 @@ static void remove_from_parent(GumboNode* node) {
2069
2061
 
2070
2062
  // https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
2071
2063
  // Also described in the "in body" handling for end formatting tags.
2064
+ // Returns true if the algorithm handled the token and false to indicate that
2065
+ // it should be handled according to "any other end tag."
2072
2066
  static bool adoption_agency_algorithm (
2073
2067
  GumboParser* parser,
2074
2068
  GumboToken* token,
@@ -2076,7 +2070,7 @@ static bool adoption_agency_algorithm (
2076
2070
  ) {
2077
2071
  GumboParserState* state = parser->_parser_state;
2078
2072
  gumbo_debug("Entering adoption agency algorithm.\n");
2079
- // Step 1.
2073
+ // Step 2.
2080
2074
  GumboNode* current_node = get_current_node(parser);
2081
2075
  if (
2082
2076
  current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML
@@ -2087,18 +2081,19 @@ static bool adoption_agency_algorithm (
2087
2081
  )
2088
2082
  ) {
2089
2083
  pop_current_node(parser);
2090
- return false;
2084
+ return true;
2091
2085
  }
2092
- // Steps 2-4 & 20:
2086
+ // Steps 3-5 & 21:
2093
2087
  for (unsigned int i = 0; i < 8; ++i) {
2094
- // Step 5.
2088
+ // Step 6.
2095
2089
  GumboNode* formatting_node = NULL;
2096
2090
  int formatting_node_in_open_elements = -1;
2097
2091
  for (int j = state->_active_formatting_elements.length; --j >= 0;) {
2098
2092
  GumboNode* current_node = state->_active_formatting_elements.data[j];
2099
2093
  if (current_node == &kActiveFormattingScopeMarker) {
2100
2094
  gumbo_debug("Broke on scope marker; aborting.\n");
2101
- // Last scope marker; abort the algorithm.
2095
+ // Last scope marker; abort the algorithm and handle according to "any
2096
+ // other end tag."
2102
2097
  return false;
2103
2098
  }
2104
2099
  if (node_html_tag_is(current_node, subject)) {
@@ -2124,7 +2119,7 @@ static bool adoption_agency_algorithm (
2124
2119
  return false;
2125
2120
  }
2126
2121
 
2127
- // Step 6
2122
+ // Step 7
2128
2123
  if (formatting_node_in_open_elements == -1) {
2129
2124
  gumbo_debug("Formatting node not on stack of open elements.\n");
2130
2125
  parser_add_parse_error(parser, token);
@@ -2132,17 +2127,17 @@ static bool adoption_agency_algorithm (
2132
2127
  formatting_node,
2133
2128
  &state->_active_formatting_elements
2134
2129
  );
2135
- return false;
2130
+ return true;
2136
2131
  }
2137
2132
 
2138
- // Step 7
2133
+ // Step 8
2139
2134
  if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
2140
2135
  parser_add_parse_error(parser, token);
2141
2136
  gumbo_debug("Element not in scope.\n");
2142
- return false;
2137
+ return true;
2143
2138
  }
2144
2139
 
2145
- // Step 8
2140
+ // Step 9
2146
2141
  if (formatting_node != get_current_node(parser)) {
2147
2142
  parser_add_parse_error(parser, token); // But continue onwards.
2148
2143
  }
@@ -2150,7 +2145,7 @@ static bool adoption_agency_algorithm (
2150
2145
  assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
2151
2146
  assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
2152
2147
 
2153
- // Step 9 & 10
2148
+ // Step 10
2154
2149
  GumboNode* furthest_block = NULL;
2155
2150
  for (
2156
2151
  unsigned int j = formatting_node_in_open_elements;
@@ -2160,32 +2155,27 @@ static bool adoption_agency_algorithm (
2160
2155
  assert(j > 0);
2161
2156
  GumboNode* current = state->_open_elements.data[j];
2162
2157
  if (is_special_node(current)) {
2163
- // Step 9.
2164
2158
  furthest_block = current;
2165
2159
  break;
2166
2160
  }
2167
2161
  }
2162
+ // Step 11.
2168
2163
  if (!furthest_block) {
2169
- // Step 10.
2170
- while (get_current_node(parser) != formatting_node) {
2171
- pop_current_node(parser);
2172
- }
2173
- // And the formatting element itself.
2174
- pop_current_node(parser);
2164
+ while (pop_current_node(parser) != formatting_node)
2165
+ ;
2175
2166
  gumbo_vector_remove (
2176
2167
  formatting_node,
2177
2168
  &state->_active_formatting_elements
2178
2169
  );
2179
- return false;
2170
+ return true;
2180
2171
  }
2181
2172
  assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
2182
- assert(furthest_block);
2183
2173
 
2184
- // Step 11.
2174
+ // Step 12.
2185
2175
  // Elements may be moved and reparented by this algorithm, so
2186
2176
  // common_ancestor is not necessarily the same as formatting_node->parent.
2187
2177
  GumboNode* common_ancestor = state->_open_elements.data [
2188
- gumbo_vector_index_of(&state->_open_elements, formatting_node) - 1
2178
+ formatting_node_in_open_elements - 1
2189
2179
  ];
2190
2180
  gumbo_debug (
2191
2181
  "Common ancestor tag = %s, furthest block tag = %s.\n",
@@ -2193,24 +2183,24 @@ static bool adoption_agency_algorithm (
2193
2183
  gumbo_normalized_tagname(furthest_block->v.element.tag)
2194
2184
  );
2195
2185
 
2196
- // Step 12.
2186
+ // Step 13.
2197
2187
  int bookmark = 1 + gumbo_vector_index_of (
2198
2188
  &state->_active_formatting_elements,
2199
2189
  formatting_node
2200
2190
  );
2201
2191
  gumbo_debug("Bookmark at %d.\n", bookmark);
2202
- // Step 13.
2192
+ // Step 14.
2203
2193
  GumboNode* node = furthest_block;
2204
2194
  GumboNode* last_node = furthest_block;
2205
2195
  // Must be stored explicitly, in case node is removed from the stack of open
2206
- // elements, to handle step 9.4.
2196
+ // elements, to handle step 14.3.
2207
2197
  int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
2208
2198
  assert(saved_node_index > 0);
2209
- // Step 13.1.
2199
+ // Step 14.1.
2210
2200
  for (int j = 0;;) {
2211
- // Step 13.2.
2201
+ // Step 14.2.
2212
2202
  ++j;
2213
- // Step 13.3.
2203
+ // Step 14.3.
2214
2204
  int node_index = gumbo_vector_index_of(&state->_open_elements, node);
2215
2205
  gumbo_debug (
2216
2206
  "Current index: %d, last index: %d.\n",
@@ -2225,16 +2215,16 @@ static bool adoption_agency_algorithm (
2225
2215
  assert((unsigned int) node_index < state->_open_elements.capacity);
2226
2216
  node = state->_open_elements.data[node_index];
2227
2217
  assert(node->parent);
2218
+ // Step 14.4.
2228
2219
  if (node == formatting_node) {
2229
- // Step 13.4.
2230
2220
  break;
2231
2221
  }
2232
2222
  int formatting_index = gumbo_vector_index_of (
2233
2223
  &state->_active_formatting_elements,
2234
2224
  node
2235
2225
  );
2226
+ // Step 14.5.
2236
2227
  if (j > 3 && formatting_index != -1) {
2237
- // Step 13.5.
2238
2228
  gumbo_debug("Removing formatting element at %d.\n", formatting_index);
2239
2229
  gumbo_vector_remove_at (
2240
2230
  formatting_index,
@@ -2249,11 +2239,11 @@ static bool adoption_agency_algorithm (
2249
2239
  continue;
2250
2240
  }
2251
2241
  if (formatting_index == -1) {
2252
- // Step 13.6.
2242
+ // Step 14.6.
2253
2243
  gumbo_vector_remove_at(node_index, &state->_open_elements);
2254
2244
  continue;
2255
2245
  }
2256
- // Step 13.7.
2246
+ // Step 14.7.
2257
2247
  // "common ancestor as the intended parent" doesn't actually mean insert
2258
2248
  // it into the common ancestor; that happens below.
2259
2249
  node = clone_node(node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
@@ -2261,21 +2251,21 @@ static bool adoption_agency_algorithm (
2261
2251
  state->_active_formatting_elements.data[formatting_index] = node;
2262
2252
  assert(node_index >= 0);
2263
2253
  state->_open_elements.data[node_index] = node;
2264
- // Step 13.8.
2254
+ // Step 14.8.
2265
2255
  if (last_node == furthest_block) {
2266
2256
  bookmark = formatting_index + 1;
2267
2257
  gumbo_debug("Bookmark moved to %d.\n", bookmark);
2268
2258
  assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
2269
2259
  }
2270
- // Step 13.9.
2260
+ // Step 14.9.
2271
2261
  last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
2272
2262
  remove_from_parent(last_node);
2273
2263
  append_node(node, last_node);
2274
- // Step 13.10.
2264
+ // Step 14.10.
2275
2265
  last_node = node;
2276
- } // Step 13.11.
2266
+ } // Step 14.11.
2277
2267
 
2278
- // Step 14.
2268
+ // Step 15.
2279
2269
  gumbo_debug (
2280
2270
  "Removing %s node from parent ",
2281
2271
  gumbo_normalized_tagname(last_node->v.element.tag)
@@ -2292,14 +2282,14 @@ static bool adoption_agency_algorithm (
2292
2282
  );
2293
2283
  insert_node(last_node, location);
2294
2284
 
2295
- // Step 15.
2285
+ // Step 16.
2296
2286
  GumboNode* new_formatting_node = clone_node (
2297
2287
  formatting_node,
2298
2288
  GUMBO_INSERTION_ADOPTION_AGENCY_CLONED
2299
2289
  );
2300
2290
  formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2301
2291
 
2302
- // Step 16. Instead of appending nodes one-by-one, we swap the children
2292
+ // Step 17. Instead of appending nodes one-by-one, we swap the children
2303
2293
  // vector of furthest_block with the empty children of new_formatting_node,
2304
2294
  // reducing memory traffic and allocations. We still have to reset their
2305
2295
  // parent pointers, though.
@@ -2313,10 +2303,10 @@ static bool adoption_agency_algorithm (
2313
2303
  child->parent = new_formatting_node;
2314
2304
  }
2315
2305
 
2316
- // Step 17.
2306
+ // Step 18.
2317
2307
  append_node(furthest_block, new_formatting_node);
2318
2308
 
2319
- // Step 18.
2309
+ // Step 19.
2320
2310
  // If the formatting node was before the bookmark, it may shift over all
2321
2311
  // indices after it, so we need to explicitly find the index and possibly
2322
2312
  // adjust the bookmark.
@@ -2344,7 +2334,7 @@ static bool adoption_agency_algorithm (
2344
2334
  &state->_active_formatting_elements
2345
2335
  );
2346
2336
 
2347
- // Step 19.
2337
+ // Step 20.
2348
2338
  gumbo_vector_remove(formatting_node, &state->_open_elements);
2349
2339
  int insert_at = 1 + gumbo_vector_index_of (
2350
2340
  &state->_open_elements,
@@ -2357,7 +2347,7 @@ static bool adoption_agency_algorithm (
2357
2347
  insert_at,
2358
2348
  &state->_open_elements
2359
2349
  );
2360
- } // Step 20.
2350
+ } // Step 21.
2361
2351
  return true;
2362
2352
  }
2363
2353
 
@@ -2406,10 +2396,12 @@ static bool handle_initial(GumboParser* parser, GumboToken* token) {
2406
2396
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2407
2397
  ignore_token(parser);
2408
2398
  return true;
2409
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2399
+ }
2400
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2410
2401
  append_comment_node(parser, get_document_node(parser), token);
2411
2402
  return true;
2412
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2403
+ }
2404
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2413
2405
  document->has_doctype = true;
2414
2406
  document->name = token->v.doc_type.name;
2415
2407
  document->public_identifier = token->v.doc_type.public_identifier;
@@ -2431,95 +2423,108 @@ static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2431
2423
  parser_add_parse_error(parser, token);
2432
2424
  ignore_token(parser);
2433
2425
  return false;
2434
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2426
+ }
2427
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2435
2428
  append_comment_node(parser, get_document_node(parser), token);
2436
2429
  return true;
2437
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2430
+ }
2431
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2438
2432
  ignore_token(parser);
2439
2433
  return true;
2440
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2434
+ }
2435
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2441
2436
  GumboNode* html_node = insert_element_from_token(parser, token);
2442
2437
  parser->_output->root = html_node;
2443
2438
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2444
2439
  return true;
2445
- } else if (
2440
+ }
2441
+ if (
2446
2442
  token->type == GUMBO_TOKEN_END_TAG
2447
2443
  && !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2448
2444
  ) {
2449
2445
  parser_add_parse_error(parser, token);
2450
2446
  ignore_token(parser);
2451
2447
  return false;
2452
- } else {
2453
- GumboNode* html_node = insert_element_of_tag_type (
2454
- parser,
2455
- GUMBO_TAG_HTML,
2456
- GUMBO_INSERTION_IMPLIED
2457
- );
2458
- assert(html_node);
2459
- parser->_output->root = html_node;
2460
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2461
- parser->_parser_state->_reprocess_current_token = true;
2462
- return true;
2463
2448
  }
2449
+ GumboNode* html_node = insert_element_of_tag_type (
2450
+ parser,
2451
+ GUMBO_TAG_HTML,
2452
+ GUMBO_INSERTION_IMPLIED
2453
+ );
2454
+ assert(html_node);
2455
+ parser->_output->root = html_node;
2456
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2457
+ parser->_parser_state->_reprocess_current_token = true;
2458
+ return true;
2464
2459
  }
2465
2460
 
2461
+ // Forward declarations because of mutual dependencies.
2462
+ static bool handle_token(GumboParser* parser, GumboToken* token);
2463
+ static bool handle_in_body(GumboParser* parser, GumboToken* token);
2464
+ static bool handle_in_template(GumboParser* parser, GumboToken* token);
2465
+
2466
2466
  // https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode
2467
2467
  static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2468
- if (token->type == GUMBO_TOKEN_DOCTYPE) {
2469
- parser_add_parse_error(parser, token);
2468
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2470
2469
  ignore_token(parser);
2471
- return false;
2472
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2470
+ return true;
2471
+ }
2472
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2473
2473
  append_comment_node(parser, get_current_node(parser), token);
2474
2474
  return true;
2475
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2475
+ }
2476
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2477
+ parser_add_parse_error(parser, token);
2476
2478
  ignore_token(parser);
2477
- return true;
2478
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2479
+ return false;
2480
+ }
2481
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2482
+ return handle_in_body(parser, token);
2483
+ }
2484
+ if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2479
2485
  GumboNode* node = insert_element_from_token(parser, token);
2480
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2481
2486
  parser->_parser_state->_head_element = node;
2487
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2482
2488
  return true;
2483
- } else if (
2489
+ }
2490
+ if (
2484
2491
  token->type == GUMBO_TOKEN_END_TAG
2485
- && !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2492
+ && !tag_in(token, kEndTag, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2486
2493
  ) {
2487
2494
  parser_add_parse_error(parser, token);
2488
2495
  ignore_token(parser);
2489
2496
  return false;
2490
- } else {
2491
- GumboNode* node = insert_element_of_tag_type (
2492
- parser,
2493
- GUMBO_TAG_HEAD,
2494
- GUMBO_INSERTION_IMPLIED
2495
- );
2496
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2497
- parser->_parser_state->_head_element = node;
2498
- parser->_parser_state->_reprocess_current_token = true;
2499
- return true;
2500
2497
  }
2498
+ GumboNode* node = insert_element_of_tag_type (
2499
+ parser,
2500
+ GUMBO_TAG_HEAD,
2501
+ GUMBO_INSERTION_IMPLIED
2502
+ );
2503
+ parser->_parser_state->_head_element = node;
2504
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2505
+ parser->_parser_state->_reprocess_current_token = true;
2506
+ return true;
2501
2507
  }
2502
2508
 
2503
- // Forward declarations because of mutual dependencies.
2504
- static bool handle_token(GumboParser* parser, GumboToken* token);
2505
- static bool handle_in_body(GumboParser* parser, GumboToken* token);
2506
- static bool handle_in_template(GumboParser* parser, GumboToken* token);
2507
-
2508
2509
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead
2509
2510
  static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2510
2511
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2511
2512
  insert_text_token(parser, token);
2512
2513
  return true;
2513
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2514
+ }
2515
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2516
+ append_comment_node(parser, get_current_node(parser), token);
2517
+ return true;
2518
+ }
2519
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2514
2520
  parser_add_parse_error(parser, token);
2515
2521
  ignore_token(parser);
2516
2522
  return false;
2517
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2518
- append_comment_node(parser, get_current_node(parser), token);
2519
- return true;
2520
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2523
+ }
2524
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2521
2525
  return handle_in_body(parser, token);
2522
- } else if (
2526
+ }
2527
+ if (
2523
2528
  tag_in(token, kStartTag, &(const TagSet) {
2524
2529
  TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK)
2525
2530
  })
@@ -2528,7 +2533,8 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2528
2533
  pop_current_node(parser);
2529
2534
  acknowledge_self_closing_tag(parser);
2530
2535
  return true;
2531
- } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2536
+ }
2537
+ if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2532
2538
  insert_element_from_token(parser, token);
2533
2539
  pop_current_node(parser);
2534
2540
  acknowledge_self_closing_tag(parser);
@@ -2537,42 +2543,50 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2537
2543
  // should specifically look for that string in the document and re-encode it
2538
2544
  // before passing to Gumbo.
2539
2545
  return true;
2540
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2546
+ }
2547
+ if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2541
2548
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2542
2549
  return true;
2543
- } else if (
2550
+ }
2551
+ if (
2544
2552
  tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
2545
2553
  ) {
2546
2554
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2547
2555
  return true;
2548
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2556
+ }
2557
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2549
2558
  insert_element_from_token(parser, token);
2550
2559
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2551
2560
  return true;
2552
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2553
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
2561
+ }
2562
+ if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2563
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT_DATA);
2554
2564
  return true;
2555
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2565
+ }
2566
+ if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2556
2567
  GumboNode* head = pop_current_node(parser);
2557
2568
  UNUSED_IF_NDEBUG(head);
2558
2569
  assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
2559
2570
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2560
2571
  return true;
2561
- } else if (
2572
+ }
2573
+ if (
2562
2574
  tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)})
2563
2575
  ) {
2564
2576
  pop_current_node(parser);
2565
2577
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2566
2578
  parser->_parser_state->_reprocess_current_token = true;
2567
2579
  return true;
2568
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2580
+ }
2581
+ if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2569
2582
  insert_element_from_token(parser, token);
2570
2583
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
2571
- parser->_parser_state->_frameset_ok = false;
2584
+ set_frameset_not_ok(parser);
2572
2585
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2573
2586
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2574
2587
  return true;
2575
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2588
+ }
2589
+ if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2576
2590
  if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2577
2591
  parser_add_parse_error(parser, token);
2578
2592
  ignore_token(parser);
@@ -2590,19 +2604,18 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2590
2604
  pop_template_insertion_mode(parser);
2591
2605
  reset_insertion_mode_appropriately(parser);
2592
2606
  return success;
2593
- } else if (
2607
+ }
2608
+ if (
2594
2609
  tag_is(token, kStartTag, GUMBO_TAG_HEAD)
2595
2610
  || (token->type == GUMBO_TOKEN_END_TAG)
2596
2611
  ) {
2597
2612
  parser_add_parse_error(parser, token);
2598
2613
  ignore_token(parser);
2599
2614
  return false;
2600
- } else {
2601
- pop_current_node(parser);
2602
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2603
- parser->_parser_state->_reprocess_current_token = true;
2604
- return true;
2605
2615
  }
2616
+ pop_current_node(parser);
2617
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2618
+ parser->_parser_state->_reprocess_current_token = true;
2606
2619
  return true;
2607
2620
  }
2608
2621
 
@@ -2611,15 +2624,18 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2611
2624
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
2612
2625
  parser_add_parse_error(parser, token);
2613
2626
  return false;
2614
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2627
+ }
2628
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2615
2629
  return handle_in_body(parser, token);
2616
- } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2630
+ }
2631
+ if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2617
2632
  const GumboNode* node = pop_current_node(parser);
2618
2633
  assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2619
2634
  UNUSED_IF_NDEBUG(node);
2620
2635
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2621
2636
  return true;
2622
- } else if (
2637
+ }
2638
+ if (
2623
2639
  token->type == GUMBO_TOKEN_WHITESPACE
2624
2640
  || token->type == GUMBO_TOKEN_COMMENT
2625
2641
  || tag_in (token, kStartTag, &(const TagSet) {
@@ -2628,7 +2644,8 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2628
2644
  })
2629
2645
  ) {
2630
2646
  return handle_in_head(parser, token);
2631
- } else if (
2647
+ }
2648
+ if (
2632
2649
  tag_in(token, kStartTag, &(const TagSet){TAG(HEAD), TAG(NOSCRIPT)})
2633
2650
  || (
2634
2651
  token->type == GUMBO_TOKEN_END_TAG
@@ -2638,15 +2655,14 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2638
2655
  parser_add_parse_error(parser, token);
2639
2656
  ignore_token(parser);
2640
2657
  return false;
2641
- } else {
2642
- parser_add_parse_error(parser, token);
2643
- const GumboNode* node = pop_current_node(parser);
2644
- assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2645
- UNUSED_IF_NDEBUG(node);
2646
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2647
- parser->_parser_state->_reprocess_current_token = true;
2648
- return false;
2649
2658
  }
2659
+ parser_add_parse_error(parser, token);
2660
+ const GumboNode* node = pop_current_node(parser);
2661
+ assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2662
+ UNUSED_IF_NDEBUG(node);
2663
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2664
+ parser->_parser_state->_reprocess_current_token = true;
2665
+ return false;
2650
2666
  }
2651
2667
 
2652
2668
  // https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode
@@ -2655,25 +2671,31 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2655
2671
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2656
2672
  insert_text_token(parser, token);
2657
2673
  return true;
2658
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2674
+ }
2675
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2676
+ append_comment_node(parser, get_current_node(parser), token);
2677
+ return true;
2678
+ }
2679
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2659
2680
  parser_add_parse_error(parser, token);
2660
2681
  ignore_token(parser);
2661
2682
  return false;
2662
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2663
- append_comment_node(parser, get_current_node(parser), token);
2664
- return true;
2665
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2683
+ }
2684
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2666
2685
  return handle_in_body(parser, token);
2667
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2686
+ }
2687
+ if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2668
2688
  insert_element_from_token(parser, token);
2669
- state->_frameset_ok = false;
2689
+ set_frameset_not_ok(parser);
2670
2690
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2671
2691
  return true;
2672
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2692
+ }
2693
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2673
2694
  insert_element_from_token(parser, token);
2674
2695
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2675
2696
  return true;
2676
- } else if (
2697
+ }
2698
+ if (
2677
2699
  tag_in(token, kStartTag, &(const TagSet) {
2678
2700
  TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META),
2679
2701
  TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)
@@ -2685,12 +2707,14 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2685
2707
  // pending character tokens that should be attached to the root.
2686
2708
  maybe_flush_text_node_buffer(parser);
2687
2709
  gumbo_vector_add(state->_head_element, &state->_open_elements);
2688
- bool result = handle_in_head(parser, token);
2710
+ handle_in_head(parser, token);
2689
2711
  gumbo_vector_remove(state->_head_element, &state->_open_elements);
2690
- return result;
2691
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2712
+ return false;
2713
+ }
2714
+ if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2692
2715
  return handle_in_head(parser, token);
2693
- } else if (
2716
+ }
2717
+ if (
2694
2718
  tag_is(token, kStartTag, GUMBO_TAG_HEAD)
2695
2719
  || (
2696
2720
  token->type == GUMBO_TOKEN_END_TAG
@@ -2700,12 +2724,11 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2700
2724
  parser_add_parse_error(parser, token);
2701
2725
  ignore_token(parser);
2702
2726
  return false;
2703
- } else {
2704
- insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2705
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2706
- state->_reprocess_current_token = true;
2707
- return true;
2708
2727
  }
2728
+ insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2729
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2730
+ state->_reprocess_current_token = true;
2731
+ return true;
2709
2732
  }
2710
2733
 
2711
2734
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
@@ -2716,11 +2739,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2716
2739
  parser_add_parse_error(parser, token);
2717
2740
  ignore_token(parser);
2718
2741
  return false;
2719
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2742
+ }
2743
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2720
2744
  reconstruct_active_formatting_elements(parser);
2721
2745
  insert_text_token(parser, token);
2722
2746
  return true;
2723
- } else if (
2747
+ }
2748
+ if (
2724
2749
  token->type == GUMBO_TOKEN_CHARACTER
2725
2750
  || token->type == GUMBO_TOKEN_CDATA
2726
2751
  ) {
@@ -2728,14 +2753,17 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2728
2753
  insert_text_token(parser, token);
2729
2754
  set_frameset_not_ok(parser);
2730
2755
  return true;
2731
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2756
+ }
2757
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2732
2758
  append_comment_node(parser, get_current_node(parser), token);
2733
2759
  return true;
2734
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2760
+ }
2761
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2735
2762
  parser_add_parse_error(parser, token);
2736
2763
  ignore_token(parser);
2737
2764
  return false;
2738
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2765
+ }
2766
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2739
2767
  parser_add_parse_error(parser, token);
2740
2768
  if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2741
2769
  ignore_token(parser);
@@ -2745,7 +2773,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2745
2773
  assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2746
2774
  merge_attributes(token, parser->_output->root);
2747
2775
  return false;
2748
- } else if (
2776
+ }
2777
+ if (
2749
2778
  tag_in(token, kStartTag, &(const TagSet) {
2750
2779
  TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
2751
2780
  TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE),
@@ -2754,7 +2783,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2754
2783
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
2755
2784
  ) {
2756
2785
  return handle_in_head(parser, token);
2757
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2786
+ }
2787
+ if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2758
2788
  parser_add_parse_error(parser, token);
2759
2789
  if (
2760
2790
  state->_open_elements.length < 2
@@ -2762,12 +2792,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2762
2792
  || has_open_element(parser, GUMBO_TAG_TEMPLATE)
2763
2793
  ) {
2764
2794
  ignore_token(parser);
2765
- return false;
2795
+ } else {
2796
+ set_frameset_not_ok(parser);
2797
+ merge_attributes(token, state->_open_elements.data[1]);
2766
2798
  }
2767
- state->_frameset_ok = false;
2768
- merge_attributes(token, state->_open_elements.data[1]);
2769
2799
  return false;
2770
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2800
+ }
2801
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2771
2802
  parser_add_parse_error(parser, token);
2772
2803
  if (
2773
2804
  state->_open_elements.length < 2
@@ -2808,64 +2839,64 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2808
2839
  insert_element_from_token(parser, token);
2809
2840
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2810
2841
  return true;
2811
- } else if (token->type == GUMBO_TOKEN_EOF) {
2812
- for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2813
- if (
2814
- !node_tag_in_set(state->_open_elements.data[i], &(const TagSet) {
2815
- TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY), TAG(TD), TAG(TFOOT),
2816
- TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML)
2817
- })
2818
- ) {
2819
- parser_add_parse_error(parser, token);
2820
- }
2821
- }
2842
+ }
2843
+ if (token->type == GUMBO_TOKEN_EOF) {
2822
2844
  if (get_current_template_insertion_mode(parser) !=
2823
2845
  GUMBO_INSERTION_MODE_INITIAL) {
2824
2846
  return handle_in_template(parser, token);
2825
2847
  }
2848
+ if (stack_contains_nonclosable_element(parser)) {
2849
+ parser_add_parse_error(parser, token);
2850
+ return false;
2851
+ }
2826
2852
  return true;
2827
- } else if (tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML)})) {
2853
+ }
2854
+ if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
2828
2855
  if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2829
2856
  parser_add_parse_error(parser, token);
2830
2857
  ignore_token(parser);
2831
2858
  return false;
2832
2859
  }
2833
2860
  bool success = true;
2834
- for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2835
- if (
2836
- !node_tag_in_set(state->_open_elements.data[i], &(const TagSet) {
2837
- TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P),
2838
- TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD),
2839
- TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML)
2840
- })
2841
- ) {
2842
- parser_add_parse_error(parser, token);
2843
- success = false;
2844
- break;
2845
- }
2861
+ if (stack_contains_nonclosable_element(parser)) {
2862
+ parser_add_parse_error(parser, token);
2863
+ success = false;
2846
2864
  }
2865
+ GumboNode* body = state->_open_elements.data[1];
2866
+ assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2867
+ record_end_of_element(state->_current_token, &body->v.element);
2847
2868
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2848
- if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2849
- parser->_parser_state->_reprocess_current_token = true;
2850
- } else {
2851
- GumboNode* body = state->_open_elements.data[1];
2852
- assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2853
- record_end_of_element(state->_current_token, &body->v.element);
2869
+ return success;
2870
+ }
2871
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2872
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2873
+ parser_add_parse_error(parser, token);
2874
+ ignore_token(parser);
2875
+ return false;
2854
2876
  }
2877
+ bool success = true;
2878
+ if (stack_contains_nonclosable_element(parser)) {
2879
+ parser_add_parse_error(parser, token);
2880
+ success = false;
2881
+ }
2882
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2883
+ parser->_parser_state->_reprocess_current_token = true;
2855
2884
  return success;
2856
- } else if (
2885
+ }
2886
+ if (
2857
2887
  tag_in(token, kStartTag, &(const TagSet) {
2858
2888
  TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER),
2859
2889
  TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
2860
2890
  TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
2861
- TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
2891
+ TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
2862
2892
  TAG(SUMMARY), TAG(UL)
2863
2893
  })
2864
2894
  ) {
2865
2895
  bool result = maybe_implicitly_close_p_tag(parser, token);
2866
2896
  insert_element_from_token(parser, token);
2867
2897
  return result;
2868
- } else if (tag_in(token, kStartTag, &heading_tags)) {
2898
+ }
2899
+ if (tag_in(token, kStartTag, &heading_tags)) {
2869
2900
  bool result = maybe_implicitly_close_p_tag(parser, token);
2870
2901
  if (node_tag_in_set(get_current_node(parser), &heading_tags)) {
2871
2902
  parser_add_parse_error(parser, token);
@@ -2874,13 +2905,15 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2874
2905
  }
2875
2906
  insert_element_from_token(parser, token);
2876
2907
  return result;
2877
- } else if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) {
2908
+ }
2909
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) {
2878
2910
  bool result = maybe_implicitly_close_p_tag(parser, token);
2879
2911
  insert_element_from_token(parser, token);
2880
2912
  state->_ignore_next_linefeed = true;
2881
- state->_frameset_ok = false;
2913
+ set_frameset_not_ok(parser);
2882
2914
  return result;
2883
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2915
+ }
2916
+ if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2884
2917
  if (
2885
2918
  state->_form_element != NULL
2886
2919
  && !has_open_element(parser, GUMBO_TAG_TEMPLATE)
@@ -2896,38 +2929,42 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2896
2929
  state->_form_element = form_element;
2897
2930
  }
2898
2931
  return result;
2899
- } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2900
- maybe_implicitly_close_list_tag(parser, token, true);
2901
- bool result = maybe_implicitly_close_p_tag(parser, token);
2932
+ }
2933
+ if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2934
+ bool result = maybe_implicitly_close_list_tag(parser, token, true);
2935
+ result = maybe_implicitly_close_p_tag(parser, token) && result;
2902
2936
  insert_element_from_token(parser, token);
2903
2937
  return result;
2904
- } else if (tag_in(token, kStartTag, &dd_dt_tags)) {
2905
- maybe_implicitly_close_list_tag(parser, token, false);
2906
- bool result = maybe_implicitly_close_p_tag(parser, token);
2938
+ }
2939
+ if (tag_in(token, kStartTag, &dd_dt_tags)) {
2940
+ bool result = maybe_implicitly_close_list_tag(parser, token, false);
2941
+ result = maybe_implicitly_close_p_tag(parser, token) && result;
2907
2942
  insert_element_from_token(parser, token);
2908
2943
  return result;
2909
- } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2944
+ }
2945
+ if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2910
2946
  bool result = maybe_implicitly_close_p_tag(parser, token);
2911
2947
  insert_element_from_token(parser, token);
2912
2948
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
2913
2949
  return result;
2914
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2950
+ }
2951
+ if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2952
+ bool success = true;
2915
2953
  if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2916
2954
  parser_add_parse_error(parser, token);
2917
- implicitly_close_tags (
2918
- parser,
2919
- token,
2920
- GUMBO_NAMESPACE_HTML,
2921
- GUMBO_TAG_BUTTON
2922
- );
2923
- state->_reprocess_current_token = true;
2924
- return false;
2955
+ success = false;
2956
+ // We don't want to use implicitly_close_tags here because it may add an
2957
+ // error and we've already added the only error the standard specifies.
2958
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2959
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_BUTTON))
2960
+ ;
2925
2961
  }
2926
2962
  reconstruct_active_formatting_elements(parser);
2927
2963
  insert_element_from_token(parser, token);
2928
- state->_frameset_ok = false;
2929
- return true;
2930
- } else if (
2964
+ set_frameset_not_ok(parser);
2965
+ return success;
2966
+ }
2967
+ if (
2931
2968
  tag_in(token, kEndTag, &(const TagSet) {
2932
2969
  TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON),
2933
2970
  TAG(CENTER), TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL),
@@ -2942,14 +2979,14 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2942
2979
  ignore_token(parser);
2943
2980
  return false;
2944
2981
  }
2945
- implicitly_close_tags (
2982
+ return implicitly_close_tags (
2946
2983
  parser,
2947
2984
  token,
2948
2985
  GUMBO_NAMESPACE_HTML,
2949
2986
  token->v.end_tag.tag
2950
2987
  );
2951
- return true;
2952
- } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2988
+ }
2989
+ if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2953
2990
  if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2954
2991
  if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
2955
2992
  parser_add_parse_error(parser, token);
@@ -2960,7 +2997,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2960
2997
  generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2961
2998
  if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
2962
2999
  parser_add_parse_error(parser, token);
2963
- return false;
3000
+ success = false;
2964
3001
  }
2965
3002
  while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
2966
3003
  ;
@@ -2992,7 +3029,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2992
3029
  gumbo_vector_remove_at(index, open_elements);
2993
3030
  return result;
2994
3031
  }
2995
- } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
3032
+ }
3033
+ if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
3034
+ bool success = true;
2996
3035
  if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2997
3036
  parser_add_parse_error(parser, token);
2998
3037
  // reconstruct_active_formatting_elements(parser);
@@ -3001,16 +3040,16 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3001
3040
  GUMBO_TAG_P,
3002
3041
  GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3003
3042
  );
3004
- state->_reprocess_current_token = true;
3005
- return false;
3043
+ success = false;
3006
3044
  }
3007
3045
  return implicitly_close_tags (
3008
3046
  parser,
3009
3047
  token,
3010
3048
  GUMBO_NAMESPACE_HTML,
3011
3049
  GUMBO_TAG_P
3012
- );
3013
- } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
3050
+ ) && success;
3051
+ }
3052
+ if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
3014
3053
  if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
3015
3054
  parser_add_parse_error(parser, token);
3016
3055
  ignore_token(parser);
@@ -3022,8 +3061,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3022
3061
  GUMBO_NAMESPACE_HTML,
3023
3062
  GUMBO_TAG_LI
3024
3063
  );
3025
- } else if (tag_in(token, kEndTag, &dd_dt_tags)) {
3026
- assert(token->type == GUMBO_TOKEN_END_TAG);
3064
+ }
3065
+ if (tag_in(token, kEndTag, &dd_dt_tags)) {
3027
3066
  GumboTag token_tag = token->v.end_tag.tag;
3028
3067
  if (!has_an_element_in_scope(parser, token_tag)) {
3029
3068
  parser_add_parse_error(parser, token);
@@ -3036,7 +3075,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3036
3075
  GUMBO_NAMESPACE_HTML,
3037
3076
  token_tag
3038
3077
  );
3039
- } else if (tag_in(token, kEndTag, &heading_tags)) {
3078
+ }
3079
+ if (tag_in(token, kEndTag, &heading_tags)) {
3040
3080
  if (
3041
3081
  !has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) {
3042
3082
  GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
@@ -3047,30 +3087,31 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3047
3087
  parser_add_parse_error(parser, token);
3048
3088
  ignore_token(parser);
3049
3089
  return false;
3050
- } else {
3051
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3052
- const GumboNode* current_node = get_current_node(parser);
3053
- bool success = node_html_tag_is(current_node, token->v.end_tag.tag);
3054
- if (!success) {
3055
- // There're children of the heading currently open; close them below and
3056
- // record a parse error.
3057
- // TODO(jdtang): Add a way to distinguish this error case from the one
3058
- // above.
3059
- parser_add_parse_error(parser, token);
3060
- }
3061
- do {
3062
- current_node = pop_current_node(parser);
3063
- } while (!node_tag_in_set(current_node, &heading_tags));
3064
- return success;
3065
3090
  }
3066
- } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
3091
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3092
+ const GumboNode* current_node = get_current_node(parser);
3093
+ bool success = node_html_tag_is(current_node, token->v.end_tag.tag);
3094
+ if (!success) {
3095
+ // There're children of the heading currently open; close them below and
3096
+ // record a parse error.
3097
+ // TODO(jdtang): Add a way to distinguish this error case from the one
3098
+ // above.
3099
+ parser_add_parse_error(parser, token);
3100
+ }
3101
+ do {
3102
+ current_node = pop_current_node(parser);
3103
+ } while (!node_tag_in_set(current_node, &heading_tags));
3104
+ return success;
3105
+ }
3106
+ if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
3067
3107
  bool success = true;
3068
3108
  int last_a;
3069
3109
  int has_matching_a = find_last_anchor_index(parser, &last_a);
3070
3110
  if (has_matching_a) {
3071
3111
  assert(has_matching_a == 1);
3072
3112
  parser_add_parse_error(parser, token);
3073
- adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
3113
+ bool handled = adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
3114
+ assert(handled);
3074
3115
  // The adoption agency algorithm usually removes all instances of <a>
3075
3116
  // from the list of active formatting elements, but in case it doesn't,
3076
3117
  // we're supposed to do this. (The conditions where it might not are
@@ -3087,7 +3128,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3087
3128
  reconstruct_active_formatting_elements(parser);
3088
3129
  add_formatting_element(parser, insert_element_from_token(parser, token));
3089
3130
  return success;
3090
- } else if (
3131
+ }
3132
+ if (
3091
3133
  tag_in(token, kStartTag, &(const TagSet) {
3092
3134
  TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I), TAG(S),
3093
3135
  TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)
@@ -3096,27 +3138,33 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3096
3138
  reconstruct_active_formatting_elements(parser);
3097
3139
  add_formatting_element(parser, insert_element_from_token(parser, token));
3098
3140
  return true;
3099
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
3141
+ }
3142
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
3100
3143
  bool result = true;
3101
3144
  reconstruct_active_formatting_elements(parser);
3102
3145
  if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
3103
3146
  result = false;
3104
3147
  parser_add_parse_error(parser, token);
3105
- adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
3148
+ bool handled = adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
3149
+ assert(handled);
3106
3150
  reconstruct_active_formatting_elements(parser);
3107
3151
  }
3108
3152
  insert_element_from_token(parser, token);
3109
3153
  add_formatting_element(parser, get_current_node(parser));
3110
3154
  return result;
3111
- } else if (
3155
+ }
3156
+ if (
3112
3157
  tag_in(token, kEndTag, &(const TagSet) {
3113
3158
  TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I),
3114
3159
  TAG(NOBR), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT),
3115
3160
  TAG(U)
3116
3161
  })
3117
3162
  ) {
3118
- return adoption_agency_algorithm(parser, token, token->v.end_tag.tag);
3119
- } else if (
3163
+ if (!adoption_agency_algorithm(parser, token, token->v.end_tag.tag))
3164
+ goto any_other_end_tag;
3165
+ return true;
3166
+ }
3167
+ if (
3120
3168
  tag_in(token, kStartTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
3121
3169
  ) {
3122
3170
  reconstruct_active_formatting_elements(parser);
@@ -3124,19 +3172,21 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3124
3172
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
3125
3173
  set_frameset_not_ok(parser);
3126
3174
  return true;
3127
- } else if (
3175
+ }
3176
+ if (
3128
3177
  tag_in(token, kEndTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
3129
3178
  ) {
3130
3179
  GumboTag token_tag = token->v.end_tag.tag;
3131
- if (!has_an_element_in_table_scope(parser, token_tag)) {
3180
+ if (!has_an_element_in_scope(parser, token_tag)) {
3132
3181
  parser_add_parse_error(parser, token);
3133
3182
  ignore_token(parser);
3134
3183
  return false;
3135
3184
  }
3136
- implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
3185
+ bool success = implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
3137
3186
  clear_active_formatting_elements(parser);
3138
- return true;
3139
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3187
+ return success;
3188
+ }
3189
+ if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3140
3190
  if (
3141
3191
  get_document_node(parser)->v.document.doc_type_quirks_mode
3142
3192
  != GUMBO_DOCTYPE_QUIRKS
@@ -3147,74 +3197,88 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3147
3197
  set_frameset_not_ok(parser);
3148
3198
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3149
3199
  return true;
3150
- } else if (
3200
+ }
3201
+ if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
3202
+ parser_add_parse_error(parser, token);
3203
+ reconstruct_active_formatting_elements(parser);
3204
+ insert_element_of_tag_type (
3205
+ parser,
3206
+ GUMBO_TAG_BR,
3207
+ GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3208
+ );
3209
+ pop_current_node(parser);
3210
+ acknowledge_self_closing_tag(parser);
3211
+ set_frameset_not_ok(parser);
3212
+ return false;
3213
+ }
3214
+ if (
3151
3215
  tag_in(token, kStartTag, &(const TagSet) {
3152
3216
  TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), TAG(IMAGE), TAG(KEYGEN),
3153
3217
  TAG(WBR)
3154
3218
  })
3155
3219
  ) {
3156
- bool success = true;
3157
- if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
3158
- success = false;
3220
+ bool is_image = tag_is(token, kStartTag, GUMBO_TAG_IMAGE);
3221
+ if (is_image) {
3159
3222
  parser_add_parse_error(parser, token);
3160
3223
  token->v.start_tag.tag = GUMBO_TAG_IMG;
3161
3224
  }
3162
3225
  reconstruct_active_formatting_elements(parser);
3163
3226
  GumboNode* node = insert_element_from_token(parser, token);
3164
- if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
3165
- success = false;
3166
- parser_add_parse_error(parser, token);
3167
- node->v.element.tag = GUMBO_TAG_IMG;
3227
+ if (is_image)
3168
3228
  node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
3169
- }
3170
3229
  pop_current_node(parser);
3171
3230
  acknowledge_self_closing_tag(parser);
3172
3231
  set_frameset_not_ok(parser);
3173
- return success;
3174
- } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
3175
- if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
3176
- // Must be before the element is inserted, as that takes ownership of the
3177
- // token's attribute vector.
3178
- set_frameset_not_ok(parser);
3179
- }
3232
+ return !is_image;
3233
+ }
3234
+ if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
3180
3235
  reconstruct_active_formatting_elements(parser);
3181
- insert_element_from_token(parser, token);
3236
+ GumboNode *input = insert_element_from_token(parser, token);
3182
3237
  pop_current_node(parser);
3183
3238
  acknowledge_self_closing_tag(parser);
3239
+ if (!attribute_matches(&input->v.element.attributes, "type", "hidden"))
3240
+ set_frameset_not_ok(parser);
3184
3241
  return true;
3185
- } else if (
3242
+ }
3243
+ if (
3186
3244
  tag_in(token, kStartTag, &(const TagSet){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})
3187
3245
  ) {
3188
3246
  insert_element_from_token(parser, token);
3189
3247
  pop_current_node(parser);
3190
3248
  acknowledge_self_closing_tag(parser);
3191
3249
  return true;
3192
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
3250
+ }
3251
+ if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
3193
3252
  bool result = maybe_implicitly_close_p_tag(parser, token);
3194
3253
  insert_element_from_token(parser, token);
3195
3254
  pop_current_node(parser);
3196
3255
  acknowledge_self_closing_tag(parser);
3197
3256
  set_frameset_not_ok(parser);
3198
3257
  return result;
3199
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
3258
+ }
3259
+ if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
3200
3260
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
3201
3261
  parser->_parser_state->_ignore_next_linefeed = true;
3202
3262
  set_frameset_not_ok(parser);
3203
3263
  return true;
3204
- } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
3264
+ }
3265
+ if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
3205
3266
  bool result = maybe_implicitly_close_p_tag(parser, token);
3206
3267
  reconstruct_active_formatting_elements(parser);
3207
3268
  set_frameset_not_ok(parser);
3208
3269
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3209
3270
  return result;
3210
- } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
3271
+ }
3272
+ if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
3211
3273
  set_frameset_not_ok(parser);
3212
3274
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3213
3275
  return true;
3214
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3276
+ }
3277
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3215
3278
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3216
3279
  return true;
3217
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3280
+ }
3281
+ if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3218
3282
  reconstruct_active_formatting_elements(parser);
3219
3283
  insert_element_from_token(parser, token);
3220
3284
  set_frameset_not_ok(parser);
@@ -3231,8 +3295,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3231
3295
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
3232
3296
  }
3233
3297
  return true;
3234
- } else if (
3235
- tag_in(token, kStartTag, &(const TagSet){TAG(OPTION), TAG(OPTGROUP)})
3298
+ }
3299
+ if (
3300
+ tag_in(token, kStartTag, &(const TagSet){TAG(OPTGROUP), TAG(OPTION)})
3236
3301
  ) {
3237
3302
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3238
3303
  pop_current_node(parser);
@@ -3240,40 +3305,34 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3240
3305
  reconstruct_active_formatting_elements(parser);
3241
3306
  insert_element_from_token(parser, token);
3242
3307
  return true;
3243
- } else if (
3244
- tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)})
3245
- ) {
3308
+ }
3309
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RTC)})) {
3246
3310
  bool success = true;
3247
- GumboTag exception = tag_in(token, kStartTag, &(const TagSet){TAG(RT), TAG(RP)})
3248
- ? GUMBO_TAG_RTC
3249
- : GUMBO_TAG_LAST
3250
- ;
3251
3311
  if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
3252
- generate_implied_end_tags(parser, exception);
3312
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3313
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)) {
3314
+ parser_add_parse_error(parser, token);
3315
+ success = false;
3316
+ }
3253
3317
  }
3254
- if (
3255
- !node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)
3256
- && !(
3257
- exception == GUMBO_TAG_LAST
3258
- || node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC)
3259
- )
3260
- ) {
3261
- parser_add_parse_error(parser, token);
3262
- success = false;
3318
+ insert_element_from_token(parser, token);
3319
+ return success;
3320
+ }
3321
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(RP), TAG(RT)})) {
3322
+ bool success = true;
3323
+ if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
3324
+ generate_implied_end_tags(parser, GUMBO_TAG_RTC);
3325
+ GumboNode* current = get_current_node(parser);
3326
+ if (!node_html_tag_is(current, GUMBO_TAG_RUBY) &&
3327
+ !node_html_tag_is(current, GUMBO_TAG_RTC)) {
3328
+ parser_add_parse_error(parser, token);
3329
+ success = false;
3330
+ }
3263
3331
  }
3264
3332
  insert_element_from_token(parser, token);
3265
3333
  return success;
3266
- } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
3267
- parser_add_parse_error(parser, token);
3268
- reconstruct_active_formatting_elements(parser);
3269
- insert_element_of_tag_type (
3270
- parser,
3271
- GUMBO_TAG_BR,
3272
- GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3273
- );
3274
- pop_current_node(parser);
3275
- return false;
3276
- } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
3334
+ }
3335
+ if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
3277
3336
  reconstruct_active_formatting_elements(parser);
3278
3337
  adjust_mathml_attributes(token);
3279
3338
  adjust_foreign_attributes(token);
@@ -3283,7 +3342,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3283
3342
  acknowledge_self_closing_tag(parser);
3284
3343
  }
3285
3344
  return true;
3286
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
3345
+ }
3346
+ if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
3287
3347
  reconstruct_active_formatting_elements(parser);
3288
3348
  adjust_svg_attributes(token);
3289
3349
  adjust_foreign_attributes(token);
@@ -3293,7 +3353,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3293
3353
  acknowledge_self_closing_tag(parser);
3294
3354
  }
3295
3355
  return true;
3296
- } else if (
3356
+ }
3357
+ if (
3297
3358
  tag_in(token, kStartTag, &(const TagSet) {
3298
3359
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(FRAME), TAG(HEAD),
3299
3360
  TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3302,48 +3363,49 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3302
3363
  parser_add_parse_error(parser, token);
3303
3364
  ignore_token(parser);
3304
3365
  return false;
3305
- } else if (token->type == GUMBO_TOKEN_START_TAG) {
3366
+ }
3367
+ if (token->type == GUMBO_TOKEN_START_TAG) {
3306
3368
  reconstruct_active_formatting_elements(parser);
3307
3369
  insert_element_from_token(parser, token);
3308
3370
  return true;
3309
- } else {
3310
- assert(token->type == GUMBO_TOKEN_END_TAG);
3311
- GumboTag end_tag = token->v.end_tag.tag;
3312
- const char *end_tagname = token->v.end_tag.name;
3313
- assert(state->_open_elements.length > 0);
3314
- assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
3315
- // Walk up the stack of open elements until we find one that either:
3316
- // a) Matches the tag name we saw
3317
- // b) Is in the "special" category.
3318
- // If we see a), implicitly close everything up to and including it. If we
3319
- // see b), then record a parse error, don't close anything (except the
3320
- // implied end tags) and ignore the end tag token.
3321
- for (int i = state->_open_elements.length; --i >= 0;) {
3322
- const GumboNode* node = state->_open_elements.data[i];
3323
- if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, end_tag, end_tagname)) {
3324
- generate_implied_end_tags(parser, end_tag);
3325
- // TODO(jdtang): Do I need to add a parse error here? The condition in
3326
- // the spec seems like it's the inverse of the loop condition above, and
3327
- // so would never fire.
3328
- // sfc: Yes, an error is needed here.
3329
- // <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example.
3330
- // foo is the "current node" but sarcasm is node.
3331
- // XXX: Write a test for this.
3332
- if (node != get_current_node(parser))
3333
- parser_add_parse_error(parser, token);
3334
- while (node != pop_current_node(parser))
3335
- ; // Pop everything.
3336
- return true;
3337
- } else if (is_special_node(node)) {
3371
+ }
3372
+ any_other_end_tag:
3373
+ assert(token->type == GUMBO_TOKEN_END_TAG);
3374
+ GumboTag end_tag = token->v.end_tag.tag;
3375
+ const char *end_tagname = token->v.end_tag.name;
3376
+ assert(state->_open_elements.length > 0);
3377
+ assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
3378
+ // Walk up the stack of open elements until we find one that either:
3379
+ // a) Matches the tag name we saw
3380
+ // b) Is in the "special" category.
3381
+ // If we see a), implicitly close everything up to and including it. If we
3382
+ // see b), then record a parse error, don't close anything (except the
3383
+ // implied end tags) and ignore the end tag token.
3384
+ for (int i = state->_open_elements.length; --i >= 0;) {
3385
+ const GumboNode* node = state->_open_elements.data[i];
3386
+ if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, end_tag, end_tagname)) {
3387
+ generate_implied_end_tags(parser, end_tag);
3388
+ // TODO(jdtang): Do I need to add a parse error here? The condition in
3389
+ // the spec seems like it's the inverse of the loop condition above, and
3390
+ // so would never fire.
3391
+ // sfc: Yes, an error is needed here.
3392
+ // <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example.
3393
+ // foo is the "current node" but sarcasm is node.
3394
+ // XXX: Write a test for this.
3395
+ if (node != get_current_node(parser))
3338
3396
  parser_add_parse_error(parser, token);
3339
- ignore_token(parser);
3340
- return false;
3341
- }
3397
+ while (node != pop_current_node(parser))
3398
+ ; // Pop everything.
3399
+ return true;
3400
+ } else if (is_special_node(node)) {
3401
+ parser_add_parse_error(parser, token);
3402
+ ignore_token(parser);
3403
+ return false;
3342
3404
  }
3343
- // <html> is in the special category, so we should never get here.
3344
- assert(0);
3345
- return false;
3346
3405
  }
3406
+ // <html> is in the special category, so we should never get here.
3407
+ assert(0);
3408
+ return false;
3347
3409
  }
3348
3410
 
3349
3411
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata
@@ -3353,30 +3415,36 @@ static bool handle_text(GumboParser* parser, GumboToken* token) {
3353
3415
  || token->type == GUMBO_TOKEN_WHITESPACE
3354
3416
  ) {
3355
3417
  insert_text_token(parser, token);
3356
- } else {
3357
- // We provide only bare-bones script handling that doesn't involve any of
3358
- // the parser-pause/already-started/script-nesting flags or re-entrant
3359
- // invocations of the tokenizer. Because the intended usage of this library
3360
- // is mostly for templating, refactoring, and static-analysis libraries, we
3361
- // provide the script body as a text-node child of the <script> element.
3362
- // This behavior doesn't support document.write of partial HTML elements,
3363
- // but should be adequate for almost all other scripting support.
3364
- if (token->type == GUMBO_TOKEN_EOF) {
3365
- parser_add_parse_error(parser, token);
3366
- parser->_parser_state->_reprocess_current_token = true;
3367
- }
3368
- pop_current_node(parser);
3369
- set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3418
+ return true;
3370
3419
  }
3371
- return true;
3420
+ // We provide only bare-bones script handling that doesn't involve any of
3421
+ // the parser-pause/already-started/script-nesting flags or re-entrant
3422
+ // invocations of the tokenizer. Because the intended usage of this library
3423
+ // is mostly for templating, refactoring, and static-analysis libraries, we
3424
+ // provide the script body as a text-node child of the <script> element.
3425
+ // This behavior doesn't support document.write of partial HTML elements,
3426
+ // but should be adequate for almost all other scripting support.
3427
+ bool success = true;
3428
+ if (token->type == GUMBO_TOKEN_EOF) {
3429
+ parser_add_parse_error(parser, token);
3430
+ success = false;
3431
+ parser->_parser_state->_reprocess_current_token = true;
3432
+ }
3433
+ pop_current_node(parser);
3434
+ set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3435
+ return success;
3372
3436
  }
3373
3437
 
3374
3438
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable
3375
3439
  static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3376
3440
  GumboParserState* state = parser->_parser_state;
3377
3441
  if (
3378
- token->type == GUMBO_TOKEN_CHARACTER
3379
- || token->type == GUMBO_TOKEN_WHITESPACE
3442
+ (token->type == GUMBO_TOKEN_CHARACTER
3443
+ || token->type == GUMBO_TOKEN_WHITESPACE
3444
+ || token->type == GUMBO_TOKEN_NULL)
3445
+ && node_tag_in_set(get_current_node(parser), &(const TagSet) {
3446
+ TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
3447
+ })
3380
3448
  ) {
3381
3449
  // The "pending table character tokens" list described in the spec is
3382
3450
  // nothing more than the TextNodeBufferState. We accumulate text tokens as
@@ -3384,71 +3452,87 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3384
3452
  // we set _foster_parent_insertions if there're non-whitespace characters in
3385
3453
  // the buffer.
3386
3454
  assert(state->_text_node._buffer.length == 0);
3455
+ assert(state->_table_character_tokens.length == 0);
3387
3456
  state->_original_insertion_mode = state->_insertion_mode;
3388
3457
  state->_reprocess_current_token = true;
3389
3458
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
3390
3459
  return true;
3391
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3460
+ }
3461
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3462
+ append_comment_node(parser, get_current_node(parser), token);
3463
+ return true;
3464
+ }
3465
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3392
3466
  parser_add_parse_error(parser, token);
3393
3467
  ignore_token(parser);
3394
3468
  return false;
3395
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3396
- append_comment_node(parser, get_current_node(parser), token);
3397
- return true;
3398
- } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3469
+ }
3470
+ if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3399
3471
  clear_stack_to_table_context(parser);
3400
3472
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
3401
3473
  insert_element_from_token(parser, token);
3402
3474
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
3403
3475
  return true;
3404
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3476
+ }
3477
+ if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3405
3478
  clear_stack_to_table_context(parser);
3406
3479
  insert_element_from_token(parser, token);
3407
3480
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3408
3481
  return true;
3409
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3482
+ }
3483
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3410
3484
  clear_stack_to_table_context(parser);
3411
3485
  insert_element_of_tag_type (
3412
3486
  parser,
3413
3487
  GUMBO_TAG_COLGROUP,
3414
3488
  GUMBO_INSERTION_IMPLIED
3415
3489
  );
3416
- parser->_parser_state->_reprocess_current_token = true;
3490
+ state->_reprocess_current_token = true;
3417
3491
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3418
3492
  return true;
3419
- } else if (
3493
+ }
3494
+ if (
3420
3495
  tag_in(token, kStartTag, &(const TagSet) {
3421
- TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD), TAG(TH), TAG(TR)
3496
+ TAG(TBODY), TAG(TFOOT), TAG(THEAD)
3422
3497
  })
3423
3498
  ) {
3424
3499
  clear_stack_to_table_context(parser);
3500
+ insert_element_from_token(parser, token);
3425
3501
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3426
- if (tag_in(token, kStartTag, &(const TagSet){TAG(TD), TAG(TH), TAG(TR)})) {
3427
- insert_element_of_tag_type (
3428
- parser,
3429
- GUMBO_TAG_TBODY,
3430
- GUMBO_INSERTION_IMPLIED
3431
- );
3432
- state->_reprocess_current_token = true;
3433
- } else {
3434
- insert_element_from_token(parser, token);
3435
- }
3436
3502
  return true;
3437
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3503
+ }
3504
+ if (
3505
+ tag_in(token, kStartTag, &(const TagSet) {
3506
+ TAG(TD), TAG(TH), TAG(TR)
3507
+ })
3508
+ ) {
3509
+ clear_stack_to_table_context(parser);
3510
+ insert_element_of_tag_type (
3511
+ parser,
3512
+ GUMBO_TAG_TBODY,
3513
+ GUMBO_INSERTION_IMPLIED
3514
+ );
3515
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3516
+ state->_reprocess_current_token = true;
3517
+ return true;
3518
+ }
3519
+ if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3438
3520
  parser_add_parse_error(parser, token);
3439
3521
  if (close_table(parser)) {
3440
- parser->_parser_state->_reprocess_current_token = true;
3522
+ state->_reprocess_current_token = true;
3441
3523
  } else {
3442
3524
  ignore_token(parser);
3443
3525
  }
3444
3526
  return false;
3445
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3527
+ }
3528
+ if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3446
3529
  if (!close_table(parser)) {
3447
3530
  parser_add_parse_error(parser, token);
3448
3531
  return false;
3449
3532
  }
3450
3533
  return true;
3451
- } else if (
3534
+ }
3535
+ if (
3452
3536
  tag_in(token, kEndTag, &(const TagSet) {
3453
3537
  TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
3454
3538
  TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3457,20 +3541,24 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3457
3541
  parser_add_parse_error(parser, token);
3458
3542
  ignore_token(parser);
3459
3543
  return false;
3460
- } else if (
3544
+ }
3545
+ if (
3461
3546
  tag_in(token, kStartTag, &(const TagSet){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)})
3462
3547
  || (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))
3463
3548
  ) {
3464
3549
  return handle_in_head(parser, token);
3465
- } else if (
3550
+ }
3551
+ if (
3466
3552
  tag_is(token, kStartTag, GUMBO_TAG_INPUT)
3467
3553
  && attribute_matches(&token->v.start_tag.attributes, "type", "hidden")
3468
3554
  ) {
3469
3555
  parser_add_parse_error(parser, token);
3470
3556
  insert_element_from_token(parser, token);
3471
3557
  pop_current_node(parser);
3558
+ acknowledge_self_closing_tag(parser);
3472
3559
  return false;
3473
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3560
+ }
3561
+ if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3474
3562
  parser_add_parse_error(parser, token);
3475
3563
  if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3476
3564
  ignore_token(parser);
@@ -3479,15 +3567,16 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3479
3567
  state->_form_element = insert_element_from_token(parser, token);
3480
3568
  pop_current_node(parser);
3481
3569
  return false;
3482
- } else if (token->type == GUMBO_TOKEN_EOF) {
3570
+ }
3571
+ if (token->type == GUMBO_TOKEN_EOF) {
3483
3572
  return handle_in_body(parser, token);
3484
- } else {
3485
- parser_add_parse_error(parser, token);
3486
- state->_foster_parent_insertions = true;
3487
- bool result = handle_in_body(parser, token);
3488
- state->_foster_parent_insertions = false;
3489
- return result;
3490
3573
  }
3574
+ // foster-parenting-start-tag or foster-parenting-end-tag error
3575
+ parser_add_parse_error(parser, token);
3576
+ state->_foster_parent_insertions = true;
3577
+ bool result = handle_in_body(parser, token);
3578
+ state->_foster_parent_insertions = false;
3579
+ return result;
3491
3580
  }
3492
3581
 
3493
3582
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext
@@ -3496,40 +3585,38 @@ static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3496
3585
  parser_add_parse_error(parser, token);
3497
3586
  ignore_token(parser);
3498
3587
  return false;
3499
- } else if (
3500
- token->type == GUMBO_TOKEN_CHARACTER
3501
- || token->type == GUMBO_TOKEN_WHITESPACE
3502
- ) {
3588
+ }
3589
+ GumboParserState* state = parser->_parser_state;
3590
+ // Non-whitespace tokens will cause parse errors later.
3591
+ // It's not entirely clear from the spec how this is supposed to work.
3592
+ // https://github.com/whatwg/html/issues/4046
3593
+ if (token->type == GUMBO_TOKEN_WHITESPACE
3594
+ || token->type == GUMBO_TOKEN_CHARACTER) {
3503
3595
  insert_text_token(parser, token);
3596
+ gumbo_character_token_buffer_append(token, &state->_table_character_tokens);
3504
3597
  return true;
3505
- } else {
3506
- GumboParserState* state = parser->_parser_state;
3507
- GumboStringBuffer* buffer = &state->_text_node._buffer;
3508
- const char* data = buffer->data;
3509
- // Note that TextNodeBuffer may contain UTF-8 characters, but the
3510
- // presence of any one byte that is not whitespace means we flip
3511
- // the flag, so this loop is still valid.
3598
+ }
3599
+
3600
+ GumboCharacterTokenBuffer* buffer = &state->_table_character_tokens;
3601
+ if (state->_text_node._type != GUMBO_NODE_WHITESPACE) {
3602
+ // Each character in buffer is an error. Unfortunately, that means we need
3603
+ // to emit a bunch of errors at the appropriate locations.
3512
3604
  for (size_t i = 0, n = buffer->length; i < n; ++i) {
3513
- switch (data[i]) {
3514
- case '\t':
3515
- case '\n':
3516
- case '\f':
3517
- case '\r':
3518
- case ' ':
3519
- continue;
3520
- default:
3521
- state->_foster_parent_insertions = true;
3522
- reconstruct_active_formatting_elements(parser);
3523
- goto loopbreak;
3524
- }
3605
+ GumboToken tok;
3606
+ gumbo_character_token_buffer_get(buffer, i, &tok);
3607
+ // foster-parenting-character error
3608
+ parser_add_parse_error(parser, &tok);
3525
3609
  }
3526
- loopbreak:
3527
- maybe_flush_text_node_buffer(parser);
3528
- state->_foster_parent_insertions = false;
3529
- state->_reprocess_current_token = true;
3530
- state->_insertion_mode = state->_original_insertion_mode;
3531
- return true;
3610
+ state->_foster_parent_insertions = true;
3611
+ set_frameset_not_ok(parser);
3612
+ reconstruct_active_formatting_elements(parser);
3532
3613
  }
3614
+ maybe_flush_text_node_buffer(parser);
3615
+ gumbo_character_token_buffer_clear(buffer);
3616
+ state->_foster_parent_insertions = false;
3617
+ state->_reprocess_current_token = true;
3618
+ state->_insertion_mode = state->_original_insertion_mode;
3619
+ return true;
3533
3620
  }
3534
3621
 
3535
3622
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incaption
@@ -3539,19 +3626,18 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3539
3626
  parser_add_parse_error(parser, token);
3540
3627
  ignore_token(parser);
3541
3628
  return false;
3542
- } else {
3543
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3544
- bool result = true;
3545
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3546
- parser_add_parse_error(parser, token);
3547
- }
3548
- while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3549
- ;
3550
- clear_active_formatting_elements(parser);
3551
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3552
- return result;
3553
3629
  }
3554
- } else if (
3630
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3631
+ bool result = node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION);
3632
+ if (!result)
3633
+ parser_add_parse_error(parser, token);
3634
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3635
+ ;
3636
+ clear_active_formatting_elements(parser);
3637
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3638
+ return result;
3639
+ }
3640
+ if (
3555
3641
  tag_in(token, kStartTag, &(const TagSet) {
3556
3642
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
3557
3643
  TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3563,13 +3649,18 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3563
3649
  ignore_token(parser);
3564
3650
  return false;
3565
3651
  }
3652
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3653
+ bool result = node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION);
3654
+ if (!result)
3655
+ parser_add_parse_error(parser, token);
3566
3656
  while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3567
3657
  ;
3568
3658
  clear_active_formatting_elements(parser);
3569
3659
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3570
3660
  parser->_parser_state->_reprocess_current_token = true;
3571
- return true;
3572
- } else if (
3661
+ return result;
3662
+ }
3663
+ if (
3573
3664
  tag_in(token, kEndTag, &(const TagSet) {
3574
3665
  TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD),
3575
3666
  TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3578,9 +3669,8 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3578
3669
  parser_add_parse_error(parser, token);
3579
3670
  ignore_token(parser);
3580
3671
  return false;
3581
- } else {
3582
- return handle_in_body(parser, token);
3583
3672
  }
3673
+ return handle_in_body(parser, token);
3584
3674
  }
3585
3675
 
3586
3676
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incolgroup
@@ -3588,21 +3678,26 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3588
3678
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
3589
3679
  insert_text_token(parser, token);
3590
3680
  return true;
3591
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3681
+ }
3682
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3683
+ append_comment_node(parser, get_current_node(parser), token);
3684
+ return true;
3685
+ }
3686
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3592
3687
  parser_add_parse_error(parser, token);
3593
3688
  ignore_token(parser);
3594
3689
  return false;
3595
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3596
- append_comment_node(parser, get_current_node(parser), token);
3597
- return true;
3598
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3690
+ }
3691
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3599
3692
  return handle_in_body(parser, token);
3600
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3693
+ }
3694
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3601
3695
  insert_element_from_token(parser, token);
3602
3696
  pop_current_node(parser);
3603
3697
  acknowledge_self_closing_tag(parser);
3604
3698
  return true;
3605
- } else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3699
+ }
3700
+ if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3606
3701
  if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3607
3702
  parser_add_parse_error(parser, token);
3608
3703
  ignore_token(parser);
@@ -3611,28 +3706,30 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3611
3706
  pop_current_node(parser);
3612
3707
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3613
3708
  return false;
3614
- } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3709
+ }
3710
+ if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3615
3711
  parser_add_parse_error(parser, token);
3616
3712
  ignore_token(parser);
3617
3713
  return false;
3618
- } else if (
3714
+ }
3715
+ if (
3619
3716
  tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)
3620
3717
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
3621
3718
  ) {
3622
3719
  return handle_in_head(parser, token);
3623
- } else if (token->type == GUMBO_TOKEN_EOF) {
3720
+ }
3721
+ if (token->type == GUMBO_TOKEN_EOF) {
3624
3722
  return handle_in_body(parser, token);
3625
- } else {
3626
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3627
- parser_add_parse_error(parser, token);
3628
- ignore_token(parser);
3629
- return false;
3630
- }
3631
- pop_current_node(parser);
3632
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3633
- parser->_parser_state->_reprocess_current_token = true;
3634
- return true;
3635
3723
  }
3724
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3725
+ parser_add_parse_error(parser, token);
3726
+ ignore_token(parser);
3727
+ return false;
3728
+ }
3729
+ pop_current_node(parser);
3730
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3731
+ parser->_parser_state->_reprocess_current_token = true;
3732
+ return true;
3636
3733
  }
3637
3734
 
3638
3735
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intbody
@@ -3642,14 +3739,16 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3642
3739
  insert_element_from_token(parser, token);
3643
3740
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3644
3741
  return true;
3645
- } else if (tag_in(token, kStartTag, &td_th_tags)) {
3742
+ }
3743
+ if (tag_in(token, kStartTag, &td_th_tags)) {
3646
3744
  parser_add_parse_error(parser, token);
3647
3745
  clear_stack_to_table_body_context(parser);
3648
3746
  insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3649
- parser->_parser_state->_reprocess_current_token = true;
3650
3747
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3748
+ parser->_parser_state->_reprocess_current_token = true;
3651
3749
  return false;
3652
- } else if (
3750
+ }
3751
+ if (
3653
3752
  tag_in(token, kEndTag, &(const TagSet){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
3654
3753
  ) {
3655
3754
  if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
@@ -3661,7 +3760,8 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3661
3760
  pop_current_node(parser);
3662
3761
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3663
3762
  return true;
3664
- } else if (
3763
+ }
3764
+ if (
3665
3765
  tag_in(token, kStartTag, &(const TagSet) {
3666
3766
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
3667
3767
  TAG(THEAD)
@@ -3684,18 +3784,18 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3684
3784
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3685
3785
  parser->_parser_state->_reprocess_current_token = true;
3686
3786
  return true;
3687
- } else if (
3787
+ }
3788
+ if (
3688
3789
  tag_in(token, kEndTag, &(const TagSet) {
3689
- TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR), TAG(COLGROUP),
3690
- TAG(HTML), TAG(TD), TAG(TH)
3790
+ TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TD),
3791
+ TAG(TH), TAG(TR)
3691
3792
  })
3692
3793
  ) {
3693
3794
  parser_add_parse_error(parser, token);
3694
3795
  ignore_token(parser);
3695
3796
  return false;
3696
- } else {
3697
- return handle_in_table(parser, token);
3698
3797
  }
3798
+ return handle_in_table(parser, token);
3699
3799
  }
3700
3800
 
3701
3801
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intr
@@ -3706,18 +3806,19 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3706
3806
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3707
3807
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
3708
3808
  return true;
3709
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3809
+ }
3810
+ if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3710
3811
  if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3711
3812
  parser_add_parse_error(parser, token);
3712
3813
  ignore_token(parser);
3713
3814
  return false;
3714
- } else {
3715
- clear_stack_to_table_row_context(parser);
3716
- pop_current_node(parser);
3717
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3718
- return true;
3719
3815
  }
3720
- } else if (
3816
+ clear_stack_to_table_row_context(parser);
3817
+ pop_current_node(parser);
3818
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3819
+ return true;
3820
+ }
3821
+ if (
3721
3822
  tag_in(token, kStartTag, &(const TagSet) {
3722
3823
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
3723
3824
  TAG(THEAD), TAG(TR)
@@ -3728,31 +3829,32 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3728
3829
  parser_add_parse_error(parser, token);
3729
3830
  ignore_token(parser);
3730
3831
  return false;
3731
- } else {
3732
- clear_stack_to_table_row_context(parser);
3733
- pop_current_node(parser);
3734
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3735
- parser->_parser_state->_reprocess_current_token = true;
3736
- return true;
3737
3832
  }
3738
- } else if (
3833
+ clear_stack_to_table_row_context(parser);
3834
+ pop_current_node(parser);
3835
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3836
+ parser->_parser_state->_reprocess_current_token = true;
3837
+ return true;
3838
+ }
3839
+ if (
3739
3840
  tag_in(token, kEndTag, &(const TagSet) {TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
3740
3841
  ) {
3741
- if (
3742
- !has_an_element_in_table_scope(parser, token->v.end_tag.tag)
3743
- || !has_an_element_in_table_scope(parser, GUMBO_TAG_TR)
3744
- ) {
3842
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3745
3843
  parser_add_parse_error(parser, token);
3746
3844
  ignore_token(parser);
3747
3845
  return false;
3748
- } else {
3749
- clear_stack_to_table_row_context(parser);
3750
- pop_current_node(parser);
3751
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3752
- parser->_parser_state->_reprocess_current_token = true;
3846
+ }
3847
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3848
+ ignore_token(parser);
3753
3849
  return true;
3754
3850
  }
3755
- } else if (
3851
+ clear_stack_to_table_row_context(parser);
3852
+ pop_current_node(parser);
3853
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3854
+ parser->_parser_state->_reprocess_current_token = true;
3855
+ return true;
3856
+ }
3857
+ if (
3756
3858
  tag_in(token, kEndTag, &(const TagSet) {
3757
3859
  TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
3758
3860
  TAG(TD), TAG(TH)
@@ -3761,9 +3863,8 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3761
3863
  parser_add_parse_error(parser, token);
3762
3864
  ignore_token(parser);
3763
3865
  return false;
3764
- } else {
3765
- return handle_in_table(parser, token);
3766
3866
  }
3867
+ return handle_in_table(parser, token);
3767
3868
  }
3768
3869
 
3769
3870
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intd
@@ -3776,7 +3877,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3776
3877
  return false;
3777
3878
  }
3778
3879
  return close_table_cell(parser, token, token_tag);
3779
- } else if (
3880
+ }
3881
+ if (
3780
3882
  tag_in(token, kStartTag, &(const TagSet) {
3781
3883
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
3782
3884
  TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3794,7 +3896,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3794
3896
  }
3795
3897
  parser->_parser_state->_reprocess_current_token = true;
3796
3898
  return close_current_cell(parser, token);
3797
- } else if (
3899
+ }
3900
+ if (
3798
3901
  tag_in(token, kEndTag, &(const TagSet) {
3799
3902
  TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML)
3800
3903
  })
@@ -3802,7 +3905,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3802
3905
  parser_add_parse_error(parser, token);
3803
3906
  ignore_token(parser);
3804
3907
  return false;
3805
- } else if (
3908
+ }
3909
+ if (
3806
3910
  tag_in(token, kEndTag, &(const TagSet) {
3807
3911
  TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
3808
3912
  })
@@ -3814,9 +3918,8 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3814
3918
  }
3815
3919
  parser->_parser_state->_reprocess_current_token = true;
3816
3920
  return close_current_cell(parser, token);
3817
- } else {
3818
- return handle_in_body(parser, token);
3819
3921
  }
3922
+ return handle_in_body(parser, token);
3820
3923
  }
3821
3924
 
3822
3925
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect
@@ -3825,28 +3928,34 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3825
3928
  parser_add_parse_error(parser, token);
3826
3929
  ignore_token(parser);
3827
3930
  return false;
3828
- } else if (
3931
+ }
3932
+ if (
3829
3933
  token->type == GUMBO_TOKEN_CHARACTER
3830
3934
  || token->type == GUMBO_TOKEN_WHITESPACE
3831
3935
  ) {
3832
3936
  insert_text_token(parser, token);
3833
3937
  return true;
3834
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3938
+ }
3939
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3940
+ append_comment_node(parser, get_current_node(parser), token);
3941
+ return true;
3942
+ }
3943
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3835
3944
  parser_add_parse_error(parser, token);
3836
3945
  ignore_token(parser);
3837
3946
  return false;
3838
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3839
- append_comment_node(parser, get_current_node(parser), token);
3840
- return true;
3841
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3947
+ }
3948
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3842
3949
  return handle_in_body(parser, token);
3843
- } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3950
+ }
3951
+ if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3844
3952
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3845
3953
  pop_current_node(parser);
3846
3954
  }
3847
3955
  insert_element_from_token(parser, token);
3848
3956
  return true;
3849
- } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3957
+ }
3958
+ if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3850
3959
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3851
3960
  pop_current_node(parser);
3852
3961
  }
@@ -3855,7 +3964,8 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3855
3964
  }
3856
3965
  insert_element_from_token(parser, token);
3857
3966
  return true;
3858
- } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3967
+ }
3968
+ if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3859
3969
  GumboVector* open_elements = &parser->_parser_state->_open_elements;
3860
3970
  if (
3861
3971
  node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)
@@ -3869,21 +3979,21 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3869
3979
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3870
3980
  pop_current_node(parser);
3871
3981
  return true;
3872
- } else {
3873
- parser_add_parse_error(parser, token);
3874
- ignore_token(parser);
3875
- return false;
3876
3982
  }
3877
- } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3983
+ parser_add_parse_error(parser, token);
3984
+ ignore_token(parser);
3985
+ return false;
3986
+ }
3987
+ if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3878
3988
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3879
3989
  pop_current_node(parser);
3880
3990
  return true;
3881
- } else {
3882
- parser_add_parse_error(parser, token);
3883
- ignore_token(parser);
3884
- return false;
3885
3991
  }
3886
- } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3992
+ parser_add_parse_error(parser, token);
3993
+ ignore_token(parser);
3994
+ return false;
3995
+ }
3996
+ if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3887
3997
  if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3888
3998
  parser_add_parse_error(parser, token);
3889
3999
  ignore_token(parser);
@@ -3891,14 +4001,16 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3891
4001
  }
3892
4002
  close_current_select(parser);
3893
4003
  return true;
3894
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
4004
+ }
4005
+ if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3895
4006
  parser_add_parse_error(parser, token);
3896
4007
  ignore_token(parser);
3897
4008
  if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3898
4009
  close_current_select(parser);
3899
4010
  }
3900
4011
  return false;
3901
- } else if (
4012
+ }
4013
+ if (
3902
4014
  tag_in(token, kStartTag, &(const TagSet) {TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})
3903
4015
  ) {
3904
4016
  parser_add_parse_error(parser, token);
@@ -3909,18 +4021,18 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3909
4021
  parser->_parser_state->_reprocess_current_token = true;
3910
4022
  }
3911
4023
  return false;
3912
- } else if (
4024
+ }
4025
+ if (
3913
4026
  tag_in(token, kStartTag, &(const TagSet){TAG(SCRIPT), TAG(TEMPLATE)})
3914
4027
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
3915
4028
  ) {
3916
4029
  return handle_in_head(parser, token);
3917
- } else if (token->type == GUMBO_TOKEN_EOF) {
3918
- return handle_in_body(parser, token);
3919
- } else {
3920
- parser_add_parse_error(parser, token);
3921
- ignore_token(parser);
3922
- return false;
3923
4030
  }
4031
+ if (token->type == GUMBO_TOKEN_EOF)
4032
+ return handle_in_body(parser, token);
4033
+ parser_add_parse_error(parser, token);
4034
+ ignore_token(parser);
4035
+ return false;
3924
4036
  }
3925
4037
 
3926
4038
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselectintable
@@ -3934,22 +4046,18 @@ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3934
4046
  close_current_select(parser);
3935
4047
  parser->_parser_state->_reprocess_current_token = true;
3936
4048
  return false;
3937
- } else if (tag_in(token, kEndTag, &tags)) {
4049
+ }
4050
+ if (tag_in(token, kEndTag, &tags)) {
3938
4051
  parser_add_parse_error(parser, token);
3939
4052
  if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3940
4053
  ignore_token(parser);
3941
4054
  return false;
3942
- } else {
3943
- close_current_select(parser);
3944
- // close_current_select already does the
3945
- // reset_insertion_mode_appropriately
3946
- // reset_insertion_mode_appropriately(parser);
3947
- parser->_parser_state->_reprocess_current_token = true;
3948
- return false;
3949
4055
  }
3950
- } else {
3951
- return handle_in_select(parser, token);
4056
+ close_current_select(parser);
4057
+ parser->_parser_state->_reprocess_current_token = true;
4058
+ return false;
3952
4059
  }
4060
+ return handle_in_select(parser, token);
3953
4061
  }
3954
4062
 
3955
4063
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate
@@ -3973,7 +4081,8 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3973
4081
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
3974
4082
  ) {
3975
4083
  return handle_in_head(parser, token);
3976
- } else if (
4084
+ }
4085
+ if (
3977
4086
  tag_in(token, kStartTag, &(const TagSet) {
3978
4087
  TAG(CAPTION), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), TAG(THEAD)
3979
4088
  })
@@ -3983,35 +4092,41 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3983
4092
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3984
4093
  state->_reprocess_current_token = true;
3985
4094
  return true;
3986
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
4095
+ }
4096
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3987
4097
  pop_template_insertion_mode(parser);
3988
4098
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3989
4099
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3990
4100
  state->_reprocess_current_token = true;
3991
4101
  return true;
3992
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
4102
+ }
4103
+ if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3993
4104
  pop_template_insertion_mode(parser);
3994
4105
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3995
4106
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3996
4107
  state->_reprocess_current_token = true;
3997
4108
  return true;
3998
- } else if (tag_in(token, kStartTag, &td_th_tags)) {
4109
+ }
4110
+ if (tag_in(token, kStartTag, &td_th_tags)) {
3999
4111
  pop_template_insertion_mode(parser);
4000
4112
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
4001
4113
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
4002
4114
  state->_reprocess_current_token = true;
4003
4115
  return true;
4004
- } else if (token->type == GUMBO_TOKEN_START_TAG) {
4116
+ }
4117
+ if (token->type == GUMBO_TOKEN_START_TAG) {
4005
4118
  pop_template_insertion_mode(parser);
4006
4119
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4007
4120
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4008
4121
  state->_reprocess_current_token = true;
4009
4122
  return true;
4010
- } else if (token->type == GUMBO_TOKEN_END_TAG) {
4123
+ }
4124
+ if (token->type == GUMBO_TOKEN_END_TAG) {
4011
4125
  parser_add_parse_error(parser, token);
4012
4126
  ignore_token(parser);
4013
4127
  return false;
4014
- } else if (token->type == GUMBO_TOKEN_EOF) {
4128
+ }
4129
+ if (token->type == GUMBO_TOKEN_EOF) {
4015
4130
  if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
4016
4131
  // Stop parsing.
4017
4132
  return true;
@@ -4024,10 +4139,9 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
4024
4139
  reset_insertion_mode_appropriately(parser);
4025
4140
  state->_reprocess_current_token = true;
4026
4141
  return false;
4027
- } else {
4028
- assert(0);
4029
- return false;
4030
4142
  }
4143
+ assert(0 && "unreachable");
4144
+ return false;
4031
4145
  }
4032
4146
 
4033
4147
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterbody
@@ -4037,16 +4151,22 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
4037
4151
  || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4038
4152
  ) {
4039
4153
  return handle_in_body(parser, token);
4040
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
4154
+ }
4155
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4041
4156
  GumboNode* html_node = parser->_output->root;
4042
4157
  assert(html_node != NULL);
4043
4158
  append_comment_node(parser, html_node, token);
4044
4159
  return true;
4045
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
4160
+ }
4161
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4046
4162
  parser_add_parse_error(parser, token);
4047
4163
  ignore_token(parser);
4048
4164
  return false;
4049
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4165
+ }
4166
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4167
+ return handle_in_body(parser, token);
4168
+ }
4169
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4050
4170
  /* fragment case: ignore the closing HTML token */
4051
4171
  if (is_fragment_parser(parser)) {
4052
4172
  parser_add_parse_error(parser, token);
@@ -4061,14 +4181,14 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
4061
4181
  &html->v.element
4062
4182
  );
4063
4183
  return true;
4064
- } else if (token->type == GUMBO_TOKEN_EOF) {
4184
+ }
4185
+ if (token->type == GUMBO_TOKEN_EOF) {
4065
4186
  return true;
4066
- } else {
4067
- parser_add_parse_error(parser, token);
4068
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4069
- parser->_parser_state->_reprocess_current_token = true;
4070
- return false;
4071
4187
  }
4188
+ parser_add_parse_error(parser, token);
4189
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4190
+ parser->_parser_state->_reprocess_current_token = true;
4191
+ return false;
4072
4192
  }
4073
4193
 
4074
4194
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inframeset
@@ -4076,19 +4196,24 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
4076
4196
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
4077
4197
  insert_text_token(parser, token);
4078
4198
  return true;
4079
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
4199
+ }
4200
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4080
4201
  append_comment_node(parser, get_current_node(parser), token);
4081
4202
  return true;
4082
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
4203
+ }
4204
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4083
4205
  parser_add_parse_error(parser, token);
4084
4206
  ignore_token(parser);
4085
4207
  return false;
4086
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4208
+ }
4209
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4087
4210
  return handle_in_body(parser, token);
4088
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
4211
+ }
4212
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
4089
4213
  insert_element_from_token(parser, token);
4090
4214
  return true;
4091
- } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
4215
+ }
4216
+ if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
4092
4217
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
4093
4218
  parser_add_parse_error(parser, token);
4094
4219
  ignore_token(parser);
@@ -4102,24 +4227,26 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
4102
4227
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
4103
4228
  }
4104
4229
  return true;
4105
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
4230
+ }
4231
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
4106
4232
  insert_element_from_token(parser, token);
4107
4233
  pop_current_node(parser);
4108
4234
  acknowledge_self_closing_tag(parser);
4109
4235
  return true;
4110
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4236
+ }
4237
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4111
4238
  return handle_in_head(parser, token);
4112
- } else if (token->type == GUMBO_TOKEN_EOF) {
4239
+ }
4240
+ if (token->type == GUMBO_TOKEN_EOF) {
4113
4241
  if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
4114
4242
  parser_add_parse_error(parser, token);
4115
4243
  return false;
4116
4244
  }
4117
4245
  return true;
4118
- } else {
4119
- parser_add_parse_error(parser, token);
4120
- ignore_token(parser);
4121
- return false;
4122
4246
  }
4247
+ parser_add_parse_error(parser, token);
4248
+ ignore_token(parser);
4249
+ return false;
4123
4250
  }
4124
4251
 
4125
4252
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterframeset
@@ -4127,16 +4254,20 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
4127
4254
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
4128
4255
  insert_text_token(parser, token);
4129
4256
  return true;
4130
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
4257
+ }
4258
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4131
4259
  append_comment_node(parser, get_current_node(parser), token);
4132
4260
  return true;
4133
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
4261
+ }
4262
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4134
4263
  parser_add_parse_error(parser, token);
4135
4264
  ignore_token(parser);
4136
4265
  return false;
4137
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4266
+ }
4267
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4138
4268
  return handle_in_body(parser, token);
4139
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4269
+ }
4270
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4140
4271
  GumboNode* html = parser->_parser_state->_open_elements.data[0];
4141
4272
  assert(node_html_tag_is(html, GUMBO_TAG_HTML));
4142
4273
  record_end_of_element (
@@ -4145,15 +4276,16 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
4145
4276
  );
4146
4277
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
4147
4278
  return true;
4148
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4279
+ }
4280
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4149
4281
  return handle_in_head(parser, token);
4150
- } else if (token->type == GUMBO_TOKEN_EOF) {
4282
+ }
4283
+ if (token->type == GUMBO_TOKEN_EOF) {
4151
4284
  return true;
4152
- } else {
4153
- parser_add_parse_error(parser, token);
4154
- ignore_token(parser);
4155
- return false;
4156
4285
  }
4286
+ parser_add_parse_error(parser, token);
4287
+ ignore_token(parser);
4288
+ return false;
4157
4289
  }
4158
4290
 
4159
4291
  // https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-body-insertion-mode
@@ -4161,20 +4293,21 @@ static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
4161
4293
  if (token->type == GUMBO_TOKEN_COMMENT) {
4162
4294
  append_comment_node(parser, get_document_node(parser), token);
4163
4295
  return true;
4164
- } else if (
4296
+ }
4297
+ if (
4165
4298
  token->type == GUMBO_TOKEN_DOCTYPE
4166
4299
  || token->type == GUMBO_TOKEN_WHITESPACE
4167
4300
  || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4168
4301
  ) {
4169
4302
  return handle_in_body(parser, token);
4170
- } else if (token->type == GUMBO_TOKEN_EOF) {
4303
+ }
4304
+ if (token->type == GUMBO_TOKEN_EOF) {
4171
4305
  return true;
4172
- } else {
4173
- parser_add_parse_error(parser, token);
4174
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4175
- parser->_parser_state->_reprocess_current_token = true;
4176
- return false;
4177
4306
  }
4307
+ parser_add_parse_error(parser, token);
4308
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4309
+ parser->_parser_state->_reprocess_current_token = true;
4310
+ return false;
4178
4311
  }
4179
4312
 
4180
4313
  // https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-frameset-insertion-mode
@@ -4185,21 +4318,23 @@ static bool handle_after_after_frameset (
4185
4318
  if (token->type == GUMBO_TOKEN_COMMENT) {
4186
4319
  append_comment_node(parser, get_document_node(parser), token);
4187
4320
  return true;
4188
- } else if (
4321
+ }
4322
+ if (
4189
4323
  token->type == GUMBO_TOKEN_DOCTYPE
4190
4324
  || token->type == GUMBO_TOKEN_WHITESPACE
4191
4325
  || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4192
4326
  ) {
4193
4327
  return handle_in_body(parser, token);
4194
- } else if (token->type == GUMBO_TOKEN_EOF) {
4328
+ }
4329
+ if (token->type == GUMBO_TOKEN_EOF) {
4195
4330
  return true;
4196
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4331
+ }
4332
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4197
4333
  return handle_in_head(parser, token);
4198
- } else {
4199
- parser_add_parse_error(parser, token);
4200
- ignore_token(parser);
4201
- return false;
4202
4334
  }
4335
+ parser_add_parse_error(parser, token);
4336
+ ignore_token(parser);
4337
+ return false;
4203
4338
  }
4204
4339
 
4205
4340
  // Function pointers for each insertion mode.
@@ -4306,8 +4441,7 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4306
4441
  parser->_parser_state->_reprocess_current_token = true;
4307
4442
  return false;
4308
4443
  }
4309
-
4310
- assert(token->type == GUMBO_TOKEN_START_TAG);
4444
+ // This is a start tag so the next if's then branch will be taken.
4311
4445
  }
4312
4446
 
4313
4447
  if (token->type == GUMBO_TOKEN_START_TAG) {
@@ -4329,49 +4463,48 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4329
4463
  return true;
4330
4464
  // </script> tags are handled like any other end tag, putting the script's
4331
4465
  // text into a text node child and closing the current node.
4332
- } else {
4333
- assert(token->type == GUMBO_TOKEN_END_TAG);
4334
- GumboNode* node = get_current_node(parser);
4335
- GumboTag tag = token->v.end_tag.tag;
4336
- const char* name = token->v.end_tag.name;
4337
- assert(node != NULL);
4466
+ }
4467
+ assert(token->type == GUMBO_TOKEN_END_TAG);
4468
+ GumboNode* node = get_current_node(parser);
4469
+ GumboTag tag = token->v.end_tag.tag;
4470
+ const char* name = token->v.end_tag.name;
4471
+ assert(node != NULL);
4338
4472
 
4339
- bool is_success = true;
4340
- if (!node_tagname_is(node, tag, name)) {
4341
- parser_add_parse_error(parser, token);
4342
- is_success = false;
4343
- }
4344
- int i = parser->_parser_state->_open_elements.length;
4345
- for (--i; i > 0;) {
4346
- // Here we move up the stack until we find an HTML element (in which
4347
- // case we do nothing) or we find the element that we're about to
4348
- // close (in which case we pop everything we've seen until that
4349
- // point.)
4350
- gumbo_debug("Foreign %s node at %d.\n", node->v.element.name, i);
4351
- if (node_tagname_is(node, tag, name)) {
4352
- gumbo_debug("Matches.\n");
4353
- while (node != pop_current_node(parser)) {
4354
- // Pop all the nodes below the current one. Node is guaranteed to
4355
- // be an element on the stack of open elements (set below), so
4356
- // this loop is guaranteed to terminate.
4357
- }
4358
- return is_success;
4359
- }
4360
- --i;
4361
- node = parser->_parser_state->_open_elements.data[i];
4362
- if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
4363
- // The loop continues only in foreign namespaces.
4364
- break;
4473
+ bool is_success = true;
4474
+ if (!node_tagname_is(node, tag, name)) {
4475
+ parser_add_parse_error(parser, token);
4476
+ is_success = false;
4477
+ }
4478
+ int i = parser->_parser_state->_open_elements.length;
4479
+ for (--i; i > 0;) {
4480
+ // Here we move up the stack until we find an HTML element (in which
4481
+ // case we do nothing) or we find the element that we're about to
4482
+ // close (in which case we pop everything we've seen until that
4483
+ // point.)
4484
+ gumbo_debug("Foreign %s node at %d.\n", node->v.element.name, i);
4485
+ if (node_tagname_is(node, tag, name)) {
4486
+ gumbo_debug("Matches.\n");
4487
+ while (node != pop_current_node(parser)) {
4488
+ // Pop all the nodes below the current one. Node is guaranteed to
4489
+ // be an element on the stack of open elements (set below), so
4490
+ // this loop is guaranteed to terminate.
4365
4491
  }
4366
- }
4367
- assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
4368
- if (i == 0)
4369
4492
  return is_success;
4370
- // We can't call handle_token directly because the current node is still in
4371
- // a foriegn namespace, so it would re-enter this and result in infinite
4372
- // recursion.
4373
- return handle_html_content(parser, token) && is_success;
4493
+ }
4494
+ --i;
4495
+ node = parser->_parser_state->_open_elements.data[i];
4496
+ if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
4497
+ // The loop continues only in foreign namespaces.
4498
+ break;
4499
+ }
4374
4500
  }
4501
+ assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
4502
+ if (i == 0)
4503
+ return is_success;
4504
+ // We can't call handle_token directly because the current node is still in
4505
+ // a foriegn namespace, so it would re-enter this and result in infinite
4506
+ // recursion.
4507
+ return handle_html_content(parser, token) && is_success;
4375
4508
  }
4376
4509
 
4377
4510
  // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
@@ -4517,7 +4650,7 @@ static void fragment_parser_init (
4517
4650
  break;
4518
4651
 
4519
4652
  case GUMBO_TAG_SCRIPT:
4520
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
4653
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
4521
4654
  break;
4522
4655
 
4523
4656
  case GUMBO_TAG_NOSCRIPT:
@@ -4554,7 +4687,7 @@ static void fragment_parser_init (
4554
4687
  // 11.
4555
4688
  if (ctx_has_form_ancestor
4556
4689
  || (ctx_tag == GUMBO_TAG_FORM
4557
- && fragment_namespace == GUMBO_NAMESPACE_HTML)) {
4690
+ && fragment_namespace == GUMBO_NAMESPACE_HTML)) {
4558
4691
  static const GumboNode form_ancestor = {
4559
4692
  .type = GUMBO_NODE_ELEMENT,
4560
4693
  .parent = NULL,
@@ -4619,11 +4752,11 @@ GumboOutput* gumbo_parse_with_options (
4619
4752
  if (state->_reprocess_current_token) {
4620
4753
  state->_reprocess_current_token = false;
4621
4754
  } else {
4622
- GumboNode* current_node = get_current_node(&parser);
4623
- gumbo_tokenizer_set_is_current_node_foreign (
4755
+ GumboNode* adjusted_current_node = get_adjusted_current_node(&parser);
4756
+ gumbo_tokenizer_set_is_adjusted_current_node_foreign (
4624
4757
  &parser,
4625
- current_node &&
4626
- current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4758
+ adjusted_current_node &&
4759
+ adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4627
4760
  );
4628
4761
  has_error = !gumbo_lex(&parser, &token) || has_error;
4629
4762
  }
@@ -4649,10 +4782,10 @@ GumboOutput* gumbo_parse_with_options (
4649
4782
  break;
4650
4783
  }
4651
4784
  gumbo_debug (
4652
- "Handling %s token @%zu:%zu in state %u.\n",
4785
+ "Handling %s token @%lu:%lu in state %u.\n",
4653
4786
  (char*) token_type,
4654
- token.position.line,
4655
- token.position.column,
4787
+ (unsigned long)token.position.line,
4788
+ (unsigned long)token.position.column,
4656
4789
  state->_insertion_mode
4657
4790
  );
4658
4791
 
@@ -4671,19 +4804,26 @@ GumboOutput* gumbo_parse_with_options (
4671
4804
  );
4672
4805
 
4673
4806
  if (!state->_reprocess_current_token) {
4807
+ // If we're done with the token, check for unacknowledged self-closing
4808
+ // flags on start tags.
4674
4809
  if (token.type == GUMBO_TOKEN_START_TAG &&
4675
4810
  token.v.start_tag.is_self_closing &&
4676
4811
  !state->_self_closing_flag_acknowledged) {
4677
- GumboError* error = parser_add_parse_error(&parser, &token);
4678
- if (error)
4679
- error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
4812
+ has_error = true;
4813
+ GumboError* error = gumbo_add_error(&parser);
4814
+ if (error) {
4815
+ // This is essentially a tokenizer error that's only caught during
4816
+ // tree construction.
4817
+ error->type = GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS;
4818
+ error->original_text = token.original_text;
4819
+ error->position = token.position;
4820
+ }
4680
4821
  }
4822
+ // Make sure we free the end tag's name since it doesn't get transferred
4823
+ // to a token.
4681
4824
  if (token.type == GUMBO_TOKEN_END_TAG &&
4682
- token.v.end_tag.is_self_closing) {
4683
- GumboError* error = parser_add_parse_error(&parser, &token);
4684
- if (error)
4685
- error->type = GUMBO_ERR_SELF_CLOSING_END_TAG;
4686
- }
4825
+ token.v.end_tag.tag == GUMBO_TAG_UNKNOWN)
4826
+ gumbo_free(token.v.end_tag.name);
4687
4827
  }
4688
4828
 
4689
4829
  if (unlikely(state->_open_elements.length > max_tree_depth)) {