nokogumbo 2.0.0.pre.alpha → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,6 +7,7 @@
7
7
  #include "insertion_mode.h"
8
8
  #include "string_buffer.h"
9
9
  #include "token_type.h"
10
+ #include "tokenizer_states.h"
10
11
 
11
12
  #ifdef __cplusplus
12
13
  extern "C" {
@@ -15,85 +16,66 @@ extern "C" {
15
16
  struct GumboInternalParser;
16
17
 
17
18
  typedef enum {
19
+ // Defined errors.
20
+ // https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
21
+ GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
22
+ GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
23
+ GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
24
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
25
+ GUMBO_ERR_CDATA_IN_HTML_CONTENT,
26
+ GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
27
+ GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM,
28
+ GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
29
+ GUMBO_ERR_END_TAG_WITH_ATTRIBUTES,
30
+ GUMBO_ERR_DUPLICATE_ATTRIBUTE,
31
+ GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS,
32
+ GUMBO_ERR_EOF_BEFORE_TAG_NAME,
33
+ GUMBO_ERR_EOF_IN_CDATA,
34
+ GUMBO_ERR_EOF_IN_COMMENT,
35
+ GUMBO_ERR_EOF_IN_DOCTYPE,
36
+ GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
37
+ GUMBO_ERR_EOF_IN_TAG,
38
+ GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT,
39
+ GUMBO_ERR_INCORRECTLY_OPENED_COMMENT,
40
+ GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
41
+ GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
42
+ GUMBO_ERR_MISSING_ATTRIBUTE_VALUE,
43
+ GUMBO_ERR_MISSING_DOCTYPE_NAME,
44
+ GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
45
+ GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
46
+ GUMBO_ERR_MISSING_END_TAG_NAME,
47
+ GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
48
+ GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
49
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
50
+ GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
51
+ GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
52
+ GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
53
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
54
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
55
+ GUMBO_ERR_NESTED_COMMENT,
56
+ GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
57
+ GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM,
58
+ GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
59
+ GUMBO_ERR_NULL_CHARACTER_REFERENCE,
60
+ GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
61
+ GUMBO_ERR_SURROGATE_IN_INPUT_STREAM,
62
+ GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
63
+ GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
64
+ GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
65
+ GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
66
+ GUMBO_ERR_UNEXPECTED_NULL_CHARACTER,
67
+ GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
68
+ GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG,
69
+ GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE,
70
+
71
+ // Encoding errors.
18
72
  GUMBO_ERR_UTF8_INVALID,
19
73
  GUMBO_ERR_UTF8_TRUNCATED,
20
- GUMBO_ERR_UTF8_NULL,
21
- GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
22
- GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
23
- GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
24
- GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
25
- GUMBO_ERR_NAMED_CHAR_REF_INVALID,
26
- GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
27
- GUMBO_ERR_TAG_EOF,
28
- GUMBO_ERR_TAG_INVALID,
29
- GUMBO_ERR_CLOSE_TAG_EMPTY,
30
- GUMBO_ERR_CLOSE_TAG_EOF,
31
- GUMBO_ERR_CLOSE_TAG_INVALID,
32
- GUMBO_ERR_SCRIPT_EOF,
33
- GUMBO_ERR_ATTR_NAME_EOF,
34
- GUMBO_ERR_ATTR_NAME_INVALID,
35
- GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
36
- GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
37
- GUMBO_ERR_ATTR_UNQUOTED_EOF,
38
- GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
39
- GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
40
- GUMBO_ERR_ATTR_AFTER_EOF,
41
- GUMBO_ERR_ATTR_AFTER_INVALID,
42
- GUMBO_ERR_DUPLICATE_ATTR,
43
- GUMBO_ERR_SOLIDUS_EOF,
44
- GUMBO_ERR_SOLIDUS_INVALID,
45
- GUMBO_ERR_DASHES_OR_DOCTYPE,
46
- GUMBO_ERR_COMMENT_EOF,
47
- GUMBO_ERR_COMMENT_INVALID,
48
- GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
49
- GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
50
- GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
51
- GUMBO_ERR_COMMENT_END_BANG_EOF,
52
- GUMBO_ERR_DOCTYPE_EOF,
53
- GUMBO_ERR_DOCTYPE_INVALID,
54
- GUMBO_ERR_DOCTYPE_SPACE,
55
- GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
56
- GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
57
- GUMBO_ERR_DOCTYPE_END,
74
+
75
+ // Generic parser error.
58
76
  GUMBO_ERR_PARSER,
59
- GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
60
- GUMBO_ERR_SELF_CLOSING_END_TAG,
61
77
  } GumboErrorType;
62
78
 
63
- // Additional data for duplicated attributes.
64
- typedef struct GumboInternalDuplicateAttrError {
65
- // The name of the attribute. Owned by this struct.
66
- const char* name;
67
-
68
- // The (0-based) index within the attributes vector of the original
69
- // occurrence.
70
- unsigned int original_index;
71
-
72
- // The (0-based) index where the new occurrence would be.
73
- unsigned int new_index;
74
- } GumboDuplicateAttrError;
75
-
76
- // A simplified representation of the tokenizer state, designed to be more
77
- // useful to clients of this library than the internal representation. This
78
- // condenses the actual states used in the tokenizer state machine into a few
79
- // values that will be familiar to users of HTML.
80
- typedef enum {
81
- GUMBO_ERR_TOKENIZER_DATA,
82
- GUMBO_ERR_TOKENIZER_CHAR_REF,
83
- GUMBO_ERR_TOKENIZER_RCDATA,
84
- GUMBO_ERR_TOKENIZER_RAWTEXT,
85
- GUMBO_ERR_TOKENIZER_PLAINTEXT,
86
- GUMBO_ERR_TOKENIZER_SCRIPT,
87
- GUMBO_ERR_TOKENIZER_TAG,
88
- GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
89
- GUMBO_ERR_TOKENIZER_ATTR_NAME,
90
- GUMBO_ERR_TOKENIZER_ATTR_VALUE,
91
- GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
92
- GUMBO_ERR_TOKENIZER_COMMENT,
93
- GUMBO_ERR_TOKENIZER_DOCTYPE,
94
- GUMBO_ERR_TOKENIZER_CDATA,
95
- } GumboTokenizerErrorState;
96
-
97
79
  // Additional data for tokenizer errors.
98
80
  // This records the current state and codepoint encountered - this is usually
99
81
  // enough to reconstruct what went wrong and provide a friendly error message.
@@ -102,7 +84,7 @@ typedef struct GumboInternalTokenizerError {
102
84
  int codepoint;
103
85
 
104
86
  // The state that the tokenizer was in at the time.
105
- GumboTokenizerErrorState state;
87
+ GumboTokenizerEnum state;
106
88
  } GumboTokenizerError;
107
89
 
108
90
  // Additional data for parse errors.
@@ -125,43 +107,25 @@ typedef struct GumboInternalParserError {
125
107
  // The overall error struct representing an error in decoding/tokenizing/parsing
126
108
  // the HTML. This contains an enumerated type flag, a source position, and then
127
109
  // a union of fields containing data specific to the error.
128
- typedef struct GumboInternalError {
110
+ struct GumboInternalError {
129
111
  // The type of error.
130
112
  GumboErrorType type;
131
113
 
132
114
  // The position within the source file where the error occurred.
133
115
  GumboSourcePosition position;
134
116
 
135
- // A pointer to the byte within the original source file text where the error
136
- // occurred (note that this is not the same as position.offset, as that gives
137
- // character-based instead of byte-based offsets).
138
- const char* original_text;
117
+ // The piece of text that caused the error.
118
+ GumboStringPiece original_text;
139
119
 
140
120
  // Type-specific error information.
141
121
  union {
142
- // The code point we encountered, for:
143
- // * GUMBO_ERR_UTF8_INVALID
144
- // * GUMBO_ERR_UTF8_TRUNCATED
145
- // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
146
- // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
147
- uint32_t codepoint;
148
-
149
122
  // Tokenizer errors.
150
123
  GumboTokenizerError tokenizer;
151
124
 
152
- // Short textual data, for:
153
- // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
154
- // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
155
- GumboStringPiece text;
156
-
157
- // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
158
- GumboDuplicateAttrError duplicate_attr;
159
-
160
- // Parser state, for GUMBO_ERR_PARSER and
161
- // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
162
- struct GumboInternalParserError parser;
125
+ // Parser errors.
126
+ GumboParserError parser;
163
127
  } v;
164
- } GumboError;
128
+ };
165
129
 
166
130
  // Adds a new error to the parser's error list, and returns a pointer to it so
167
131
  // that clients can fill out the rest of its fields. May return NULL if we're
@@ -177,32 +141,6 @@ void gumbo_destroy_errors(struct GumboInternalParser* errors);
177
141
  // Frees the memory used for a single GumboError.
178
142
  void gumbo_error_destroy(GumboError* error);
179
143
 
180
- // Prints an error to a string. This fills an empty GumboStringBuffer with a
181
- // freshly-allocated buffer containing the error message text. The caller is
182
- // responsible for freeing the buffer.
183
- void gumbo_error_to_string (
184
- const GumboError* error,
185
- GumboStringBuffer* output
186
- );
187
-
188
- // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
189
- // with a freshly-allocated buffer containing the error message text. The
190
- // caller is responsible for freeing the buffer.
191
- void gumbo_caret_diagnostic_to_string (
192
- const GumboError* error,
193
- const char* source_text,
194
- size_t source_length,
195
- GumboStringBuffer* output
196
- );
197
-
198
- // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
199
- // of writing to a string.
200
- void gumbo_print_caret_diagnostic (
201
- const GumboError* error,
202
- const char* source_text,
203
- size_t source_length
204
- );
205
-
206
144
  #ifdef __cplusplus
207
145
  }
208
146
  #endif
@@ -706,6 +706,15 @@ typedef struct GumboInternalOptions {
706
706
  */
707
707
  bool stop_on_first_error;
708
708
 
709
+ /**
710
+ * Maximum allowed number of attributes per element. If this limit is
711
+ * exceeded, the parser will return early with a partial document and
712
+ * the returned `GumboOutput` will have its `status` field set to
713
+ * `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
714
+ * Default: `400`.
715
+ */
716
+ int max_attributes;
717
+
709
718
  /**
710
719
  * Maximum allowed depth for the parse tree. If this limit is exceeded,
711
720
  * the parser will return early with a partial document and the returned
@@ -796,6 +805,16 @@ typedef enum {
796
805
  */
797
806
  GUMBO_STATUS_TREE_TOO_DEEP,
798
807
 
808
+ /**
809
+ * Indicates that the maximum number of attributes per element
810
+ * (`GumboOptions::max_attributes`) was reached during parsing. The
811
+ * resulting tree will be a partial document, with no further nodes
812
+ * created after the point where the limit was reached. The partial
813
+ * document may be useful for constructing an error message but
814
+ * typically shouldn't be used for other purposes.
815
+ */
816
+ GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
817
+
799
818
  // Currently unused
800
819
  GUMBO_STATUS_OUT_OF_MEMORY,
801
820
  } GumboOutputStatus;
@@ -817,13 +836,17 @@ typedef struct GumboInternalOutput {
817
836
 
818
837
  /**
819
838
  * A list of errors that occurred during the parse.
820
- * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
821
- * fleshed out and may change in the future. For this reason, the GumboError
822
- * header isn't part of the public API. Contact us if you need errors
823
- * reported so we can work out something appropriate for your use-case.
824
839
  */
825
840
  GumboVector /* GumboError */ errors;
826
841
 
842
+ /**
843
+ * True if the parser encounted an error.
844
+ *
845
+ * This can be true and `errors` an empty `GumboVector` if the `max_errors`
846
+ * option was set to 0.
847
+ */
848
+ bool document_error;
849
+
827
850
  /**
828
851
  * A status code indicating whether parsing finished successfully or was
829
852
  * stopped mid-document due to exceptional circumstances.
@@ -866,6 +889,53 @@ const char* gumbo_status_to_string(GumboOutputStatus status);
866
889
  /** Release the memory used for the parse tree and parse errors. */
867
890
  void gumbo_destroy_output(GumboOutput* output);
868
891
 
892
+ /** Opaque GumboError type */
893
+ typedef struct GumboInternalError GumboError;
894
+
895
+ /**
896
+ * Returns the position of the error.
897
+ */
898
+ GumboSourcePosition gumbo_error_position(const GumboError* error);
899
+
900
+ /**
901
+ * Returns a constant string representation of the error's code. This is owned
902
+ * by the library and should not be freed by the caller.
903
+ */
904
+ const char* gumbo_error_code(const GumboError* error);
905
+
906
+ /**
907
+ * Prints an error to a string. This stores a freshly-allocated buffer
908
+ * containing the error message text in output. The caller is responsible for
909
+ * freeing the buffer. The size of the error message is returned. The error
910
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
911
+ * returned size must be used.
912
+ */
913
+ size_t gumbo_error_to_string(const GumboError* error, char **output);
914
+
915
+ /**
916
+ * Prints a caret diagnostic to a string. This stores a freshly-allocated
917
+ * buffer containing the error message text in output. The caller is responsible for
918
+ * freeing the buffer. The size of the error message is returned. The error
919
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
920
+ * returned size must be used.
921
+ */
922
+ size_t gumbo_caret_diagnostic_to_string (
923
+ const GumboError* error,
924
+ const char* source_text,
925
+ size_t source_length,
926
+ char** output
927
+ );
928
+
929
+ /**
930
+ * Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
931
+ * instead of writing to a string.
932
+ */
933
+ void gumbo_print_caret_diagnostic (
934
+ const GumboError* error,
935
+ const char* source_text,
936
+ size_t source_length
937
+ );
938
+
869
939
  #ifdef __cplusplus
870
940
  }
871
941
  #endif
@@ -31,6 +31,7 @@
31
31
  #include "replacement.h"
32
32
  #include "tokenizer.h"
33
33
  #include "tokenizer_states.h"
34
+ #include "token_buffer.h"
34
35
  #include "utf8.h"
35
36
  #include "util.h"
36
37
  #include "vector.h"
@@ -42,11 +43,12 @@ typedef uint8_t TagSet[GUMBO_TAG_LAST + 1];
42
43
 
43
44
  #define GUMBO_EMPTY_SOURCE_POSITION_INIT { .line = 0, .column = 0, .offset = 0 }
44
45
  #define kGumboEmptySourcePosition (const GumboSourcePosition) \
45
- GUMBO_EMPTY_SOURCE_POSITION_INIT
46
+ GUMBO_EMPTY_SOURCE_POSITION_INIT
46
47
 
47
48
  const GumboOptions kGumboDefaultOptions = {
48
49
  .tab_stop = 8,
49
50
  .stop_on_first_error = false,
51
+ .max_attributes = 400,
50
52
  .max_tree_depth = 400,
51
53
  .max_errors = -1,
52
54
  .fragment_context = NULL,
@@ -59,25 +61,6 @@ const GumboOptions kGumboDefaultOptions = {
59
61
  #define STRING(s) {.data = s, .length = sizeof(s) - 1}
60
62
  #define TERMINATOR {.data = NULL, .length = 0}
61
63
 
62
- static const GumboStringPiece kPublicIdHtml4_0 =
63
- STRING("-//W3C//DTD HTML 4.0//EN");
64
- static const GumboStringPiece kPublicIdHtml4_01 =
65
- STRING("-//W3C//DTD HTML 4.01//EN");
66
- static const GumboStringPiece kPublicIdXhtml1_0 =
67
- STRING("-//W3C//DTD XHTML 1.0 Strict//EN");
68
- static const GumboStringPiece kPublicIdXhtml1_1 =
69
- STRING("-//W3C//DTD XHTML 1.1//EN");
70
- static const GumboStringPiece kSystemIdRecHtml4_0 =
71
- STRING("http://www.w3.org/TR/REC-html40/strict.dtd");
72
- static const GumboStringPiece kSystemIdHtml4 =
73
- STRING("http://www.w3.org/TR/html4/strict.dtd");
74
- static const GumboStringPiece kSystemIdXhtmlStrict1_1 =
75
- STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
76
- static const GumboStringPiece kSystemIdXhtml1_1 =
77
- STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
78
- static const GumboStringPiece kSystemIdLegacyCompat =
79
- STRING("about:legacy-compat");
80
-
81
64
  // The doctype arrays have an explicit terminator because we want to pass them
82
65
  // to a helper function, and passing them as a pointer discards sizeof
83
66
  // information. The SVG arrays are used only by one-off functions, and so loops
@@ -260,6 +243,9 @@ typedef struct GumboInternalParserState {
260
243
  // The accumulated text node buffer state.
261
244
  TextNodeBufferState _text_node;
262
245
 
246
+ // The accumulated character tokens in tables for error purposes.
247
+ GumboCharacterTokenBuffer _table_character_tokens;
248
+
263
249
  // The current token.
264
250
  GumboToken* _current_token;
265
251
 
@@ -351,6 +337,7 @@ static void output_init(GumboParser* parser) {
351
337
  GumboOutput* output = gumbo_alloc(sizeof(GumboOutput));
352
338
  output->root = NULL;
353
339
  output->document = new_document_node();
340
+ output->document_error = false;
354
341
  output->status = GUMBO_STATUS_OK;
355
342
  parser->_output = output;
356
343
  gumbo_init_errors(parser);
@@ -365,6 +352,7 @@ static void parser_state_init(GumboParser* parser) {
365
352
  parser_state->_foster_parent_insertions = false;
366
353
  parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
367
354
  gumbo_string_buffer_init(&parser_state->_text_node._buffer);
355
+ gumbo_character_token_buffer_init(&parser_state->_table_character_tokens);
368
356
  gumbo_vector_init(10, &parser_state->_open_elements);
369
357
  gumbo_vector_init(5, &parser_state->_active_formatting_elements);
370
358
  gumbo_vector_init(5, &parser_state->_template_insertion_modes);
@@ -463,6 +451,7 @@ static void parser_state_destroy(GumboParser* parser) {
463
451
  gumbo_vector_destroy(&state->_open_elements);
464
452
  gumbo_vector_destroy(&state->_template_insertion_modes);
465
453
  gumbo_string_buffer_destroy(&state->_text_node._buffer);
454
+ gumbo_character_token_buffer_destroy(&state->_table_character_tokens);
466
455
  gumbo_free(state);
467
456
  }
468
457
 
@@ -573,11 +562,11 @@ static bool tag_in (
573
562
  static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
574
563
  if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
575
564
  return token->v.start_tag.tag == tag;
576
- } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
565
+ }
566
+ if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
577
567
  return token->v.end_tag.tag == tag;
578
- } else {
579
- return false;
580
568
  }
569
+ return false;
581
570
  }
582
571
 
583
572
  static inline bool tagset_includes (
@@ -621,6 +610,14 @@ static bool node_qualified_tagname_is (
621
610
  return !gumbo_ascii_strcasecmp(element_name, name);
622
611
  }
623
612
 
613
+ static bool node_html_tagname_is (
614
+ const GumboNode* node,
615
+ GumboTag tag,
616
+ const char *name
617
+ ) {
618
+ return node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, tag, name);
619
+ }
620
+
624
621
  static bool node_tagname_is (
625
622
  const GumboNode* node,
626
623
  GumboTag tag,
@@ -646,7 +643,6 @@ static bool node_qualified_tag_is (
646
643
 
647
644
  // Like node_tag_in, but for the single-tag case in the HTML namespace
648
645
  static bool node_html_tag_is(const GumboNode* node, GumboTag tag) {
649
- assert(tag != GUMBO_TAG_UNKNOWN);
650
646
  return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
651
647
  }
652
648
 
@@ -738,18 +734,18 @@ static void reset_insertion_mode_appropriately(GumboParser* parser) {
738
734
  assert(0);
739
735
  }
740
736
 
741
- static GumboError* parser_add_parse_error (
737
+ static void parser_add_parse_error (
742
738
  GumboParser* parser,
743
739
  const GumboToken* token
744
740
  ) {
745
741
  gumbo_debug("Adding parse error.\n");
746
742
  GumboError* error = gumbo_add_error(parser);
747
743
  if (!error) {
748
- return NULL;
744
+ return;
749
745
  }
750
746
  error->type = GUMBO_ERR_PARSER;
751
747
  error->position = token->position;
752
- error->original_text = token->original_text.data;
748
+ error->original_text = token->original_text;
753
749
  GumboParserError* extra_data = &error->v.parser;
754
750
  extra_data->input_type = token->type;
755
751
  extra_data->input_tag = GUMBO_TAG_UNKNOWN;
@@ -772,7 +768,6 @@ static GumboError* parser_add_parse_error (
772
768
  &extra_data->tag_stack
773
769
  );
774
770
  }
775
- return error;
776
771
  }
777
772
 
778
773
  // https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point
@@ -1639,9 +1634,11 @@ static bool has_node_in_scope(const GumboParser* parser, const GumboNode* node)
1639
1634
  const GumboNodeType type = current->type;
1640
1635
  if (current == node) {
1641
1636
  return true;
1642
- } else if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
1637
+ }
1638
+ if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
1643
1639
  continue;
1644
- } else if (node_tag_in_set(current, &tags)) {
1640
+ }
1641
+ if (node_tag_in_set(current, &tags)) {
1645
1642
  return false;
1646
1643
  }
1647
1644
  }
@@ -1687,14 +1684,18 @@ static bool has_an_element_in_select_scope(const GumboParser* parser, GumboTag t
1687
1684
  // https://html.spec.whatwg.org/multipage/parsing.html#generate-implied-end-tags
1688
1685
  // "exception" is the "element to exclude from the process" listed in the spec.
1689
1686
  // Pass GUMBO_TAG_LAST to not exclude any of them.
1690
- static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1687
+ static void generate_implied_end_tags (
1688
+ GumboParser* parser,
1689
+ GumboTag exception,
1690
+ const char* exception_name
1691
+ ) {
1691
1692
  static const TagSet tags = {
1692
- TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), TAG(OPTGROUP),
1693
- TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC)
1693
+ TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION),
1694
+ TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC)
1694
1695
  };
1695
1696
  while (
1696
1697
  node_tag_in_set(get_current_node(parser), &tags)
1697
- && !node_html_tag_is(get_current_node(parser), exception)
1698
+ && !node_html_tagname_is(get_current_node(parser), exception, exception_name)
1698
1699
  ) {
1699
1700
  pop_current_node(parser);
1700
1701
  }
@@ -1704,15 +1705,36 @@ static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1704
1705
  // https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags
1705
1706
  static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
1706
1707
  static const TagSet tags = {
1707
- TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTION),
1708
- TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY),
1709
- TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)
1708
+ TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
1709
+ TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY),
1710
+ TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
1710
1711
  };
1711
1712
  while (node_tag_in_set(get_current_node(parser), &tags)) {
1712
1713
  pop_current_node(parser);
1713
1714
  }
1714
1715
  }
1715
1716
 
1717
+ // This factors out the clauses in the "in body" insertion mode checking "if
1718
+ // there is a node in the stack of open elements that is not" one of a list of
1719
+ // elements in which case it's a parse error.
1720
+ // This is used in "an end-of-file token", "an end tag whose tag name is
1721
+ // 'body'", and "an end tag whose tag name is 'html'".
1722
+ static bool stack_contains_nonclosable_element (
1723
+ GumboParser* parser
1724
+ ) {
1725
+ static const TagSet tags = {
1726
+ TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P), TAG(RB),
1727
+ TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH),
1728
+ TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML),
1729
+ };
1730
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1731
+ for (size_t i = 0; i < open_elements->length; ++i) {
1732
+ if (!node_tag_in_set(open_elements->data[i], &tags))
1733
+ return true;
1734
+ }
1735
+ return false;
1736
+ }
1737
+
1716
1738
  // This factors out the clauses relating to "act as if an end tag token with tag
1717
1739
  // name "table" had been seen. Returns true if there's a table element in table
1718
1740
  // scope which was successfully closed, false if not and the token should be
@@ -1732,37 +1754,35 @@ static bool close_table(GumboParser* parser) {
1732
1754
 
1733
1755
  // This factors out the clauses relating to "act as if an end tag token with tag
1734
1756
  // name `cell_tag` had been seen".
1735
- static bool close_table_cell (
1757
+ static void close_table_cell (
1736
1758
  GumboParser* parser,
1737
1759
  const GumboToken* token,
1738
1760
  GumboTag cell_tag
1739
1761
  ) {
1740
- bool result = true;
1741
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
1762
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
1742
1763
  const GumboNode* node = get_current_node(parser);
1743
- if (!node_html_tag_is(node, cell_tag)) {
1764
+ if (!node_html_tag_is(node, cell_tag))
1744
1765
  parser_add_parse_error(parser, token);
1745
- result = false;
1746
- }
1747
1766
  do {
1748
1767
  node = pop_current_node(parser);
1749
1768
  } while (!node_html_tag_is(node, cell_tag));
1750
1769
 
1751
1770
  clear_active_formatting_elements(parser);
1752
1771
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
1753
- return result;
1754
1772
  }
1755
1773
 
1756
1774
  // https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell
1757
1775
  // This holds the logic to determine whether we should close a <td> or a <th>.
1758
- static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1776
+ static void close_current_cell(GumboParser* parser, const GumboToken* token) {
1777
+ GumboTag cell_tag;
1759
1778
  if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1760
1779
  assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1761
- return close_table_cell(parser, token, GUMBO_TAG_TD);
1780
+ cell_tag = GUMBO_TAG_TD;
1762
1781
  } else {
1763
1782
  assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1764
- return close_table_cell(parser, token, GUMBO_TAG_TH);
1783
+ cell_tag = GUMBO_TAG_TH;
1765
1784
  }
1785
+ close_table_cell(parser, token, cell_tag);
1766
1786
  }
1767
1787
 
1768
1788
  // This factors out the "act as if an end tag of tag name 'select' had been
@@ -1819,14 +1839,14 @@ static bool is_special_node(const GumboNode* node) {
1819
1839
  // specified qualified name. If the elements closed are in the set handled by
1820
1840
  // generate_implied_end_tags, this is normal operation and this function returns
1821
1841
  // true. Otherwise, a parse error is recorded and this function returns false.
1822
- static bool implicitly_close_tags (
1842
+ static void implicitly_close_tags (
1823
1843
  GumboParser* parser,
1824
1844
  GumboToken* token,
1825
1845
  GumboNamespaceEnum target_ns,
1826
1846
  GumboTag target
1827
1847
  ) {
1828
- bool result = true;
1829
- generate_implied_end_tags(parser, target);
1848
+ assert(target != GUMBO_TAG_UNKNOWN);
1849
+ generate_implied_end_tags(parser, target, NULL);
1830
1850
  if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1831
1851
  parser_add_parse_error(parser, token);
1832
1852
  while (
@@ -1834,30 +1854,27 @@ static bool implicitly_close_tags (
1834
1854
  ) {
1835
1855
  pop_current_node(parser);
1836
1856
  }
1837
- result = false;
1838
1857
  }
1839
1858
  assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
1840
1859
  pop_current_node(parser);
1841
- return result;
1842
1860
  }
1843
1861
 
1844
1862
  // If the stack of open elements has a <p> tag in button scope, this acts as if
1845
1863
  // a </p> tag was encountered, implicitly closing tags. Returns false if a
1846
1864
  // parse error occurs. This is a convenience function because this particular
1847
1865
  // clause appears several times in the spec.
1848
- static bool maybe_implicitly_close_p_tag (
1866
+ static void maybe_implicitly_close_p_tag (
1849
1867
  GumboParser* parser,
1850
1868
  GumboToken* token
1851
1869
  ) {
1852
1870
  if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1853
- return implicitly_close_tags (
1871
+ implicitly_close_tags (
1854
1872
  parser,
1855
1873
  token,
1856
1874
  GUMBO_NAMESPACE_HTML,
1857
1875
  GUMBO_TAG_P
1858
1876
  );
1859
1877
  }
1860
- return true;
1861
1878
  }
1862
1879
 
1863
1880
  // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
@@ -1868,7 +1885,7 @@ static void maybe_implicitly_close_list_tag (
1868
1885
  bool is_li
1869
1886
  ) {
1870
1887
  GumboParserState* state = parser->_parser_state;
1871
- state->_frameset_ok = false;
1888
+ set_frameset_not_ok(parser);
1872
1889
  for (int i = state->_open_elements.length; --i >= 0;) {
1873
1890
  const GumboNode* node = state->_open_elements.data[i];
1874
1891
  bool is_list_tag = is_li
@@ -1884,6 +1901,7 @@ static void maybe_implicitly_close_list_tag (
1884
1901
  );
1885
1902
  return;
1886
1903
  }
1904
+
1887
1905
  if (
1888
1906
  is_special_node(node)
1889
1907
  && !node_tag_in_set(node, &(const TagSet){TAG(ADDRESS), TAG(DIV), TAG(P)})
@@ -2009,40 +2027,19 @@ static void adjust_mathml_attributes(GumboToken* token) {
2009
2027
  attr->name = gumbo_strdup("definitionURL");
2010
2028
  }
2011
2029
 
2012
- static bool doctype_matches (
2013
- const GumboTokenDocType* doctype,
2014
- const GumboStringPiece* public_id,
2015
- const GumboStringPiece* system_id,
2016
- bool allow_missing_system_id
2017
- ) {
2018
- return
2019
- !strcmp(doctype->public_identifier, public_id->data)
2020
- && (allow_missing_system_id || doctype->has_system_identifier)
2021
- && !strcmp(doctype->system_identifier, system_id->data);
2022
- }
2023
-
2024
- static bool maybe_add_doctype_error (
2030
+ static void maybe_add_doctype_error (
2025
2031
  GumboParser* parser,
2026
2032
  const GumboToken* token
2027
2033
  ) {
2028
2034
  const GumboTokenDocType* doctype = &token->v.doc_type;
2029
- bool html_doctype = !strcmp(doctype->name, "html");
2030
- if ((!html_doctype || doctype->has_public_identifier ||
2031
- (doctype->has_system_identifier &&
2032
- !strcmp(
2033
- doctype->system_identifier, kSystemIdLegacyCompat.data))) &&
2034
- !(html_doctype && (doctype_matches(doctype, &kPublicIdHtml4_0,
2035
- &kSystemIdRecHtml4_0, true) ||
2036
- doctype_matches(doctype, &kPublicIdHtml4_01,
2037
- &kSystemIdHtml4, true) ||
2038
- doctype_matches(doctype, &kPublicIdXhtml1_0,
2039
- &kSystemIdXhtmlStrict1_1, false) ||
2040
- doctype_matches(doctype, &kPublicIdXhtml1_1,
2041
- &kSystemIdXhtml1_1, false)))) {
2035
+ if (
2036
+ strcmp(doctype->name, "html")
2037
+ || doctype->has_public_identifier
2038
+ || (doctype->has_system_identifier
2039
+ && strcmp(doctype->system_identifier, "about:legacy-compat"))
2040
+ ) {
2042
2041
  parser_add_parse_error(parser, token);
2043
- return false;
2044
2042
  }
2045
- return true;
2046
2043
  }
2047
2044
 
2048
2045
  static void remove_from_parent(GumboNode* node) {
@@ -2067,39 +2064,115 @@ static void remove_from_parent(GumboNode* node) {
2067
2064
  }
2068
2065
  }
2069
2066
 
2067
+ // This is here to clean up memory when the spec says "Ignore current token."
2068
+ static void ignore_token(GumboParser* parser) {
2069
+ GumboToken* token = parser->_parser_state->_current_token;
2070
+ // Ownership of the token's internal buffers are normally transferred to the
2071
+ // element, but if no element is emitted (as happens in non-verbatim-mode
2072
+ // when a token is ignored), we need to free it here to prevent a memory
2073
+ // leak.
2074
+ gumbo_token_destroy(token);
2075
+ #ifndef NDEBUG
2076
+ if (token->type == GUMBO_TOKEN_START_TAG) {
2077
+ // Mark this sentinel so the assertion in the main loop knows it's been
2078
+ // destroyed.
2079
+ token->v.start_tag.attributes = kGumboEmptyVector;
2080
+ token->v.start_tag.name = NULL;
2081
+ }
2082
+ #endif
2083
+ }
2084
+
2085
+ // The token is usually an end tag; however, the adoption agency algorithm may
2086
+ // invoke this for an 'a' or 'nobr' start tag.
2087
+ // Returns false if there was an error.
2088
+ static void in_body_any_other_end_tag(GumboParser* parser, GumboToken* token)
2089
+ {
2090
+ GumboParserState* state = parser->_parser_state;
2091
+ GumboTag tag;
2092
+ const char* tagname;
2093
+
2094
+ if (token->type == GUMBO_TOKEN_END_TAG) {
2095
+ tag = token->v.end_tag.tag;
2096
+ tagname = token->v.end_tag.name;
2097
+ } else {
2098
+ assert(token->type == GUMBO_TOKEN_START_TAG);
2099
+ tag = token->v.start_tag.tag;
2100
+ tagname = token->v.start_tag.name;
2101
+ }
2102
+
2103
+ assert(state->_open_elements.length > 0);
2104
+ assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
2105
+ // Walk up the stack of open elements until we find one that either:
2106
+ // a) Matches the tag name we saw
2107
+ // b) Is in the "special" category.
2108
+ // If we see a), implicitly close everything up to and including it. If we
2109
+ // see b), then record a parse error, don't close anything (except the
2110
+ // implied end tags) and ignore the end tag token.
2111
+ for (int i = state->_open_elements.length; --i >= 0;) {
2112
+ const GumboNode* node = state->_open_elements.data[i];
2113
+ if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, tag, tagname)) {
2114
+ generate_implied_end_tags(parser, tag, tagname);
2115
+ // <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example of an error.
2116
+ // foo is the "current node" but sarcasm is node.
2117
+ // XXX: Write a test for this.
2118
+ if (node != get_current_node(parser)) {
2119
+ parser_add_parse_error(parser, token);
2120
+ }
2121
+ while (node != pop_current_node(parser))
2122
+ ; // Pop everything.
2123
+ return;
2124
+ } else if (is_special_node(node)) {
2125
+ parser_add_parse_error(parser, token);
2126
+ ignore_token(parser);
2127
+ return;
2128
+ }
2129
+ }
2130
+ // <html> is in the special category, so we should never get here.
2131
+ assert(0 && "unreachable");
2132
+ }
2133
+
2070
2134
  // https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
2071
2135
  // Also described in the "in body" handling for end formatting tags.
2072
- static bool adoption_agency_algorithm (
2073
- GumboParser* parser,
2074
- GumboToken* token,
2075
- GumboTag subject
2076
- ) {
2136
+ // Returns false if there was an error.
2137
+ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
2138
+ {
2077
2139
  GumboParserState* state = parser->_parser_state;
2078
2140
  gumbo_debug("Entering adoption agency algorithm.\n");
2079
2141
  // Step 1.
2142
+ GumboTag subject;
2143
+ if (token->type == GUMBO_TOKEN_START_TAG) {
2144
+ subject = token->v.start_tag.tag;
2145
+ } else {
2146
+ assert(token->type == GUMBO_TOKEN_END_TAG);
2147
+ subject = token->v.end_tag.tag;
2148
+ }
2149
+ assert(subject != GUMBO_TAG_UNKNOWN);
2150
+
2151
+ // Step 2.
2080
2152
  GumboNode* current_node = get_current_node(parser);
2081
2153
  if (
2082
- current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML
2083
- && current_node->v.element.tag == subject
2154
+ node_html_tag_is(current_node, subject)
2084
2155
  && -1 == gumbo_vector_index_of (
2085
2156
  &state->_active_formatting_elements,
2086
2157
  current_node
2087
2158
  )
2088
2159
  ) {
2089
2160
  pop_current_node(parser);
2090
- return false;
2161
+ return;
2091
2162
  }
2092
- // Steps 2-4 & 20:
2163
+
2164
+ // Steps 3-5 & 21:
2093
2165
  for (unsigned int i = 0; i < 8; ++i) {
2094
- // Step 5.
2166
+ // Step 6.
2095
2167
  GumboNode* formatting_node = NULL;
2096
2168
  int formatting_node_in_open_elements = -1;
2097
2169
  for (int j = state->_active_formatting_elements.length; --j >= 0;) {
2098
2170
  GumboNode* current_node = state->_active_formatting_elements.data[j];
2099
2171
  if (current_node == &kActiveFormattingScopeMarker) {
2100
2172
  gumbo_debug("Broke on scope marker; aborting.\n");
2101
- // Last scope marker; abort the algorithm.
2102
- return false;
2173
+ // Last scope marker; abort the algorithm and handle according to "any
2174
+ // other end tag" (below).
2175
+ break;
2103
2176
  }
2104
2177
  if (node_html_tag_is(current_node, subject)) {
2105
2178
  // Found it.
@@ -2121,10 +2194,11 @@ static bool adoption_agency_algorithm (
2121
2194
  // "any other end tag" clause (which may potentially add a parse error,
2122
2195
  // but not always).
2123
2196
  gumbo_debug("No active formatting elements; aborting.\n");
2124
- return false;
2197
+ in_body_any_other_end_tag(parser, token);
2198
+ return;
2125
2199
  }
2126
2200
 
2127
- // Step 6
2201
+ // Step 7
2128
2202
  if (formatting_node_in_open_elements == -1) {
2129
2203
  gumbo_debug("Formatting node not on stack of open elements.\n");
2130
2204
  parser_add_parse_error(parser, token);
@@ -2132,25 +2206,24 @@ static bool adoption_agency_algorithm (
2132
2206
  formatting_node,
2133
2207
  &state->_active_formatting_elements
2134
2208
  );
2135
- return false;
2209
+ return;
2136
2210
  }
2137
2211
 
2138
- // Step 7
2212
+ // Step 8
2139
2213
  if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
2140
2214
  parser_add_parse_error(parser, token);
2141
2215
  gumbo_debug("Element not in scope.\n");
2142
- return false;
2216
+ return;
2143
2217
  }
2144
2218
 
2145
- // Step 8
2146
- if (formatting_node != get_current_node(parser)) {
2219
+ // Step 9
2220
+ if (formatting_node != get_current_node(parser))
2147
2221
  parser_add_parse_error(parser, token); // But continue onwards.
2148
- }
2149
2222
  assert(formatting_node);
2150
2223
  assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
2151
2224
  assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
2152
2225
 
2153
- // Step 9 & 10
2226
+ // Step 10
2154
2227
  GumboNode* furthest_block = NULL;
2155
2228
  for (
2156
2229
  unsigned int j = formatting_node_in_open_elements;
@@ -2160,32 +2233,27 @@ static bool adoption_agency_algorithm (
2160
2233
  assert(j > 0);
2161
2234
  GumboNode* current = state->_open_elements.data[j];
2162
2235
  if (is_special_node(current)) {
2163
- // Step 9.
2164
2236
  furthest_block = current;
2165
2237
  break;
2166
2238
  }
2167
2239
  }
2240
+ // Step 11.
2168
2241
  if (!furthest_block) {
2169
- // Step 10.
2170
- while (get_current_node(parser) != formatting_node) {
2171
- pop_current_node(parser);
2172
- }
2173
- // And the formatting element itself.
2174
- pop_current_node(parser);
2242
+ while (pop_current_node(parser) != formatting_node)
2243
+ ;
2175
2244
  gumbo_vector_remove (
2176
2245
  formatting_node,
2177
2246
  &state->_active_formatting_elements
2178
2247
  );
2179
- return false;
2248
+ return;
2180
2249
  }
2181
2250
  assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
2182
- assert(furthest_block);
2183
2251
 
2184
- // Step 11.
2252
+ // Step 12.
2185
2253
  // Elements may be moved and reparented by this algorithm, so
2186
2254
  // common_ancestor is not necessarily the same as formatting_node->parent.
2187
2255
  GumboNode* common_ancestor = state->_open_elements.data [
2188
- gumbo_vector_index_of(&state->_open_elements, formatting_node) - 1
2256
+ formatting_node_in_open_elements - 1
2189
2257
  ];
2190
2258
  gumbo_debug (
2191
2259
  "Common ancestor tag = %s, furthest block tag = %s.\n",
@@ -2193,24 +2261,24 @@ static bool adoption_agency_algorithm (
2193
2261
  gumbo_normalized_tagname(furthest_block->v.element.tag)
2194
2262
  );
2195
2263
 
2196
- // Step 12.
2264
+ // Step 13.
2197
2265
  int bookmark = 1 + gumbo_vector_index_of (
2198
2266
  &state->_active_formatting_elements,
2199
2267
  formatting_node
2200
2268
  );
2201
2269
  gumbo_debug("Bookmark at %d.\n", bookmark);
2202
- // Step 13.
2270
+ // Step 14.
2203
2271
  GumboNode* node = furthest_block;
2204
2272
  GumboNode* last_node = furthest_block;
2205
2273
  // Must be stored explicitly, in case node is removed from the stack of open
2206
- // elements, to handle step 9.4.
2274
+ // elements, to handle step 14.3.
2207
2275
  int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
2208
2276
  assert(saved_node_index > 0);
2209
- // Step 13.1.
2277
+ // Step 14.1.
2210
2278
  for (int j = 0;;) {
2211
- // Step 13.2.
2279
+ // Step 14.2.
2212
2280
  ++j;
2213
- // Step 13.3.
2281
+ // Step 14.3.
2214
2282
  int node_index = gumbo_vector_index_of(&state->_open_elements, node);
2215
2283
  gumbo_debug (
2216
2284
  "Current index: %d, last index: %d.\n",
@@ -2225,16 +2293,16 @@ static bool adoption_agency_algorithm (
2225
2293
  assert((unsigned int) node_index < state->_open_elements.capacity);
2226
2294
  node = state->_open_elements.data[node_index];
2227
2295
  assert(node->parent);
2296
+ // Step 14.4.
2228
2297
  if (node == formatting_node) {
2229
- // Step 13.4.
2230
2298
  break;
2231
2299
  }
2232
2300
  int formatting_index = gumbo_vector_index_of (
2233
2301
  &state->_active_formatting_elements,
2234
2302
  node
2235
2303
  );
2304
+ // Step 14.5.
2236
2305
  if (j > 3 && formatting_index != -1) {
2237
- // Step 13.5.
2238
2306
  gumbo_debug("Removing formatting element at %d.\n", formatting_index);
2239
2307
  gumbo_vector_remove_at (
2240
2308
  formatting_index,
@@ -2249,11 +2317,11 @@ static bool adoption_agency_algorithm (
2249
2317
  continue;
2250
2318
  }
2251
2319
  if (formatting_index == -1) {
2252
- // Step 13.6.
2320
+ // Step 14.6.
2253
2321
  gumbo_vector_remove_at(node_index, &state->_open_elements);
2254
2322
  continue;
2255
2323
  }
2256
- // Step 13.7.
2324
+ // Step 14.7.
2257
2325
  // "common ancestor as the intended parent" doesn't actually mean insert
2258
2326
  // it into the common ancestor; that happens below.
2259
2327
  node = clone_node(node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
@@ -2261,21 +2329,21 @@ static bool adoption_agency_algorithm (
2261
2329
  state->_active_formatting_elements.data[formatting_index] = node;
2262
2330
  assert(node_index >= 0);
2263
2331
  state->_open_elements.data[node_index] = node;
2264
- // Step 13.8.
2332
+ // Step 14.8.
2265
2333
  if (last_node == furthest_block) {
2266
2334
  bookmark = formatting_index + 1;
2267
2335
  gumbo_debug("Bookmark moved to %d.\n", bookmark);
2268
2336
  assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
2269
2337
  }
2270
- // Step 13.9.
2338
+ // Step 14.9.
2271
2339
  last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
2272
2340
  remove_from_parent(last_node);
2273
2341
  append_node(node, last_node);
2274
- // Step 13.10.
2342
+ // Step 14.10.
2275
2343
  last_node = node;
2276
- } // Step 13.11.
2344
+ } // Step 14.11.
2277
2345
 
2278
- // Step 14.
2346
+ // Step 15.
2279
2347
  gumbo_debug (
2280
2348
  "Removing %s node from parent ",
2281
2349
  gumbo_normalized_tagname(last_node->v.element.tag)
@@ -2292,14 +2360,14 @@ static bool adoption_agency_algorithm (
2292
2360
  );
2293
2361
  insert_node(last_node, location);
2294
2362
 
2295
- // Step 15.
2363
+ // Step 16.
2296
2364
  GumboNode* new_formatting_node = clone_node (
2297
2365
  formatting_node,
2298
2366
  GUMBO_INSERTION_ADOPTION_AGENCY_CLONED
2299
2367
  );
2300
2368
  formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2301
2369
 
2302
- // Step 16. Instead of appending nodes one-by-one, we swap the children
2370
+ // Step 17. Instead of appending nodes one-by-one, we swap the children
2303
2371
  // vector of furthest_block with the empty children of new_formatting_node,
2304
2372
  // reducing memory traffic and allocations. We still have to reset their
2305
2373
  // parent pointers, though.
@@ -2313,10 +2381,10 @@ static bool adoption_agency_algorithm (
2313
2381
  child->parent = new_formatting_node;
2314
2382
  }
2315
2383
 
2316
- // Step 17.
2384
+ // Step 18.
2317
2385
  append_node(furthest_block, new_formatting_node);
2318
2386
 
2319
- // Step 18.
2387
+ // Step 19.
2320
2388
  // If the formatting node was before the bookmark, it may shift over all
2321
2389
  // indices after it, so we need to explicitly find the index and possibly
2322
2390
  // adjust the bookmark.
@@ -2344,7 +2412,7 @@ static bool adoption_agency_algorithm (
2344
2412
  &state->_active_formatting_elements
2345
2413
  );
2346
2414
 
2347
- // Step 19.
2415
+ // Step 20.
2348
2416
  gumbo_vector_remove(formatting_node, &state->_open_elements);
2349
2417
  int insert_at = 1 + gumbo_vector_index_of (
2350
2418
  &state->_open_elements,
@@ -2357,26 +2425,7 @@ static bool adoption_agency_algorithm (
2357
2425
  insert_at,
2358
2426
  &state->_open_elements
2359
2427
  );
2360
- } // Step 20.
2361
- return true;
2362
- }
2363
-
2364
- // This is here to clean up memory when the spec says "Ignore current token."
2365
- static void ignore_token(GumboParser* parser) {
2366
- GumboToken* token = parser->_parser_state->_current_token;
2367
- // Ownership of the token's internal buffers are normally transferred to the
2368
- // element, but if no element is emitted (as happens in non-verbatim-mode
2369
- // when a token is ignored), we need to free it here to prevent a memory
2370
- // leak.
2371
- gumbo_token_destroy(token);
2372
- #ifndef NDEBUG
2373
- if (token->type == GUMBO_TOKEN_START_TAG) {
2374
- // Mark this sentinel so the assertion in the main loop knows it's been
2375
- // destroyed.
2376
- token->v.start_tag.attributes = kGumboEmptyVector;
2377
- token->v.start_tag.name = NULL;
2378
- }
2379
- #endif
2428
+ } // Step 21.
2380
2429
  }
2381
2430
 
2382
2431
  // https://html.spec.whatwg.org/multipage/parsing.html#the-end
@@ -2401,125 +2450,139 @@ static void finish_parsing(GumboParser* parser) {
2401
2450
  ; // Pop them all.
2402
2451
  }
2403
2452
 
2404
- static bool handle_initial(GumboParser* parser, GumboToken* token) {
2453
+ static void handle_initial(GumboParser* parser, GumboToken* token) {
2405
2454
  GumboDocument* document = &get_document_node(parser)->v.document;
2406
2455
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2407
2456
  ignore_token(parser);
2408
- return true;
2409
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2457
+ return;
2458
+ }
2459
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2410
2460
  append_comment_node(parser, get_document_node(parser), token);
2411
- return true;
2412
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2461
+ return;
2462
+ }
2463
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2413
2464
  document->has_doctype = true;
2414
2465
  document->name = token->v.doc_type.name;
2415
2466
  document->public_identifier = token->v.doc_type.public_identifier;
2416
2467
  document->system_identifier = token->v.doc_type.system_identifier;
2417
2468
  document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
2418
2469
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2419
- return maybe_add_doctype_error(parser, token);
2470
+ maybe_add_doctype_error(parser, token);
2471
+ return;
2420
2472
  }
2421
2473
  parser_add_parse_error(parser, token);
2422
2474
  document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
2423
2475
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2424
2476
  parser->_parser_state->_reprocess_current_token = true;
2425
- return true;
2426
2477
  }
2427
2478
 
2428
2479
  // https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode
2429
- static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2480
+ static void handle_before_html(GumboParser* parser, GumboToken* token) {
2430
2481
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
2431
2482
  parser_add_parse_error(parser, token);
2432
2483
  ignore_token(parser);
2433
- return false;
2434
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2484
+ return;
2485
+ }
2486
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2435
2487
  append_comment_node(parser, get_document_node(parser), token);
2436
- return true;
2437
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2488
+ return;
2489
+ }
2490
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2438
2491
  ignore_token(parser);
2439
- return true;
2440
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2492
+ return;
2493
+ }
2494
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2441
2495
  GumboNode* html_node = insert_element_from_token(parser, token);
2442
2496
  parser->_output->root = html_node;
2443
2497
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2444
- return true;
2445
- } else if (
2498
+ return;
2499
+ }
2500
+ if (
2446
2501
  token->type == GUMBO_TOKEN_END_TAG
2447
2502
  && !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2448
2503
  ) {
2449
2504
  parser_add_parse_error(parser, token);
2450
2505
  ignore_token(parser);
2451
- return false;
2452
- } else {
2453
- GumboNode* html_node = insert_element_of_tag_type (
2454
- parser,
2455
- GUMBO_TAG_HTML,
2456
- GUMBO_INSERTION_IMPLIED
2457
- );
2458
- assert(html_node);
2459
- parser->_output->root = html_node;
2460
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2461
- parser->_parser_state->_reprocess_current_token = true;
2462
- return true;
2506
+ return;
2463
2507
  }
2508
+ GumboNode* html_node = insert_element_of_tag_type (
2509
+ parser,
2510
+ GUMBO_TAG_HTML,
2511
+ GUMBO_INSERTION_IMPLIED
2512
+ );
2513
+ assert(html_node);
2514
+ parser->_output->root = html_node;
2515
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2516
+ parser->_parser_state->_reprocess_current_token = true;
2464
2517
  }
2465
2518
 
2519
+ // Forward declarations because of mutual dependencies.
2520
+ static void handle_token(GumboParser* parser, GumboToken* token);
2521
+ static void handle_in_body(GumboParser* parser, GumboToken* token);
2522
+ static void handle_in_template(GumboParser* parser, GumboToken* token);
2523
+
2466
2524
  // https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode
2467
- static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2468
- if (token->type == GUMBO_TOKEN_DOCTYPE) {
2469
- parser_add_parse_error(parser, token);
2525
+ static void handle_before_head(GumboParser* parser, GumboToken* token) {
2526
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2470
2527
  ignore_token(parser);
2471
- return false;
2472
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2528
+ return;
2529
+ }
2530
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2473
2531
  append_comment_node(parser, get_current_node(parser), token);
2474
- return true;
2475
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2532
+ return;
2533
+ }
2534
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2535
+ parser_add_parse_error(parser, token);
2476
2536
  ignore_token(parser);
2477
- return true;
2478
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2537
+ return;
2538
+ }
2539
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2540
+ handle_in_body(parser, token);
2541
+ return;
2542
+ }
2543
+ if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2479
2544
  GumboNode* node = insert_element_from_token(parser, token);
2480
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2481
2545
  parser->_parser_state->_head_element = node;
2482
- return true;
2483
- } else if (
2546
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2547
+ return;
2548
+ }
2549
+ if (
2484
2550
  token->type == GUMBO_TOKEN_END_TAG
2485
- && !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2551
+ && !tag_in(token, kEndTag, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2486
2552
  ) {
2487
2553
  parser_add_parse_error(parser, token);
2488
2554
  ignore_token(parser);
2489
- return false;
2490
- } else {
2491
- GumboNode* node = insert_element_of_tag_type (
2492
- parser,
2493
- GUMBO_TAG_HEAD,
2494
- GUMBO_INSERTION_IMPLIED
2495
- );
2496
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2497
- parser->_parser_state->_head_element = node;
2498
- parser->_parser_state->_reprocess_current_token = true;
2499
- return true;
2555
+ return;
2500
2556
  }
2557
+ GumboNode* node = insert_element_of_tag_type (
2558
+ parser,
2559
+ GUMBO_TAG_HEAD,
2560
+ GUMBO_INSERTION_IMPLIED
2561
+ );
2562
+ parser->_parser_state->_head_element = node;
2563
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2564
+ parser->_parser_state->_reprocess_current_token = true;
2501
2565
  }
2502
2566
 
2503
- // Forward declarations because of mutual dependencies.
2504
- static bool handle_token(GumboParser* parser, GumboToken* token);
2505
- static bool handle_in_body(GumboParser* parser, GumboToken* token);
2506
- static bool handle_in_template(GumboParser* parser, GumboToken* token);
2507
-
2508
2567
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead
2509
- static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2568
+ static void handle_in_head(GumboParser* parser, GumboToken* token) {
2510
2569
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2511
2570
  insert_text_token(parser, token);
2512
- return true;
2513
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2571
+ return;
2572
+ }
2573
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2574
+ append_comment_node(parser, get_current_node(parser), token);
2575
+ return;
2576
+ }
2577
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2514
2578
  parser_add_parse_error(parser, token);
2515
2579
  ignore_token(parser);
2516
- return false;
2517
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2518
- append_comment_node(parser, get_current_node(parser), token);
2519
- return true;
2520
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2580
+ return;
2581
+ }
2582
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2521
2583
  return handle_in_body(parser, token);
2522
- } else if (
2584
+ }
2585
+ if (
2523
2586
  tag_in(token, kStartTag, &(const TagSet) {
2524
2587
  TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK)
2525
2588
  })
@@ -2527,8 +2590,9 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2527
2590
  insert_element_from_token(parser, token);
2528
2591
  pop_current_node(parser);
2529
2592
  acknowledge_self_closing_tag(parser);
2530
- return true;
2531
- } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2593
+ return;
2594
+ }
2595
+ if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2532
2596
  insert_element_from_token(parser, token);
2533
2597
  pop_current_node(parser);
2534
2598
  acknowledge_self_closing_tag(parser);
@@ -2536,90 +2600,98 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2536
2600
  // spec doesn't apply. If clients want to handle meta-tag re-encoding, they
2537
2601
  // should specifically look for that string in the document and re-encode it
2538
2602
  // before passing to Gumbo.
2539
- return true;
2540
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2603
+ return;
2604
+ }
2605
+ if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2541
2606
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2542
- return true;
2543
- } else if (
2607
+ return;
2608
+ }
2609
+ if (
2544
2610
  tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
2545
2611
  ) {
2546
2612
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2547
- return true;
2548
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2613
+ return;
2614
+ }
2615
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2549
2616
  insert_element_from_token(parser, token);
2550
2617
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2551
- return true;
2552
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2553
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
2554
- return true;
2555
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2618
+ return;
2619
+ }
2620
+ if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2621
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT_DATA);
2622
+ return;
2623
+ }
2624
+ if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2556
2625
  GumboNode* head = pop_current_node(parser);
2557
2626
  UNUSED_IF_NDEBUG(head);
2558
2627
  assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
2559
2628
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2560
- return true;
2561
- } else if (
2629
+ return;
2630
+ }
2631
+ if (
2562
2632
  tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)})
2563
2633
  ) {
2564
2634
  pop_current_node(parser);
2565
2635
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2566
2636
  parser->_parser_state->_reprocess_current_token = true;
2567
- return true;
2568
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2637
+ return;
2638
+ }
2639
+ if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2569
2640
  insert_element_from_token(parser, token);
2570
2641
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
2571
- parser->_parser_state->_frameset_ok = false;
2642
+ set_frameset_not_ok(parser);
2572
2643
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2573
2644
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2574
- return true;
2575
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2645
+ return;
2646
+ }
2647
+ if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2576
2648
  if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2577
2649
  parser_add_parse_error(parser, token);
2578
2650
  ignore_token(parser);
2579
- return false;
2651
+ return;
2580
2652
  }
2581
2653
  generate_all_implied_end_tags_thoroughly(parser);
2582
- bool success = true;
2583
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) {
2654
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE))
2584
2655
  parser_add_parse_error(parser, token);
2585
- success = false;
2586
- }
2587
2656
  while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
2588
2657
  ;
2589
2658
  clear_active_formatting_elements(parser);
2590
2659
  pop_template_insertion_mode(parser);
2591
2660
  reset_insertion_mode_appropriately(parser);
2592
- return success;
2593
- } else if (
2661
+ return;
2662
+ }
2663
+ if (
2594
2664
  tag_is(token, kStartTag, GUMBO_TAG_HEAD)
2595
2665
  || (token->type == GUMBO_TOKEN_END_TAG)
2596
2666
  ) {
2597
2667
  parser_add_parse_error(parser, token);
2598
2668
  ignore_token(parser);
2599
- return false;
2600
- } else {
2601
- pop_current_node(parser);
2602
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2603
- parser->_parser_state->_reprocess_current_token = true;
2604
- return true;
2669
+ return;
2605
2670
  }
2606
- return true;
2671
+ pop_current_node(parser);
2672
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2673
+ parser->_parser_state->_reprocess_current_token = true;
2674
+ return;
2607
2675
  }
2608
2676
 
2609
2677
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inheadnoscript
2610
- static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2678
+ static void handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2611
2679
  if (token->type == GUMBO_TOKEN_DOCTYPE) {
2612
2680
  parser_add_parse_error(parser, token);
2613
- return false;
2614
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2615
- return handle_in_body(parser, token);
2616
- } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2681
+ return;
2682
+ }
2683
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2684
+ handle_in_body(parser, token);
2685
+ return;
2686
+ }
2687
+ if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2617
2688
  const GumboNode* node = pop_current_node(parser);
2618
2689
  assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2619
2690
  UNUSED_IF_NDEBUG(node);
2620
2691
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2621
- return true;
2622
- } else if (
2692
+ return;
2693
+ }
2694
+ if (
2623
2695
  token->type == GUMBO_TOKEN_WHITESPACE
2624
2696
  || token->type == GUMBO_TOKEN_COMMENT
2625
2697
  || tag_in (token, kStartTag, &(const TagSet) {
@@ -2627,8 +2699,10 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2627
2699
  TAG(META), TAG(NOFRAMES), TAG(STYLE)
2628
2700
  })
2629
2701
  ) {
2630
- return handle_in_head(parser, token);
2631
- } else if (
2702
+ handle_in_head(parser, token);
2703
+ return;
2704
+ }
2705
+ if (
2632
2706
  tag_in(token, kStartTag, &(const TagSet){TAG(HEAD), TAG(NOSCRIPT)})
2633
2707
  || (
2634
2708
  token->type == GUMBO_TOKEN_END_TAG
@@ -2637,43 +2711,48 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2637
2711
  ) {
2638
2712
  parser_add_parse_error(parser, token);
2639
2713
  ignore_token(parser);
2640
- return false;
2641
- } else {
2642
- parser_add_parse_error(parser, token);
2643
- const GumboNode* node = pop_current_node(parser);
2644
- assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2645
- UNUSED_IF_NDEBUG(node);
2646
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2647
- parser->_parser_state->_reprocess_current_token = true;
2648
- return false;
2714
+ return;
2649
2715
  }
2716
+ parser_add_parse_error(parser, token);
2717
+ const GumboNode* node = pop_current_node(parser);
2718
+ assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2719
+ UNUSED_IF_NDEBUG(node);
2720
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2721
+ parser->_parser_state->_reprocess_current_token = true;
2650
2722
  }
2651
2723
 
2652
2724
  // https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode
2653
- static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2725
+ static void handle_after_head(GumboParser* parser, GumboToken* token) {
2654
2726
  GumboParserState* state = parser->_parser_state;
2655
2727
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
2656
2728
  insert_text_token(parser, token);
2657
- return true;
2658
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2729
+ return;
2730
+ }
2731
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2732
+ append_comment_node(parser, get_current_node(parser), token);
2733
+ return;
2734
+ }
2735
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2659
2736
  parser_add_parse_error(parser, token);
2660
2737
  ignore_token(parser);
2661
- return false;
2662
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2663
- append_comment_node(parser, get_current_node(parser), token);
2664
- return true;
2665
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2666
- return handle_in_body(parser, token);
2667
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2738
+ return;
2739
+ }
2740
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2741
+ handle_in_body(parser, token);
2742
+ return;
2743
+ }
2744
+ if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2668
2745
  insert_element_from_token(parser, token);
2669
- state->_frameset_ok = false;
2746
+ set_frameset_not_ok(parser);
2670
2747
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2671
- return true;
2672
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2748
+ return;
2749
+ }
2750
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2673
2751
  insert_element_from_token(parser, token);
2674
2752
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2675
- return true;
2676
- } else if (
2753
+ return;
2754
+ }
2755
+ if (
2677
2756
  tag_in(token, kStartTag, &(const TagSet) {
2678
2757
  TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META),
2679
2758
  TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)
@@ -2685,12 +2764,15 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2685
2764
  // pending character tokens that should be attached to the root.
2686
2765
  maybe_flush_text_node_buffer(parser);
2687
2766
  gumbo_vector_add(state->_head_element, &state->_open_elements);
2688
- bool result = handle_in_head(parser, token);
2767
+ handle_in_head(parser, token);
2689
2768
  gumbo_vector_remove(state->_head_element, &state->_open_elements);
2690
- return result;
2691
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2692
- return handle_in_head(parser, token);
2693
- } else if (
2769
+ return;
2770
+ }
2771
+ if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2772
+ handle_in_head(parser, token);
2773
+ return;
2774
+ }
2775
+ if (
2694
2776
  tag_is(token, kStartTag, GUMBO_TAG_HEAD)
2695
2777
  || (
2696
2778
  token->type == GUMBO_TOKEN_END_TAG
@@ -2699,53 +2781,57 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2699
2781
  ) {
2700
2782
  parser_add_parse_error(parser, token);
2701
2783
  ignore_token(parser);
2702
- return false;
2703
- } else {
2704
- insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2705
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2706
- state->_reprocess_current_token = true;
2707
- return true;
2784
+ return;
2708
2785
  }
2786
+ insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2787
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2788
+ state->_reprocess_current_token = true;
2709
2789
  }
2710
2790
 
2711
2791
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
2712
- static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2792
+ static void handle_in_body(GumboParser* parser, GumboToken* token) {
2713
2793
  GumboParserState* state = parser->_parser_state;
2714
2794
  assert(state->_open_elements.length > 0);
2715
2795
  if (token->type == GUMBO_TOKEN_NULL) {
2716
2796
  parser_add_parse_error(parser, token);
2717
2797
  ignore_token(parser);
2718
- return false;
2719
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2798
+ return;
2799
+ }
2800
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2720
2801
  reconstruct_active_formatting_elements(parser);
2721
2802
  insert_text_token(parser, token);
2722
- return true;
2723
- } else if (
2803
+ return;
2804
+ }
2805
+ if (
2724
2806
  token->type == GUMBO_TOKEN_CHARACTER
2725
2807
  || token->type == GUMBO_TOKEN_CDATA
2726
2808
  ) {
2727
2809
  reconstruct_active_formatting_elements(parser);
2728
2810
  insert_text_token(parser, token);
2729
2811
  set_frameset_not_ok(parser);
2730
- return true;
2731
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2812
+ return;
2813
+ }
2814
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2732
2815
  append_comment_node(parser, get_current_node(parser), token);
2733
- return true;
2734
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2816
+ return;
2817
+ }
2818
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2735
2819
  parser_add_parse_error(parser, token);
2736
2820
  ignore_token(parser);
2737
- return false;
2738
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2821
+ return;
2822
+ }
2823
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2739
2824
  parser_add_parse_error(parser, token);
2740
2825
  if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2741
2826
  ignore_token(parser);
2742
- return false;
2827
+ return;
2743
2828
  }
2744
2829
  assert(parser->_output->root != NULL);
2745
2830
  assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2746
2831
  merge_attributes(token, parser->_output->root);
2747
- return false;
2748
- } else if (
2832
+ return;
2833
+ }
2834
+ if (
2749
2835
  tag_in(token, kStartTag, &(const TagSet) {
2750
2836
  TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
2751
2837
  TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE),
@@ -2753,8 +2839,10 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2753
2839
  })
2754
2840
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
2755
2841
  ) {
2756
- return handle_in_head(parser, token);
2757
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2842
+ handle_in_head(parser, token);
2843
+ return;
2844
+ }
2845
+ if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2758
2846
  parser_add_parse_error(parser, token);
2759
2847
  if (
2760
2848
  state->_open_elements.length < 2
@@ -2762,12 +2850,13 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2762
2850
  || has_open_element(parser, GUMBO_TAG_TEMPLATE)
2763
2851
  ) {
2764
2852
  ignore_token(parser);
2765
- return false;
2853
+ } else {
2854
+ set_frameset_not_ok(parser);
2855
+ merge_attributes(token, state->_open_elements.data[1]);
2766
2856
  }
2767
- state->_frameset_ok = false;
2768
- merge_attributes(token, state->_open_elements.data[1]);
2769
- return false;
2770
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2857
+ return;
2858
+ }
2859
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2771
2860
  parser_add_parse_error(parser, token);
2772
2861
  if (
2773
2862
  state->_open_elements.length < 2
@@ -2775,7 +2864,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2775
2864
  || !state->_frameset_ok
2776
2865
  ) {
2777
2866
  ignore_token(parser);
2778
- return false;
2867
+ return;
2779
2868
  }
2780
2869
  // Save the body node for later removal.
2781
2870
  GumboNode* body_node = state->_open_elements.data[1];
@@ -2807,80 +2896,74 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2807
2896
  // Insert the <frameset>, and switch the insertion mode.
2808
2897
  insert_element_from_token(parser, token);
2809
2898
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2810
- return true;
2811
- } else if (token->type == GUMBO_TOKEN_EOF) {
2812
- for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2813
- if (
2814
- !node_tag_in_set(state->_open_elements.data[i], &(const TagSet) {
2815
- TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY), TAG(TD), TAG(TFOOT),
2816
- TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML)
2817
- })
2818
- ) {
2819
- parser_add_parse_error(parser, token);
2820
- }
2821
- }
2899
+ return;
2900
+ }
2901
+ if (token->type == GUMBO_TOKEN_EOF) {
2822
2902
  if (get_current_template_insertion_mode(parser) !=
2823
2903
  GUMBO_INSERTION_MODE_INITIAL) {
2824
- return handle_in_template(parser, token);
2904
+ handle_in_template(parser, token);
2905
+ return;
2825
2906
  }
2826
- return true;
2827
- } else if (tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML)})) {
2907
+ if (stack_contains_nonclosable_element(parser))
2908
+ parser_add_parse_error(parser, token);
2909
+ return;
2910
+ }
2911
+ if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
2828
2912
  if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2829
2913
  parser_add_parse_error(parser, token);
2830
2914
  ignore_token(parser);
2831
- return false;
2832
- }
2833
- bool success = true;
2834
- for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2835
- if (
2836
- !node_tag_in_set(state->_open_elements.data[i], &(const TagSet) {
2837
- TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P),
2838
- TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD),
2839
- TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML)
2840
- })
2841
- ) {
2842
- parser_add_parse_error(parser, token);
2843
- success = false;
2844
- break;
2845
- }
2915
+ return;
2846
2916
  }
2917
+ if (stack_contains_nonclosable_element(parser))
2918
+ parser_add_parse_error(parser, token);
2919
+ GumboNode* body = state->_open_elements.data[1];
2920
+ assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2921
+ record_end_of_element(state->_current_token, &body->v.element);
2847
2922
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2848
- if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2849
- parser->_parser_state->_reprocess_current_token = true;
2850
- } else {
2851
- GumboNode* body = state->_open_elements.data[1];
2852
- assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2853
- record_end_of_element(state->_current_token, &body->v.element);
2923
+ return;
2924
+ }
2925
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2926
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2927
+ parser_add_parse_error(parser, token);
2928
+ ignore_token(parser);
2929
+ return;
2854
2930
  }
2855
- return success;
2856
- } else if (
2931
+ if (stack_contains_nonclosable_element(parser))
2932
+ parser_add_parse_error(parser, token);
2933
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2934
+ parser->_parser_state->_reprocess_current_token = true;
2935
+ return;
2936
+ }
2937
+ if (
2857
2938
  tag_in(token, kStartTag, &(const TagSet) {
2858
2939
  TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER),
2859
2940
  TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
2860
2941
  TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
2861
- TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
2942
+ TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
2862
2943
  TAG(SUMMARY), TAG(UL)
2863
2944
  })
2864
2945
  ) {
2865
- bool result = maybe_implicitly_close_p_tag(parser, token);
2946
+ maybe_implicitly_close_p_tag(parser, token);
2866
2947
  insert_element_from_token(parser, token);
2867
- return result;
2868
- } else if (tag_in(token, kStartTag, &heading_tags)) {
2869
- bool result = maybe_implicitly_close_p_tag(parser, token);
2948
+ return;
2949
+ }
2950
+ if (tag_in(token, kStartTag, &heading_tags)) {
2951
+ maybe_implicitly_close_p_tag(parser, token);
2870
2952
  if (node_tag_in_set(get_current_node(parser), &heading_tags)) {
2871
2953
  parser_add_parse_error(parser, token);
2872
2954
  pop_current_node(parser);
2873
- result = false;
2874
2955
  }
2875
2956
  insert_element_from_token(parser, token);
2876
- return result;
2877
- } else if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) {
2878
- bool result = maybe_implicitly_close_p_tag(parser, token);
2957
+ return;
2958
+ }
2959
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) {
2960
+ maybe_implicitly_close_p_tag(parser, token);
2879
2961
  insert_element_from_token(parser, token);
2880
2962
  state->_ignore_next_linefeed = true;
2881
- state->_frameset_ok = false;
2882
- return result;
2883
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2963
+ set_frameset_not_ok(parser);
2964
+ return;
2965
+ }
2966
+ if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2884
2967
  if (
2885
2968
  state->_form_element != NULL
2886
2969
  && !has_open_element(parser, GUMBO_TAG_TEMPLATE)
@@ -2888,46 +2971,48 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2888
2971
  gumbo_debug("Ignoring nested form.\n");
2889
2972
  parser_add_parse_error(parser, token);
2890
2973
  ignore_token(parser);
2891
- return false;
2974
+ return;
2892
2975
  }
2893
- bool result = maybe_implicitly_close_p_tag(parser, token);
2976
+ maybe_implicitly_close_p_tag(parser, token);
2894
2977
  GumboNode* form_element = insert_element_from_token(parser, token);
2895
2978
  if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2896
2979
  state->_form_element = form_element;
2897
2980
  }
2898
- return result;
2899
- } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2981
+ return;
2982
+ }
2983
+ if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2900
2984
  maybe_implicitly_close_list_tag(parser, token, true);
2901
- bool result = maybe_implicitly_close_p_tag(parser, token);
2985
+ maybe_implicitly_close_p_tag(parser, token);
2902
2986
  insert_element_from_token(parser, token);
2903
- return result;
2904
- } else if (tag_in(token, kStartTag, &dd_dt_tags)) {
2987
+ return;
2988
+ }
2989
+ if (tag_in(token, kStartTag, &dd_dt_tags)) {
2905
2990
  maybe_implicitly_close_list_tag(parser, token, false);
2906
- bool result = maybe_implicitly_close_p_tag(parser, token);
2991
+ maybe_implicitly_close_p_tag(parser, token);
2907
2992
  insert_element_from_token(parser, token);
2908
- return result;
2909
- } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2910
- bool result = maybe_implicitly_close_p_tag(parser, token);
2993
+ return;
2994
+ }
2995
+ if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2996
+ maybe_implicitly_close_p_tag(parser, token);
2911
2997
  insert_element_from_token(parser, token);
2912
2998
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
2913
- return result;
2914
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2999
+ return;
3000
+ }
3001
+ if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2915
3002
  if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2916
3003
  parser_add_parse_error(parser, token);
2917
- implicitly_close_tags (
2918
- parser,
2919
- token,
2920
- GUMBO_NAMESPACE_HTML,
2921
- GUMBO_TAG_BUTTON
2922
- );
2923
- state->_reprocess_current_token = true;
2924
- return false;
3004
+ // We don't want to use implicitly_close_tags here because it may add an
3005
+ // error and we've already added the only error the standard specifies.
3006
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3007
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_BUTTON))
3008
+ ;
2925
3009
  }
2926
3010
  reconstruct_active_formatting_elements(parser);
2927
3011
  insert_element_from_token(parser, token);
2928
- state->_frameset_ok = false;
2929
- return true;
2930
- } else if (
3012
+ set_frameset_not_ok(parser);
3013
+ return;
3014
+ }
3015
+ if (
2931
3016
  tag_in(token, kEndTag, &(const TagSet) {
2932
3017
  TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON),
2933
3018
  TAG(CENTER), TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL),
@@ -2940,33 +3025,29 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2940
3025
  if (!has_an_element_in_scope(parser, tag)) {
2941
3026
  parser_add_parse_error(parser, token);
2942
3027
  ignore_token(parser);
2943
- return false;
3028
+ return;
2944
3029
  }
2945
- implicitly_close_tags (
3030
+ return implicitly_close_tags (
2946
3031
  parser,
2947
3032
  token,
2948
3033
  GUMBO_NAMESPACE_HTML,
2949
3034
  token->v.end_tag.tag
2950
3035
  );
2951
- return true;
2952
- } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
3036
+ }
3037
+ if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2953
3038
  if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2954
3039
  if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
2955
3040
  parser_add_parse_error(parser, token);
2956
3041
  ignore_token(parser);
2957
- return false;
3042
+ return;
2958
3043
  }
2959
- bool success = true;
2960
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2961
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
3044
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3045
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM))
2962
3046
  parser_add_parse_error(parser, token);
2963
- return false;
2964
- }
2965
3047
  while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
2966
3048
  ;
2967
- return success;
3049
+ return;
2968
3050
  } else {
2969
- bool result = true;
2970
3051
  GumboNode* node = state->_form_element;
2971
3052
  assert(!node || node->type == GUMBO_NODE_ELEMENT);
2972
3053
  state->_form_element = NULL;
@@ -2974,25 +3055,24 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2974
3055
  gumbo_debug("Closing an unopened form.\n");
2975
3056
  parser_add_parse_error(parser, token);
2976
3057
  ignore_token(parser);
2977
- return false;
3058
+ return;
2978
3059
  }
2979
3060
  // This differs from implicitly_close_tags because we remove *only* the
2980
3061
  // <form> element; other nodes are left in scope.
2981
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2982
- if (get_current_node(parser) != node) {
3062
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3063
+ if (get_current_node(parser) != node)
2983
3064
  parser_add_parse_error(parser, token);
2984
- result = false;
2985
- } else {
3065
+ else
2986
3066
  record_end_of_element(token, &node->v.element);
2987
- }
2988
3067
 
2989
3068
  GumboVector* open_elements = &state->_open_elements;
2990
3069
  int index = gumbo_vector_index_of(open_elements, node);
2991
3070
  assert(index >= 0);
2992
3071
  gumbo_vector_remove_at(index, open_elements);
2993
- return result;
3072
+ return;
2994
3073
  }
2995
- } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
3074
+ }
3075
+ if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
2996
3076
  if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2997
3077
  parser_add_parse_error(parser, token);
2998
3078
  // reconstruct_active_formatting_elements(parser);
@@ -3001,42 +3081,45 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3001
3081
  GUMBO_TAG_P,
3002
3082
  GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3003
3083
  );
3004
- state->_reprocess_current_token = true;
3005
- return false;
3006
3084
  }
3007
- return implicitly_close_tags (
3085
+ implicitly_close_tags (
3008
3086
  parser,
3009
3087
  token,
3010
3088
  GUMBO_NAMESPACE_HTML,
3011
3089
  GUMBO_TAG_P
3012
3090
  );
3013
- } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
3091
+ return;
3092
+ }
3093
+ if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
3014
3094
  if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
3015
3095
  parser_add_parse_error(parser, token);
3016
3096
  ignore_token(parser);
3017
- return false;
3097
+ return;
3018
3098
  }
3019
- return implicitly_close_tags (
3099
+ implicitly_close_tags (
3020
3100
  parser,
3021
3101
  token,
3022
3102
  GUMBO_NAMESPACE_HTML,
3023
3103
  GUMBO_TAG_LI
3024
3104
  );
3025
- } else if (tag_in(token, kEndTag, &dd_dt_tags)) {
3026
- assert(token->type == GUMBO_TOKEN_END_TAG);
3105
+ return;
3106
+ }
3107
+ if (tag_in(token, kEndTag, &dd_dt_tags)) {
3027
3108
  GumboTag token_tag = token->v.end_tag.tag;
3028
3109
  if (!has_an_element_in_scope(parser, token_tag)) {
3029
3110
  parser_add_parse_error(parser, token);
3030
3111
  ignore_token(parser);
3031
- return false;
3112
+ return;
3032
3113
  }
3033
- return implicitly_close_tags (
3114
+ implicitly_close_tags (
3034
3115
  parser,
3035
3116
  token,
3036
3117
  GUMBO_NAMESPACE_HTML,
3037
3118
  token_tag
3038
3119
  );
3039
- } else if (tag_in(token, kEndTag, &heading_tags)) {
3120
+ return;
3121
+ }
3122
+ if (tag_in(token, kEndTag, &heading_tags)) {
3040
3123
  if (
3041
3124
  !has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) {
3042
3125
  GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
@@ -3046,31 +3129,29 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3046
3129
  // No heading open; ignore the token entirely.
3047
3130
  parser_add_parse_error(parser, token);
3048
3131
  ignore_token(parser);
3049
- return false;
3050
- } else {
3051
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3052
- const GumboNode* current_node = get_current_node(parser);
3053
- bool success = node_html_tag_is(current_node, token->v.end_tag.tag);
3054
- if (!success) {
3055
- // There're children of the heading currently open; close them below and
3056
- // record a parse error.
3057
- // TODO(jdtang): Add a way to distinguish this error case from the one
3058
- // above.
3059
- parser_add_parse_error(parser, token);
3060
- }
3061
- do {
3062
- current_node = pop_current_node(parser);
3063
- } while (!node_tag_in_set(current_node, &heading_tags));
3064
- return success;
3132
+ return;
3133
+ }
3134
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3135
+ const GumboNode* current_node = get_current_node(parser);
3136
+ if (!node_html_tag_is(current_node, token->v.end_tag.tag)) {
3137
+ // There're children of the heading currently open; close them below and
3138
+ // record a parse error.
3139
+ // TODO(jdtang): Add a way to distinguish this error case from the one
3140
+ // above.
3141
+ parser_add_parse_error(parser, token);
3065
3142
  }
3066
- } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
3067
- bool success = true;
3143
+ do {
3144
+ current_node = pop_current_node(parser);
3145
+ } while (!node_tag_in_set(current_node, &heading_tags));
3146
+ return;
3147
+ }
3148
+ if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
3068
3149
  int last_a;
3069
3150
  int has_matching_a = find_last_anchor_index(parser, &last_a);
3070
3151
  if (has_matching_a) {
3071
3152
  assert(has_matching_a == 1);
3072
3153
  parser_add_parse_error(parser, token);
3073
- adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
3154
+ (void)adoption_agency_algorithm(parser, token);
3074
3155
  // The adoption agency algorithm usually removes all instances of <a>
3075
3156
  // from the list of active formatting elements, but in case it doesn't,
3076
3157
  // we're supposed to do this. (The conditions where it might not are
@@ -3082,12 +3163,12 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3082
3163
  );
3083
3164
  gumbo_vector_remove(last_element, &state->_open_elements);
3084
3165
  }
3085
- success = false;
3086
3166
  }
3087
3167
  reconstruct_active_formatting_elements(parser);
3088
3168
  add_formatting_element(parser, insert_element_from_token(parser, token));
3089
- return success;
3090
- } else if (
3169
+ return;
3170
+ }
3171
+ if (
3091
3172
  tag_in(token, kStartTag, &(const TagSet) {
3092
3173
  TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I), TAG(S),
3093
3174
  TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)
@@ -3095,48 +3176,52 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3095
3176
  ) {
3096
3177
  reconstruct_active_formatting_elements(parser);
3097
3178
  add_formatting_element(parser, insert_element_from_token(parser, token));
3098
- return true;
3099
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
3100
- bool result = true;
3179
+ return;
3180
+ }
3181
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
3101
3182
  reconstruct_active_formatting_elements(parser);
3102
3183
  if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
3103
- result = false;
3104
3184
  parser_add_parse_error(parser, token);
3105
- adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
3185
+ adoption_agency_algorithm(parser, token);
3106
3186
  reconstruct_active_formatting_elements(parser);
3107
3187
  }
3108
3188
  insert_element_from_token(parser, token);
3109
3189
  add_formatting_element(parser, get_current_node(parser));
3110
- return result;
3111
- } else if (
3190
+ return;
3191
+ }
3192
+ if (
3112
3193
  tag_in(token, kEndTag, &(const TagSet) {
3113
3194
  TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I),
3114
3195
  TAG(NOBR), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT),
3115
3196
  TAG(U)
3116
3197
  })
3117
3198
  ) {
3118
- return adoption_agency_algorithm(parser, token, token->v.end_tag.tag);
3119
- } else if (
3199
+ adoption_agency_algorithm(parser, token);
3200
+ return;
3201
+ }
3202
+ if (
3120
3203
  tag_in(token, kStartTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
3121
3204
  ) {
3122
3205
  reconstruct_active_formatting_elements(parser);
3123
3206
  insert_element_from_token(parser, token);
3124
3207
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
3125
3208
  set_frameset_not_ok(parser);
3126
- return true;
3127
- } else if (
3209
+ return;
3210
+ }
3211
+ if (
3128
3212
  tag_in(token, kEndTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
3129
3213
  ) {
3130
3214
  GumboTag token_tag = token->v.end_tag.tag;
3131
- if (!has_an_element_in_table_scope(parser, token_tag)) {
3215
+ if (!has_an_element_in_scope(parser, token_tag)) {
3132
3216
  parser_add_parse_error(parser, token);
3133
3217
  ignore_token(parser);
3134
- return false;
3218
+ return;
3135
3219
  }
3136
3220
  implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
3137
3221
  clear_active_formatting_elements(parser);
3138
- return true;
3139
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3222
+ return;
3223
+ }
3224
+ if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3140
3225
  if (
3141
3226
  get_document_node(parser)->v.document.doc_type_quirks_mode
3142
3227
  != GUMBO_DOCTYPE_QUIRKS
@@ -3146,75 +3231,89 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3146
3231
  insert_element_from_token(parser, token);
3147
3232
  set_frameset_not_ok(parser);
3148
3233
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3149
- return true;
3150
- } else if (
3151
- tag_in(token, kStartTag, &(const TagSet) {
3234
+ return;
3235
+ }
3236
+ if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
3237
+ parser_add_parse_error(parser, token);
3238
+ reconstruct_active_formatting_elements(parser);
3239
+ insert_element_of_tag_type (
3240
+ parser,
3241
+ GUMBO_TAG_BR,
3242
+ GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3243
+ );
3244
+ pop_current_node(parser);
3245
+ acknowledge_self_closing_tag(parser);
3246
+ set_frameset_not_ok(parser);
3247
+ return;
3248
+ }
3249
+ if (
3250
+ tag_in(token, kStartTag, &(const TagSet) {
3152
3251
  TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), TAG(IMAGE), TAG(KEYGEN),
3153
3252
  TAG(WBR)
3154
3253
  })
3155
3254
  ) {
3156
- bool success = true;
3157
- if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
3158
- success = false;
3255
+ bool is_image = tag_is(token, kStartTag, GUMBO_TAG_IMAGE);
3256
+ if (is_image) {
3159
3257
  parser_add_parse_error(parser, token);
3160
3258
  token->v.start_tag.tag = GUMBO_TAG_IMG;
3161
3259
  }
3162
3260
  reconstruct_active_formatting_elements(parser);
3163
3261
  GumboNode* node = insert_element_from_token(parser, token);
3164
- if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
3165
- success = false;
3166
- parser_add_parse_error(parser, token);
3167
- node->v.element.tag = GUMBO_TAG_IMG;
3262
+ if (is_image)
3168
3263
  node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
3169
- }
3170
3264
  pop_current_node(parser);
3171
3265
  acknowledge_self_closing_tag(parser);
3172
3266
  set_frameset_not_ok(parser);
3173
- return success;
3174
- } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
3175
- if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
3176
- // Must be before the element is inserted, as that takes ownership of the
3177
- // token's attribute vector.
3178
- set_frameset_not_ok(parser);
3179
- }
3267
+ return;
3268
+ }
3269
+ if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
3180
3270
  reconstruct_active_formatting_elements(parser);
3181
- insert_element_from_token(parser, token);
3271
+ GumboNode *input = insert_element_from_token(parser, token);
3182
3272
  pop_current_node(parser);
3183
3273
  acknowledge_self_closing_tag(parser);
3184
- return true;
3185
- } else if (
3274
+ if (!attribute_matches(&input->v.element.attributes, "type", "hidden"))
3275
+ set_frameset_not_ok(parser);
3276
+ return;
3277
+ }
3278
+ if (
3186
3279
  tag_in(token, kStartTag, &(const TagSet){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})
3187
3280
  ) {
3188
3281
  insert_element_from_token(parser, token);
3189
3282
  pop_current_node(parser);
3190
3283
  acknowledge_self_closing_tag(parser);
3191
- return true;
3192
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
3193
- bool result = maybe_implicitly_close_p_tag(parser, token);
3284
+ return;
3285
+ }
3286
+ if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
3287
+ maybe_implicitly_close_p_tag(parser, token);
3194
3288
  insert_element_from_token(parser, token);
3195
3289
  pop_current_node(parser);
3196
3290
  acknowledge_self_closing_tag(parser);
3197
3291
  set_frameset_not_ok(parser);
3198
- return result;
3199
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
3292
+ return;
3293
+ }
3294
+ if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
3200
3295
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
3201
3296
  parser->_parser_state->_ignore_next_linefeed = true;
3202
3297
  set_frameset_not_ok(parser);
3203
- return true;
3204
- } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
3205
- bool result = maybe_implicitly_close_p_tag(parser, token);
3298
+ return;
3299
+ }
3300
+ if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
3301
+ maybe_implicitly_close_p_tag(parser, token);
3206
3302
  reconstruct_active_formatting_elements(parser);
3207
3303
  set_frameset_not_ok(parser);
3208
3304
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3209
- return result;
3210
- } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
3305
+ return;
3306
+ }
3307
+ if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
3211
3308
  set_frameset_not_ok(parser);
3212
3309
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3213
- return true;
3214
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3310
+ return;
3311
+ }
3312
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3215
3313
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3216
- return true;
3217
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3314
+ return;
3315
+ }
3316
+ if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3218
3317
  reconstruct_active_formatting_elements(parser);
3219
3318
  insert_element_from_token(parser, token);
3220
3319
  set_frameset_not_ok(parser);
@@ -3230,50 +3329,40 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3230
3329
  } else {
3231
3330
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
3232
3331
  }
3233
- return true;
3234
- } else if (
3235
- tag_in(token, kStartTag, &(const TagSet){TAG(OPTION), TAG(OPTGROUP)})
3332
+ return;
3333
+ }
3334
+ if (
3335
+ tag_in(token, kStartTag, &(const TagSet){TAG(OPTGROUP), TAG(OPTION)})
3236
3336
  ) {
3237
3337
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3238
3338
  pop_current_node(parser);
3239
3339
  }
3240
3340
  reconstruct_active_formatting_elements(parser);
3241
3341
  insert_element_from_token(parser, token);
3242
- return true;
3243
- } else if (
3244
- tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)})
3245
- ) {
3246
- bool success = true;
3247
- GumboTag exception = tag_in(token, kStartTag, &(const TagSet){TAG(RT), TAG(RP)})
3248
- ? GUMBO_TAG_RTC
3249
- : GUMBO_TAG_LAST
3250
- ;
3342
+ return;
3343
+ }
3344
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RTC)})) {
3251
3345
  if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
3252
- generate_implied_end_tags(parser, exception);
3346
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3347
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY))
3348
+ parser_add_parse_error(parser, token);
3253
3349
  }
3254
- if (
3255
- !node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)
3256
- && !(
3257
- exception == GUMBO_TAG_LAST
3258
- || node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC)
3259
- )
3260
- ) {
3261
- parser_add_parse_error(parser, token);
3262
- success = false;
3350
+ insert_element_from_token(parser, token);
3351
+ return;
3352
+ }
3353
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(RP), TAG(RT)})) {
3354
+ if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
3355
+ generate_implied_end_tags(parser, GUMBO_TAG_RTC, NULL);
3356
+ GumboNode* current = get_current_node(parser);
3357
+ if (!node_html_tag_is(current, GUMBO_TAG_RUBY) &&
3358
+ !node_html_tag_is(current, GUMBO_TAG_RTC)) {
3359
+ parser_add_parse_error(parser, token);
3360
+ }
3263
3361
  }
3264
3362
  insert_element_from_token(parser, token);
3265
- return success;
3266
- } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
3267
- parser_add_parse_error(parser, token);
3268
- reconstruct_active_formatting_elements(parser);
3269
- insert_element_of_tag_type (
3270
- parser,
3271
- GUMBO_TAG_BR,
3272
- GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3273
- );
3274
- pop_current_node(parser);
3275
- return false;
3276
- } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
3363
+ return;
3364
+ }
3365
+ if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
3277
3366
  reconstruct_active_formatting_elements(parser);
3278
3367
  adjust_mathml_attributes(token);
3279
3368
  adjust_foreign_attributes(token);
@@ -3282,8 +3371,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3282
3371
  pop_current_node(parser);
3283
3372
  acknowledge_self_closing_tag(parser);
3284
3373
  }
3285
- return true;
3286
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
3374
+ return;
3375
+ }
3376
+ if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
3287
3377
  reconstruct_active_formatting_elements(parser);
3288
3378
  adjust_svg_attributes(token);
3289
3379
  adjust_foreign_attributes(token);
@@ -3292,8 +3382,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3292
3382
  pop_current_node(parser);
3293
3383
  acknowledge_self_closing_tag(parser);
3294
3384
  }
3295
- return true;
3296
- } else if (
3385
+ return;
3386
+ }
3387
+ if (
3297
3388
  tag_in(token, kStartTag, &(const TagSet) {
3298
3389
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(FRAME), TAG(HEAD),
3299
3390
  TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3301,82 +3392,50 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
3301
3392
  ) {
3302
3393
  parser_add_parse_error(parser, token);
3303
3394
  ignore_token(parser);
3304
- return false;
3305
- } else if (token->type == GUMBO_TOKEN_START_TAG) {
3395
+ return;
3396
+ }
3397
+ if (token->type == GUMBO_TOKEN_START_TAG) {
3306
3398
  reconstruct_active_formatting_elements(parser);
3307
3399
  insert_element_from_token(parser, token);
3308
- return true;
3309
- } else {
3310
- assert(token->type == GUMBO_TOKEN_END_TAG);
3311
- GumboTag end_tag = token->v.end_tag.tag;
3312
- const char *end_tagname = token->v.end_tag.name;
3313
- assert(state->_open_elements.length > 0);
3314
- assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
3315
- // Walk up the stack of open elements until we find one that either:
3316
- // a) Matches the tag name we saw
3317
- // b) Is in the "special" category.
3318
- // If we see a), implicitly close everything up to and including it. If we
3319
- // see b), then record a parse error, don't close anything (except the
3320
- // implied end tags) and ignore the end tag token.
3321
- for (int i = state->_open_elements.length; --i >= 0;) {
3322
- const GumboNode* node = state->_open_elements.data[i];
3323
- if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, end_tag, end_tagname)) {
3324
- generate_implied_end_tags(parser, end_tag);
3325
- // TODO(jdtang): Do I need to add a parse error here? The condition in
3326
- // the spec seems like it's the inverse of the loop condition above, and
3327
- // so would never fire.
3328
- // sfc: Yes, an error is needed here.
3329
- // <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example.
3330
- // foo is the "current node" but sarcasm is node.
3331
- // XXX: Write a test for this.
3332
- if (node != get_current_node(parser))
3333
- parser_add_parse_error(parser, token);
3334
- while (node != pop_current_node(parser))
3335
- ; // Pop everything.
3336
- return true;
3337
- } else if (is_special_node(node)) {
3338
- parser_add_parse_error(parser, token);
3339
- ignore_token(parser);
3340
- return false;
3341
- }
3342
- }
3343
- // <html> is in the special category, so we should never get here.
3344
- assert(0);
3345
- return false;
3400
+ return;
3346
3401
  }
3402
+ in_body_any_other_end_tag(parser, token);
3347
3403
  }
3348
3404
 
3349
3405
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata
3350
- static bool handle_text(GumboParser* parser, GumboToken* token) {
3406
+ static void handle_text(GumboParser* parser, GumboToken* token) {
3351
3407
  if (
3352
3408
  token->type == GUMBO_TOKEN_CHARACTER
3353
3409
  || token->type == GUMBO_TOKEN_WHITESPACE
3354
3410
  ) {
3355
3411
  insert_text_token(parser, token);
3356
- } else {
3357
- // We provide only bare-bones script handling that doesn't involve any of
3358
- // the parser-pause/already-started/script-nesting flags or re-entrant
3359
- // invocations of the tokenizer. Because the intended usage of this library
3360
- // is mostly for templating, refactoring, and static-analysis libraries, we
3361
- // provide the script body as a text-node child of the <script> element.
3362
- // This behavior doesn't support document.write of partial HTML elements,
3363
- // but should be adequate for almost all other scripting support.
3364
- if (token->type == GUMBO_TOKEN_EOF) {
3365
- parser_add_parse_error(parser, token);
3366
- parser->_parser_state->_reprocess_current_token = true;
3367
- }
3368
- pop_current_node(parser);
3369
- set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3412
+ return;
3370
3413
  }
3371
- return true;
3414
+ // We provide only bare-bones script handling that doesn't involve any of
3415
+ // the parser-pause/already-started/script-nesting flags or re-entrant
3416
+ // invocations of the tokenizer. Because the intended usage of this library
3417
+ // is mostly for templating, refactoring, and static-analysis libraries, we
3418
+ // provide the script body as a text-node child of the <script> element.
3419
+ // This behavior doesn't support document.write of partial HTML elements,
3420
+ // but should be adequate for almost all other scripting support.
3421
+ if (token->type == GUMBO_TOKEN_EOF) {
3422
+ parser_add_parse_error(parser, token);
3423
+ parser->_parser_state->_reprocess_current_token = true;
3424
+ }
3425
+ pop_current_node(parser);
3426
+ set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3372
3427
  }
3373
3428
 
3374
3429
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable
3375
- static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3430
+ static void handle_in_table(GumboParser* parser, GumboToken* token) {
3376
3431
  GumboParserState* state = parser->_parser_state;
3377
3432
  if (
3378
- token->type == GUMBO_TOKEN_CHARACTER
3379
- || token->type == GUMBO_TOKEN_WHITESPACE
3433
+ (token->type == GUMBO_TOKEN_CHARACTER
3434
+ || token->type == GUMBO_TOKEN_WHITESPACE
3435
+ || token->type == GUMBO_TOKEN_NULL)
3436
+ && node_tag_in_set(get_current_node(parser), &(const TagSet) {
3437
+ TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
3438
+ })
3380
3439
  ) {
3381
3440
  // The "pending table character tokens" list described in the spec is
3382
3441
  // nothing more than the TextNodeBufferState. We accumulate text tokens as
@@ -3384,71 +3443,87 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3384
3443
  // we set _foster_parent_insertions if there're non-whitespace characters in
3385
3444
  // the buffer.
3386
3445
  assert(state->_text_node._buffer.length == 0);
3446
+ assert(state->_table_character_tokens.length == 0);
3387
3447
  state->_original_insertion_mode = state->_insertion_mode;
3388
3448
  state->_reprocess_current_token = true;
3389
3449
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
3390
- return true;
3391
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3450
+ return;
3451
+ }
3452
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3453
+ append_comment_node(parser, get_current_node(parser), token);
3454
+ return;
3455
+ }
3456
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3392
3457
  parser_add_parse_error(parser, token);
3393
3458
  ignore_token(parser);
3394
- return false;
3395
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3396
- append_comment_node(parser, get_current_node(parser), token);
3397
- return true;
3398
- } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3459
+ return;
3460
+ }
3461
+ if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3399
3462
  clear_stack_to_table_context(parser);
3400
3463
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
3401
3464
  insert_element_from_token(parser, token);
3402
3465
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
3403
- return true;
3404
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3466
+ return;
3467
+ }
3468
+ if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3405
3469
  clear_stack_to_table_context(parser);
3406
3470
  insert_element_from_token(parser, token);
3407
3471
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3408
- return true;
3409
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3472
+ return;
3473
+ }
3474
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3410
3475
  clear_stack_to_table_context(parser);
3411
3476
  insert_element_of_tag_type (
3412
3477
  parser,
3413
3478
  GUMBO_TAG_COLGROUP,
3414
3479
  GUMBO_INSERTION_IMPLIED
3415
3480
  );
3416
- parser->_parser_state->_reprocess_current_token = true;
3481
+ state->_reprocess_current_token = true;
3417
3482
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3418
- return true;
3419
- } else if (
3483
+ return;
3484
+ }
3485
+ if (
3420
3486
  tag_in(token, kStartTag, &(const TagSet) {
3421
- TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD), TAG(TH), TAG(TR)
3487
+ TAG(TBODY), TAG(TFOOT), TAG(THEAD)
3422
3488
  })
3423
3489
  ) {
3424
3490
  clear_stack_to_table_context(parser);
3491
+ insert_element_from_token(parser, token);
3425
3492
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3426
- if (tag_in(token, kStartTag, &(const TagSet){TAG(TD), TAG(TH), TAG(TR)})) {
3427
- insert_element_of_tag_type (
3428
- parser,
3429
- GUMBO_TAG_TBODY,
3430
- GUMBO_INSERTION_IMPLIED
3431
- );
3432
- state->_reprocess_current_token = true;
3433
- } else {
3434
- insert_element_from_token(parser, token);
3435
- }
3436
- return true;
3437
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3493
+ return;
3494
+ }
3495
+ if (
3496
+ tag_in(token, kStartTag, &(const TagSet) {
3497
+ TAG(TD), TAG(TH), TAG(TR)
3498
+ })
3499
+ ) {
3500
+ clear_stack_to_table_context(parser);
3501
+ insert_element_of_tag_type (
3502
+ parser,
3503
+ GUMBO_TAG_TBODY,
3504
+ GUMBO_INSERTION_IMPLIED
3505
+ );
3506
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3507
+ state->_reprocess_current_token = true;
3508
+ return;
3509
+ }
3510
+ if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3438
3511
  parser_add_parse_error(parser, token);
3439
3512
  if (close_table(parser)) {
3440
- parser->_parser_state->_reprocess_current_token = true;
3513
+ state->_reprocess_current_token = true;
3441
3514
  } else {
3442
3515
  ignore_token(parser);
3443
3516
  }
3444
- return false;
3445
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3517
+ return;
3518
+ }
3519
+ if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3446
3520
  if (!close_table(parser)) {
3447
3521
  parser_add_parse_error(parser, token);
3448
- return false;
3522
+ return;
3449
3523
  }
3450
- return true;
3451
- } else if (
3524
+ return;
3525
+ }
3526
+ if (
3452
3527
  tag_in(token, kEndTag, &(const TagSet) {
3453
3528
  TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
3454
3529
  TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3456,102 +3531,103 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3456
3531
  ) {
3457
3532
  parser_add_parse_error(parser, token);
3458
3533
  ignore_token(parser);
3459
- return false;
3460
- } else if (
3534
+ return;
3535
+ }
3536
+ if (
3461
3537
  tag_in(token, kStartTag, &(const TagSet){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)})
3462
3538
  || (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))
3463
3539
  ) {
3464
- return handle_in_head(parser, token);
3465
- } else if (
3540
+ handle_in_head(parser, token);
3541
+ return;
3542
+ }
3543
+ if (
3466
3544
  tag_is(token, kStartTag, GUMBO_TAG_INPUT)
3467
3545
  && attribute_matches(&token->v.start_tag.attributes, "type", "hidden")
3468
3546
  ) {
3469
3547
  parser_add_parse_error(parser, token);
3470
3548
  insert_element_from_token(parser, token);
3471
3549
  pop_current_node(parser);
3472
- return false;
3473
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3550
+ acknowledge_self_closing_tag(parser);
3551
+ return;
3552
+ }
3553
+ if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3474
3554
  parser_add_parse_error(parser, token);
3475
3555
  if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3476
3556
  ignore_token(parser);
3477
- return false;
3557
+ return;
3478
3558
  }
3479
3559
  state->_form_element = insert_element_from_token(parser, token);
3480
3560
  pop_current_node(parser);
3481
- return false;
3482
- } else if (token->type == GUMBO_TOKEN_EOF) {
3483
- return handle_in_body(parser, token);
3484
- } else {
3485
- parser_add_parse_error(parser, token);
3486
- state->_foster_parent_insertions = true;
3487
- bool result = handle_in_body(parser, token);
3488
- state->_foster_parent_insertions = false;
3489
- return result;
3561
+ return;
3562
+ }
3563
+ if (token->type == GUMBO_TOKEN_EOF) {
3564
+ handle_in_body(parser, token);
3565
+ return;
3490
3566
  }
3567
+ // foster-parenting-start-tag or foster-parenting-end-tag error
3568
+ parser_add_parse_error(parser, token);
3569
+ state->_foster_parent_insertions = true;
3570
+ handle_in_body(parser, token);
3571
+ state->_foster_parent_insertions = false;
3491
3572
  }
3492
3573
 
3493
3574
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext
3494
- static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3575
+ static void handle_in_table_text(GumboParser* parser, GumboToken* token) {
3495
3576
  if (token->type == GUMBO_TOKEN_NULL) {
3496
3577
  parser_add_parse_error(parser, token);
3497
3578
  ignore_token(parser);
3498
- return false;
3499
- } else if (
3500
- token->type == GUMBO_TOKEN_CHARACTER
3501
- || token->type == GUMBO_TOKEN_WHITESPACE
3502
- ) {
3579
+ return;
3580
+ }
3581
+ GumboParserState* state = parser->_parser_state;
3582
+ // Non-whitespace tokens will cause parse errors later.
3583
+ // It's not entirely clear from the spec how this is supposed to work.
3584
+ // https://github.com/whatwg/html/issues/4046
3585
+ if (token->type == GUMBO_TOKEN_WHITESPACE
3586
+ || token->type == GUMBO_TOKEN_CHARACTER) {
3503
3587
  insert_text_token(parser, token);
3504
- return true;
3505
- } else {
3506
- GumboParserState* state = parser->_parser_state;
3507
- GumboStringBuffer* buffer = &state->_text_node._buffer;
3508
- const char* data = buffer->data;
3509
- // Note that TextNodeBuffer may contain UTF-8 characters, but the
3510
- // presence of any one byte that is not whitespace means we flip
3511
- // the flag, so this loop is still valid.
3588
+ gumbo_character_token_buffer_append(token, &state->_table_character_tokens);
3589
+ return;
3590
+ }
3591
+
3592
+ GumboCharacterTokenBuffer* buffer = &state->_table_character_tokens;
3593
+ if (state->_text_node._type != GUMBO_NODE_WHITESPACE) {
3594
+ // Each character in buffer is an error. Unfortunately, that means we need
3595
+ // to emit a bunch of errors at the appropriate locations.
3512
3596
  for (size_t i = 0, n = buffer->length; i < n; ++i) {
3513
- switch (data[i]) {
3514
- case '\t':
3515
- case '\n':
3516
- case '\f':
3517
- case '\r':
3518
- case ' ':
3519
- continue;
3520
- default:
3521
- state->_foster_parent_insertions = true;
3522
- reconstruct_active_formatting_elements(parser);
3523
- goto loopbreak;
3524
- }
3597
+ GumboToken tok;
3598
+ gumbo_character_token_buffer_get(buffer, i, &tok);
3599
+ // foster-parenting-character error
3600
+ parser_add_parse_error(parser, &tok);
3525
3601
  }
3526
- loopbreak:
3527
- maybe_flush_text_node_buffer(parser);
3528
- state->_foster_parent_insertions = false;
3529
- state->_reprocess_current_token = true;
3530
- state->_insertion_mode = state->_original_insertion_mode;
3531
- return true;
3602
+ state->_foster_parent_insertions = true;
3603
+ set_frameset_not_ok(parser);
3604
+ reconstruct_active_formatting_elements(parser);
3532
3605
  }
3606
+ maybe_flush_text_node_buffer(parser);
3607
+ gumbo_character_token_buffer_clear(buffer);
3608
+ state->_foster_parent_insertions = false;
3609
+ state->_reprocess_current_token = true;
3610
+ state->_insertion_mode = state->_original_insertion_mode;
3533
3611
  }
3534
3612
 
3535
3613
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incaption
3536
- static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3614
+ static void handle_in_caption(GumboParser* parser, GumboToken* token) {
3537
3615
  if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3538
3616
  if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3539
3617
  parser_add_parse_error(parser, token);
3540
3618
  ignore_token(parser);
3541
- return false;
3542
- } else {
3543
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3544
- bool result = true;
3545
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3546
- parser_add_parse_error(parser, token);
3547
- }
3548
- while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3549
- ;
3550
- clear_active_formatting_elements(parser);
3551
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3552
- return result;
3619
+ return;
3553
3620
  }
3554
- } else if (
3621
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3622
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION))
3623
+ parser_add_parse_error(parser, token);
3624
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3625
+ ;
3626
+ clear_active_formatting_elements(parser);
3627
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3628
+ return;
3629
+ }
3630
+ if (
3555
3631
  tag_in(token, kStartTag, &(const TagSet) {
3556
3632
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
3557
3633
  TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3561,15 +3637,19 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3561
3637
  if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3562
3638
  parser_add_parse_error(parser, token);
3563
3639
  ignore_token(parser);
3564
- return false;
3640
+ return;
3565
3641
  }
3642
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3643
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION))
3644
+ parser_add_parse_error(parser, token);
3566
3645
  while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3567
3646
  ;
3568
3647
  clear_active_formatting_elements(parser);
3569
3648
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3570
3649
  parser->_parser_state->_reprocess_current_token = true;
3571
- return true;
3572
- } else if (
3650
+ return;
3651
+ }
3652
+ if (
3573
3653
  tag_in(token, kEndTag, &(const TagSet) {
3574
3654
  TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD),
3575
3655
  TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3577,91 +3657,102 @@ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3577
3657
  ) {
3578
3658
  parser_add_parse_error(parser, token);
3579
3659
  ignore_token(parser);
3580
- return false;
3581
- } else {
3582
- return handle_in_body(parser, token);
3660
+ return;
3583
3661
  }
3662
+ handle_in_body(parser, token);
3584
3663
  }
3585
3664
 
3586
3665
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incolgroup
3587
- static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3666
+ static void handle_in_column_group(GumboParser* parser, GumboToken* token) {
3588
3667
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
3589
3668
  insert_text_token(parser, token);
3590
- return true;
3591
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3669
+ return;
3670
+ }
3671
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3672
+ append_comment_node(parser, get_current_node(parser), token);
3673
+ return;
3674
+ }
3675
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3592
3676
  parser_add_parse_error(parser, token);
3593
3677
  ignore_token(parser);
3594
- return false;
3595
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3596
- append_comment_node(parser, get_current_node(parser), token);
3597
- return true;
3598
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3599
- return handle_in_body(parser, token);
3600
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3678
+ return;
3679
+ }
3680
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3681
+ handle_in_body(parser, token);
3682
+ return;
3683
+ }
3684
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3601
3685
  insert_element_from_token(parser, token);
3602
3686
  pop_current_node(parser);
3603
3687
  acknowledge_self_closing_tag(parser);
3604
- return true;
3605
- } else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3688
+ return;
3689
+ }
3690
+ if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3606
3691
  if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3607
3692
  parser_add_parse_error(parser, token);
3608
3693
  ignore_token(parser);
3609
- return false;
3694
+ return;
3610
3695
  }
3611
3696
  pop_current_node(parser);
3612
3697
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3613
- return false;
3614
- } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3698
+ return;
3699
+ }
3700
+ if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3615
3701
  parser_add_parse_error(parser, token);
3616
3702
  ignore_token(parser);
3617
- return false;
3618
- } else if (
3703
+ return;
3704
+ }
3705
+ if (
3619
3706
  tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)
3620
3707
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
3621
3708
  ) {
3622
- return handle_in_head(parser, token);
3623
- } else if (token->type == GUMBO_TOKEN_EOF) {
3624
- return handle_in_body(parser, token);
3625
- } else {
3626
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3627
- parser_add_parse_error(parser, token);
3628
- ignore_token(parser);
3629
- return false;
3630
- }
3631
- pop_current_node(parser);
3632
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3633
- parser->_parser_state->_reprocess_current_token = true;
3634
- return true;
3709
+ handle_in_head(parser, token);
3710
+ return;
3711
+ }
3712
+ if (token->type == GUMBO_TOKEN_EOF) {
3713
+ handle_in_body(parser, token);
3714
+ return;
3635
3715
  }
3716
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3717
+ parser_add_parse_error(parser, token);
3718
+ ignore_token(parser);
3719
+ return;
3720
+ }
3721
+ pop_current_node(parser);
3722
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3723
+ parser->_parser_state->_reprocess_current_token = true;
3636
3724
  }
3637
3725
 
3638
3726
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intbody
3639
- static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3727
+ static void handle_in_table_body(GumboParser* parser, GumboToken* token) {
3640
3728
  if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3641
3729
  clear_stack_to_table_body_context(parser);
3642
3730
  insert_element_from_token(parser, token);
3643
3731
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3644
- return true;
3645
- } else if (tag_in(token, kStartTag, &td_th_tags)) {
3732
+ return;
3733
+ }
3734
+ if (tag_in(token, kStartTag, &td_th_tags)) {
3646
3735
  parser_add_parse_error(parser, token);
3647
3736
  clear_stack_to_table_body_context(parser);
3648
3737
  insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3649
- parser->_parser_state->_reprocess_current_token = true;
3650
3738
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3651
- return false;
3652
- } else if (
3739
+ parser->_parser_state->_reprocess_current_token = true;
3740
+ return;
3741
+ }
3742
+ if (
3653
3743
  tag_in(token, kEndTag, &(const TagSet){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
3654
3744
  ) {
3655
3745
  if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3656
3746
  parser_add_parse_error(parser, token);
3657
3747
  ignore_token(parser);
3658
- return false;
3748
+ return;
3659
3749
  }
3660
3750
  clear_stack_to_table_body_context(parser);
3661
3751
  pop_current_node(parser);
3662
3752
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3663
- return true;
3664
- } else if (
3753
+ return;
3754
+ }
3755
+ if (
3665
3756
  tag_in(token, kStartTag, &(const TagSet) {
3666
3757
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
3667
3758
  TAG(THEAD)
@@ -3677,47 +3768,48 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3677
3768
  ) {
3678
3769
  parser_add_parse_error(parser, token);
3679
3770
  ignore_token(parser);
3680
- return false;
3771
+ return;
3681
3772
  }
3682
3773
  clear_stack_to_table_body_context(parser);
3683
3774
  pop_current_node(parser);
3684
3775
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3685
3776
  parser->_parser_state->_reprocess_current_token = true;
3686
- return true;
3687
- } else if (
3777
+ return;
3778
+ }
3779
+ if (
3688
3780
  tag_in(token, kEndTag, &(const TagSet) {
3689
- TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR), TAG(COLGROUP),
3690
- TAG(HTML), TAG(TD), TAG(TH)
3781
+ TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TD),
3782
+ TAG(TH), TAG(TR)
3691
3783
  })
3692
3784
  ) {
3693
3785
  parser_add_parse_error(parser, token);
3694
3786
  ignore_token(parser);
3695
- return false;
3696
- } else {
3697
- return handle_in_table(parser, token);
3787
+ return;
3698
3788
  }
3789
+ handle_in_table(parser, token);
3699
3790
  }
3700
3791
 
3701
3792
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intr
3702
- static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3793
+ static void handle_in_row(GumboParser* parser, GumboToken* token) {
3703
3794
  if (tag_in(token, kStartTag, &td_th_tags)) {
3704
3795
  clear_stack_to_table_row_context(parser);
3705
3796
  insert_element_from_token(parser, token);
3706
3797
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3707
3798
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
3708
- return true;
3709
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3799
+ return;
3800
+ }
3801
+ if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3710
3802
  if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3711
3803
  parser_add_parse_error(parser, token);
3712
3804
  ignore_token(parser);
3713
- return false;
3714
- } else {
3715
- clear_stack_to_table_row_context(parser);
3716
- pop_current_node(parser);
3717
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3718
- return true;
3805
+ return;
3719
3806
  }
3720
- } else if (
3807
+ clear_stack_to_table_row_context(parser);
3808
+ pop_current_node(parser);
3809
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3810
+ return;
3811
+ }
3812
+ if (
3721
3813
  tag_in(token, kStartTag, &(const TagSet) {
3722
3814
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
3723
3815
  TAG(THEAD), TAG(TR)
@@ -3727,32 +3819,33 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3727
3819
  if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3728
3820
  parser_add_parse_error(parser, token);
3729
3821
  ignore_token(parser);
3730
- return false;
3731
- } else {
3732
- clear_stack_to_table_row_context(parser);
3733
- pop_current_node(parser);
3734
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3735
- parser->_parser_state->_reprocess_current_token = true;
3736
- return true;
3822
+ return;
3737
3823
  }
3738
- } else if (
3824
+ clear_stack_to_table_row_context(parser);
3825
+ pop_current_node(parser);
3826
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3827
+ parser->_parser_state->_reprocess_current_token = true;
3828
+ return;
3829
+ }
3830
+ if (
3739
3831
  tag_in(token, kEndTag, &(const TagSet) {TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
3740
3832
  ) {
3741
- if (
3742
- !has_an_element_in_table_scope(parser, token->v.end_tag.tag)
3743
- || !has_an_element_in_table_scope(parser, GUMBO_TAG_TR)
3744
- ) {
3833
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3745
3834
  parser_add_parse_error(parser, token);
3746
3835
  ignore_token(parser);
3747
- return false;
3748
- } else {
3749
- clear_stack_to_table_row_context(parser);
3750
- pop_current_node(parser);
3751
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3752
- parser->_parser_state->_reprocess_current_token = true;
3753
- return true;
3836
+ return;
3837
+ }
3838
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3839
+ ignore_token(parser);
3840
+ return;
3754
3841
  }
3755
- } else if (
3842
+ clear_stack_to_table_row_context(parser);
3843
+ pop_current_node(parser);
3844
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3845
+ parser->_parser_state->_reprocess_current_token = true;
3846
+ return;
3847
+ }
3848
+ if (
3756
3849
  tag_in(token, kEndTag, &(const TagSet) {
3757
3850
  TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
3758
3851
  TAG(TD), TAG(TH)
@@ -3760,23 +3853,24 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3760
3853
  ) {
3761
3854
  parser_add_parse_error(parser, token);
3762
3855
  ignore_token(parser);
3763
- return false;
3764
- } else {
3765
- return handle_in_table(parser, token);
3856
+ return;
3766
3857
  }
3858
+ handle_in_table(parser, token);
3767
3859
  }
3768
3860
 
3769
3861
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intd
3770
- static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3862
+ static void handle_in_cell(GumboParser* parser, GumboToken* token) {
3771
3863
  if (tag_in(token, kEndTag, &td_th_tags)) {
3772
3864
  GumboTag token_tag = token->v.end_tag.tag;
3773
3865
  if (!has_an_element_in_table_scope(parser, token_tag)) {
3774
3866
  parser_add_parse_error(parser, token);
3775
3867
  ignore_token(parser);
3776
- return false;
3868
+ return;
3777
3869
  }
3778
- return close_table_cell(parser, token, token_tag);
3779
- } else if (
3870
+ close_table_cell(parser, token, token_tag);
3871
+ return;
3872
+ }
3873
+ if (
3780
3874
  tag_in(token, kStartTag, &(const TagSet) {
3781
3875
  TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
3782
3876
  TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
@@ -3790,19 +3884,22 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3790
3884
  gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
3791
3885
  parser_add_parse_error(parser, token);
3792
3886
  ignore_token(parser);
3793
- return false;
3887
+ return;
3794
3888
  }
3795
3889
  parser->_parser_state->_reprocess_current_token = true;
3796
- return close_current_cell(parser, token);
3797
- } else if (
3890
+ close_current_cell(parser, token);
3891
+ return;
3892
+ }
3893
+ if (
3798
3894
  tag_in(token, kEndTag, &(const TagSet) {
3799
3895
  TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML)
3800
3896
  })
3801
3897
  ) {
3802
3898
  parser_add_parse_error(parser, token);
3803
3899
  ignore_token(parser);
3804
- return false;
3805
- } else if (
3900
+ return;
3901
+ }
3902
+ if (
3806
3903
  tag_in(token, kEndTag, &(const TagSet) {
3807
3904
  TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
3808
3905
  })
@@ -3810,43 +3907,50 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3810
3907
  if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3811
3908
  parser_add_parse_error(parser, token);
3812
3909
  ignore_token(parser);
3813
- return false;
3910
+ return;
3814
3911
  }
3815
3912
  parser->_parser_state->_reprocess_current_token = true;
3816
- return close_current_cell(parser, token);
3817
- } else {
3818
- return handle_in_body(parser, token);
3913
+ close_current_cell(parser, token);
3914
+ return;
3819
3915
  }
3916
+ handle_in_body(parser, token);
3820
3917
  }
3821
3918
 
3822
3919
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect
3823
- static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3920
+ static void handle_in_select(GumboParser* parser, GumboToken* token) {
3824
3921
  if (token->type == GUMBO_TOKEN_NULL) {
3825
3922
  parser_add_parse_error(parser, token);
3826
3923
  ignore_token(parser);
3827
- return false;
3828
- } else if (
3924
+ return;
3925
+ }
3926
+ if (
3829
3927
  token->type == GUMBO_TOKEN_CHARACTER
3830
3928
  || token->type == GUMBO_TOKEN_WHITESPACE
3831
3929
  ) {
3832
3930
  insert_text_token(parser, token);
3833
- return true;
3834
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3931
+ return;
3932
+ }
3933
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3934
+ append_comment_node(parser, get_current_node(parser), token);
3935
+ return;
3936
+ }
3937
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3835
3938
  parser_add_parse_error(parser, token);
3836
3939
  ignore_token(parser);
3837
- return false;
3838
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3839
- append_comment_node(parser, get_current_node(parser), token);
3840
- return true;
3841
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3842
- return handle_in_body(parser, token);
3843
- } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3940
+ return;
3941
+ }
3942
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3943
+ handle_in_body(parser, token);
3944
+ return;
3945
+ }
3946
+ if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3844
3947
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3845
3948
  pop_current_node(parser);
3846
3949
  }
3847
3950
  insert_element_from_token(parser, token);
3848
- return true;
3849
- } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3951
+ return;
3952
+ }
3953
+ if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3850
3954
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3851
3955
  pop_current_node(parser);
3852
3956
  }
@@ -3854,8 +3958,9 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3854
3958
  pop_current_node(parser);
3855
3959
  }
3856
3960
  insert_element_from_token(parser, token);
3857
- return true;
3858
- } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3961
+ return;
3962
+ }
3963
+ if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3859
3964
  GumboVector* open_elements = &parser->_parser_state->_open_elements;
3860
3965
  if (
3861
3966
  node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)
@@ -3868,37 +3973,39 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3868
3973
  }
3869
3974
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3870
3975
  pop_current_node(parser);
3871
- return true;
3872
- } else {
3873
- parser_add_parse_error(parser, token);
3874
- ignore_token(parser);
3875
- return false;
3976
+ return;
3876
3977
  }
3877
- } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3978
+ parser_add_parse_error(parser, token);
3979
+ ignore_token(parser);
3980
+ return;
3981
+ }
3982
+ if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3878
3983
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3879
3984
  pop_current_node(parser);
3880
- return true;
3881
- } else {
3882
- parser_add_parse_error(parser, token);
3883
- ignore_token(parser);
3884
- return false;
3985
+ return;
3885
3986
  }
3886
- } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3987
+ parser_add_parse_error(parser, token);
3988
+ ignore_token(parser);
3989
+ return;
3990
+ }
3991
+ if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3887
3992
  if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3888
3993
  parser_add_parse_error(parser, token);
3889
3994
  ignore_token(parser);
3890
- return false;
3995
+ return;
3891
3996
  }
3892
3997
  close_current_select(parser);
3893
- return true;
3894
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3998
+ return;
3999
+ }
4000
+ if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3895
4001
  parser_add_parse_error(parser, token);
3896
4002
  ignore_token(parser);
3897
4003
  if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3898
4004
  close_current_select(parser);
3899
4005
  }
3900
- return false;
3901
- } else if (
4006
+ return;
4007
+ }
4008
+ if (
3902
4009
  tag_in(token, kStartTag, &(const TagSet) {TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})
3903
4010
  ) {
3904
4011
  parser_add_parse_error(parser, token);
@@ -3908,23 +4015,25 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3908
4015
  close_current_select(parser);
3909
4016
  parser->_parser_state->_reprocess_current_token = true;
3910
4017
  }
3911
- return false;
3912
- } else if (
4018
+ return;
4019
+ }
4020
+ if (
3913
4021
  tag_in(token, kStartTag, &(const TagSet){TAG(SCRIPT), TAG(TEMPLATE)})
3914
4022
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
3915
4023
  ) {
3916
- return handle_in_head(parser, token);
3917
- } else if (token->type == GUMBO_TOKEN_EOF) {
3918
- return handle_in_body(parser, token);
3919
- } else {
3920
- parser_add_parse_error(parser, token);
3921
- ignore_token(parser);
3922
- return false;
4024
+ handle_in_head(parser, token);
4025
+ return;
3923
4026
  }
4027
+ if (token->type == GUMBO_TOKEN_EOF) {
4028
+ handle_in_body(parser, token);
4029
+ return;
4030
+ }
4031
+ parser_add_parse_error(parser, token);
4032
+ ignore_token(parser);
3924
4033
  }
3925
4034
 
3926
4035
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselectintable
3927
- static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
4036
+ static void handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3928
4037
  static const TagSet tags = {
3929
4038
  TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD),
3930
4039
  TAG(TR), TAG(TD), TAG(TH)
@@ -3933,27 +4042,23 @@ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3933
4042
  parser_add_parse_error(parser, token);
3934
4043
  close_current_select(parser);
3935
4044
  parser->_parser_state->_reprocess_current_token = true;
3936
- return false;
3937
- } else if (tag_in(token, kEndTag, &tags)) {
4045
+ return;
4046
+ }
4047
+ if (tag_in(token, kEndTag, &tags)) {
3938
4048
  parser_add_parse_error(parser, token);
3939
4049
  if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3940
4050
  ignore_token(parser);
3941
- return false;
3942
- } else {
3943
- close_current_select(parser);
3944
- // close_current_select already does the
3945
- // reset_insertion_mode_appropriately
3946
- // reset_insertion_mode_appropriately(parser);
3947
- parser->_parser_state->_reprocess_current_token = true;
3948
- return false;
4051
+ return;
3949
4052
  }
3950
- } else {
3951
- return handle_in_select(parser, token);
4053
+ close_current_select(parser);
4054
+ parser->_parser_state->_reprocess_current_token = true;
4055
+ return;
3952
4056
  }
4057
+ handle_in_select(parser, token);
3953
4058
  }
3954
4059
 
3955
4060
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate
3956
- static bool handle_in_template(GumboParser* parser, GumboToken* token) {
4061
+ static void handle_in_template(GumboParser* parser, GumboToken* token) {
3957
4062
  GumboParserState* state = parser->_parser_state;
3958
4063
  switch (token->type) {
3959
4064
  case GUMBO_TOKEN_WHITESPACE:
@@ -3961,7 +4066,8 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3961
4066
  case GUMBO_TOKEN_COMMENT:
3962
4067
  case GUMBO_TOKEN_NULL:
3963
4068
  case GUMBO_TOKEN_DOCTYPE:
3964
- return handle_in_body(parser, token);
4069
+ handle_in_body(parser, token);
4070
+ return;
3965
4071
  default:
3966
4072
  break;
3967
4073
  }
@@ -3972,8 +4078,10 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3972
4078
  })
3973
4079
  || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
3974
4080
  ) {
3975
- return handle_in_head(parser, token);
3976
- } else if (
4081
+ handle_in_head(parser, token);
4082
+ return;
4083
+ }
4084
+ if (
3977
4085
  tag_in(token, kStartTag, &(const TagSet) {
3978
4086
  TAG(CAPTION), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), TAG(THEAD)
3979
4087
  })
@@ -3982,39 +4090,45 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3982
4090
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3983
4091
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3984
4092
  state->_reprocess_current_token = true;
3985
- return true;
3986
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
4093
+ return;
4094
+ }
4095
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3987
4096
  pop_template_insertion_mode(parser);
3988
4097
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3989
4098
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3990
4099
  state->_reprocess_current_token = true;
3991
- return true;
3992
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
4100
+ return;
4101
+ }
4102
+ if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3993
4103
  pop_template_insertion_mode(parser);
3994
4104
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3995
4105
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3996
4106
  state->_reprocess_current_token = true;
3997
- return true;
3998
- } else if (tag_in(token, kStartTag, &td_th_tags)) {
4107
+ return;
4108
+ }
4109
+ if (tag_in(token, kStartTag, &td_th_tags)) {
3999
4110
  pop_template_insertion_mode(parser);
4000
4111
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
4001
4112
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
4002
4113
  state->_reprocess_current_token = true;
4003
- return true;
4004
- } else if (token->type == GUMBO_TOKEN_START_TAG) {
4114
+ return;
4115
+ }
4116
+ if (token->type == GUMBO_TOKEN_START_TAG) {
4005
4117
  pop_template_insertion_mode(parser);
4006
4118
  push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4007
4119
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4008
4120
  state->_reprocess_current_token = true;
4009
- return true;
4010
- } else if (token->type == GUMBO_TOKEN_END_TAG) {
4121
+ return;
4122
+ }
4123
+ if (token->type == GUMBO_TOKEN_END_TAG) {
4011
4124
  parser_add_parse_error(parser, token);
4012
4125
  ignore_token(parser);
4013
- return false;
4014
- } else if (token->type == GUMBO_TOKEN_EOF) {
4126
+ return;
4127
+ }
4128
+ if (token->type == GUMBO_TOKEN_EOF) {
4015
4129
  if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
4016
4130
  // Stop parsing.
4017
- return true;
4131
+ return;
4018
4132
  }
4019
4133
  parser_add_parse_error(parser, token);
4020
4134
  while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
@@ -4023,35 +4137,41 @@ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
4023
4137
  pop_template_insertion_mode(parser);
4024
4138
  reset_insertion_mode_appropriately(parser);
4025
4139
  state->_reprocess_current_token = true;
4026
- return false;
4027
- } else {
4028
- assert(0);
4029
- return false;
4140
+ return;
4030
4141
  }
4142
+ assert(0 && "unreachable");
4031
4143
  }
4032
4144
 
4033
4145
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterbody
4034
- static bool handle_after_body(GumboParser* parser, GumboToken* token) {
4146
+ static void handle_after_body(GumboParser* parser, GumboToken* token) {
4035
4147
  if (
4036
4148
  token->type == GUMBO_TOKEN_WHITESPACE
4037
4149
  || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4038
4150
  ) {
4039
- return handle_in_body(parser, token);
4040
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
4151
+ handle_in_body(parser, token);
4152
+ return;
4153
+ }
4154
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4041
4155
  GumboNode* html_node = parser->_output->root;
4042
4156
  assert(html_node != NULL);
4043
4157
  append_comment_node(parser, html_node, token);
4044
- return true;
4045
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
4158
+ return;
4159
+ }
4160
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4046
4161
  parser_add_parse_error(parser, token);
4047
4162
  ignore_token(parser);
4048
- return false;
4049
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4163
+ return;
4164
+ }
4165
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4166
+ handle_in_body(parser, token);
4167
+ return;
4168
+ }
4169
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4050
4170
  /* fragment case: ignore the closing HTML token */
4051
4171
  if (is_fragment_parser(parser)) {
4052
4172
  parser_add_parse_error(parser, token);
4053
4173
  ignore_token(parser);
4054
- return false;
4174
+ return;
4055
4175
  }
4056
4176
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
4057
4177
  GumboNode* html = parser->_parser_state->_open_elements.data[0];
@@ -4060,39 +4180,44 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
4060
4180
  parser->_parser_state->_current_token,
4061
4181
  &html->v.element
4062
4182
  );
4063
- return true;
4064
- } else if (token->type == GUMBO_TOKEN_EOF) {
4065
- return true;
4066
- } else {
4067
- parser_add_parse_error(parser, token);
4068
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4069
- parser->_parser_state->_reprocess_current_token = true;
4070
- return false;
4183
+ return;
4184
+ }
4185
+ if (token->type == GUMBO_TOKEN_EOF) {
4186
+ return;
4071
4187
  }
4188
+ parser_add_parse_error(parser, token);
4189
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4190
+ parser->_parser_state->_reprocess_current_token = true;
4072
4191
  }
4073
4192
 
4074
4193
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inframeset
4075
- static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
4194
+ static void handle_in_frameset(GumboParser* parser, GumboToken* token) {
4076
4195
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
4077
4196
  insert_text_token(parser, token);
4078
- return true;
4079
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
4197
+ return;
4198
+ }
4199
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4080
4200
  append_comment_node(parser, get_current_node(parser), token);
4081
- return true;
4082
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
4201
+ return;
4202
+ }
4203
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4083
4204
  parser_add_parse_error(parser, token);
4084
4205
  ignore_token(parser);
4085
- return false;
4086
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4087
- return handle_in_body(parser, token);
4088
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
4206
+ return;
4207
+ }
4208
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4209
+ handle_in_body(parser, token);
4210
+ return;
4211
+ }
4212
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
4089
4213
  insert_element_from_token(parser, token);
4090
- return true;
4091
- } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
4214
+ return;
4215
+ }
4216
+ if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
4092
4217
  if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
4093
4218
  parser_add_parse_error(parser, token);
4094
4219
  ignore_token(parser);
4095
- return false;
4220
+ return;
4096
4221
  }
4097
4222
  pop_current_node(parser);
4098
4223
  if (
@@ -4101,42 +4226,47 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
4101
4226
  ) {
4102
4227
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
4103
4228
  }
4104
- return true;
4105
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
4229
+ return;
4230
+ }
4231
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
4106
4232
  insert_element_from_token(parser, token);
4107
4233
  pop_current_node(parser);
4108
4234
  acknowledge_self_closing_tag(parser);
4109
- return true;
4110
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4111
- return handle_in_head(parser, token);
4112
- } else if (token->type == GUMBO_TOKEN_EOF) {
4113
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
4235
+ return;
4236
+ }
4237
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4238
+ handle_in_head(parser, token);
4239
+ return;
4240
+ }
4241
+ if (token->type == GUMBO_TOKEN_EOF) {
4242
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML))
4114
4243
  parser_add_parse_error(parser, token);
4115
- return false;
4116
- }
4117
- return true;
4118
- } else {
4119
- parser_add_parse_error(parser, token);
4120
- ignore_token(parser);
4121
- return false;
4244
+ return;
4122
4245
  }
4246
+ parser_add_parse_error(parser, token);
4247
+ ignore_token(parser);
4123
4248
  }
4124
4249
 
4125
4250
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterframeset
4126
- static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
4251
+ static void handle_after_frameset(GumboParser* parser, GumboToken* token) {
4127
4252
  if (token->type == GUMBO_TOKEN_WHITESPACE) {
4128
4253
  insert_text_token(parser, token);
4129
- return true;
4130
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
4254
+ return;
4255
+ }
4256
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4131
4257
  append_comment_node(parser, get_current_node(parser), token);
4132
- return true;
4133
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
4258
+ return;
4259
+ }
4260
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4134
4261
  parser_add_parse_error(parser, token);
4135
4262
  ignore_token(parser);
4136
- return false;
4137
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4138
- return handle_in_body(parser, token);
4139
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4263
+ return;
4264
+ }
4265
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4266
+ handle_in_body(parser, token);
4267
+ return;
4268
+ }
4269
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4140
4270
  GumboNode* html = parser->_parser_state->_open_elements.data[0];
4141
4271
  assert(node_html_tag_is(html, GUMBO_TAG_HTML));
4142
4272
  record_end_of_element (
@@ -4144,67 +4274,71 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
4144
4274
  &html->v.element
4145
4275
  );
4146
4276
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
4147
- return true;
4148
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4277
+ return;
4278
+ }
4279
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4149
4280
  return handle_in_head(parser, token);
4150
- } else if (token->type == GUMBO_TOKEN_EOF) {
4151
- return true;
4152
- } else {
4153
- parser_add_parse_error(parser, token);
4154
- ignore_token(parser);
4155
- return false;
4156
4281
  }
4282
+ if (token->type == GUMBO_TOKEN_EOF) {
4283
+ return;
4284
+ }
4285
+ parser_add_parse_error(parser, token);
4286
+ ignore_token(parser);
4157
4287
  }
4158
4288
 
4159
4289
  // https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-body-insertion-mode
4160
- static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
4290
+ static void handle_after_after_body(GumboParser* parser, GumboToken* token) {
4161
4291
  if (token->type == GUMBO_TOKEN_COMMENT) {
4162
4292
  append_comment_node(parser, get_document_node(parser), token);
4163
- return true;
4164
- } else if (
4293
+ return;
4294
+ }
4295
+ if (
4165
4296
  token->type == GUMBO_TOKEN_DOCTYPE
4166
4297
  || token->type == GUMBO_TOKEN_WHITESPACE
4167
4298
  || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4168
4299
  ) {
4169
- return handle_in_body(parser, token);
4170
- } else if (token->type == GUMBO_TOKEN_EOF) {
4171
- return true;
4172
- } else {
4173
- parser_add_parse_error(parser, token);
4174
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4175
- parser->_parser_state->_reprocess_current_token = true;
4176
- return false;
4300
+ handle_in_body(parser, token);
4301
+ return;
4177
4302
  }
4303
+ if (token->type == GUMBO_TOKEN_EOF) {
4304
+ return;
4305
+ }
4306
+ parser_add_parse_error(parser, token);
4307
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4308
+ parser->_parser_state->_reprocess_current_token = true;
4178
4309
  }
4179
4310
 
4180
4311
  // https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-frameset-insertion-mode
4181
- static bool handle_after_after_frameset (
4312
+ static void handle_after_after_frameset (
4182
4313
  GumboParser* parser,
4183
4314
  GumboToken* token
4184
4315
  ) {
4185
4316
  if (token->type == GUMBO_TOKEN_COMMENT) {
4186
4317
  append_comment_node(parser, get_document_node(parser), token);
4187
- return true;
4188
- } else if (
4318
+ return;
4319
+ }
4320
+ if (
4189
4321
  token->type == GUMBO_TOKEN_DOCTYPE
4190
4322
  || token->type == GUMBO_TOKEN_WHITESPACE
4191
4323
  || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4192
4324
  ) {
4193
- return handle_in_body(parser, token);
4194
- } else if (token->type == GUMBO_TOKEN_EOF) {
4195
- return true;
4196
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4197
- return handle_in_head(parser, token);
4198
- } else {
4199
- parser_add_parse_error(parser, token);
4200
- ignore_token(parser);
4201
- return false;
4325
+ handle_in_body(parser, token);
4326
+ return;
4202
4327
  }
4328
+ if (token->type == GUMBO_TOKEN_EOF) {
4329
+ return;
4330
+ }
4331
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4332
+ handle_in_head(parser, token);
4333
+ return;
4334
+ }
4335
+ parser_add_parse_error(parser, token);
4336
+ ignore_token(parser);
4203
4337
  }
4204
4338
 
4205
4339
  // Function pointers for each insertion mode.
4206
4340
  // Keep in sync with insertion_mode.h.
4207
- typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token);
4341
+ typedef void (*TokenHandler)(GumboParser* parser, GumboToken* token);
4208
4342
  static const TokenHandler kTokenHandlers[] = {
4209
4343
  handle_initial,
4210
4344
  handle_before_html,
@@ -4231,36 +4365,36 @@ static const TokenHandler kTokenHandlers[] = {
4231
4365
  handle_after_after_frameset
4232
4366
  };
4233
4367
 
4234
- static bool handle_html_content(GumboParser* parser, GumboToken* token) {
4368
+ static void handle_html_content(GumboParser* parser, GumboToken* token) {
4235
4369
  const GumboInsertionMode mode = parser->_parser_state->_insertion_mode;
4236
4370
  const TokenHandler handler = kTokenHandlers[mode];
4237
- return handler(parser, token);
4371
+ handler(parser, token);
4238
4372
  }
4239
4373
 
4240
4374
  // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
4241
- static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4375
+ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4242
4376
  gumbo_debug("Handling foreign content");
4243
4377
  switch (token->type) {
4244
4378
  case GUMBO_TOKEN_NULL:
4245
4379
  parser_add_parse_error(parser, token);
4246
4380
  token->v.character = kUtf8ReplacementChar;
4247
4381
  insert_text_token(parser, token);
4248
- return false;
4382
+ return;
4249
4383
  case GUMBO_TOKEN_WHITESPACE:
4250
4384
  insert_text_token(parser, token);
4251
- return true;
4385
+ return;
4252
4386
  case GUMBO_TOKEN_CDATA:
4253
4387
  case GUMBO_TOKEN_CHARACTER:
4254
4388
  insert_text_token(parser, token);
4255
4389
  set_frameset_not_ok(parser);
4256
- return true;
4390
+ return;
4257
4391
  case GUMBO_TOKEN_COMMENT:
4258
4392
  append_comment_node(parser, get_current_node(parser), token);
4259
- return true;
4393
+ return;
4260
4394
  case GUMBO_TOKEN_DOCTYPE:
4261
4395
  parser_add_parse_error(parser, token);
4262
4396
  ignore_token(parser);
4263
- return false;
4397
+ return;
4264
4398
  default:
4265
4399
  // Fall through to the if-statements below.
4266
4400
  break;
@@ -4304,10 +4438,9 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4304
4438
  )
4305
4439
  );
4306
4440
  parser->_parser_state->_reprocess_current_token = true;
4307
- return false;
4441
+ return;
4308
4442
  }
4309
-
4310
- assert(token->type == GUMBO_TOKEN_START_TAG);
4443
+ // This is a start tag so the next if's then branch will be taken.
4311
4444
  }
4312
4445
 
4313
4446
  if (token->type == GUMBO_TOKEN_START_TAG) {
@@ -4326,63 +4459,59 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4326
4459
  pop_current_node(parser);
4327
4460
  acknowledge_self_closing_tag(parser);
4328
4461
  }
4329
- return true;
4462
+ return;
4330
4463
  // </script> tags are handled like any other end tag, putting the script's
4331
4464
  // text into a text node child and closing the current node.
4332
- } else {
4333
- assert(token->type == GUMBO_TOKEN_END_TAG);
4334
- GumboNode* node = get_current_node(parser);
4335
- GumboTag tag = token->v.end_tag.tag;
4336
- const char* name = token->v.end_tag.name;
4337
- assert(node != NULL);
4465
+ }
4466
+ assert(token->type == GUMBO_TOKEN_END_TAG);
4467
+ GumboNode* node = get_current_node(parser);
4468
+ GumboTag tag = token->v.end_tag.tag;
4469
+ const char* name = token->v.end_tag.name;
4470
+ assert(node != NULL);
4338
4471
 
4339
- bool is_success = true;
4340
- if (!node_tagname_is(node, tag, name)) {
4341
- parser_add_parse_error(parser, token);
4342
- is_success = false;
4343
- }
4344
- int i = parser->_parser_state->_open_elements.length;
4345
- for (--i; i > 0;) {
4346
- // Here we move up the stack until we find an HTML element (in which
4347
- // case we do nothing) or we find the element that we're about to
4348
- // close (in which case we pop everything we've seen until that
4349
- // point.)
4350
- gumbo_debug("Foreign %s node at %d.\n", node->v.element.name, i);
4351
- if (node_tagname_is(node, tag, name)) {
4352
- gumbo_debug("Matches.\n");
4353
- while (node != pop_current_node(parser)) {
4354
- // Pop all the nodes below the current one. Node is guaranteed to
4355
- // be an element on the stack of open elements (set below), so
4356
- // this loop is guaranteed to terminate.
4357
- }
4358
- return is_success;
4359
- }
4360
- --i;
4361
- node = parser->_parser_state->_open_elements.data[i];
4362
- if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
4363
- // The loop continues only in foreign namespaces.
4364
- break;
4472
+ if (!node_tagname_is(node, tag, name))
4473
+ parser_add_parse_error(parser, token);
4474
+ int i = parser->_parser_state->_open_elements.length;
4475
+ for (--i; i > 0;) {
4476
+ // Here we move up the stack until we find an HTML element (in which
4477
+ // case we do nothing) or we find the element that we're about to
4478
+ // close (in which case we pop everything we've seen until that
4479
+ // point.)
4480
+ gumbo_debug("Foreign %s node at %d.\n", node->v.element.name, i);
4481
+ if (node_tagname_is(node, tag, name)) {
4482
+ gumbo_debug("Matches.\n");
4483
+ while (node != pop_current_node(parser)) {
4484
+ // Pop all the nodes below the current one. Node is guaranteed to
4485
+ // be an element on the stack of open elements (set below), so
4486
+ // this loop is guaranteed to terminate.
4365
4487
  }
4488
+ return;
4489
+ }
4490
+ --i;
4491
+ node = parser->_parser_state->_open_elements.data[i];
4492
+ if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
4493
+ // The loop continues only in foreign namespaces.
4494
+ break;
4366
4495
  }
4367
- assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
4368
- if (i == 0)
4369
- return is_success;
4370
- // We can't call handle_token directly because the current node is still in
4371
- // a foriegn namespace, so it would re-enter this and result in infinite
4372
- // recursion.
4373
- return handle_html_content(parser, token) && is_success;
4374
4496
  }
4497
+ assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
4498
+ if (i == 0)
4499
+ return;
4500
+ // We can't call handle_token directly because the current node is still in
4501
+ // a foriegn namespace, so it would re-enter this and result in infinite
4502
+ // recursion.
4503
+ handle_html_content(parser, token);
4375
4504
  }
4376
4505
 
4377
4506
  // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
4378
- static bool handle_token(GumboParser* parser, GumboToken* token) {
4507
+ static void handle_token(GumboParser* parser, GumboToken* token) {
4379
4508
  if (
4380
4509
  parser->_parser_state->_ignore_next_linefeed
4381
4510
  && token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n'
4382
4511
  ) {
4383
4512
  parser->_parser_state->_ignore_next_linefeed = false;
4384
4513
  ignore_token(parser);
4385
- return true;
4514
+ return;
4386
4515
  }
4387
4516
  // This needs to be reset both here and in the conditional above to catch both
4388
4517
  // the case where the next token is not whitespace (so we don't ignore
@@ -4424,9 +4553,9 @@ static bool handle_token(GumboParser* parser, GumboToken* token) {
4424
4553
  token->type == GUMBO_TOKEN_NULL ||
4425
4554
  token->type == GUMBO_TOKEN_WHITESPACE)) ||
4426
4555
  token->type == GUMBO_TOKEN_EOF) {
4427
- return handle_html_content(parser, token);
4556
+ handle_html_content(parser, token);
4428
4557
  } else {
4429
- return handle_in_foreign_content(parser, token);
4558
+ handle_in_foreign_content(parser, token);
4430
4559
  }
4431
4560
  }
4432
4561
 
@@ -4517,7 +4646,7 @@ static void fragment_parser_init (
4517
4646
  break;
4518
4647
 
4519
4648
  case GUMBO_TAG_SCRIPT:
4520
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
4649
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
4521
4650
  break;
4522
4651
 
4523
4652
  case GUMBO_TAG_NOSCRIPT:
@@ -4554,7 +4683,7 @@ static void fragment_parser_init (
4554
4683
  // 11.
4555
4684
  if (ctx_has_form_ancestor
4556
4685
  || (ctx_tag == GUMBO_TAG_FORM
4557
- && fragment_namespace == GUMBO_NAMESPACE_HTML)) {
4686
+ && fragment_namespace == GUMBO_NAMESPACE_HTML)) {
4558
4687
  static const GumboNode form_ancestor = {
4559
4688
  .type = GUMBO_NODE_ELEMENT,
4560
4689
  .parent = NULL,
@@ -4613,19 +4742,18 @@ GumboOutput* gumbo_parse_with_options (
4613
4742
 
4614
4743
  const unsigned int max_tree_depth = options->max_tree_depth;
4615
4744
  GumboToken token;
4616
- bool has_error = false;
4617
4745
 
4618
4746
  do {
4619
4747
  if (state->_reprocess_current_token) {
4620
4748
  state->_reprocess_current_token = false;
4621
4749
  } else {
4622
- GumboNode* current_node = get_current_node(&parser);
4623
- gumbo_tokenizer_set_is_current_node_foreign (
4750
+ GumboNode* adjusted_current_node = get_adjusted_current_node(&parser);
4751
+ gumbo_tokenizer_set_is_adjusted_current_node_foreign (
4624
4752
  &parser,
4625
- current_node &&
4626
- current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4753
+ adjusted_current_node &&
4754
+ adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4627
4755
  );
4628
- has_error = !gumbo_lex(&parser, &token) || has_error;
4756
+ gumbo_lex(&parser, &token);
4629
4757
  }
4630
4758
 
4631
4759
  const char* token_type = "text";
@@ -4649,17 +4777,17 @@ GumboOutput* gumbo_parse_with_options (
4649
4777
  break;
4650
4778
  }
4651
4779
  gumbo_debug (
4652
- "Handling %s token @%zu:%zu in state %u.\n",
4780
+ "Handling %s token @%lu:%lu in state %u.\n",
4653
4781
  (char*) token_type,
4654
- token.position.line,
4655
- token.position.column,
4782
+ (unsigned long)token.position.line,
4783
+ (unsigned long)token.position.column,
4656
4784
  state->_insertion_mode
4657
4785
  );
4658
4786
 
4659
4787
  state->_current_token = &token;
4660
4788
  state->_self_closing_flag_acknowledged = false;
4661
4789
 
4662
- has_error = !handle_token(&parser, &token) || has_error;
4790
+ handle_token(&parser, &token);
4663
4791
 
4664
4792
  // Check for memory leaks when ownership is transferred from start tag
4665
4793
  // tokens to nodes.
@@ -4671,19 +4799,25 @@ GumboOutput* gumbo_parse_with_options (
4671
4799
  );
4672
4800
 
4673
4801
  if (!state->_reprocess_current_token) {
4802
+ // If we're done with the token, check for unacknowledged self-closing
4803
+ // flags on start tags.
4674
4804
  if (token.type == GUMBO_TOKEN_START_TAG &&
4675
4805
  token.v.start_tag.is_self_closing &&
4676
4806
  !state->_self_closing_flag_acknowledged) {
4677
- GumboError* error = parser_add_parse_error(&parser, &token);
4678
- if (error)
4679
- error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
4807
+ GumboError* error = gumbo_add_error(&parser);
4808
+ if (error) {
4809
+ // This is essentially a tokenizer error that's only caught during
4810
+ // tree construction.
4811
+ error->type = GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS;
4812
+ error->original_text = token.original_text;
4813
+ error->position = token.position;
4814
+ }
4680
4815
  }
4816
+ // Make sure we free the end tag's name since it doesn't get transferred
4817
+ // to a token.
4681
4818
  if (token.type == GUMBO_TOKEN_END_TAG &&
4682
- token.v.end_tag.is_self_closing) {
4683
- GumboError* error = parser_add_parse_error(&parser, &token);
4684
- if (error)
4685
- error->type = GUMBO_ERR_SELF_CLOSING_END_TAG;
4686
- }
4819
+ token.v.end_tag.tag == GUMBO_TAG_UNKNOWN)
4820
+ gumbo_free(token.v.end_tag.name);
4687
4821
  }
4688
4822
 
4689
4823
  if (unlikely(state->_open_elements.length > max_tree_depth)) {
@@ -4697,7 +4831,7 @@ GumboOutput* gumbo_parse_with_options (
4697
4831
 
4698
4832
  } while (
4699
4833
  (token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token)
4700
- && !(options->stop_on_first_error && has_error)
4834
+ && !(options->stop_on_first_error && parser._output->document_error)
4701
4835
  );
4702
4836
 
4703
4837
  finish_parsing(&parser);
@@ -4725,6 +4859,8 @@ const char* gumbo_status_to_string(GumboOutputStatus status) {
4725
4859
  return "OK";
4726
4860
  case GUMBO_STATUS_OUT_OF_MEMORY:
4727
4861
  return "System allocator returned NULL during parsing";
4862
+ case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
4863
+ return "Attributes per element limit exceeded";
4728
4864
  case GUMBO_STATUS_TREE_TOO_DEEP:
4729
4865
  return "Document tree depth limit exceeded";
4730
4866
  default: