nokogumbo 2.0.0.pre.alpha → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -79,7 +79,7 @@ void gumbo_string_buffer_append_codepoint (
79
79
  }
80
80
 
81
81
  void gumbo_string_buffer_append_string (
82
- GumboStringPiece* str,
82
+ const GumboStringPiece* str,
83
83
  GumboStringBuffer* output
84
84
  ) {
85
85
  maybe_resize_string_buffer(str->length, output);
@@ -47,7 +47,7 @@ void gumbo_string_buffer_append_codepoint (
47
47
 
48
48
  // Appends a string onto the end of the GumboStringBuffer.
49
49
  void gumbo_string_buffer_append_string (
50
- GumboStringPiece* str,
50
+ const GumboStringPiece* str,
51
51
  GumboStringBuffer* output
52
52
  );
53
53
 
@@ -0,0 +1,79 @@
1
+ /*
2
+ Copyright 2018 Stephen Checkoway
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
16
+
17
+ #include <assert.h>
18
+
19
+ #include "ascii.h"
20
+ #include "token_buffer.h"
21
+ #include "tokenizer.h"
22
+ #include "util.h"
23
+
24
+ struct GumboInternalCharacterToken {
25
+ GumboSourcePosition position;
26
+ GumboStringPiece original_text;
27
+ int c;
28
+ };
29
+
30
+ void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer) {
31
+ buffer->data = NULL;
32
+ buffer->length = 0;
33
+ buffer->capacity = 0;
34
+ }
35
+
36
+ void gumbo_character_token_buffer_append (
37
+ const GumboToken* token,
38
+ GumboCharacterTokenBuffer* buffer
39
+ ) {
40
+ assert(token->type == GUMBO_TOKEN_WHITESPACE
41
+ || token->type == GUMBO_TOKEN_CHARACTER);
42
+ if (buffer->length == buffer->capacity) {
43
+ if (buffer->capacity == 0)
44
+ buffer->capacity = 10;
45
+ else
46
+ buffer->capacity *= 2;
47
+ size_t bytes = sizeof(*buffer->data) * buffer->capacity;
48
+ buffer->data = gumbo_realloc(buffer->data, bytes);
49
+ }
50
+ size_t index = buffer->length++;
51
+ buffer->data[index].position = token->position;
52
+ buffer->data[index].original_text = token->original_text;
53
+ buffer->data[index].c = token->v.character;
54
+ }
55
+
56
+ void gumbo_character_token_buffer_get (
57
+ const GumboCharacterTokenBuffer* buffer,
58
+ size_t index,
59
+ struct GumboInternalToken* output
60
+ ) {
61
+ assert(index < buffer->length);
62
+ int c = buffer->data[index].c;
63
+ output->type = gumbo_ascii_isspace(c)?
64
+ GUMBO_TOKEN_WHITESPACE : GUMBO_TOKEN_CHARACTER;
65
+ output->position = buffer->data[index].position;
66
+ output->original_text = buffer->data[index].original_text;
67
+ output->v.character = c;
68
+ }
69
+
70
+ void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer) {
71
+ buffer->length = 0;
72
+ }
73
+
74
+ void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer) {
75
+ gumbo_free(buffer->data);
76
+ buffer->data = NULL;
77
+ buffer->length = 0;
78
+ buffer->capacity = 0;
79
+ }
@@ -0,0 +1,71 @@
1
+ /*
2
+ Copyright 2018 Stephen Checkoway
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
16
+
17
+ #ifndef GUMBO_TOKEN_BUFFER_H
18
+ #define GUMBO_TOKEN_BUFFER_H
19
+
20
+ #include <stdbool.h>
21
+ #include <stddef.h>
22
+
23
+ #include "gumbo.h"
24
+
25
+ #ifdef __cplusplus
26
+ extern "C" {
27
+ #endif
28
+
29
+ struct GumboInternalCharacterToken;
30
+ struct GumboInternalToken;
31
+
32
+ // A struct representing a growable sequence of character (and whitespace)
33
+ // tokens.
34
+ typedef struct {
35
+ // A pointer to the start of the sequence.
36
+ struct GumboInternalCharacterToken* data;
37
+
38
+ // The length of the sequence.
39
+ size_t length;
40
+
41
+ // The capacity of the buffer.
42
+ size_t capacity;
43
+ } GumboCharacterTokenBuffer;
44
+
45
+ // Initializes a new GumboCharacterTokenBuffer.
46
+ void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer);
47
+
48
+ // Appends a character (or whitespace) token.
49
+ void gumbo_character_token_buffer_append (
50
+ const struct GumboInternalToken* token,
51
+ GumboCharacterTokenBuffer* buffer
52
+ );
53
+
54
+ void gumbo_character_token_buffer_get (
55
+ const GumboCharacterTokenBuffer* buffer,
56
+ size_t index,
57
+ struct GumboInternalToken* output
58
+ );
59
+
60
+ // Reinitialize this string buffer. This clears it by setting length=0. It
61
+ // does not zero out the buffer itself.
62
+ void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer);
63
+
64
+ // Deallocates this GumboCharacterTokenBuffer.
65
+ void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer);
66
+
67
+ #ifdef __cplusplus
68
+ }
69
+ #endif
70
+
71
+ #endif // GUMBO_TOKEN_BUFFER_H
@@ -1,5 +1,7 @@
1
1
  /*
2
2
  Copyright 2010 Google Inc.
3
+ Copyright 2017-2018 Craig Barnes
4
+ Copyright 2018 Stephen Checkoway
3
5
 
4
6
  Licensed under the Apache License, Version 2.0 (the "License");
5
7
  you may not use this file except in compliance with the License.
@@ -60,15 +62,18 @@
60
62
  #include "util.h"
61
63
  #include "vector.h"
62
64
 
63
- // Compared against _script_data_buffer to determine if we're in
65
+ // Compared against _temporary_buffer to determine if we're in
64
66
  // double-escaped script mode.
65
67
  static const GumboStringPiece kScriptTag = {.data = "script", .length = 6};
66
68
 
67
- // An enum for the return value of each individual state.
69
+ // An enum for the return value of each individual state. Each of the emit_*
70
+ // functions should return EMIT_TOKEN and should be called as
71
+ // return emit_foo(parser, ..., output);
72
+ // Each of the handle_*_state functions that do not return emit_* should
73
+ // instead return CONTINUE to indicate to gumbo_lex to continue lexing.
68
74
  typedef enum {
69
- RETURN_ERROR, // Return false (error) from the tokenizer.
70
- RETURN_SUCCESS, // Return true (success) from the tokenizer.
71
- NEXT_CHAR // Proceed to the next character and continue lexing.
75
+ EMIT_TOKEN,
76
+ CONTINUE,
72
77
  } StateResult;
73
78
 
74
79
  // This is a struct containing state necessary to build up a tag token,
@@ -103,12 +108,6 @@ typedef struct GumboInternalTagState {
103
108
  // the attribute value, but shouldn't overwrite the existing value.
104
109
  bool _drop_next_attr_value;
105
110
 
106
- // The state that caused the tokenizer to switch into a character reference in
107
- // attribute value state. This is used to set the additional allowed
108
- // character, and is switched back to on completion. Initialized as the
109
- // tokenizer enters the character reference state.
110
- GumboTokenizerEnum _attr_value_state;
111
-
112
111
  // The last start tag to have been emitted by the tokenizer. This is
113
112
  // necessary to check for appropriate end tags.
114
113
  GumboTag _last_start_tag;
@@ -133,15 +132,19 @@ typedef struct GumboInternalTokenizerState {
133
132
  // "Reconsume the current input character in..."
134
133
  bool _reconsume_current_input;
135
134
 
136
- // A flag indicating whether the current node is a foreign element. This is
137
- // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
138
- // markup declaration state.
139
- bool _is_current_node_foreign;
135
+ // A flag indicating whether the adjusted current node is a foreign element.
136
+ // This is set by gumbo_tokenizer_set_is_adjusted_current_node_foreign and
137
+ // checked in the markup declaration state.
138
+ bool _is_adjusted_current_node_foreign;
140
139
 
141
140
  // A flag indicating whether the tokenizer is in a CDATA section. If so, then
142
141
  // text tokens emitted will be GUMBO_TOKEN_CDATA.
143
142
  bool _is_in_cdata;
144
143
 
144
+ // A flag indicating whether the tokenizer has seen a parse error since the
145
+ // last token was emitted.
146
+ bool _parse_error;
147
+
145
148
  // Certain states (notably character references) may emit two character tokens
146
149
  // at once, but the contract for lex() fills in only one token at a time. The
147
150
  // extra character is buffered here, and then this is checked on entry to
@@ -159,27 +162,24 @@ typedef struct GumboInternalTokenizerState {
159
162
 
160
163
  // A temporary buffer to accumulate characters, as described by the "temporary
161
164
  // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
162
- // way: we record the specific character to go into the buffer, which may
163
- // sometimes be a lowercased version of the actual input character. However,
164
- // we *also* use utf8iterator_mark() to record the position at tag start.
165
- // When we start flushing the temporary buffer, we set _temporary_buffer_emit
166
- // to the start of it, and then increment it for each call to the tokenizer.
167
- // We also call utf8iterator_reset(), and utf8iterator_next() through the
168
- // input stream, so that tokens emitted by emit_char have the correct position
169
- // and original text.
165
+ // way: In situations where the spec calls for inserting characters into the
166
+ // temporary buffer that exactly match the input in order to emit them as
167
+ // character tokens, we don't actually do it.
168
+ // Instead, we mark the input and reset the input to it using set_mark() and
169
+ // emit_from_mark(). We do use the temporary buffer for other uses such as
170
+ // DOCTYPEs, comments, and detecting escaped <script> tags.
170
171
  GumboStringBuffer _temporary_buffer;
171
172
 
172
- // The current cursor position we're emitting from within
173
- // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
174
- const char* _temporary_buffer_emit;
173
+ // The position to resume normal operation after we start emitting from the
174
+ // mark. NULL whenever we're not emitting from the mark.
175
+ const char* _resume_pos;
175
176
 
176
- // The temporary buffer is also used by the spec to check whether we should
177
- // enter the script data double escaped state, but we can't use the same
178
- // buffer for both because we have to flush out "<s" as emits while still
179
- // maintaining the context that will eventually become "script". This is a
180
- // separate buffer that's used in place of the temporary buffer for states
181
- // that may enter the script data double escape start state.
182
- GumboStringBuffer _script_data_buffer;
177
+ // The character reference state uses a return state to return to the state
178
+ // it was invoked from.
179
+ GumboTokenizerEnum _return_state;
180
+
181
+ // Numeric character reference.
182
+ uint32_t _character_reference_code;
183
183
 
184
184
  // Pointer to the beginning of the current token in the original buffer; used
185
185
  // to record the original text.
@@ -201,123 +201,69 @@ typedef struct GumboInternalTokenizerState {
201
201
  Utf8Iterator _input;
202
202
  } GumboTokenizerState;
203
203
 
204
- // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
204
+ // Adds a parse error to the parser's error struct.
205
205
  static void tokenizer_add_parse_error (
206
206
  GumboParser* parser,
207
207
  GumboErrorType type
208
208
  ) {
209
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
210
+ tokenizer->_parse_error = true;
209
211
  GumboError* error = gumbo_add_error(parser);
210
212
  if (!error) {
211
213
  return;
212
214
  }
215
+ const Utf8Iterator* input = &tokenizer->_input;
216
+ utf8iterator_get_position(input, &error->position);
217
+ error->original_text.data = utf8iterator_get_char_pointer(input);
218
+ error->original_text.length = utf8iterator_get_width(input);
219
+ error->type = type;
220
+ error->v.tokenizer.state = tokenizer->_state;
221
+ error->v.tokenizer.codepoint = utf8iterator_current(input);
222
+ }
223
+
224
+ // Adds an error pointing at the start of the character reference.
225
+ static void tokenizer_add_char_ref_error (
226
+ struct GumboInternalParser* parser,
227
+ GumboErrorType type,
228
+ int codepoint
229
+ ) {
213
230
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
214
- utf8iterator_get_position(&tokenizer->_input, &error->position);
215
- error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
231
+ tokenizer->_parse_error = true;
232
+ GumboError* error = gumbo_add_error(parser);
233
+ if (!error)
234
+ return;
235
+ Utf8Iterator* input = &tokenizer->_input;
216
236
  error->type = type;
217
- error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
218
- switch (tokenizer->_state) {
219
- case GUMBO_LEX_DATA:
220
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
221
- break;
222
- case GUMBO_LEX_CHAR_REF_IN_DATA:
223
- case GUMBO_LEX_CHAR_REF_IN_RCDATA:
224
- case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
225
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
226
- break;
227
- case GUMBO_LEX_RCDATA:
228
- case GUMBO_LEX_RCDATA_LT:
229
- case GUMBO_LEX_RCDATA_END_TAG_OPEN:
230
- case GUMBO_LEX_RCDATA_END_TAG_NAME:
231
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
232
- break;
233
- case GUMBO_LEX_RAWTEXT:
234
- case GUMBO_LEX_RAWTEXT_LT:
235
- case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
236
- case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
237
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
238
- break;
239
- case GUMBO_LEX_PLAINTEXT:
240
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
241
- break;
242
- case GUMBO_LEX_SCRIPT:
243
- case GUMBO_LEX_SCRIPT_LT:
244
- case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
245
- case GUMBO_LEX_SCRIPT_END_TAG_NAME:
246
- case GUMBO_LEX_SCRIPT_ESCAPED_START:
247
- case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
248
- case GUMBO_LEX_SCRIPT_ESCAPED:
249
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
250
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
251
- case GUMBO_LEX_SCRIPT_ESCAPED_LT:
252
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
253
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
254
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
255
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
256
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
257
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
258
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
259
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
260
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
261
- break;
262
- case GUMBO_LEX_TAG_OPEN:
263
- case GUMBO_LEX_END_TAG_OPEN:
264
- case GUMBO_LEX_TAG_NAME:
265
- case GUMBO_LEX_BEFORE_ATTR_NAME:
266
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
267
- break;
268
- case GUMBO_LEX_SELF_CLOSING_START_TAG:
269
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
270
- break;
271
- case GUMBO_LEX_ATTR_NAME:
272
- case GUMBO_LEX_AFTER_ATTR_NAME:
273
- case GUMBO_LEX_BEFORE_ATTR_VALUE:
274
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
275
- break;
276
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
277
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
278
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
279
- case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
280
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
281
- break;
282
- case GUMBO_LEX_BOGUS_COMMENT:
283
- case GUMBO_LEX_COMMENT_START:
284
- case GUMBO_LEX_COMMENT_START_DASH:
285
- case GUMBO_LEX_COMMENT:
286
- case GUMBO_LEX_COMMENT_END_DASH:
287
- case GUMBO_LEX_COMMENT_END:
288
- case GUMBO_LEX_COMMENT_END_BANG:
289
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
290
- break;
291
- case GUMBO_LEX_MARKUP_DECLARATION:
292
- case GUMBO_LEX_DOCTYPE:
293
- case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
294
- case GUMBO_LEX_DOCTYPE_NAME:
295
- case GUMBO_LEX_AFTER_DOCTYPE_NAME:
296
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
297
- case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
298
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
299
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
300
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
301
- case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
302
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
303
- case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
304
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
305
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
306
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
307
- case GUMBO_LEX_BOGUS_DOCTYPE:
308
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
309
- break;
310
- case GUMBO_LEX_CDATA:
311
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
312
- break;
313
- }
237
+ error->position = utf8iterator_get_mark_position(input);
238
+ const char* mark = utf8iterator_get_mark_pointer(input);
239
+ error->original_text.data = mark;
240
+ error->original_text.length = utf8iterator_get_char_pointer(input) - mark;
241
+ error->v.tokenizer.state = tokenizer->_state;
242
+ error->v.tokenizer.codepoint = codepoint;
243
+ }
244
+
245
+ // Adds an error pointing at the start of the token.
246
+ static void tokenizer_add_token_parse_error (
247
+ GumboParser* parser,
248
+ GumboErrorType type
249
+ ) {
250
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
251
+ tokenizer->_parse_error = true;
252
+ GumboError* error = gumbo_add_error(parser);
253
+ if (!error)
254
+ return;
255
+ Utf8Iterator* input = &tokenizer->_input;
256
+ error->type = type;
257
+ error->position = tokenizer->_token_start_pos;
258
+ error->original_text.data = tokenizer->_token_start;
259
+ error->original_text.length =
260
+ utf8iterator_get_char_pointer(input) - tokenizer->_token_start;
261
+ error->v.tokenizer.state = tokenizer->_state;
262
+ error->v.tokenizer.codepoint = 0;
314
263
  }
315
264
 
316
265
  static bool is_alpha(int c) {
317
- // We don't use the ISO C isalpha() function here because it depends
318
- // on the current locale, whereas the behavior in the HTML5 spec is
319
- // locale-independent.
320
- return ((unsigned) c | 32) - 'a' < 26;
266
+ return gumbo_ascii_isalpha(c);
321
267
  }
322
268
 
323
269
  static int ensure_lowercase(int c) {
@@ -347,24 +293,9 @@ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
347
293
  }
348
294
 
349
295
  // Starts recording characters in the temporary buffer.
350
- // Because this needs to reset the utf8iterator_mark to the beginning of the
351
- // text that will eventually be emitted, it needs to be called a couple of
352
- // states before the spec says "Set the temporary buffer to the empty string".
353
- // In general, this should be called whenever there's a transition to a
354
- // "less-than sign state". The initial < and possibly / then need to be
355
- // appended to the temporary buffer, their presence needs to be accounted for in
356
- // states that compare the temporary buffer against a literal value, and
357
- // spec stanzas that say "emit a < and / character token along with a character
358
- // token for each character in the temporary buffer" need to be adjusted to
359
- // account for the presence of the < and / inside the temporary buffer.
360
296
  static void clear_temporary_buffer(GumboParser* parser) {
361
297
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
362
- assert(!tokenizer->_temporary_buffer_emit);
363
- utf8iterator_mark(&tokenizer->_input);
364
298
  gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
365
- // The temporary buffer and script data buffer are the same object in the
366
- // spec, so the script data buffer should be cleared as well.
367
- gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
368
299
  }
369
300
 
370
301
  // Appends a codepoint to the temporary buffer.
@@ -378,25 +309,20 @@ static void append_char_to_temporary_buffer (
378
309
  );
379
310
  }
380
311
 
381
- #ifndef NDEBUG
382
- static bool temporary_buffer_equals__ (
383
- const GumboParser* parser,
384
- const char* text,
385
- size_t text_len
312
+ static void append_string_to_temporary_buffer (
313
+ GumboParser* parser,
314
+ const GumboStringPiece* str
386
315
  ) {
387
- const GumboStringBuffer* buf = &parser->_tokenizer_state->_temporary_buffer;
388
- return
389
- text_len == buf->length
390
- && memcmp(buf->data, text, text_len) == 0;
316
+ gumbo_string_buffer_append_string (
317
+ str,
318
+ &parser->_tokenizer_state->_temporary_buffer
319
+ );
391
320
  }
392
321
 
393
- #define temporary_buffer_equals(parser, text) \
394
- temporary_buffer_equals__(parser, "" text, sizeof(text) - 1)
395
322
 
396
323
  static bool temporary_buffer_is_empty(const GumboParser* parser) {
397
324
  return parser->_tokenizer_state->_temporary_buffer.length == 0;
398
325
  }
399
- #endif
400
326
 
401
327
  static void doc_type_state_init(GumboParser* parser) {
402
328
  GumboTokenDocType* doc_type_state =
@@ -493,56 +419,49 @@ static void finish_doctype_system_id(GumboParser* parser) {
493
419
  }
494
420
 
495
421
  // Writes a single specified character to the output token.
496
- static void emit_char(GumboParser* parser, int c, GumboToken* output) {
422
+ static StateResult emit_char(GumboParser* parser, int c, GumboToken* output) {
497
423
  output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
498
424
  output->v.character = c;
499
425
  finish_token(parser, output);
426
+ return EMIT_TOKEN;
500
427
  }
501
428
 
502
429
  // Writes a replacement character token and records a parse error.
503
- // Always returns RETURN_ERROR, per gumbo_lex return value.
430
+ // Always returns EMIT_TOKEN, per gumbo_lex return value.
504
431
  static StateResult emit_replacement_char(
505
432
  GumboParser* parser, GumboToken* output) {
506
433
  // In all cases, this is because of a null byte in the input stream.
507
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
434
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
508
435
  emit_char(parser, kUtf8ReplacementChar, output);
509
- return RETURN_ERROR;
436
+ return EMIT_TOKEN;
510
437
  }
511
438
 
512
- // Writes an EOF character token. Always returns RETURN_SUCCESS.
439
+ // Writes an EOF character token. Always returns EMIT_TOKEN.
513
440
  static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
514
- emit_char(parser, -1, output);
515
- return RETURN_SUCCESS;
516
- }
517
-
518
- // Writes the current input character out as a character token.
519
- // Always returns RETURN_SUCCESS.
520
- static bool emit_current_char(GumboParser* parser, GumboToken* output) {
521
- emit_char(
522
- parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
523
- return RETURN_SUCCESS;
441
+ return emit_char(parser, -1, output);
524
442
  }
525
443
 
526
444
  // Writes out a doctype token, copying it from the tokenizer state.
527
- static void emit_doctype(GumboParser* parser, GumboToken* output) {
445
+ static StateResult emit_doctype(GumboParser* parser, GumboToken* output) {
528
446
  output->type = GUMBO_TOKEN_DOCTYPE;
529
447
  output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
530
448
  finish_token(parser, output);
531
449
  doc_type_state_init(parser);
450
+ return EMIT_TOKEN;
532
451
  }
533
452
 
534
453
  // Debug-only function that explicitly sets the attribute vector data to NULL so
535
454
  // it can be asserted on tag creation, verifying that there are no memory leaks.
536
455
  static void mark_tag_state_as_empty(GumboTagState* tag_state) {
537
456
  UNUSED_IF_NDEBUG(tag_state);
538
- #ifndef NDEBUG
539
457
  tag_state->_name = NULL;
458
+ #ifndef NDEBUG
540
459
  tag_state->_attributes = kGumboEmptyVector;
541
460
  #endif
542
461
  }
543
462
 
544
463
  // Writes out the current tag as a start or end tag token.
545
- // Always returns RETURN_SUCCESS.
464
+ // Always returns EMIT_TOKEN.
546
465
  static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
547
466
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
548
467
  if (tag_state->_is_start_tag) {
@@ -559,7 +478,10 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
559
478
  output->type = GUMBO_TOKEN_END_TAG;
560
479
  output->v.end_tag.tag = tag_state->_tag;
561
480
  output->v.end_tag.name = tag_state->_name;
562
- output->v.end_tag.is_self_closing = tag_state->_is_self_closing;
481
+ if (tag_state->_is_self_closing)
482
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS);
483
+ if (tag_state->_attributes.length > 0)
484
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_ATTRIBUTES);
563
485
  // In end tags, ownership of the attributes vector is not transferred to the
564
486
  // token, but it's still initialized as normal, so it must be manually
565
487
  // deallocated. There may also be attributes to destroy, in certain broken
@@ -582,7 +504,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
582
504
  assert(output->original_text.length >= 2);
583
505
  assert(output->original_text.data[0] == '<');
584
506
  assert(output->original_text.data[output->original_text.length - 1] == '>');
585
- return RETURN_SUCCESS;
507
+ return EMIT_TOKEN;
586
508
  }
587
509
 
588
510
  // In some states, we speculatively start a tag, but don't know whether it'll be
@@ -600,90 +522,59 @@ static void abandon_current_tag(GumboParser* parser) {
600
522
  gumbo_debug("Abandoning current tag.\n");
601
523
  }
602
524
 
603
- // Wraps the gumbo_consume_char_ref function to handle its output and make the
604
- // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
605
- // error occurred, RETURN_SUCCESS otherwise.
606
- static StateResult emit_char_ref (
607
- GumboParser* parser,
608
- int additional_allowed_char,
609
- bool UNUSED_ARG(is_in_attribute),
610
- GumboToken* output
611
- ) {
612
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
613
- OneOrTwoCodepoints char_ref;
614
- bool status = gumbo_consume_char_ref (
615
- parser,
616
- &tokenizer->_input,
617
- additional_allowed_char,
618
- false,
619
- &char_ref
620
- );
621
- if (char_ref.first != kGumboNoChar) {
622
- // gumbo_consume_char_ref ends with the iterator pointing at the next
623
- // character, so we need to be sure not advance it again before
624
- // reading the next token.
625
- tokenizer->_reconsume_current_input = true;
626
- emit_char(parser, char_ref.first, output);
627
- tokenizer->_buffered_emit_char = char_ref.second;
628
- } else {
629
- emit_char(parser, '&', output);
630
- }
631
- return status ? RETURN_SUCCESS : RETURN_ERROR;
632
- }
633
-
634
525
  // Emits a comment token. Comments use the temporary buffer to accumulate their
635
526
  // data, and then it's copied over and released to the 'text' field of the
636
- // GumboToken union. Always returns RETURN_SUCCESS.
527
+ // GumboToken union. Always returns EMIT_TOKEN.
637
528
  static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
638
529
  output->type = GUMBO_TOKEN_COMMENT;
639
530
  finish_temporary_buffer(parser, &output->v.text);
640
531
  finish_token(parser, output);
641
- return RETURN_SUCCESS;
532
+ return EMIT_TOKEN;
642
533
  }
643
534
 
644
- // Checks to see we should be flushing accumulated characters in the temporary
645
- // buffer, and fills the output token with the next output character if so.
646
- // Returns true if a character has been emitted and the tokenizer should
647
- // immediately return, false if we're at the end of the temporary buffer and
648
- // should resume normal operation.
649
- static bool maybe_emit_from_temporary_buffer(
650
- GumboParser* parser, GumboToken* output) {
535
+ static void set_mark(GumboParser* parser) {
651
536
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
652
- const char* c = tokenizer->_temporary_buffer_emit;
653
- GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
537
+ utf8iterator_mark(&tokenizer->_input);
538
+ }
654
539
 
655
- if (!c || c >= buffer->data + buffer->length) {
656
- tokenizer->_temporary_buffer_emit = NULL;
657
- return false;
540
+ // Checks to see we should be emitting characters from the mark, and fills the
541
+ // output token with the next output character if so.
542
+ // Returns EMIT_TOKEN if a character has been emitted and the tokenizer should
543
+ // immediately return, CONTINUE if we should resume normal operation.
544
+ static StateResult maybe_emit_from_mark (
545
+ GumboParser* parser,
546
+ GumboToken* output
547
+ ) {
548
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
549
+ const char* pos = tokenizer->_resume_pos;
550
+
551
+ if (!pos)
552
+ return CONTINUE;
553
+ if (utf8iterator_get_char_pointer(&tokenizer->_input) >= pos) {
554
+ tokenizer->_resume_pos = NULL;
555
+ return CONTINUE;
658
556
  }
659
557
 
660
- assert(*c == utf8iterator_current(&tokenizer->_input));
661
- // emit_char also advances the input stream. We need to do some juggling of
662
- // the _reconsume_current_input flag to get the proper behavior when emitting
663
- // previous tokens. Basically, _reconsume_current_input should *never* be set
664
- // when emitting anything from the temporary buffer, since those characters
665
- // have already been advanced past. However, it should be preserved so that
666
- // when the *next* character is encountered again, the tokenizer knows not to
667
- // advance past it.
668
- bool saved_reconsume_state = tokenizer->_reconsume_current_input;
669
- tokenizer->_reconsume_current_input = false;
670
- emit_char(parser, *c, output);
671
- ++tokenizer->_temporary_buffer_emit;
672
- tokenizer->_reconsume_current_input = saved_reconsume_state;
673
- return true;
558
+ // emit_char advances the input stream. _reconsume_current_input should
559
+ // *never* be set when emitting from the mark since those characters have
560
+ // already been advanced past.
561
+ assert(!tokenizer->_reconsume_current_input);
562
+ return emit_char(parser, utf8iterator_current(&tokenizer->_input), output);
674
563
  }
675
564
 
676
- // Sets up the tokenizer to begin flushing the temporary buffer.
677
- // This resets the input iterator stream to the start of the last tag, sets up
678
- // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
679
- // the first character in it. It returns true if a character was emitted, false
680
- // otherwise.
681
- static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
565
+ // Sets up the tokenizer to begin emitting from the mark up to, but not
566
+ // including, the current code point. This resets the input iterator stream to
567
+ // the mark, sets up _resume_pos, and then emits the first character in it.
568
+ // Returns EMIT_TOKEN.
569
+ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
682
570
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
683
- assert(tokenizer->_temporary_buffer.data);
571
+ tokenizer->_resume_pos = utf8iterator_get_char_pointer(&tokenizer->_input);
684
572
  utf8iterator_reset(&tokenizer->_input);
685
- tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
686
- return maybe_emit_from_temporary_buffer(parser, output);
573
+ // Now that we have reset the input, we need to advance through it.
574
+ tokenizer->_reconsume_current_input = false;
575
+ StateResult result = maybe_emit_from_mark(parser, output);
576
+ assert(result == EMIT_TOKEN);
577
+ return result;
687
578
  }
688
579
 
689
580
  // Appends a codepoint to the current tag buffer. If
@@ -703,6 +594,19 @@ static void append_char_to_tag_buffer (
703
594
  gumbo_string_buffer_append_codepoint(codepoint, buffer);
704
595
  }
705
596
 
597
+ // Like above but append a string.
598
+ static void append_string_to_tag_buffer (
599
+ GumboParser* parser,
600
+ GumboStringPiece* str,
601
+ bool reinitilize_position_on_first
602
+ ) {
603
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
604
+ if (buffer->length == 0 && reinitilize_position_on_first) {
605
+ reset_tag_buffer_start_point(parser);
606
+ }
607
+ gumbo_string_buffer_append_string(str, buffer);
608
+ }
609
+
706
610
  // (Re-)initialize the tag buffer. This also resets the original_text pointer
707
611
  // and _start_pos field to point to the current position.
708
612
  static void initialize_tag_buffer(GumboParser* parser) {
@@ -713,6 +617,70 @@ static void initialize_tag_buffer(GumboParser* parser) {
713
617
  reset_tag_buffer_start_point(parser);
714
618
  }
715
619
 
620
+ // https://html.spec.whatwg.org/multipage/parsing.html#charref-in-attribute
621
+ static bool character_reference_part_of_attribute(GumboParser* parser) {
622
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
623
+ switch (tokenizer->_return_state) {
624
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
625
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
626
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
627
+ return true;
628
+ default:
629
+ return false;
630
+ }
631
+ }
632
+
633
+ // https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference
634
+ // For each code point in the temporary buffer, add to the current attribute
635
+ // value if the character reference was consumed as part of an attribute or
636
+ // emit the code point as a character token.
637
+ static StateResult flush_code_points_consumed_as_character_reference (
638
+ GumboParser* parser,
639
+ GumboToken* output
640
+ ) {
641
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
642
+ if (character_reference_part_of_attribute(parser)) {
643
+ const char *start = utf8iterator_get_mark_pointer(&tokenizer->_input);
644
+ assert(start);
645
+ GumboStringPiece str = {
646
+ .data = start,
647
+ .length = utf8iterator_get_char_pointer(&tokenizer->_input) - start,
648
+ };
649
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
650
+ append_string_to_tag_buffer(parser, &str, unquoted);
651
+ return CONTINUE;
652
+ }
653
+ return emit_from_mark(parser, output);
654
+ }
655
+
656
+ // After a character reference has been successfully constructed, the standard
657
+ // says to set the temporary buffer equal to the empty string, append the code
658
+ // point(s) associated with the reference and flush code points consumed as a
659
+ // character reference.
660
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
661
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
662
+ // That doesn't work for us because we use the temporary buffer in lock step
663
+ // with the input for position and that would fail if we inserted a different
664
+ // number of code points. So duplicate a bit of the above logic.
665
+ static StateResult flush_char_ref (
666
+ GumboParser* parser,
667
+ int first,
668
+ int second,
669
+ GumboToken* output
670
+ ) {
671
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
672
+ if (character_reference_part_of_attribute(parser)) {
673
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
674
+ append_char_to_tag_buffer(parser, first, unquoted);
675
+ if (second != kGumboNoChar)
676
+ append_char_to_tag_buffer(parser, second, unquoted);
677
+ return CONTINUE;
678
+ }
679
+ tokenizer->_buffered_emit_char = second;
680
+ return emit_char(parser, first, output);
681
+ }
682
+
683
+
716
684
  // Initializes the tag_state to start a new tag, keeping track of the opening
717
685
  // positions and original text. Takes a boolean indicating whether this is a
718
686
  // start or end tag.
@@ -725,7 +693,6 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
725
693
  assert(is_alpha(c));
726
694
 
727
695
  initialize_tag_buffer(parser);
728
- gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
729
696
 
730
697
  assert(tag_state->_name == NULL);
731
698
  assert(tag_state->_attributes.data == NULL);
@@ -801,23 +768,20 @@ static void finish_tag_name(GumboParser* parser) {
801
768
  }
802
769
 
803
770
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
804
- static void add_duplicate_attr_error (
805
- GumboParser* parser,
806
- int original_index,
807
- int new_index
808
- ) {
771
+ static void add_duplicate_attr_error(GumboParser* parser) {
772
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
773
+ tokenizer->_parse_error = true;
809
774
  GumboError* error = gumbo_add_error(parser);
810
775
  if (!error) {
811
776
  return;
812
777
  }
813
778
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
814
- error->type = GUMBO_ERR_DUPLICATE_ATTR;
779
+ error->type = GUMBO_ERR_DUPLICATE_ATTRIBUTE;
815
780
  error->position = tag_state->_start_pos;
816
- error->original_text = tag_state->_original_text;
817
- error->v.duplicate_attr.original_index = original_index;
818
- error->v.duplicate_attr.new_index = new_index;
819
- copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
820
- reinitialize_tag_buffer(parser);
781
+ error->original_text.data = tag_state->_original_text;
782
+ error->original_text.length =
783
+ utf8iterator_get_char_pointer(&tokenizer->_input) - error->original_text.data;
784
+ error->v.tokenizer.state = tokenizer->_state;
821
785
  }
822
786
 
823
787
  // Creates a new attribute in the current tag, copying the current tag buffer to
@@ -846,7 +810,8 @@ static bool finish_attribute_name(GumboParser* parser) {
846
810
  )
847
811
  ) {
848
812
  // Identical attribute; bail.
849
- add_duplicate_attr_error(parser, i, attributes->length);
813
+ add_duplicate_attr_error(parser);
814
+ reinitialize_tag_buffer(parser);
850
815
  tag_state->_drop_next_attr_value = true;
851
816
  return false;
852
817
  }
@@ -911,19 +876,21 @@ void gumbo_tokenizer_state_init (
911
876
  GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState));
912
877
  parser->_tokenizer_state = tokenizer;
913
878
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
879
+ tokenizer->_return_state = GUMBO_LEX_DATA;
880
+ tokenizer->_character_reference_code = 0;
914
881
  tokenizer->_reconsume_current_input = false;
915
- tokenizer->_is_current_node_foreign = false;
882
+ tokenizer->_is_adjusted_current_node_foreign = false;
916
883
  tokenizer->_is_in_cdata = false;
884
+ tokenizer->_parse_error = false;
917
885
  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
918
886
  tokenizer->_tag_state._name = NULL;
919
887
 
920
888
  tokenizer->_buffered_emit_char = kGumboNoChar;
921
889
  gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
922
- tokenizer->_temporary_buffer_emit = NULL;
890
+ tokenizer->_resume_pos = NULL;
923
891
 
924
892
  mark_tag_state_as_empty(&tokenizer->_tag_state);
925
893
 
926
- gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
927
894
  tokenizer->_token_start = text;
928
895
  utf8iterator_init(parser, text, text_length, &tokenizer->_input);
929
896
  utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
@@ -936,7 +903,6 @@ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
936
903
  assert(tokenizer->_doc_type_state.public_identifier == NULL);
937
904
  assert(tokenizer->_doc_type_state.system_identifier == NULL);
938
905
  gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
939
- gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
940
906
  assert(tokenizer->_tag_state._name == NULL);
941
907
  assert(tokenizer->_tag_state._attributes.data == NULL);
942
908
  gumbo_free(tokenizer);
@@ -946,17 +912,23 @@ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
946
912
  parser->_tokenizer_state->_state = state;
947
913
  }
948
914
 
949
- void gumbo_tokenizer_set_is_current_node_foreign (
915
+ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
950
916
  GumboParser* parser,
951
917
  bool is_foreign
952
918
  ) {
953
- if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
919
+ if (is_foreign != parser->_tokenizer_state->_is_adjusted_current_node_foreign) {
954
920
  gumbo_debug (
955
921
  "Toggling is_current_node_foreign to %s.\n",
956
922
  is_foreign ? "true" : "false"
957
923
  );
958
924
  }
959
- parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
925
+ parser->_tokenizer_state->_is_adjusted_current_node_foreign = is_foreign;
926
+ }
927
+
928
+ static void reconsume_in_state(GumboParser* parser, GumboTokenizerEnum state) {
929
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
930
+ tokenizer->_reconsume_current_input = true;
931
+ tokenizer->_state = state;
960
932
  }
961
933
 
962
934
  // https://html.spec.whatwg.org/multipage/parsing.html#data-state
@@ -968,37 +940,24 @@ static StateResult handle_data_state (
968
940
  ) {
969
941
  switch (c) {
970
942
  case '&':
971
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
972
- // The char_ref machinery expects to be on the & so it can mark that
973
- // and return to it if the text isn't a char ref, so we need to
974
- // reconsume it.
975
- tokenizer->_reconsume_current_input = true;
976
- return NEXT_CHAR;
943
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
944
+ set_mark(parser);
945
+ tokenizer->_return_state = GUMBO_LEX_DATA;
946
+ return CONTINUE;
977
947
  case '<':
978
948
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
979
- clear_temporary_buffer(parser);
980
- append_char_to_temporary_buffer(parser, '<');
981
- return NEXT_CHAR;
949
+ set_mark(parser);
950
+ return CONTINUE;
982
951
  case '\0':
983
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
984
- emit_char(parser, c, output);
985
- return RETURN_ERROR;
952
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
953
+ return emit_char(parser, c, output);
954
+ case -1:
955
+ return emit_eof(parser, output);
986
956
  default:
987
- return emit_current_char(parser, output);
957
+ return emit_char(parser, c, output);
988
958
  }
989
959
  }
990
960
 
991
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-data-state
992
- static StateResult handle_char_ref_in_data_state (
993
- GumboParser* parser,
994
- GumboTokenizerState* UNUSED_ARG(tokenizer),
995
- int UNUSED_ARG(c),
996
- GumboToken* output
997
- ) {
998
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
999
- return emit_char_ref(parser, ' ', false, output);
1000
- }
1001
-
1002
961
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
1003
962
  static StateResult handle_rcdata_state (
1004
963
  GumboParser* parser,
@@ -1008,34 +967,23 @@ static StateResult handle_rcdata_state (
1008
967
  ) {
1009
968
  switch (c) {
1010
969
  case '&':
1011
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
1012
- tokenizer->_reconsume_current_input = true;
1013
- return NEXT_CHAR;
970
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
971
+ set_mark(parser);
972
+ tokenizer->_return_state = GUMBO_LEX_RCDATA;
973
+ return CONTINUE;
1014
974
  case '<':
1015
975
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
1016
- clear_temporary_buffer(parser);
1017
- append_char_to_temporary_buffer(parser, '<');
1018
- return NEXT_CHAR;
976
+ set_mark(parser);
977
+ return CONTINUE;
1019
978
  case '\0':
1020
979
  return emit_replacement_char(parser, output);
1021
980
  case -1:
1022
981
  return emit_eof(parser, output);
1023
982
  default:
1024
- return emit_current_char(parser, output);
983
+ return emit_char(parser, c, output);
1025
984
  }
1026
985
  }
1027
986
 
1028
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-rcdata-state
1029
- static StateResult handle_char_ref_in_rcdata_state (
1030
- GumboParser* parser,
1031
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1032
- int UNUSED_ARG(c),
1033
- GumboToken* output
1034
- ) {
1035
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1036
- return emit_char_ref(parser, ' ', false, output);
1037
- }
1038
-
1039
987
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
1040
988
  static StateResult handle_rawtext_state (
1041
989
  GumboParser* parser,
@@ -1046,20 +994,19 @@ static StateResult handle_rawtext_state (
1046
994
  switch (c) {
1047
995
  case '<':
1048
996
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
1049
- clear_temporary_buffer(parser);
1050
- append_char_to_temporary_buffer(parser, '<');
1051
- return NEXT_CHAR;
997
+ set_mark(parser);
998
+ return CONTINUE;
1052
999
  case '\0':
1053
1000
  return emit_replacement_char(parser, output);
1054
1001
  case -1:
1055
1002
  return emit_eof(parser, output);
1056
1003
  default:
1057
- return emit_current_char(parser, output);
1004
+ return emit_char(parser, c, output);
1058
1005
  }
1059
1006
  }
1060
1007
 
1061
1008
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
1062
- static StateResult handle_script_state (
1009
+ static StateResult handle_script_data_state (
1063
1010
  GumboParser* parser,
1064
1011
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1065
1012
  int c,
@@ -1067,16 +1014,15 @@ static StateResult handle_script_state (
1067
1014
  ) {
1068
1015
  switch (c) {
1069
1016
  case '<':
1070
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
1071
- clear_temporary_buffer(parser);
1072
- append_char_to_temporary_buffer(parser, '<');
1073
- return NEXT_CHAR;
1017
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_LT);
1018
+ set_mark(parser);
1019
+ return CONTINUE;
1074
1020
  case '\0':
1075
1021
  return emit_replacement_char(parser, output);
1076
1022
  case -1:
1077
1023
  return emit_eof(parser, output);
1078
1024
  default:
1079
- return emit_current_char(parser, output);
1025
+ return emit_char(parser, c, output);
1080
1026
  }
1081
1027
  }
1082
1028
 
@@ -1093,75 +1039,75 @@ static StateResult handle_plaintext_state (
1093
1039
  case -1:
1094
1040
  return emit_eof(parser, output);
1095
1041
  default:
1096
- return emit_current_char(parser, output);
1042
+ return emit_char(parser, c, output);
1097
1043
  }
1098
1044
  }
1099
1045
 
1100
1046
  // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
1101
1047
  static StateResult handle_tag_open_state (
1102
1048
  GumboParser* parser,
1103
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1049
+ GumboTokenizerState* tokenizer,
1104
1050
  int c,
1105
1051
  GumboToken* output
1106
1052
  ) {
1107
- assert(temporary_buffer_equals(parser, "<"));
1108
1053
  switch (c) {
1109
1054
  case '!':
1110
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1055
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION_OPEN);
1111
1056
  clear_temporary_buffer(parser);
1112
- return NEXT_CHAR;
1057
+ return CONTINUE;
1113
1058
  case '/':
1114
1059
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1115
- append_char_to_temporary_buffer(parser, '/');
1116
- return NEXT_CHAR;
1060
+ return CONTINUE;
1117
1061
  case '?':
1118
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1062
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
1119
1063
  clear_temporary_buffer(parser);
1120
- append_char_to_temporary_buffer(parser, '?');
1121
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1122
- return NEXT_CHAR;
1064
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1065
+ return CONTINUE;
1066
+ case -1:
1067
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1068
+ // Switch to data to emit EOF.
1069
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1070
+ return emit_from_mark(parser, output);
1123
1071
  default:
1124
1072
  if (is_alpha(c)) {
1125
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1073
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1126
1074
  start_new_tag(parser, true);
1127
- return NEXT_CHAR;
1128
- } else {
1129
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1130
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1131
- emit_temporary_buffer(parser, output);
1132
- return RETURN_ERROR;
1075
+ return CONTINUE;
1133
1076
  }
1077
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1078
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1079
+ return emit_from_mark(parser, output);
1134
1080
  }
1135
1081
  }
1136
1082
 
1137
1083
  // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
1138
1084
  static StateResult handle_end_tag_open_state (
1139
1085
  GumboParser* parser,
1140
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1086
+ GumboTokenizerState* tokenizer,
1141
1087
  int c,
1142
1088
  GumboToken* output
1143
1089
  ) {
1144
- assert(temporary_buffer_equals(parser, "</"));
1145
1090
  switch (c) {
1146
1091
  case '>':
1147
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1092
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_END_TAG_NAME);
1148
1093
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1149
- return NEXT_CHAR;
1094
+ return CONTINUE;
1150
1095
  case -1:
1151
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1152
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1153
- return emit_temporary_buffer(parser, output);
1096
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1097
+ // Similar to the tag open state except we need to emit '<' and '/'
1098
+ // before the EOF.
1099
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1100
+ return emit_from_mark(parser, output);
1154
1101
  default:
1155
1102
  if (is_alpha(c)) {
1156
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1103
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1157
1104
  start_new_tag(parser, false);
1158
1105
  } else {
1159
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1160
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1106
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1107
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1161
1108
  clear_temporary_buffer(parser);
1162
- append_char_to_temporary_buffer(parser, c);
1163
1109
  }
1164
- return NEXT_CHAR;
1110
+ return CONTINUE;
1165
1111
  }
1166
1112
  }
1167
1113
 
@@ -1179,27 +1125,26 @@ static StateResult handle_tag_name_state (
1179
1125
  case ' ':
1180
1126
  finish_tag_name(parser);
1181
1127
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1182
- return NEXT_CHAR;
1128
+ return CONTINUE;
1183
1129
  case '/':
1184
1130
  finish_tag_name(parser);
1185
1131
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1186
- return NEXT_CHAR;
1132
+ return CONTINUE;
1187
1133
  case '>':
1188
1134
  finish_tag_name(parser);
1189
1135
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1190
1136
  return emit_current_tag(parser, output);
1191
1137
  case '\0':
1192
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1138
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1193
1139
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1194
- return NEXT_CHAR;
1140
+ return CONTINUE;
1195
1141
  case -1:
1196
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1142
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1197
1143
  abandon_current_tag(parser);
1198
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1199
- return NEXT_CHAR;
1144
+ return emit_eof(parser, output);
1200
1145
  default:
1201
1146
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1202
- return NEXT_CHAR;
1147
+ return CONTINUE;
1203
1148
  }
1204
1149
  }
1205
1150
 
@@ -1210,36 +1155,29 @@ static StateResult handle_rcdata_lt_state (
1210
1155
  int c,
1211
1156
  GumboToken* output
1212
1157
  ) {
1213
- assert(temporary_buffer_equals(parser, "<"));
1214
1158
  if (c == '/') {
1215
1159
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1216
- append_char_to_temporary_buffer(parser, '/');
1217
- return NEXT_CHAR;
1160
+ return CONTINUE;
1218
1161
  } else {
1219
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1220
- tokenizer->_reconsume_current_input = true;
1221
- return emit_temporary_buffer(parser, output);
1162
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1163
+ return emit_from_mark(parser, output);
1222
1164
  }
1223
1165
  }
1224
1166
 
1225
1167
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
1226
1168
  static StateResult handle_rcdata_end_tag_open_state (
1227
1169
  GumboParser* parser,
1228
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1170
+ GumboTokenizerState* tokenizer,
1229
1171
  int c,
1230
1172
  GumboToken* output
1231
1173
  ) {
1232
- assert(temporary_buffer_equals(parser, "</"));
1233
1174
  if (is_alpha(c)) {
1234
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1175
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1235
1176
  start_new_tag(parser, false);
1236
- append_char_to_temporary_buffer(parser, c);
1237
- return NEXT_CHAR;
1238
- } else {
1239
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1240
- return emit_temporary_buffer(parser, output);
1177
+ return CONTINUE;
1241
1178
  }
1242
- return true;
1179
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1180
+ return emit_from_mark(parser, output);
1243
1181
  }
1244
1182
 
1245
1183
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
@@ -1250,33 +1188,39 @@ static StateResult handle_rcdata_end_tag_name_state (
1250
1188
  GumboToken* output
1251
1189
  ) {
1252
1190
  UNUSED_IF_NDEBUG(tokenizer);
1253
- assert(tokenizer->_temporary_buffer.length >= 2);
1254
1191
  if (is_alpha(c)) {
1255
1192
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1256
- append_char_to_temporary_buffer(parser, c);
1257
- return NEXT_CHAR;
1258
- } else if (is_appropriate_end_tag(parser)) {
1259
- switch (c) {
1260
- case '\t':
1261
- case '\n':
1262
- case '\f':
1263
- case ' ':
1264
- finish_tag_name(parser);
1265
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1266
- return NEXT_CHAR;
1267
- case '/':
1268
- finish_tag_name(parser);
1269
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1270
- return NEXT_CHAR;
1271
- case '>':
1272
- finish_tag_name(parser);
1273
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1274
- return emit_current_tag(parser, output);
1193
+ return CONTINUE;
1194
+ }
1195
+ switch (c) {
1196
+ case '\t':
1197
+ case '\n':
1198
+ case '\f':
1199
+ case ' ':
1200
+ if (is_appropriate_end_tag(parser)) {
1201
+ finish_tag_name(parser);
1202
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1203
+ return CONTINUE;
1204
+ }
1205
+ break;
1206
+ case '/':
1207
+ if (is_appropriate_end_tag(parser)) {
1208
+ finish_tag_name(parser);
1209
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1210
+ return CONTINUE;
1211
+ }
1212
+ break;
1213
+ case '>':
1214
+ if (is_appropriate_end_tag(parser)) {
1215
+ finish_tag_name(parser);
1216
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1217
+ return emit_current_tag(parser, output);
1275
1218
  }
1219
+ break;
1276
1220
  }
1277
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1278
1221
  abandon_current_tag(parser);
1279
- return emit_temporary_buffer(parser, output);
1222
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1223
+ return emit_from_mark(parser, output);
1280
1224
  }
1281
1225
 
1282
1226
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
@@ -1286,34 +1230,29 @@ static StateResult handle_rawtext_lt_state (
1286
1230
  int c,
1287
1231
  GumboToken* output
1288
1232
  ) {
1289
- assert(temporary_buffer_equals(parser, "<"));
1290
1233
  if (c == '/') {
1291
1234
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1292
- append_char_to_temporary_buffer(parser, '/');
1293
- return NEXT_CHAR;
1235
+ return CONTINUE;
1294
1236
  } else {
1295
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1296
- tokenizer->_reconsume_current_input = true;
1297
- return emit_temporary_buffer(parser, output);
1237
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1238
+ return emit_from_mark(parser, output);
1298
1239
  }
1299
1240
  }
1300
1241
 
1301
1242
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
1302
1243
  static StateResult handle_rawtext_end_tag_open_state (
1303
1244
  GumboParser* parser,
1304
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1245
+ GumboTokenizerState* tokenizer,
1305
1246
  int c,
1306
1247
  GumboToken* output
1307
1248
  ) {
1308
- assert(temporary_buffer_equals(parser, "</"));
1309
1249
  if (is_alpha(c)) {
1310
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1250
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1311
1251
  start_new_tag(parser, false);
1312
- append_char_to_temporary_buffer(parser, c);
1313
- return NEXT_CHAR;
1252
+ return CONTINUE;
1314
1253
  } else {
1315
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1316
- return emit_temporary_buffer(parser, output);
1254
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1255
+ return emit_from_mark(parser, output);
1317
1256
  }
1318
1257
  }
1319
1258
 
@@ -1324,153 +1263,156 @@ static StateResult handle_rawtext_end_tag_name_state (
1324
1263
  int c,
1325
1264
  GumboToken* output
1326
1265
  ) {
1327
- assert(tokenizer->_temporary_buffer.length >= 2);
1328
- gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1329
- tokenizer->_tag_state._buffer.data);
1330
1266
  if (is_alpha(c)) {
1331
1267
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1332
- append_char_to_temporary_buffer(parser, c);
1333
- return NEXT_CHAR;
1334
- } else if (is_appropriate_end_tag(parser)) {
1335
- gumbo_debug("Is an appropriate end tag.\n");
1336
- switch (c) {
1337
- case '\t':
1338
- case '\n':
1339
- case '\f':
1340
- case ' ':
1341
- finish_tag_name(parser);
1342
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1343
- return NEXT_CHAR;
1344
- case '/':
1345
- finish_tag_name(parser);
1346
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1347
- return NEXT_CHAR;
1348
- case '>':
1349
- finish_tag_name(parser);
1350
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1351
- return emit_current_tag(parser, output);
1268
+ return CONTINUE;
1269
+ }
1270
+ switch (c) {
1271
+ case '\t':
1272
+ case '\n':
1273
+ case '\f':
1274
+ case ' ':
1275
+ if (is_appropriate_end_tag(parser)) {
1276
+ finish_tag_name(parser);
1277
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1278
+ return CONTINUE;
1279
+ }
1280
+ break;
1281
+ case '/':
1282
+ if (is_appropriate_end_tag(parser)) {
1283
+ finish_tag_name(parser);
1284
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1285
+ return CONTINUE;
1352
1286
  }
1287
+ break;
1288
+ case '>':
1289
+ if (is_appropriate_end_tag(parser)) {
1290
+ finish_tag_name(parser);
1291
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1292
+ return emit_current_tag(parser, output);
1293
+ }
1294
+ break;
1353
1295
  }
1354
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1355
1296
  abandon_current_tag(parser);
1356
- return emit_temporary_buffer(parser, output);
1297
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1298
+ return emit_from_mark(parser, output);
1357
1299
  }
1358
1300
 
1359
1301
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
1360
- static StateResult handle_script_lt_state (
1302
+ static StateResult handle_script_data_lt_state (
1361
1303
  GumboParser* parser,
1362
1304
  GumboTokenizerState* tokenizer,
1363
1305
  int c,
1364
1306
  GumboToken* output
1365
1307
  ) {
1366
- assert(temporary_buffer_equals(parser, "<"));
1367
1308
  if (c == '/') {
1368
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1369
- append_char_to_temporary_buffer(parser, '/');
1370
- return NEXT_CHAR;
1371
- } else if (c == '!') {
1372
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1373
- append_char_to_temporary_buffer(parser, '!');
1374
- return emit_temporary_buffer(parser, output);
1375
- } else {
1376
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1377
- tokenizer->_reconsume_current_input = true;
1378
- return emit_temporary_buffer(parser, output);
1309
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN);
1310
+ return CONTINUE;
1311
+ }
1312
+ if (c == '!') {
1313
+ // This is the only place we don't reconsume the input before emitting the
1314
+ // temporary buffer. Since the current position is stored and the current
1315
+ // character is not emitted, we need to advance the input and then
1316
+ // reconsume.
1317
+ utf8iterator_next(&tokenizer->_input);
1318
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START);
1319
+ return emit_from_mark(parser, output);
1379
1320
  }
1321
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1322
+ return emit_from_mark(parser, output);
1380
1323
  }
1381
1324
 
1382
1325
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
1383
- static StateResult handle_script_end_tag_open_state (
1326
+ static StateResult handle_script_data_end_tag_open_state (
1384
1327
  GumboParser* parser,
1385
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1328
+ GumboTokenizerState* tokenizer,
1386
1329
  int c,
1387
1330
  GumboToken* output
1388
1331
  ) {
1389
- assert(temporary_buffer_equals(parser, "</"));
1390
1332
  if (is_alpha(c)) {
1391
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1333
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME);
1392
1334
  start_new_tag(parser, false);
1393
- append_char_to_temporary_buffer(parser, c);
1394
- return NEXT_CHAR;
1395
- } else {
1396
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1397
- return emit_temporary_buffer(parser, output);
1335
+ return CONTINUE;
1398
1336
  }
1337
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1338
+ return emit_from_mark(parser, output);
1399
1339
  }
1400
1340
 
1401
1341
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
1402
- static StateResult handle_script_end_tag_name_state (
1342
+ static StateResult handle_script_data_end_tag_name_state (
1403
1343
  GumboParser* parser,
1404
1344
  GumboTokenizerState* tokenizer,
1405
1345
  int c,
1406
1346
  GumboToken* output
1407
1347
  ) {
1408
- UNUSED_IF_NDEBUG(tokenizer);
1409
- assert(tokenizer->_temporary_buffer.length >= 2);
1410
1348
  if (is_alpha(c)) {
1411
1349
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1412
- append_char_to_temporary_buffer(parser, c);
1413
- return NEXT_CHAR;
1414
- } else if (is_appropriate_end_tag(parser)) {
1415
- switch (c) {
1416
- case '\t':
1417
- case '\n':
1418
- case '\f':
1419
- case ' ':
1420
- finish_tag_name(parser);
1421
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1422
- return NEXT_CHAR;
1423
- case '/':
1424
- finish_tag_name(parser);
1425
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1426
- return NEXT_CHAR;
1427
- case '>':
1428
- finish_tag_name(parser);
1429
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1430
- return emit_current_tag(parser, output);
1350
+ return CONTINUE;
1351
+ }
1352
+ switch (c) {
1353
+ case '\t':
1354
+ case '\n':
1355
+ case '\f':
1356
+ case ' ':
1357
+ if (is_appropriate_end_tag(parser)) {
1358
+ finish_tag_name(parser);
1359
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1360
+ return CONTINUE;
1361
+ }
1362
+ break;
1363
+ case '/':
1364
+ if (is_appropriate_end_tag(parser)) {
1365
+ finish_tag_name(parser);
1366
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1367
+ return CONTINUE;
1368
+ }
1369
+ break;
1370
+ case '>':
1371
+ if (is_appropriate_end_tag(parser)) {
1372
+ finish_tag_name(parser);
1373
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1374
+ return emit_current_tag(parser, output);
1431
1375
  }
1376
+ break;
1432
1377
  }
1433
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1434
1378
  abandon_current_tag(parser);
1435
- return emit_temporary_buffer(parser, output);
1379
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1380
+ return emit_from_mark(parser, output);
1436
1381
  }
1437
1382
 
1438
1383
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
1439
- static StateResult handle_script_escaped_start_state (
1384
+ static StateResult handle_script_data_escaped_start_state (
1440
1385
  GumboParser* parser,
1441
1386
  GumboTokenizerState* tokenizer,
1442
1387
  int c,
1443
1388
  GumboToken* output
1444
1389
  ) {
1445
1390
  if (c == '-') {
1446
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1447
- return emit_current_char(parser, output);
1448
- } else {
1449
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1450
- tokenizer->_reconsume_current_input = true;
1451
- return NEXT_CHAR;
1391
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH);
1392
+ return emit_char(parser, c, output);
1452
1393
  }
1394
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1395
+ return CONTINUE;
1453
1396
  }
1454
1397
 
1455
1398
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
1456
- static StateResult handle_script_escaped_start_dash_state (
1399
+ static StateResult handle_script_data_escaped_start_dash_state (
1457
1400
  GumboParser* parser,
1458
1401
  GumboTokenizerState* tokenizer,
1459
1402
  int c,
1460
1403
  GumboToken* output
1461
1404
  ) {
1462
1405
  if (c == '-') {
1463
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1464
- return emit_current_char(parser, output);
1406
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1407
+ return emit_char(parser, c, output);
1465
1408
  } else {
1466
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1467
- tokenizer->_reconsume_current_input = true;
1468
- return NEXT_CHAR;
1409
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1410
+ return CONTINUE;
1469
1411
  }
1470
1412
  }
1471
1413
 
1472
1414
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
1473
- static StateResult handle_script_escaped_state (
1415
+ static StateResult handle_script_data_escaped_state (
1474
1416
  GumboParser* parser,
1475
1417
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1476
1418
  int c,
@@ -1478,25 +1420,25 @@ static StateResult handle_script_escaped_state (
1478
1420
  ) {
1479
1421
  switch (c) {
1480
1422
  case '-':
1481
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1482
- return emit_current_char(parser, output);
1423
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH);
1424
+ return emit_char(parser, c, output);
1483
1425
  case '<':
1484
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1426
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1485
1427
  clear_temporary_buffer(parser);
1486
- append_char_to_temporary_buffer(parser, c);
1487
- return NEXT_CHAR;
1428
+ set_mark(parser);
1429
+ return CONTINUE;
1488
1430
  case '\0':
1489
1431
  return emit_replacement_char(parser, output);
1490
1432
  case -1:
1491
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1433
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1492
1434
  return emit_eof(parser, output);
1493
1435
  default:
1494
- return emit_current_char(parser, output);
1436
+ return emit_char(parser, c, output);
1495
1437
  }
1496
1438
  }
1497
1439
 
1498
1440
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
1499
- static StateResult handle_script_escaped_dash_state (
1441
+ static StateResult handle_script_data_escaped_dash_state (
1500
1442
  GumboParser* parser,
1501
1443
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1502
1444
  int c,
@@ -1504,28 +1446,27 @@ static StateResult handle_script_escaped_dash_state (
1504
1446
  ) {
1505
1447
  switch (c) {
1506
1448
  case '-':
1507
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1508
- return emit_current_char(parser, output);
1449
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1450
+ return emit_char(parser, c, output);
1509
1451
  case '<':
1510
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1452
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1511
1453
  clear_temporary_buffer(parser);
1512
- append_char_to_temporary_buffer(parser, c);
1513
- return NEXT_CHAR;
1454
+ set_mark(parser);
1455
+ return CONTINUE;
1514
1456
  case '\0':
1515
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1457
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1516
1458
  return emit_replacement_char(parser, output);
1517
1459
  case -1:
1518
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1519
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1520
- return NEXT_CHAR;
1460
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1461
+ return emit_eof(parser, output);
1521
1462
  default:
1522
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1523
- return emit_current_char(parser, output);
1463
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1464
+ return emit_char(parser, c, output);
1524
1465
  }
1525
1466
  }
1526
1467
 
1527
1468
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
1528
- static StateResult handle_script_escaped_dash_dash_state (
1469
+ static StateResult handle_script_data_escaped_dash_dash_state (
1529
1470
  GumboParser* parser,
1530
1471
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1531
1472
  int c,
@@ -1533,113 +1474,107 @@ static StateResult handle_script_escaped_dash_dash_state (
1533
1474
  ) {
1534
1475
  switch (c) {
1535
1476
  case '-':
1536
- return emit_current_char(parser, output);
1477
+ return emit_char(parser, c, output);
1537
1478
  case '<':
1538
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1479
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1539
1480
  clear_temporary_buffer(parser);
1540
- append_char_to_temporary_buffer(parser, c);
1541
- return NEXT_CHAR;
1481
+ set_mark(parser);
1482
+ return CONTINUE;
1542
1483
  case '>':
1543
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1544
- return emit_current_char(parser, output);
1484
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1485
+ return emit_char(parser, c, output);
1545
1486
  case '\0':
1546
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1487
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1547
1488
  return emit_replacement_char(parser, output);
1548
1489
  case -1:
1549
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1550
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1551
- return NEXT_CHAR;
1490
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1491
+ return emit_eof(parser, output);
1552
1492
  default:
1553
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1554
- return emit_current_char(parser, output);
1493
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1494
+ return emit_char(parser, c, output);
1555
1495
  }
1556
1496
  }
1557
1497
 
1558
1498
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
1559
- static StateResult handle_script_escaped_lt_state (
1499
+ static StateResult handle_script_data_escaped_lt_state (
1560
1500
  GumboParser* parser,
1561
1501
  GumboTokenizerState* tokenizer,
1562
1502
  int c,
1563
1503
  GumboToken* output
1564
1504
  ) {
1565
- assert(temporary_buffer_equals(parser, "<"));
1566
- assert(!tokenizer->_script_data_buffer.length);
1505
+ assert(temporary_buffer_is_empty(parser));
1567
1506
  if (c == '/') {
1568
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1569
- append_char_to_temporary_buffer(parser, c);
1570
- return NEXT_CHAR;
1571
- } else if (is_alpha(c)) {
1572
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1573
- append_char_to_temporary_buffer(parser, c);
1574
- gumbo_string_buffer_append_codepoint (
1575
- ensure_lowercase(c),
1576
- &tokenizer->_script_data_buffer
1577
- );
1578
- return emit_temporary_buffer(parser, output);
1579
- } else {
1580
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1581
- return emit_temporary_buffer(parser, output);
1507
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN);
1508
+ return CONTINUE;
1509
+ }
1510
+ if (is_alpha(c)) {
1511
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START);
1512
+ return emit_from_mark(parser, output);
1582
1513
  }
1514
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1515
+ return emit_from_mark(parser, output);
1583
1516
  }
1584
1517
 
1585
1518
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
1586
- static StateResult handle_script_escaped_end_tag_open_state (
1519
+ static StateResult handle_script_data_escaped_end_tag_open_state (
1587
1520
  GumboParser* parser,
1588
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1521
+ GumboTokenizerState* tokenizer,
1589
1522
  int c,
1590
1523
  GumboToken* output
1591
1524
  ) {
1592
- assert(temporary_buffer_equals(parser, "</"));
1593
1525
  if (is_alpha(c)) {
1594
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1526
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME);
1595
1527
  start_new_tag(parser, false);
1596
- append_char_to_temporary_buffer(parser, c);
1597
- return NEXT_CHAR;
1598
- } else {
1599
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1600
- return emit_temporary_buffer(parser, output);
1528
+ return CONTINUE;
1601
1529
  }
1530
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1531
+ return emit_from_mark(parser, output);
1602
1532
  }
1603
1533
 
1604
1534
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
1605
- static StateResult handle_script_escaped_end_tag_name_state (
1535
+ static StateResult handle_script_data_escaped_end_tag_name_state (
1606
1536
  GumboParser* parser,
1607
1537
  GumboTokenizerState* tokenizer,
1608
1538
  int c,
1609
1539
  GumboToken* output
1610
1540
  ) {
1611
- UNUSED_IF_NDEBUG(tokenizer);
1612
- assert(tokenizer->_temporary_buffer.length >= 2);
1613
1541
  if (is_alpha(c)) {
1614
1542
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1615
- append_char_to_temporary_buffer(parser, c);
1616
- return NEXT_CHAR;
1617
- } else if (is_appropriate_end_tag(parser)) {
1618
- switch (c) {
1619
- case '\t':
1620
- case '\n':
1621
- case '\f':
1622
- case ' ':
1623
- finish_tag_name(parser);
1624
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1625
- return NEXT_CHAR;
1626
- case '/':
1627
- finish_tag_name(parser);
1628
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1629
- return NEXT_CHAR;
1630
- case '>':
1631
- finish_tag_name(parser);
1632
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1633
- return emit_current_tag(parser, output);
1543
+ return CONTINUE;
1544
+ }
1545
+ switch (c) {
1546
+ case '\t':
1547
+ case '\n':
1548
+ case '\f':
1549
+ case ' ':
1550
+ if (is_appropriate_end_tag(parser)) {
1551
+ finish_tag_name(parser);
1552
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1553
+ return CONTINUE;
1554
+ }
1555
+ break;
1556
+ case '/':
1557
+ if (is_appropriate_end_tag(parser)) {
1558
+ finish_tag_name(parser);
1559
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1560
+ return CONTINUE;
1561
+ }
1562
+ break;
1563
+ case '>':
1564
+ if (is_appropriate_end_tag(parser)) {
1565
+ finish_tag_name(parser);
1566
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1567
+ return emit_current_tag(parser, output);
1634
1568
  }
1569
+ break;
1635
1570
  }
1636
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1637
1571
  abandon_current_tag(parser);
1638
- return emit_temporary_buffer(parser, output);
1572
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1573
+ return emit_from_mark(parser, output);
1639
1574
  }
1640
1575
 
1641
1576
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
1642
- static StateResult handle_script_double_escaped_start_state (
1577
+ static StateResult handle_script_data_double_escaped_start_state (
1643
1578
  GumboParser* parser,
1644
1579
  GumboTokenizerState* tokenizer,
1645
1580
  int c,
@@ -1656,29 +1591,23 @@ static StateResult handle_script_double_escaped_start_state (
1656
1591
  parser,
1657
1592
  gumbo_string_equals (
1658
1593
  &kScriptTag,
1659
- (GumboStringPiece*) &tokenizer->_script_data_buffer
1594
+ (GumboStringPiece*) &tokenizer->_temporary_buffer
1660
1595
  )
1661
- ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1662
- : GUMBO_LEX_SCRIPT_ESCAPED
1596
+ ? GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED
1597
+ : GUMBO_LEX_SCRIPT_DATA_ESCAPED
1663
1598
  );
1664
- return emit_current_char(parser, output);
1665
- default:
1666
- if (is_alpha(c)) {
1667
- gumbo_string_buffer_append_codepoint (
1668
- ensure_lowercase(c),
1669
- &tokenizer->_script_data_buffer
1670
- );
1671
- return emit_current_char(parser, output);
1672
- } else {
1673
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1674
- tokenizer->_reconsume_current_input = true;
1675
- return NEXT_CHAR;
1676
- }
1599
+ return emit_char(parser, c, output);
1600
+ }
1601
+ if (is_alpha(c)) {
1602
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1603
+ return emit_char(parser, c, output);
1677
1604
  }
1605
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1606
+ return CONTINUE;
1678
1607
  }
1679
1608
 
1680
1609
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
1681
- static StateResult handle_script_double_escaped_state (
1610
+ static StateResult handle_script_data_double_escaped_state (
1682
1611
  GumboParser* parser,
1683
1612
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1684
1613
  int c,
@@ -1686,24 +1615,23 @@ static StateResult handle_script_double_escaped_state (
1686
1615
  ) {
1687
1616
  switch (c) {
1688
1617
  case '-':
1689
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1690
- return emit_current_char(parser, output);
1618
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH);
1619
+ return emit_char(parser, c, output);
1691
1620
  case '<':
1692
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1693
- return emit_current_char(parser, output);
1621
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1622
+ return emit_char(parser, c, output);
1694
1623
  case '\0':
1695
1624
  return emit_replacement_char(parser, output);
1696
1625
  case -1:
1697
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1698
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1699
- return NEXT_CHAR;
1626
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1627
+ return emit_eof(parser, output);
1700
1628
  default:
1701
- return emit_current_char(parser, output);
1629
+ return emit_char(parser, c, output);
1702
1630
  }
1703
1631
  }
1704
1632
 
1705
1633
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
1706
- static StateResult handle_script_double_escaped_dash_state (
1634
+ static StateResult handle_script_data_double_escaped_dash_state (
1707
1635
  GumboParser* parser,
1708
1636
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1709
1637
  int c,
@@ -1712,26 +1640,25 @@ static StateResult handle_script_double_escaped_dash_state (
1712
1640
  switch (c) {
1713
1641
  case '-':
1714
1642
  gumbo_tokenizer_set_state(
1715
- parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1716
- return emit_current_char(parser, output);
1643
+ parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH);
1644
+ return emit_char(parser, c, output);
1717
1645
  case '<':
1718
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1719
- return emit_current_char(parser, output);
1646
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1647
+ return emit_char(parser, c, output);
1720
1648
  case '\0':
1721
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1649
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1722
1650
  return emit_replacement_char(parser, output);
1723
1651
  case -1:
1724
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1725
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1726
- return NEXT_CHAR;
1652
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1653
+ return emit_eof(parser, output);
1727
1654
  default:
1728
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1729
- return emit_current_char(parser, output);
1655
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1656
+ return emit_char(parser, c, output);
1730
1657
  }
1731
1658
  }
1732
1659
 
1733
1660
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
1734
- static StateResult handle_script_double_escaped_dash_dash_state (
1661
+ static StateResult handle_script_data_double_escaped_dash_dash_state (
1735
1662
  GumboParser* parser,
1736
1663
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1737
1664
  int c,
@@ -1739,46 +1666,44 @@ static StateResult handle_script_double_escaped_dash_dash_state (
1739
1666
  ) {
1740
1667
  switch (c) {
1741
1668
  case '-':
1742
- return emit_current_char(parser, output);
1669
+ return emit_char(parser, c, output);
1743
1670
  case '<':
1744
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1745
- return emit_current_char(parser, output);
1671
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1672
+ return emit_char(parser, c, output);
1746
1673
  case '>':
1747
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1748
- return emit_current_char(parser, output);
1674
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1675
+ return emit_char(parser, c, output);
1749
1676
  case '\0':
1750
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1677
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1751
1678
  return emit_replacement_char(parser, output);
1752
1679
  case -1:
1753
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1754
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1755
- return NEXT_CHAR;
1680
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1681
+ return emit_eof(parser, output);
1756
1682
  default:
1757
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1758
- return emit_current_char(parser, output);
1683
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1684
+ return emit_char(parser, c, output);
1759
1685
  }
1760
1686
  }
1761
1687
 
1762
1688
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
1763
- static StateResult handle_script_double_escaped_lt_state (
1689
+ static StateResult handle_script_data_double_escaped_lt_state (
1764
1690
  GumboParser* parser,
1765
1691
  GumboTokenizerState* tokenizer,
1766
1692
  int c,
1767
1693
  GumboToken* output
1768
1694
  ) {
1769
1695
  if (c == '/') {
1770
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1771
- gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
1772
- return emit_current_char(parser, output);
1696
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END);
1697
+ clear_temporary_buffer(parser);
1698
+ return emit_char(parser, c, output);
1773
1699
  } else {
1774
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1775
- tokenizer->_reconsume_current_input = true;
1776
- return NEXT_CHAR;
1700
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1701
+ return CONTINUE;
1777
1702
  }
1778
1703
  }
1779
1704
 
1780
1705
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
1781
- static StateResult handle_script_double_escaped_end_state (
1706
+ static StateResult handle_script_data_double_escaped_end_state (
1782
1707
  GumboParser* parser,
1783
1708
  GumboTokenizerState* tokenizer,
1784
1709
  int c,
@@ -1793,29 +1718,23 @@ static StateResult handle_script_double_escaped_end_state (
1793
1718
  case '>':
1794
1719
  gumbo_tokenizer_set_state(
1795
1720
  parser, gumbo_string_equals(&kScriptTag,
1796
- (GumboStringPiece*) &tokenizer->_script_data_buffer)
1797
- ? GUMBO_LEX_SCRIPT_ESCAPED
1798
- : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1799
- return emit_current_char(parser, output);
1800
- default:
1801
- if (is_alpha(c)) {
1802
- gumbo_string_buffer_append_codepoint (
1803
- ensure_lowercase(c),
1804
- &tokenizer->_script_data_buffer
1805
- );
1806
- return emit_current_char(parser, output);
1807
- } else {
1808
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1809
- tokenizer->_reconsume_current_input = true;
1810
- return NEXT_CHAR;
1811
- }
1721
+ (GumboStringPiece*) &tokenizer->_temporary_buffer)
1722
+ ? GUMBO_LEX_SCRIPT_DATA_ESCAPED
1723
+ : GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1724
+ return emit_char(parser, c, output);
1725
+ }
1726
+ if (is_alpha(c)) {
1727
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1728
+ return emit_char(parser, c, output);
1812
1729
  }
1730
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1731
+ return CONTINUE;
1813
1732
  }
1814
1733
 
1815
1734
  // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
1816
1735
  static StateResult handle_before_attr_name_state (
1817
1736
  GumboParser* parser,
1818
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1737
+ GumboTokenizerState* tokenizer,
1819
1738
  int c,
1820
1739
  GumboToken* output
1821
1740
  ) {
@@ -1824,40 +1743,27 @@ static StateResult handle_before_attr_name_state (
1824
1743
  case '\n':
1825
1744
  case '\f':
1826
1745
  case ' ':
1827
- return NEXT_CHAR;
1746
+ return CONTINUE;
1828
1747
  case '/':
1829
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1830
- return NEXT_CHAR;
1831
1748
  case '>':
1832
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1833
- return emit_current_tag(parser, output);
1834
- case '\0':
1835
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1836
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1837
- append_char_to_temporary_buffer(parser, 0xfffd);
1838
- return NEXT_CHAR;
1839
1749
  case -1:
1840
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1841
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1842
- abandon_current_tag(parser);
1843
- return NEXT_CHAR;
1844
- case '"':
1845
- case '\'':
1846
- case '<':
1750
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1751
+ return CONTINUE;
1847
1752
  case '=':
1848
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1849
- // Fall through.
1850
- default:
1753
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME);
1851
1754
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1852
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1853
- return NEXT_CHAR;
1755
+ append_char_to_tag_buffer(parser, c, true);
1756
+ return CONTINUE;
1757
+ default:
1758
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1759
+ return CONTINUE;
1854
1760
  }
1855
1761
  }
1856
1762
 
1857
1763
  // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
1858
1764
  static StateResult handle_attr_name_state (
1859
1765
  GumboParser* parser,
1860
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1766
+ GumboTokenizerState* tokenizer,
1861
1767
  int c,
1862
1768
  GumboToken* output
1863
1769
  ) {
@@ -1866,45 +1772,35 @@ static StateResult handle_attr_name_state (
1866
1772
  case '\n':
1867
1773
  case '\f':
1868
1774
  case ' ':
1869
- finish_attribute_name(parser);
1870
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1871
- return NEXT_CHAR;
1872
1775
  case '/':
1776
+ case '>':
1777
+ case -1:
1873
1778
  finish_attribute_name(parser);
1874
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1875
- return NEXT_CHAR;
1779
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1780
+ return CONTINUE;
1876
1781
  case '=':
1877
1782
  finish_attribute_name(parser);
1878
1783
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1879
- return NEXT_CHAR;
1880
- case '>':
1881
- finish_attribute_name(parser);
1882
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1883
- return emit_current_tag(parser, output);
1784
+ return CONTINUE;
1884
1785
  case '\0':
1885
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1786
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1886
1787
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1887
- return NEXT_CHAR;
1888
- case -1:
1889
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1890
- abandon_current_tag(parser);
1891
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1892
- return NEXT_CHAR;
1788
+ return CONTINUE;
1893
1789
  case '"':
1894
1790
  case '\'':
1895
1791
  case '<':
1896
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1792
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME);
1897
1793
  // Fall through.
1898
1794
  default:
1899
1795
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1900
- return NEXT_CHAR;
1796
+ return CONTINUE;
1901
1797
  }
1902
1798
  }
1903
1799
 
1904
1800
  // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
1905
1801
  static StateResult handle_after_attr_name_state (
1906
1802
  GumboParser* parser,
1907
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1803
+ GumboTokenizerState* tokenizer,
1908
1804
  int c,
1909
1805
  GumboToken* output
1910
1806
  ) {
@@ -1913,35 +1809,23 @@ static StateResult handle_after_attr_name_state (
1913
1809
  case '\n':
1914
1810
  case '\f':
1915
1811
  case ' ':
1916
- return NEXT_CHAR;
1812
+ return CONTINUE;
1917
1813
  case '/':
1918
1814
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1919
- return NEXT_CHAR;
1815
+ return CONTINUE;
1920
1816
  case '=':
1921
1817
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1922
- return NEXT_CHAR;
1818
+ return CONTINUE;
1923
1819
  case '>':
1924
1820
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1925
1821
  return emit_current_tag(parser, output);
1926
- case '\0':
1927
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1928
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1929
- append_char_to_temporary_buffer(parser, 0xfffd);
1930
- return NEXT_CHAR;
1931
1822
  case -1:
1932
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1933
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1823
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1934
1824
  abandon_current_tag(parser);
1935
- return NEXT_CHAR;
1936
- case '"':
1937
- case '\'':
1938
- case '<':
1939
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1940
- // Fall through.
1825
+ return emit_eof(parser, output);
1941
1826
  default:
1942
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1943
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1944
- return NEXT_CHAR;
1827
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1828
+ return CONTINUE;
1945
1829
  }
1946
1830
  }
1947
1831
 
@@ -1957,45 +1841,22 @@ static StateResult handle_before_attr_value_state (
1957
1841
  case '\n':
1958
1842
  case '\f':
1959
1843
  case ' ':
1960
- return NEXT_CHAR;
1844
+ return CONTINUE;
1961
1845
  case '"':
1962
1846
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1963
1847
  reset_tag_buffer_start_point(parser);
1964
- return NEXT_CHAR;
1965
- case '&':
1966
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1967
- tokenizer->_reconsume_current_input = true;
1968
- return NEXT_CHAR;
1848
+ return CONTINUE;
1969
1849
  case '\'':
1970
1850
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1971
1851
  reset_tag_buffer_start_point(parser);
1972
- return NEXT_CHAR;
1973
- case '\0':
1974
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1975
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1976
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1977
- return NEXT_CHAR;
1978
- case -1:
1979
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1980
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1981
- abandon_current_tag(parser);
1982
- tokenizer->_reconsume_current_input = true;
1983
- return NEXT_CHAR;
1852
+ return CONTINUE;
1984
1853
  case '>':
1985
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1854
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_ATTRIBUTE_VALUE);
1986
1855
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1987
- emit_current_tag(parser, output);
1988
- return RETURN_ERROR;
1989
- case '<':
1990
- case '=':
1991
- case '`':
1992
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1993
- // Fall through.
1994
- default:
1995
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1996
- append_char_to_tag_buffer(parser, c, true);
1997
- return NEXT_CHAR;
1856
+ return emit_current_tag(parser, output);
1998
1857
  }
1858
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1859
+ return CONTINUE;
1999
1860
  }
2000
1861
 
2001
1862
  // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
@@ -2003,30 +1864,28 @@ static StateResult handle_attr_value_double_quoted_state (
2003
1864
  GumboParser* parser,
2004
1865
  GumboTokenizerState* tokenizer,
2005
1866
  int c,
2006
- GumboToken* UNUSED_ARG(output)
1867
+ GumboToken* output
2007
1868
  ) {
2008
1869
  switch (c) {
2009
1870
  case '"':
2010
1871
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
2011
- return NEXT_CHAR;
1872
+ return CONTINUE;
2012
1873
  case '&':
2013
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2014
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2015
- tokenizer->_reconsume_current_input = true;
2016
- return NEXT_CHAR;
1874
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1875
+ set_mark(parser);
1876
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED;
1877
+ return CONTINUE;
2017
1878
  case '\0':
2018
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1879
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2019
1880
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
2020
- return NEXT_CHAR;
1881
+ return CONTINUE;
2021
1882
  case -1:
2022
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
2023
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1883
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2024
1884
  abandon_current_tag(parser);
2025
- tokenizer->_reconsume_current_input = true;
2026
- return NEXT_CHAR;
1885
+ return emit_eof(parser, output);
2027
1886
  default:
2028
1887
  append_char_to_tag_buffer(parser, c, false);
2029
- return NEXT_CHAR;
1888
+ return CONTINUE;
2030
1889
  }
2031
1890
  }
2032
1891
 
@@ -2035,30 +1894,28 @@ static StateResult handle_attr_value_single_quoted_state (
2035
1894
  GumboParser* parser,
2036
1895
  GumboTokenizerState* tokenizer,
2037
1896
  int c,
2038
- GumboToken* UNUSED_ARG(output)
1897
+ GumboToken* output
2039
1898
  ) {
2040
1899
  switch (c) {
2041
1900
  case '\'':
2042
1901
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
2043
- return NEXT_CHAR;
1902
+ return CONTINUE;
2044
1903
  case '&':
2045
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2046
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2047
- tokenizer->_reconsume_current_input = true;
2048
- return NEXT_CHAR;
1904
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1905
+ set_mark(parser);
1906
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED;
1907
+ return CONTINUE;
2049
1908
  case '\0':
2050
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1909
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2051
1910
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
2052
- return NEXT_CHAR;
1911
+ return CONTINUE;
2053
1912
  case -1:
2054
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
2055
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1913
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2056
1914
  abandon_current_tag(parser);
2057
- tokenizer->_reconsume_current_input = true;
2058
- return NEXT_CHAR;
1915
+ return emit_eof(parser, output);
2059
1916
  default:
2060
1917
  append_char_to_tag_buffer(parser, c, false);
2061
- return NEXT_CHAR;
1918
+ return CONTINUE;
2062
1919
  }
2063
1920
  }
2064
1921
 
@@ -2076,91 +1933,37 @@ static StateResult handle_attr_value_unquoted_state (
2076
1933
  case ' ':
2077
1934
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2078
1935
  finish_attribute_value(parser);
2079
- return NEXT_CHAR;
1936
+ return CONTINUE;
2080
1937
  case '&':
2081
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2082
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2083
- tokenizer->_reconsume_current_input = true;
2084
- return NEXT_CHAR;
1938
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1939
+ set_mark(parser);
1940
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_UNQUOTED;
1941
+ return CONTINUE;
2085
1942
  case '>':
2086
1943
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2087
1944
  finish_attribute_value(parser);
2088
1945
  return emit_current_tag(parser, output);
2089
1946
  case '\0':
2090
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1947
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2091
1948
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
2092
- return NEXT_CHAR;
1949
+ return CONTINUE;
2093
1950
  case -1:
2094
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
2095
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2096
- tokenizer->_reconsume_current_input = true;
1951
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2097
1952
  abandon_current_tag(parser);
2098
- return NEXT_CHAR;
2099
- case '<':
2100
- case '=':
1953
+ return emit_eof(parser, output);
2101
1954
  case '"':
2102
1955
  case '\'':
1956
+ case '<':
1957
+ case '=':
2103
1958
  case '`':
2104
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1959
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE);
2105
1960
  // Fall through.
2106
1961
  default:
2107
1962
  append_char_to_tag_buffer(parser, c, true);
2108
- return NEXT_CHAR;
1963
+ return CONTINUE;
2109
1964
  }
2110
1965
  }
2111
1966
 
2112
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-attribute-value-state
2113
- static StateResult handle_char_ref_in_attr_value_state (
2114
- GumboParser* parser,
2115
- GumboTokenizerState* tokenizer,
2116
- int UNUSED_ARG(c),
2117
- GumboToken* UNUSED_ARG(output)
2118
- ) {
2119
- OneOrTwoCodepoints char_ref;
2120
- int allowed_char;
2121
- bool is_unquoted = false;
2122
- switch (tokenizer->_tag_state._attr_value_state) {
2123
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
2124
- allowed_char = '"';
2125
- break;
2126
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
2127
- allowed_char = '\'';
2128
- break;
2129
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
2130
- allowed_char = '>';
2131
- is_unquoted = true;
2132
- break;
2133
- default:
2134
- // -Wmaybe-uninitialized is a little overzealous here, and doesn't
2135
- // get that the assert(0) means this codepath will never happen.
2136
- allowed_char = ' ';
2137
- assert(0);
2138
- }
2139
-
2140
- // Ignore the status, since we don't have a convenient way of signalling that
2141
- // a parser error has occurred when the error occurs in the middle of a
2142
- // multi-state token. We'd need a flag inside the TokenizerState to do this,
2143
- // but that's a low priority fix.
2144
- gumbo_consume_char_ref (
2145
- parser,
2146
- &tokenizer->_input,
2147
- allowed_char,
2148
- true,
2149
- &char_ref
2150
- );
2151
- if (char_ref.first != kGumboNoChar) {
2152
- tokenizer->_reconsume_current_input = true;
2153
- append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
2154
- if (char_ref.second != kGumboNoChar) {
2155
- append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
2156
- }
2157
- } else {
2158
- append_char_to_tag_buffer(parser, '&', is_unquoted);
2159
- }
2160
- gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
2161
- return NEXT_CHAR;
2162
- }
2163
-
2164
1967
  // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
2165
1968
  static StateResult handle_after_attr_value_quoted_state (
2166
1969
  GumboParser* parser,
@@ -2175,24 +1978,21 @@ static StateResult handle_after_attr_value_quoted_state (
2175
1978
  case '\f':
2176
1979
  case ' ':
2177
1980
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2178
- return NEXT_CHAR;
1981
+ return CONTINUE;
2179
1982
  case '/':
2180
1983
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
2181
- return NEXT_CHAR;
1984
+ return CONTINUE;
2182
1985
  case '>':
2183
1986
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2184
1987
  return emit_current_tag(parser, output);
2185
1988
  case -1:
2186
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
2187
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1989
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2188
1990
  abandon_current_tag(parser);
2189
- tokenizer->_reconsume_current_input = true;
2190
- return NEXT_CHAR;
1991
+ return emit_eof(parser, output);
2191
1992
  default:
2192
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
2193
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2194
- tokenizer->_reconsume_current_input = true;
2195
- return NEXT_CHAR;
1993
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES);
1994
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1995
+ return CONTINUE;
2196
1996
  }
2197
1997
  }
2198
1998
 
@@ -2209,15 +2009,13 @@ static StateResult handle_self_closing_start_tag_state (
2209
2009
  tokenizer->_tag_state._is_self_closing = true;
2210
2010
  return emit_current_tag(parser, output);
2211
2011
  case -1:
2212
- tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
2213
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2012
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2214
2013
  abandon_current_tag(parser);
2215
- return NEXT_CHAR;
2014
+ return emit_eof(parser, output);
2216
2015
  default:
2217
- tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
2218
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2219
- tokenizer->_reconsume_current_input = true;
2220
- return NEXT_CHAR;
2016
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG);
2017
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2018
+ return CONTINUE;
2221
2019
  }
2222
2020
  }
2223
2021
 
@@ -2228,21 +2026,27 @@ static StateResult handle_bogus_comment_state (
2228
2026
  int c,
2229
2027
  GumboToken* output
2230
2028
  ) {
2231
- while (c != '>' && c != -1) {
2232
- if (c == '\0') {
2233
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2234
- c = 0xFFFD;
2235
- }
2029
+ switch (c) {
2030
+ case '>':
2031
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2032
+ return emit_comment(parser, output);
2033
+ case -1:
2034
+ // We need to emit the comment and then the EOF, so reconsume in data
2035
+ // state.
2036
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2037
+ return emit_comment(parser, output);
2038
+ case '\0':
2039
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2040
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2041
+ return CONTINUE;
2042
+ default:
2236
2043
  append_char_to_temporary_buffer(parser, c);
2237
- utf8iterator_next(&tokenizer->_input);
2238
- c = utf8iterator_current(&tokenizer->_input);
2044
+ return CONTINUE;
2239
2045
  }
2240
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2241
- return emit_comment(parser, output);
2242
2046
  }
2243
2047
 
2244
2048
  // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
2245
- static StateResult handle_markup_declaration_state (
2049
+ static StateResult handle_markup_declaration_open_state (
2246
2050
  GumboParser* parser,
2247
2051
  GumboTokenizerState* tokenizer,
2248
2052
  int UNUSED_ARG(c),
@@ -2253,21 +2057,21 @@ static StateResult handle_markup_declaration_state (
2253
2057
  &tokenizer->_input,
2254
2058
  "--",
2255
2059
  sizeof("--") - 1,
2256
- true
2060
+ /* case sensitive */ true
2257
2061
  )
2258
2062
  ) {
2259
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
2260
- tokenizer->_reconsume_current_input = true;
2261
- } else if (
2063
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_START);
2064
+ return CONTINUE;
2065
+ }
2066
+ if (
2262
2067
  utf8iterator_maybe_consume_match (
2263
2068
  &tokenizer->_input,
2264
2069
  "DOCTYPE",
2265
2070
  sizeof("DOCTYPE") - 1,
2266
- false
2071
+ /* case sensitive */ false
2267
2072
  )
2268
2073
  ) {
2269
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
2270
- tokenizer->_reconsume_current_input = true;
2074
+ reconsume_in_state(parser, GUMBO_LEX_DOCTYPE);
2271
2075
  // If we get here, we know we'll eventually emit a doctype token, so now is
2272
2076
  // the time to initialize the doctype strings. (Not in doctype_state_init,
2273
2077
  // since then they'll leak if ownership never gets transferred to the
@@ -2275,24 +2079,35 @@ static StateResult handle_markup_declaration_state (
2275
2079
  tokenizer->_doc_type_state.name = gumbo_strdup("");
2276
2080
  tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
2277
2081
  tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
2278
- } else if (
2279
- tokenizer->_is_current_node_foreign
2280
- && utf8iterator_maybe_consume_match (
2082
+ return CONTINUE;
2083
+ }
2084
+ if (
2085
+ utf8iterator_maybe_consume_match (
2281
2086
  &tokenizer->_input,
2282
2087
  "[CDATA[", sizeof("[CDATA[") - 1,
2283
- true
2088
+ /* case sensitive */ true
2284
2089
  )
2285
2090
  ) {
2286
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2287
- tokenizer->_is_in_cdata = true;
2288
- tokenizer->_reconsume_current_input = true;
2289
- } else {
2290
- tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2291
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2292
- tokenizer->_reconsume_current_input = true;
2293
- clear_temporary_buffer(parser);
2091
+ if (tokenizer->_is_adjusted_current_node_foreign) {
2092
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2093
+ tokenizer->_is_in_cdata = true;
2094
+ // Start the token after the <![CDATA[.
2095
+ reset_token_start_point(tokenizer);
2096
+ } else {
2097
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_CDATA_IN_HTML_CONTENT);
2098
+ clear_temporary_buffer(parser);
2099
+ append_string_to_temporary_buffer (
2100
+ parser,
2101
+ &(const GumboStringPiece) { .data = "[CDATA[", .length = 7 }
2102
+ );
2103
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2104
+ }
2105
+ return CONTINUE;
2294
2106
  }
2295
- return NEXT_CHAR;
2107
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_OPENED_COMMENT);
2108
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2109
+ clear_temporary_buffer(parser);
2110
+ return CONTINUE;
2296
2111
  }
2297
2112
 
2298
2113
  // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
@@ -2305,26 +2120,14 @@ static StateResult handle_comment_start_state (
2305
2120
  switch (c) {
2306
2121
  case '-':
2307
2122
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2308
- return NEXT_CHAR;
2309
- case '\0':
2310
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2311
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2312
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2313
- return NEXT_CHAR;
2123
+ return CONTINUE;
2314
2124
  case '>':
2315
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2125
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2316
2126
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2317
- emit_comment(parser, output);
2318
- return RETURN_ERROR;
2319
- case -1:
2320
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2321
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2322
- emit_comment(parser, output);
2323
- return RETURN_ERROR;
2127
+ return emit_comment(parser, output);
2324
2128
  default:
2325
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2326
- append_char_to_temporary_buffer(parser, c);
2327
- return NEXT_CHAR;
2129
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2130
+ return CONTINUE;
2328
2131
  }
2329
2132
  }
2330
2133
 
@@ -2338,28 +2141,20 @@ static StateResult handle_comment_start_dash_state (
2338
2141
  switch (c) {
2339
2142
  case '-':
2340
2143
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2341
- return NEXT_CHAR;
2342
- case '\0':
2343
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2344
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2345
- append_char_to_temporary_buffer(parser, '-');
2346
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2347
- return NEXT_CHAR;
2144
+ return CONTINUE;
2348
2145
  case '>':
2349
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2146
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2350
2147
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2351
- emit_comment(parser, output);
2352
- return RETURN_ERROR;
2148
+ return emit_comment(parser, output);
2353
2149
  case -1:
2354
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2355
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2356
- emit_comment(parser, output);
2357
- return RETURN_ERROR;
2150
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2151
+ // Switch to data to emit the EOF next.
2152
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2153
+ return emit_comment(parser, output);
2358
2154
  default:
2359
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2155
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2360
2156
  append_char_to_temporary_buffer(parser, '-');
2361
- append_char_to_temporary_buffer(parser, c);
2362
- return NEXT_CHAR;
2157
+ return CONTINUE;
2363
2158
  }
2364
2159
  }
2365
2160
 
@@ -2371,21 +2166,99 @@ static StateResult handle_comment_state (
2371
2166
  GumboToken* output
2372
2167
  ) {
2373
2168
  switch (c) {
2169
+ case '<':
2170
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT);
2171
+ append_char_to_temporary_buffer(parser, c);
2172
+ return CONTINUE;
2374
2173
  case '-':
2375
2174
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2376
- return NEXT_CHAR;
2175
+ return CONTINUE;
2377
2176
  case '\0':
2378
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2177
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2379
2178
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2380
- return NEXT_CHAR;
2179
+ return CONTINUE;
2381
2180
  case -1:
2382
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2383
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2384
- emit_comment(parser, output);
2385
- return RETURN_ERROR;
2181
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2182
+ // Switch to data to emit the EOF token next.
2183
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2184
+ return emit_comment(parser, output);
2386
2185
  default:
2387
2186
  append_char_to_temporary_buffer(parser, c);
2388
- return NEXT_CHAR;
2187
+ return CONTINUE;
2188
+ }
2189
+ }
2190
+
2191
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
2192
+ static StateResult handle_comment_lt_state (
2193
+ GumboParser* parser,
2194
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2195
+ int c,
2196
+ GumboToken* output
2197
+ ) {
2198
+ switch (c) {
2199
+ case '!':
2200
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG);
2201
+ append_char_to_temporary_buffer(parser, c);
2202
+ return CONTINUE;
2203
+ case '<':
2204
+ append_char_to_temporary_buffer(parser, c);
2205
+ return CONTINUE;
2206
+ default:
2207
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2208
+ return CONTINUE;
2209
+ }
2210
+ }
2211
+
2212
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
2213
+ static StateResult handle_comment_lt_bang_state (
2214
+ GumboParser* parser,
2215
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2216
+ int c,
2217
+ GumboToken* output
2218
+ ) {
2219
+ switch (c) {
2220
+ case '-':
2221
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH);
2222
+ return CONTINUE;
2223
+ default:
2224
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2225
+ return CONTINUE;
2226
+ }
2227
+ }
2228
+
2229
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
2230
+ static StateResult handle_comment_lt_bang_dash_state (
2231
+ GumboParser* parser,
2232
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2233
+ int c,
2234
+ GumboToken* output
2235
+ ) {
2236
+ switch (c) {
2237
+ case '-':
2238
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH);
2239
+ return CONTINUE;
2240
+ default:
2241
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2242
+ return CONTINUE;
2243
+ }
2244
+ }
2245
+
2246
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
2247
+ static StateResult handle_comment_lt_bang_dash_dash_state (
2248
+ GumboParser* parser,
2249
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2250
+ int c,
2251
+ GumboToken* output
2252
+ ) {
2253
+ switch (c) {
2254
+ case '>':
2255
+ case -1:
2256
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2257
+ return CONTINUE;
2258
+ default:
2259
+ tokenizer_add_parse_error(parser, GUMBO_ERR_NESTED_COMMENT);
2260
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2261
+ return CONTINUE;
2389
2262
  }
2390
2263
  }
2391
2264
 
@@ -2397,25 +2270,18 @@ static StateResult handle_comment_end_dash_state (
2397
2270
  GumboToken* output
2398
2271
  ) {
2399
2272
  switch (c) {
2400
- case '-':
2401
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2402
- return NEXT_CHAR;
2403
- case '\0':
2404
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2405
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2406
- append_char_to_temporary_buffer(parser, '-');
2407
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2408
- return NEXT_CHAR;
2409
- case -1:
2410
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2411
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2412
- emit_comment(parser, output);
2413
- return RETURN_ERROR;
2414
- default:
2415
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2416
- append_char_to_temporary_buffer(parser, '-');
2417
- append_char_to_temporary_buffer(parser, c);
2418
- return NEXT_CHAR;
2273
+ case '-':
2274
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2275
+ return CONTINUE;
2276
+ case -1:
2277
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2278
+ // Switch to data to emit EOF next.
2279
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2280
+ return emit_comment(parser, output);
2281
+ default:
2282
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2283
+ append_char_to_temporary_buffer(parser, '-');
2284
+ return CONTINUE;
2419
2285
  }
2420
2286
  }
2421
2287
 
@@ -2430,35 +2296,22 @@ static StateResult handle_comment_end_state (
2430
2296
  case '>':
2431
2297
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2432
2298
  return emit_comment(parser, output);
2433
- case '\0':
2434
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2435
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2436
- append_char_to_temporary_buffer(parser, '-');
2437
- append_char_to_temporary_buffer(parser, '-');
2438
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2439
- return NEXT_CHAR;
2440
2299
  case '!':
2441
- tokenizer_add_parse_error(
2442
- parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2443
2300
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2444
- return NEXT_CHAR;
2301
+ return CONTINUE;
2445
2302
  case '-':
2446
- tokenizer_add_parse_error(
2447
- parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2448
2303
  append_char_to_temporary_buffer(parser, '-');
2449
- return NEXT_CHAR;
2304
+ return CONTINUE;
2450
2305
  case -1:
2451
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2306
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2307
+ // Switch to data to emit EOF next.
2452
2308
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2453
- emit_comment(parser, output);
2454
- return RETURN_ERROR;
2309
+ return emit_comment(parser, output);
2455
2310
  default:
2456
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2457
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2311
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2458
2312
  append_char_to_temporary_buffer(parser, '-');
2459
2313
  append_char_to_temporary_buffer(parser, '-');
2460
- append_char_to_temporary_buffer(parser, c);
2461
- return NEXT_CHAR;
2314
+ return CONTINUE;
2462
2315
  }
2463
2316
  }
2464
2317
 
@@ -2475,30 +2328,22 @@ static StateResult handle_comment_end_bang_state (
2475
2328
  append_char_to_temporary_buffer(parser, '-');
2476
2329
  append_char_to_temporary_buffer(parser, '-');
2477
2330
  append_char_to_temporary_buffer(parser, '!');
2478
- return NEXT_CHAR;
2331
+ return CONTINUE;
2479
2332
  case '>':
2333
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT);
2480
2334
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2481
2335
  return emit_comment(parser, output);
2482
- case '\0':
2483
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2484
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2485
- append_char_to_temporary_buffer(parser, '-');
2486
- append_char_to_temporary_buffer(parser, '-');
2487
- append_char_to_temporary_buffer(parser, '!');
2488
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2489
- return NEXT_CHAR;
2490
2336
  case -1:
2491
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2337
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2338
+ // Switch to data to emit EOF next.
2492
2339
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2493
- emit_comment(parser, output);
2494
- return RETURN_ERROR;
2340
+ return emit_comment(parser, output);
2495
2341
  default:
2496
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2342
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2497
2343
  append_char_to_temporary_buffer(parser, '-');
2498
2344
  append_char_to_temporary_buffer(parser, '-');
2499
2345
  append_char_to_temporary_buffer(parser, '!');
2500
- append_char_to_temporary_buffer(parser, c);
2501
- return NEXT_CHAR;
2346
+ return CONTINUE;
2502
2347
  }
2503
2348
  }
2504
2349
 
@@ -2509,26 +2354,27 @@ static StateResult handle_doctype_state (
2509
2354
  int c,
2510
2355
  GumboToken* output
2511
2356
  ) {
2512
- assert(!tokenizer->_temporary_buffer.length);
2357
+ assert(temporary_buffer_is_empty(parser));
2513
2358
  switch (c) {
2514
2359
  case '\t':
2515
2360
  case '\n':
2516
2361
  case '\f':
2517
2362
  case ' ':
2518
2363
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2519
- return NEXT_CHAR;
2364
+ return CONTINUE;
2365
+ case '>':
2366
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2367
+ return CONTINUE;
2520
2368
  case -1:
2521
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2522
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2369
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2523
2370
  tokenizer->_doc_type_state.force_quirks = true;
2524
- emit_doctype(parser, output);
2525
- return RETURN_ERROR;
2371
+ // Switch to data to emit EOF next.
2372
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2373
+ return emit_doctype(parser, output);
2526
2374
  default:
2527
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2528
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2529
- tokenizer->_reconsume_current_input = true;
2530
- tokenizer->_doc_type_state.force_quirks = true;
2531
- return NEXT_CHAR;
2375
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME);
2376
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2377
+ return CONTINUE;
2532
2378
  }
2533
2379
  }
2534
2380
 
@@ -2544,30 +2390,27 @@ static StateResult handle_before_doctype_name_state (
2544
2390
  case '\n':
2545
2391
  case '\f':
2546
2392
  case ' ':
2547
- return NEXT_CHAR;
2393
+ return CONTINUE;
2548
2394
  case '\0':
2549
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2395
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2550
2396
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2551
- tokenizer->_doc_type_state.force_quirks = true;
2552
2397
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2553
- return NEXT_CHAR;
2398
+ return CONTINUE;
2554
2399
  case '>':
2555
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2400
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_NAME);
2556
2401
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2557
2402
  tokenizer->_doc_type_state.force_quirks = true;
2558
- emit_doctype(parser, output);
2559
- return RETURN_ERROR;
2403
+ return emit_doctype(parser, output);
2560
2404
  case -1:
2561
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2562
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2405
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2563
2406
  tokenizer->_doc_type_state.force_quirks = true;
2564
- emit_doctype(parser, output);
2565
- return RETURN_ERROR;
2407
+ // Switch to data to emit EOF next.
2408
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2409
+ return emit_doctype(parser, output);
2566
2410
  default:
2567
2411
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2568
- tokenizer->_doc_type_state.force_quirks = false;
2569
2412
  append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2570
- return NEXT_CHAR;
2413
+ return CONTINUE;
2571
2414
  }
2572
2415
  }
2573
2416
 
@@ -2586,30 +2429,26 @@ static StateResult handle_doctype_name_state (
2586
2429
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2587
2430
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2588
2431
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2589
- return NEXT_CHAR;
2432
+ return CONTINUE;
2590
2433
  case '>':
2591
2434
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2592
2435
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2593
2436
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2594
- emit_doctype(parser, output);
2595
- return RETURN_SUCCESS;
2437
+ return emit_doctype(parser, output);
2596
2438
  case '\0':
2597
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2439
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2598
2440
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2599
- return NEXT_CHAR;
2441
+ return CONTINUE;
2600
2442
  case -1:
2601
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2602
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2443
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2444
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2603
2445
  tokenizer->_doc_type_state.force_quirks = true;
2604
2446
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2605
2447
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2606
- emit_doctype(parser, output);
2607
- return RETURN_ERROR;
2448
+ return emit_doctype(parser, output);
2608
2449
  default:
2609
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2610
- tokenizer->_doc_type_state.force_quirks = false;
2611
2450
  append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2612
- return NEXT_CHAR;
2451
+ return CONTINUE;
2613
2452
  }
2614
2453
  }
2615
2454
 
@@ -2625,35 +2464,29 @@ static StateResult handle_after_doctype_name_state (
2625
2464
  case '\n':
2626
2465
  case '\f':
2627
2466
  case ' ':
2628
- return NEXT_CHAR;
2467
+ return CONTINUE;
2629
2468
  case '>':
2630
2469
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2631
- emit_doctype(parser, output);
2632
- return RETURN_SUCCESS;
2470
+ return emit_doctype(parser, output);
2633
2471
  case -1:
2634
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2472
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2635
2473
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2636
2474
  tokenizer->_doc_type_state.force_quirks = true;
2637
- emit_doctype(parser, output);
2638
- return RETURN_ERROR;
2475
+ return emit_doctype(parser, output);
2639
2476
  default:
2640
2477
  if (utf8iterator_maybe_consume_match(
2641
2478
  &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2642
- gumbo_tokenizer_set_state(
2643
- parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2644
- tokenizer->_reconsume_current_input = true;
2479
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2645
2480
  } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2646
2481
  sizeof("SYSTEM") - 1, false)) {
2647
- gumbo_tokenizer_set_state(
2648
- parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2649
- tokenizer->_reconsume_current_input = true;
2482
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2650
2483
  } else {
2651
2484
  tokenizer_add_parse_error(
2652
- parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2653
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2485
+ parser, GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME);
2486
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2654
2487
  tokenizer->_doc_type_state.force_quirks = true;
2655
2488
  }
2656
- return NEXT_CHAR;
2489
+ return CONTINUE;
2657
2490
  }
2658
2491
  }
2659
2492
 
@@ -2670,37 +2503,34 @@ static StateResult handle_after_doctype_public_keyword_state (
2670
2503
  case '\f':
2671
2504
  case ' ':
2672
2505
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2673
- return NEXT_CHAR;
2506
+ return CONTINUE;
2674
2507
  case '"':
2675
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2508
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2676
2509
  assert(temporary_buffer_is_empty(parser));
2677
2510
  gumbo_tokenizer_set_state(
2678
2511
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2679
- return NEXT_CHAR;
2512
+ return CONTINUE;
2680
2513
  case '\'':
2681
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2514
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2682
2515
  assert(temporary_buffer_is_empty(parser));
2683
2516
  gumbo_tokenizer_set_state(
2684
2517
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2685
- return NEXT_CHAR;
2518
+ return CONTINUE;
2686
2519
  case '>':
2687
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2520
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2688
2521
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2689
2522
  tokenizer->_doc_type_state.force_quirks = true;
2690
- emit_doctype(parser, output);
2691
- return RETURN_ERROR;
2523
+ return emit_doctype(parser, output);
2692
2524
  case -1:
2693
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2694
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2525
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2526
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2695
2527
  tokenizer->_doc_type_state.force_quirks = true;
2696
- emit_doctype(parser, output);
2697
- return RETURN_ERROR;
2528
+ return emit_doctype(parser, output);
2698
2529
  default:
2699
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2700
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2530
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2531
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2701
2532
  tokenizer->_doc_type_state.force_quirks = true;
2702
- emit_doctype(parser, output);
2703
- return RETURN_ERROR;
2533
+ return CONTINUE;
2704
2534
  }
2705
2535
  }
2706
2536
 
@@ -2716,35 +2546,32 @@ static StateResult handle_before_doctype_public_id_state (
2716
2546
  case '\n':
2717
2547
  case '\f':
2718
2548
  case ' ':
2719
- return NEXT_CHAR;
2549
+ return CONTINUE;
2720
2550
  case '"':
2721
2551
  assert(temporary_buffer_is_empty(parser));
2722
2552
  gumbo_tokenizer_set_state(
2723
2553
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2724
- return NEXT_CHAR;
2554
+ return CONTINUE;
2725
2555
  case '\'':
2726
2556
  assert(temporary_buffer_is_empty(parser));
2727
2557
  gumbo_tokenizer_set_state(
2728
2558
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2729
- return NEXT_CHAR;
2559
+ return CONTINUE;
2730
2560
  case '>':
2731
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2561
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2732
2562
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2733
2563
  tokenizer->_doc_type_state.force_quirks = true;
2734
- emit_doctype(parser, output);
2735
- return RETURN_ERROR;
2564
+ return emit_doctype(parser, output);
2736
2565
  case -1:
2737
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2738
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2566
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2567
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2739
2568
  tokenizer->_doc_type_state.force_quirks = true;
2740
- emit_doctype(parser, output);
2741
- return RETURN_ERROR;
2569
+ return emit_doctype(parser, output);
2742
2570
  default:
2743
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2744
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2571
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2572
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2745
2573
  tokenizer->_doc_type_state.force_quirks = true;
2746
- emit_doctype(parser, output);
2747
- return RETURN_ERROR;
2574
+ return CONTINUE;
2748
2575
  }
2749
2576
  }
2750
2577
 
@@ -2759,28 +2586,26 @@ static StateResult handle_doctype_public_id_double_quoted_state (
2759
2586
  case '"':
2760
2587
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2761
2588
  finish_doctype_public_id(parser);
2762
- return NEXT_CHAR;
2589
+ return CONTINUE;
2763
2590
  case '\0':
2764
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2591
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2765
2592
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2766
- return NEXT_CHAR;
2593
+ return CONTINUE;
2767
2594
  case '>':
2768
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2595
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2769
2596
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2770
2597
  tokenizer->_doc_type_state.force_quirks = true;
2771
2598
  finish_doctype_public_id(parser);
2772
- emit_doctype(parser, output);
2773
- return RETURN_ERROR;
2599
+ return emit_doctype(parser, output);
2774
2600
  case -1:
2775
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2776
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2601
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2602
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2777
2603
  tokenizer->_doc_type_state.force_quirks = true;
2778
2604
  finish_doctype_public_id(parser);
2779
- emit_doctype(parser, output);
2780
- return RETURN_ERROR;
2605
+ return emit_doctype(parser, output);
2781
2606
  default:
2782
2607
  append_char_to_temporary_buffer(parser, c);
2783
- return NEXT_CHAR;
2608
+ return CONTINUE;
2784
2609
  }
2785
2610
  }
2786
2611
 
@@ -2795,28 +2620,26 @@ static StateResult handle_doctype_public_id_single_quoted_state (
2795
2620
  case '\'':
2796
2621
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2797
2622
  finish_doctype_public_id(parser);
2798
- return NEXT_CHAR;
2623
+ return CONTINUE;
2799
2624
  case '\0':
2800
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2625
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2801
2626
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2802
- return NEXT_CHAR;
2627
+ return CONTINUE;
2803
2628
  case '>':
2804
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2629
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2805
2630
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2806
2631
  tokenizer->_doc_type_state.force_quirks = true;
2807
2632
  finish_doctype_public_id(parser);
2808
- emit_doctype(parser, output);
2809
- return RETURN_ERROR;
2633
+ return emit_doctype(parser, output);
2810
2634
  case -1:
2811
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2812
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2635
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2636
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2813
2637
  tokenizer->_doc_type_state.force_quirks = true;
2814
2638
  finish_doctype_public_id(parser);
2815
- emit_doctype(parser, output);
2816
- return RETURN_ERROR;
2639
+ return emit_doctype(parser, output);
2817
2640
  default:
2818
2641
  append_char_to_temporary_buffer(parser, c);
2819
- return NEXT_CHAR;
2642
+ return CONTINUE;
2820
2643
  }
2821
2644
  }
2822
2645
 
@@ -2834,35 +2657,38 @@ static StateResult handle_after_doctype_public_id_state (
2834
2657
  case ' ':
2835
2658
  gumbo_tokenizer_set_state(
2836
2659
  parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2837
- return NEXT_CHAR;
2660
+ return CONTINUE;
2838
2661
  case '>':
2839
2662
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2840
- emit_doctype(parser, output);
2841
- return RETURN_SUCCESS;
2663
+ return emit_doctype(parser, output);
2842
2664
  case '"':
2843
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2665
+ tokenizer_add_parse_error (
2666
+ parser,
2667
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2668
+ );
2844
2669
  assert(temporary_buffer_is_empty(parser));
2845
2670
  gumbo_tokenizer_set_state(
2846
2671
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2847
- return NEXT_CHAR;
2672
+ return CONTINUE;
2848
2673
  case '\'':
2849
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2674
+ tokenizer_add_parse_error (
2675
+ parser,
2676
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2677
+ );
2850
2678
  assert(temporary_buffer_is_empty(parser));
2851
2679
  gumbo_tokenizer_set_state(
2852
2680
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2853
- return NEXT_CHAR;
2681
+ return CONTINUE;
2854
2682
  case -1:
2855
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2856
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2857
- tokenizer->_reconsume_current_input = true;
2683
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2684
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2858
2685
  tokenizer->_doc_type_state.force_quirks = true;
2859
- emit_doctype(parser, output);
2860
- return RETURN_ERROR;
2686
+ return emit_doctype(parser, output);
2861
2687
  default:
2862
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2863
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2688
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2689
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2864
2690
  tokenizer->_doc_type_state.force_quirks = true;
2865
- return NEXT_CHAR;
2691
+ return CONTINUE;
2866
2692
  }
2867
2693
  }
2868
2694
 
@@ -2878,33 +2704,30 @@ static StateResult handle_between_doctype_public_system_id_state (
2878
2704
  case '\n':
2879
2705
  case '\f':
2880
2706
  case ' ':
2881
- return NEXT_CHAR;
2707
+ return CONTINUE;
2882
2708
  case '>':
2883
2709
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2884
- emit_doctype(parser, output);
2885
- return RETURN_SUCCESS;
2710
+ return emit_doctype(parser, output);
2886
2711
  case '"':
2887
2712
  assert(temporary_buffer_is_empty(parser));
2888
2713
  gumbo_tokenizer_set_state(
2889
2714
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2890
- return NEXT_CHAR;
2715
+ return CONTINUE;
2891
2716
  case '\'':
2892
2717
  assert(temporary_buffer_is_empty(parser));
2893
2718
  gumbo_tokenizer_set_state(
2894
2719
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2895
- return NEXT_CHAR;
2720
+ return CONTINUE;
2896
2721
  case -1:
2897
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2898
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2722
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2723
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2899
2724
  tokenizer->_doc_type_state.force_quirks = true;
2900
- emit_doctype(parser, output);
2901
- return RETURN_ERROR;
2725
+ return emit_doctype(parser, output);
2902
2726
  default:
2903
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2904
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2727
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2728
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2905
2729
  tokenizer->_doc_type_state.force_quirks = true;
2906
- emit_doctype(parser, output);
2907
- return RETURN_ERROR;
2730
+ return CONTINUE;
2908
2731
  }
2909
2732
  }
2910
2733
 
@@ -2921,36 +2744,34 @@ static StateResult handle_after_doctype_system_keyword_state (
2921
2744
  case '\f':
2922
2745
  case ' ':
2923
2746
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2924
- return NEXT_CHAR;
2747
+ return CONTINUE;
2925
2748
  case '"':
2926
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2749
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2927
2750
  assert(temporary_buffer_is_empty(parser));
2928
2751
  gumbo_tokenizer_set_state(
2929
2752
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2930
- return NEXT_CHAR;
2753
+ return CONTINUE;
2931
2754
  case '\'':
2932
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2755
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2933
2756
  assert(temporary_buffer_is_empty(parser));
2934
2757
  gumbo_tokenizer_set_state(
2935
2758
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2936
- return NEXT_CHAR;
2759
+ return CONTINUE;
2937
2760
  case '>':
2938
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2761
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2939
2762
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2940
2763
  tokenizer->_doc_type_state.force_quirks = true;
2941
- emit_doctype(parser, output);
2942
- return RETURN_ERROR;
2764
+ return emit_doctype(parser, output);
2943
2765
  case -1:
2944
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2945
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2766
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2767
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2946
2768
  tokenizer->_doc_type_state.force_quirks = true;
2947
- emit_doctype(parser, output);
2948
- return RETURN_ERROR;
2769
+ return emit_doctype(parser, output);
2949
2770
  default:
2950
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2951
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2771
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2772
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2952
2773
  tokenizer->_doc_type_state.force_quirks = true;
2953
- return NEXT_CHAR;
2774
+ return CONTINUE;
2954
2775
  }
2955
2776
  }
2956
2777
 
@@ -2966,34 +2787,32 @@ static StateResult handle_before_doctype_system_id_state (
2966
2787
  case '\n':
2967
2788
  case '\f':
2968
2789
  case ' ':
2969
- return NEXT_CHAR;
2790
+ return CONTINUE;
2970
2791
  case '"':
2971
2792
  assert(temporary_buffer_is_empty(parser));
2972
2793
  gumbo_tokenizer_set_state(
2973
2794
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2974
- return NEXT_CHAR;
2795
+ return CONTINUE;
2975
2796
  case '\'':
2976
2797
  assert(temporary_buffer_is_empty(parser));
2977
2798
  gumbo_tokenizer_set_state(
2978
2799
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2979
- return NEXT_CHAR;
2800
+ return CONTINUE;
2980
2801
  case '>':
2981
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2802
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2982
2803
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2983
2804
  tokenizer->_doc_type_state.force_quirks = true;
2984
- emit_doctype(parser, output);
2985
- return RETURN_ERROR;
2805
+ return emit_doctype(parser, output);
2986
2806
  case -1:
2987
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2988
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2807
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2808
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2989
2809
  tokenizer->_doc_type_state.force_quirks = true;
2990
- emit_doctype(parser, output);
2991
- return RETURN_ERROR;
2810
+ return emit_doctype(parser, output);
2992
2811
  default:
2993
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2994
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2812
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2813
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2995
2814
  tokenizer->_doc_type_state.force_quirks = true;
2996
- return NEXT_CHAR;
2815
+ return CONTINUE;
2997
2816
  }
2998
2817
  }
2999
2818
 
@@ -3008,28 +2827,26 @@ static StateResult handle_doctype_system_id_double_quoted_state (
3008
2827
  case '"':
3009
2828
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
3010
2829
  finish_doctype_system_id(parser);
3011
- return NEXT_CHAR;
2830
+ return CONTINUE;
3012
2831
  case '\0':
3013
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2832
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
3014
2833
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
3015
- return NEXT_CHAR;
2834
+ return CONTINUE;
3016
2835
  case '>':
3017
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2836
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
3018
2837
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3019
2838
  tokenizer->_doc_type_state.force_quirks = true;
3020
2839
  finish_doctype_system_id(parser);
3021
- emit_doctype(parser, output);
3022
- return RETURN_ERROR;
2840
+ return emit_doctype(parser, output);
3023
2841
  case -1:
3024
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
3025
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2842
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2843
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3026
2844
  tokenizer->_doc_type_state.force_quirks = true;
3027
2845
  finish_doctype_system_id(parser);
3028
- emit_doctype(parser, output);
3029
- return RETURN_ERROR;
2846
+ return emit_doctype(parser, output);
3030
2847
  default:
3031
2848
  append_char_to_temporary_buffer(parser, c);
3032
- return NEXT_CHAR;
2849
+ return CONTINUE;
3033
2850
  }
3034
2851
  }
3035
2852
 
@@ -3044,28 +2861,26 @@ static StateResult handle_doctype_system_id_single_quoted_state (
3044
2861
  case '\'':
3045
2862
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
3046
2863
  finish_doctype_system_id(parser);
3047
- return NEXT_CHAR;
2864
+ return CONTINUE;
3048
2865
  case '\0':
3049
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2866
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
3050
2867
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
3051
- return NEXT_CHAR;
2868
+ return CONTINUE;
3052
2869
  case '>':
3053
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2870
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
3054
2871
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3055
2872
  tokenizer->_doc_type_state.force_quirks = true;
3056
2873
  finish_doctype_system_id(parser);
3057
- emit_doctype(parser, output);
3058
- return RETURN_ERROR;
2874
+ return emit_doctype(parser, output);
3059
2875
  case -1:
3060
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
3061
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2876
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2877
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3062
2878
  tokenizer->_doc_type_state.force_quirks = true;
3063
2879
  finish_doctype_system_id(parser);
3064
- emit_doctype(parser, output);
3065
- return RETURN_ERROR;
2880
+ return emit_doctype(parser, output);
3066
2881
  default:
3067
2882
  append_char_to_temporary_buffer(parser, c);
3068
- return NEXT_CHAR;
2883
+ return CONTINUE;
3069
2884
  }
3070
2885
  }
3071
2886
 
@@ -3081,21 +2896,19 @@ static StateResult handle_after_doctype_system_id_state (
3081
2896
  case '\n':
3082
2897
  case '\f':
3083
2898
  case ' ':
3084
- return NEXT_CHAR;
2899
+ return CONTINUE;
3085
2900
  case '>':
3086
2901
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3087
- emit_doctype(parser, output);
3088
- return RETURN_SUCCESS;
2902
+ return emit_doctype(parser, output);
3089
2903
  case -1:
3090
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
3091
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2904
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2905
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3092
2906
  tokenizer->_doc_type_state.force_quirks = true;
3093
- emit_doctype(parser, output);
3094
- return RETURN_ERROR;
2907
+ return emit_doctype(parser, output);
3095
2908
  default:
3096
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
3097
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
3098
- return NEXT_CHAR;
2909
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER);
2910
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2911
+ return CONTINUE;
3099
2912
  }
3100
2913
  }
3101
2914
 
@@ -3106,33 +2919,370 @@ static StateResult handle_bogus_doctype_state (
3106
2919
  int c,
3107
2920
  GumboToken* output
3108
2921
  ) {
3109
- if (c == '>' || c == -1) {
2922
+ switch (c) {
2923
+ case '>':
3110
2924
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3111
- emit_doctype(parser, output);
3112
- return RETURN_ERROR;
2925
+ return emit_doctype(parser, output);
2926
+ case '\0':
2927
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2928
+ return CONTINUE;
2929
+ case -1:
2930
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2931
+ return emit_doctype(parser, output);
2932
+ default:
2933
+ return CONTINUE;
3113
2934
  }
3114
- return NEXT_CHAR;
3115
2935
  }
3116
2936
 
3117
2937
  // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
3118
- static StateResult handle_cdata_state (
2938
+ static StateResult handle_cdata_section_state (
3119
2939
  GumboParser* parser,
3120
2940
  GumboTokenizerState* tokenizer,
3121
2941
  int c,
3122
2942
  GumboToken* output
3123
2943
  ) {
3124
- if (c == -1 || utf8iterator_maybe_consume_match(
3125
- &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
3126
- tokenizer->_reconsume_current_input = true;
2944
+ switch (c) {
2945
+ case ']':
2946
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_BRACKET);
2947
+ set_mark(parser);
2948
+ return CONTINUE;
2949
+ case -1:
2950
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_CDATA);
2951
+ return emit_eof(parser, output);
2952
+ default:
2953
+ return emit_char(parser, c, output);
2954
+ }
2955
+ }
2956
+
2957
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
2958
+ static StateResult handle_cdata_section_bracket_state (
2959
+ GumboParser* parser,
2960
+ GumboTokenizerState* tokenizer,
2961
+ int c,
2962
+ GumboToken* output
2963
+ ) {
2964
+ switch (c) {
2965
+ case ']':
2966
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_END);
2967
+ return CONTINUE;
2968
+ default:
2969
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2970
+ // Emit the ].
2971
+ return emit_from_mark(parser, output);
2972
+ }
2973
+ }
2974
+
2975
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
2976
+ static StateResult handle_cdata_section_end_state (
2977
+ GumboParser* parser,
2978
+ GumboTokenizerState* tokenizer,
2979
+ int c,
2980
+ GumboToken* output
2981
+ ) {
2982
+ switch (c) {
2983
+ case ']':
2984
+ {
2985
+ // XXX: This is terrible. We want to emit a ] corresponding to the first
2986
+ // of the three in a row we've seen. So let's emit one token from the
2987
+ // temporary buffer (which will rewind 3 characters, emit the ] and
2988
+ // advance one). Next, let's clear the temporary buffer which will set the
2989
+ // mark to the middle of the three brackets. Finally, let's move to the
2990
+ // appropriate state.
2991
+ StateResult result = emit_from_mark(parser, output);
2992
+ tokenizer->_resume_pos = NULL;
2993
+ set_mark(parser);
2994
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION);
2995
+ return result;
2996
+ }
2997
+ case '>':
2998
+ // We're done with CDATA so move past the >, reset the token start point
2999
+ // to point after the >, and then reconsume in the data state.
3000
+ utf8iterator_next(&tokenizer->_input);
3127
3001
  reset_token_start_point(tokenizer);
3128
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3002
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3129
3003
  tokenizer->_is_in_cdata = false;
3130
- return NEXT_CHAR;
3131
- } else {
3132
- return emit_current_char(parser, output);
3004
+ return CONTINUE;
3005
+ default:
3006
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
3007
+ return emit_from_mark(parser, output);
3133
3008
  }
3134
3009
  }
3135
3010
 
3011
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
3012
+ static StateResult handle_character_reference_state (
3013
+ GumboParser* parser,
3014
+ GumboTokenizerState* tokenizer,
3015
+ int c,
3016
+ GumboToken* output
3017
+ ) {
3018
+ if (gumbo_ascii_isalnum(c)) {
3019
+ reconsume_in_state(parser, GUMBO_LEX_NAMED_CHARACTER_REFERENCE);
3020
+ return CONTINUE;
3021
+ }
3022
+ if (c == '#') {
3023
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE);
3024
+ return CONTINUE;
3025
+ }
3026
+ reconsume_in_state(parser, tokenizer->_return_state);
3027
+ return flush_code_points_consumed_as_character_reference(parser, output);
3028
+ }
3029
+
3030
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
3031
+ static StateResult handle_named_character_reference_state (
3032
+ GumboParser* parser,
3033
+ GumboTokenizerState* tokenizer,
3034
+ int c,
3035
+ GumboToken* output
3036
+ ) {
3037
+ const char *cur = utf8iterator_get_char_pointer(&tokenizer->_input);
3038
+ const char *end = utf8iterator_get_end_pointer(&tokenizer->_input);
3039
+ int code_point[2];
3040
+ size_t size = match_named_char_ref(cur, end - cur, code_point);
3041
+
3042
+ if (size > 0) {
3043
+ utf8iterator_maybe_consume_match(&tokenizer->_input, cur, size, true);
3044
+ int next = utf8iterator_current(&tokenizer->_input);
3045
+ reconsume_in_state(parser, tokenizer->_return_state);
3046
+ if (character_reference_part_of_attribute(parser)
3047
+ && cur[size-1] != ';'
3048
+ && (next == '=' || gumbo_ascii_isalnum(next))) {
3049
+ GumboStringPiece str = { .data = cur, .length = size };
3050
+ append_string_to_temporary_buffer(parser, &str);
3051
+ return flush_code_points_consumed_as_character_reference(parser, output);
3052
+ }
3053
+ if (cur[size-1] != ';')
3054
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE, -1);
3055
+ reconsume_in_state(parser, tokenizer->_return_state);
3056
+ return flush_char_ref(parser, code_point[0], code_point[1], output);
3057
+ }
3058
+ reconsume_in_state(parser, GUMBO_LEX_AMBIGUOUS_AMPERSAND);
3059
+ return flush_code_points_consumed_as_character_reference(parser, output);
3060
+ }
3061
+
3062
+ // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
3063
+ static StateResult handle_ambiguous_ampersand_state (
3064
+ GumboParser* parser,
3065
+ GumboTokenizerState* tokenizer,
3066
+ int c,
3067
+ GumboToken* output
3068
+ ) {
3069
+ if (gumbo_ascii_isalnum(c)) {
3070
+ if (character_reference_part_of_attribute(parser)) {
3071
+ append_char_to_tag_buffer(parser, c, true);
3072
+ return CONTINUE;
3073
+ }
3074
+ return emit_char(parser, c, output);
3075
+ }
3076
+ if (c == ';') {
3077
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE, -1);
3078
+ // fall through
3079
+ }
3080
+ reconsume_in_state(parser, tokenizer->_return_state);
3081
+ return CONTINUE;
3082
+ }
3083
+
3084
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
3085
+ static StateResult handle_numeric_character_reference_state (
3086
+ GumboParser* parser,
3087
+ GumboTokenizerState* tokenizer,
3088
+ int c,
3089
+ GumboToken* output
3090
+ ) {
3091
+ tokenizer->_character_reference_code = 0;
3092
+ switch (c) {
3093
+ case 'x':
3094
+ case 'X':
3095
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START);
3096
+ return CONTINUE;
3097
+ default:
3098
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START);
3099
+ return CONTINUE;
3100
+ }
3101
+ }
3102
+
3103
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-start-state
3104
+ static StateResult handle_hexadecimal_character_reference_start_state (
3105
+ GumboParser* parser,
3106
+ GumboTokenizerState* tokenizer,
3107
+ int c,
3108
+ GumboToken* output
3109
+ ) {
3110
+ if (gumbo_ascii_isxdigit(c)) {
3111
+ reconsume_in_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE);
3112
+ return CONTINUE;
3113
+ }
3114
+ tokenizer_add_char_ref_error (
3115
+ parser,
3116
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3117
+ -1
3118
+ );
3119
+ reconsume_in_state(parser, tokenizer->_return_state);
3120
+ return flush_code_points_consumed_as_character_reference(parser, output);
3121
+ }
3122
+
3123
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
3124
+ static StateResult handle_decimal_character_reference_start_state (
3125
+ GumboParser* parser,
3126
+ GumboTokenizerState* tokenizer,
3127
+ int c,
3128
+ GumboToken* output
3129
+ ) {
3130
+ if (gumbo_ascii_isdigit(c)) {
3131
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE);
3132
+ return CONTINUE;
3133
+ }
3134
+ tokenizer_add_char_ref_error (
3135
+ parser,
3136
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3137
+ -1
3138
+ );
3139
+ reconsume_in_state(parser, tokenizer->_return_state);
3140
+ return flush_code_points_consumed_as_character_reference(parser, output);
3141
+ }
3142
+
3143
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-state
3144
+ static StateResult handle_hexadecimal_character_reference_state (
3145
+ GumboParser* parser,
3146
+ GumboTokenizerState* tokenizer,
3147
+ int c,
3148
+ GumboToken* output
3149
+ ) {
3150
+ if (gumbo_ascii_isdigit(c)) {
3151
+ tokenizer->_character_reference_code =
3152
+ tokenizer->_character_reference_code * 16 + (c - 0x0030);
3153
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3154
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3155
+ return CONTINUE;
3156
+ }
3157
+ if (gumbo_ascii_isupper_xdigit(c)) {
3158
+ tokenizer->_character_reference_code =
3159
+ tokenizer->_character_reference_code * 16 + (c - 0x0037);
3160
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3161
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3162
+ return CONTINUE;
3163
+ }
3164
+ if (gumbo_ascii_islower_xdigit(c)) {
3165
+ tokenizer->_character_reference_code =
3166
+ tokenizer->_character_reference_code * 16 + (c - 0x0057);
3167
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3168
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3169
+ return CONTINUE;
3170
+ }
3171
+ if (c == ';') {
3172
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3173
+ return CONTINUE;
3174
+ }
3175
+ tokenizer_add_char_ref_error(
3176
+ parser,
3177
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3178
+ tokenizer->_character_reference_code
3179
+ );
3180
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3181
+ return CONTINUE;
3182
+ }
3183
+
3184
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
3185
+ static StateResult handle_decimal_character_reference_state (
3186
+ GumboParser* parser,
3187
+ GumboTokenizerState* tokenizer,
3188
+ int c,
3189
+ GumboToken* output
3190
+ ) {
3191
+ if (gumbo_ascii_isdigit(c)) {
3192
+ tokenizer->_character_reference_code =
3193
+ tokenizer->_character_reference_code * 10 + (c - 0x0030);
3194
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3195
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3196
+ return CONTINUE;
3197
+ }
3198
+ if (c == ';') {
3199
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3200
+ return CONTINUE;
3201
+ }
3202
+ tokenizer_add_char_ref_error(
3203
+ parser,
3204
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3205
+ tokenizer->_character_reference_code
3206
+ );
3207
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3208
+ return CONTINUE;
3209
+ }
3210
+
3211
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
3212
+ static StateResult handle_numeric_character_reference_end_state (
3213
+ GumboParser* parser,
3214
+ GumboTokenizerState* tokenizer,
3215
+ int c,
3216
+ GumboToken* output
3217
+ ) {
3218
+ c = tokenizer->_character_reference_code;
3219
+ if (c == 0) {
3220
+ tokenizer_add_char_ref_error(
3221
+ parser,
3222
+ GUMBO_ERR_NULL_CHARACTER_REFERENCE,
3223
+ c
3224
+ );
3225
+ c = kUtf8ReplacementChar;
3226
+ } else if (c > kUtf8MaxChar) {
3227
+ tokenizer_add_char_ref_error(
3228
+ parser,
3229
+ GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
3230
+ c
3231
+ );
3232
+ c = kUtf8ReplacementChar;
3233
+ } else if (utf8_is_surrogate(c)) {
3234
+ tokenizer_add_char_ref_error(
3235
+ parser,
3236
+ GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
3237
+ c
3238
+ );
3239
+ c = kUtf8ReplacementChar;
3240
+ } else if (utf8_is_noncharacter(c)) {
3241
+ tokenizer_add_char_ref_error(
3242
+ parser,
3243
+ GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
3244
+ c
3245
+ );
3246
+ } else if (c == 0x0D || (utf8_is_control(c) && !gumbo_ascii_isspace(c))) {
3247
+ tokenizer_add_char_ref_error(
3248
+ parser,
3249
+ GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
3250
+ c
3251
+ );
3252
+ switch (c) {
3253
+ case 0x80: c = 0x20AC; break;
3254
+ case 0x82: c = 0x201A; break;
3255
+ case 0x83: c = 0x0192; break;
3256
+ case 0x84: c = 0x201E; break;
3257
+ case 0x85: c = 0x2026; break;
3258
+ case 0x86: c = 0x2020; break;
3259
+ case 0x87: c = 0x2021; break;
3260
+ case 0x88: c = 0x02C6; break;
3261
+ case 0x89: c = 0x2030; break;
3262
+ case 0x8A: c = 0x0160; break;
3263
+ case 0x8B: c = 0x2039; break;
3264
+ case 0x8C: c = 0x0152; break;
3265
+ case 0x8E: c = 0x017D; break;
3266
+ case 0x91: c = 0x2018; break;
3267
+ case 0x92: c = 0x2019; break;
3268
+ case 0x93: c = 0x201C; break;
3269
+ case 0x94: c = 0x201D; break;
3270
+ case 0x95: c = 0x2022; break;
3271
+ case 0x96: c = 0x2013; break;
3272
+ case 0x97: c = 0x2014; break;
3273
+ case 0x98: c = 0x02DC; break;
3274
+ case 0x99: c = 0x2122; break;
3275
+ case 0x9A: c = 0x0161; break;
3276
+ case 0x9B: c = 0x203A; break;
3277
+ case 0x9C: c = 0x0153; break;
3278
+ case 0x9E: c = 0x017E; break;
3279
+ case 0x9F: c = 0x0178; break;
3280
+ }
3281
+ }
3282
+ reconsume_in_state(parser, tokenizer->_return_state);
3283
+ return flush_char_ref(parser, c, kGumboNoChar, output);
3284
+ }
3285
+
3136
3286
  typedef StateResult (*GumboLexerStateFunction) (
3137
3287
  GumboParser* parser,
3138
3288
  GumboTokenizerState* tokenizer,
@@ -3141,74 +3291,86 @@ typedef StateResult (*GumboLexerStateFunction) (
3141
3291
  );
3142
3292
 
3143
3293
  static GumboLexerStateFunction dispatch_table[] = {
3144
- handle_data_state,
3145
- handle_char_ref_in_data_state,
3146
- handle_rcdata_state,
3147
- handle_char_ref_in_rcdata_state,
3148
- handle_rawtext_state,
3149
- handle_script_state,
3150
- handle_plaintext_state,
3151
- handle_tag_open_state,
3152
- handle_end_tag_open_state,
3153
- handle_tag_name_state,
3154
- handle_rcdata_lt_state,
3155
- handle_rcdata_end_tag_open_state,
3156
- handle_rcdata_end_tag_name_state,
3157
- handle_rawtext_lt_state,
3158
- handle_rawtext_end_tag_open_state,
3159
- handle_rawtext_end_tag_name_state,
3160
- handle_script_lt_state,
3161
- handle_script_end_tag_open_state,
3162
- handle_script_end_tag_name_state,
3163
- handle_script_escaped_start_state,
3164
- handle_script_escaped_start_dash_state,
3165
- handle_script_escaped_state,
3166
- handle_script_escaped_dash_state,
3167
- handle_script_escaped_dash_dash_state,
3168
- handle_script_escaped_lt_state,
3169
- handle_script_escaped_end_tag_open_state,
3170
- handle_script_escaped_end_tag_name_state,
3171
- handle_script_double_escaped_start_state,
3172
- handle_script_double_escaped_state,
3173
- handle_script_double_escaped_dash_state,
3174
- handle_script_double_escaped_dash_dash_state,
3175
- handle_script_double_escaped_lt_state,
3176
- handle_script_double_escaped_end_state,
3177
- handle_before_attr_name_state,
3178
- handle_attr_name_state,
3179
- handle_after_attr_name_state,
3180
- handle_before_attr_value_state,
3181
- handle_attr_value_double_quoted_state,
3182
- handle_attr_value_single_quoted_state,
3183
- handle_attr_value_unquoted_state,
3184
- handle_char_ref_in_attr_value_state,
3185
- handle_after_attr_value_quoted_state,
3186
- handle_self_closing_start_tag_state,
3187
- handle_bogus_comment_state,
3188
- handle_markup_declaration_state,
3189
- handle_comment_start_state,
3190
- handle_comment_start_dash_state,
3191
- handle_comment_state,
3192
- handle_comment_end_dash_state,
3193
- handle_comment_end_state,
3194
- handle_comment_end_bang_state,
3195
- handle_doctype_state,
3196
- handle_before_doctype_name_state,
3197
- handle_doctype_name_state,
3198
- handle_after_doctype_name_state,
3199
- handle_after_doctype_public_keyword_state,
3200
- handle_before_doctype_public_id_state,
3201
- handle_doctype_public_id_double_quoted_state,
3202
- handle_doctype_public_id_single_quoted_state,
3203
- handle_after_doctype_public_id_state,
3204
- handle_between_doctype_public_system_id_state,
3205
- handle_after_doctype_system_keyword_state,
3206
- handle_before_doctype_system_id_state,
3207
- handle_doctype_system_id_double_quoted_state,
3208
- handle_doctype_system_id_single_quoted_state,
3209
- handle_after_doctype_system_id_state,
3210
- handle_bogus_doctype_state,
3211
- handle_cdata_state
3294
+ [GUMBO_LEX_DATA] = handle_data_state,
3295
+ [GUMBO_LEX_RCDATA] = handle_rcdata_state,
3296
+ [GUMBO_LEX_RAWTEXT] = handle_rawtext_state,
3297
+ [GUMBO_LEX_SCRIPT_DATA] = handle_script_data_state,
3298
+ [GUMBO_LEX_PLAINTEXT] = handle_plaintext_state,
3299
+ [GUMBO_LEX_TAG_OPEN] = handle_tag_open_state,
3300
+ [GUMBO_LEX_END_TAG_OPEN] = handle_end_tag_open_state,
3301
+ [GUMBO_LEX_TAG_NAME] = handle_tag_name_state,
3302
+ [GUMBO_LEX_RCDATA_LT] = handle_rcdata_lt_state,
3303
+ [GUMBO_LEX_RCDATA_END_TAG_OPEN] = handle_rcdata_end_tag_open_state,
3304
+ [GUMBO_LEX_RCDATA_END_TAG_NAME] = handle_rcdata_end_tag_name_state,
3305
+ [GUMBO_LEX_RAWTEXT_LT] = handle_rawtext_lt_state,
3306
+ [GUMBO_LEX_RAWTEXT_END_TAG_OPEN] = handle_rawtext_end_tag_open_state,
3307
+ [GUMBO_LEX_RAWTEXT_END_TAG_NAME] = handle_rawtext_end_tag_name_state,
3308
+ [GUMBO_LEX_SCRIPT_DATA_LT] = handle_script_data_lt_state,
3309
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN] = handle_script_data_end_tag_open_state,
3310
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME] = handle_script_data_end_tag_name_state,
3311
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START] = handle_script_data_escaped_start_state,
3312
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH] = handle_script_data_escaped_start_dash_state,
3313
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED] = handle_script_data_escaped_state,
3314
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH] = handle_script_data_escaped_dash_state,
3315
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH] = handle_script_data_escaped_dash_dash_state,
3316
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT] = handle_script_data_escaped_lt_state,
3317
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN] = handle_script_data_escaped_end_tag_open_state,
3318
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME] = handle_script_data_escaped_end_tag_name_state,
3319
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START] = handle_script_data_double_escaped_start_state,
3320
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED] = handle_script_data_double_escaped_state,
3321
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH] = handle_script_data_double_escaped_dash_state,
3322
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH] = handle_script_data_double_escaped_dash_dash_state,
3323
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT] = handle_script_data_double_escaped_lt_state,
3324
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END] = handle_script_data_double_escaped_end_state,
3325
+ [GUMBO_LEX_BEFORE_ATTR_NAME] = handle_before_attr_name_state,
3326
+ [GUMBO_LEX_ATTR_NAME] = handle_attr_name_state,
3327
+ [GUMBO_LEX_AFTER_ATTR_NAME] = handle_after_attr_name_state,
3328
+ [GUMBO_LEX_BEFORE_ATTR_VALUE] = handle_before_attr_value_state,
3329
+ [GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED] = handle_attr_value_double_quoted_state,
3330
+ [GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED] = handle_attr_value_single_quoted_state,
3331
+ [GUMBO_LEX_ATTR_VALUE_UNQUOTED] = handle_attr_value_unquoted_state,
3332
+ [GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED] = handle_after_attr_value_quoted_state,
3333
+ [GUMBO_LEX_SELF_CLOSING_START_TAG] = handle_self_closing_start_tag_state,
3334
+ [GUMBO_LEX_BOGUS_COMMENT] = handle_bogus_comment_state,
3335
+ [GUMBO_LEX_MARKUP_DECLARATION_OPEN] = handle_markup_declaration_open_state,
3336
+ [GUMBO_LEX_COMMENT_START] = handle_comment_start_state,
3337
+ [GUMBO_LEX_COMMENT_START_DASH] = handle_comment_start_dash_state,
3338
+ [GUMBO_LEX_COMMENT] = handle_comment_state,
3339
+ [GUMBO_LEX_COMMENT_LT] = handle_comment_lt_state,
3340
+ [GUMBO_LEX_COMMENT_LT_BANG] = handle_comment_lt_bang_state,
3341
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH] = handle_comment_lt_bang_dash_state,
3342
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH] = handle_comment_lt_bang_dash_dash_state,
3343
+ [GUMBO_LEX_COMMENT_END_DASH] = handle_comment_end_dash_state,
3344
+ [GUMBO_LEX_COMMENT_END] = handle_comment_end_state,
3345
+ [GUMBO_LEX_COMMENT_END_BANG] = handle_comment_end_bang_state,
3346
+ [GUMBO_LEX_DOCTYPE] = handle_doctype_state,
3347
+ [GUMBO_LEX_BEFORE_DOCTYPE_NAME] = handle_before_doctype_name_state,
3348
+ [GUMBO_LEX_DOCTYPE_NAME] = handle_doctype_name_state,
3349
+ [GUMBO_LEX_AFTER_DOCTYPE_NAME] = handle_after_doctype_name_state,
3350
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD] = handle_after_doctype_public_keyword_state,
3351
+ [GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID] = handle_before_doctype_public_id_state,
3352
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED] = handle_doctype_public_id_double_quoted_state,
3353
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED] = handle_doctype_public_id_single_quoted_state,
3354
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID] = handle_after_doctype_public_id_state,
3355
+ [GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID] = handle_between_doctype_public_system_id_state,
3356
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD] = handle_after_doctype_system_keyword_state,
3357
+ [GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID] = handle_before_doctype_system_id_state,
3358
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED] = handle_doctype_system_id_double_quoted_state,
3359
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED] = handle_doctype_system_id_single_quoted_state,
3360
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID] = handle_after_doctype_system_id_state,
3361
+ [GUMBO_LEX_BOGUS_DOCTYPE] = handle_bogus_doctype_state,
3362
+ [GUMBO_LEX_CDATA_SECTION] = handle_cdata_section_state,
3363
+ [GUMBO_LEX_CDATA_SECTION_BRACKET] = handle_cdata_section_bracket_state,
3364
+ [GUMBO_LEX_CDATA_SECTION_END] = handle_cdata_section_end_state,
3365
+ [GUMBO_LEX_CHARACTER_REFERENCE] = handle_character_reference_state,
3366
+ [GUMBO_LEX_NAMED_CHARACTER_REFERENCE] = handle_named_character_reference_state,
3367
+ [GUMBO_LEX_AMBIGUOUS_AMPERSAND] = handle_ambiguous_ampersand_state,
3368
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE] = handle_numeric_character_reference_state,
3369
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START] = handle_hexadecimal_character_reference_start_state,
3370
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START] = handle_decimal_character_reference_start_state,
3371
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE] = handle_hexadecimal_character_reference_state,
3372
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE] = handle_decimal_character_reference_state,
3373
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
3212
3374
  };
3213
3375
 
3214
3376
  bool gumbo_lex(GumboParser* parser, GumboToken* output) {
@@ -3239,12 +3401,14 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3239
3401
  return true;
3240
3402
  }
3241
3403
 
3242
- if (maybe_emit_from_temporary_buffer(parser, output)) {
3404
+ if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
3405
+ // Return no error.
3243
3406
  return true;
3244
3407
  }
3245
3408
 
3409
+ tokenizer->_parse_error = false;
3246
3410
  while (1) {
3247
- assert(!tokenizer->_temporary_buffer_emit);
3411
+ assert(!tokenizer->_resume_pos);
3248
3412
  assert(tokenizer->_buffered_emit_char == kGumboNoChar);
3249
3413
  int c = utf8iterator_current(&tokenizer->_input);
3250
3414
  GumboTokenizerEnum state = tokenizer->_state;
@@ -3255,11 +3419,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3255
3419
  bool should_advance = !tokenizer->_reconsume_current_input;
3256
3420
  tokenizer->_reconsume_current_input = false;
3257
3421
 
3258
- if (result == RETURN_SUCCESS) {
3259
- return true;
3260
- } else if (result == RETURN_ERROR) {
3261
- return false;
3262
- }
3422
+ if (result == EMIT_TOKEN)
3423
+ return !tokenizer->_parse_error;
3263
3424
 
3264
3425
  if (should_advance) {
3265
3426
  utf8iterator_next(&tokenizer->_input);
@@ -3285,12 +3446,16 @@ void gumbo_token_destroy(GumboToken* token) {
3285
3446
  }
3286
3447
  }
3287
3448
  gumbo_free((void*) token->v.start_tag.attributes.data);
3288
- if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN)
3449
+ if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
3289
3450
  gumbo_free(token->v.start_tag.name);
3451
+ token->v.start_tag.name = NULL;
3452
+ }
3290
3453
  return;
3291
3454
  case GUMBO_TOKEN_END_TAG:
3292
- if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN)
3455
+ if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN) {
3293
3456
  gumbo_free(token->v.end_tag.name);
3457
+ token->v.end_tag.name = NULL;
3458
+ }
3294
3459
  break;
3295
3460
  case GUMBO_TOKEN_COMMENT:
3296
3461
  gumbo_free((void*) token->v.text);