nokogumbo 2.0.0.pre.alpha → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -79,7 +79,7 @@ void gumbo_string_buffer_append_codepoint (
79
79
  }
80
80
 
81
81
  void gumbo_string_buffer_append_string (
82
- GumboStringPiece* str,
82
+ const GumboStringPiece* str,
83
83
  GumboStringBuffer* output
84
84
  ) {
85
85
  maybe_resize_string_buffer(str->length, output);
@@ -47,7 +47,7 @@ void gumbo_string_buffer_append_codepoint (
47
47
 
48
48
  // Appends a string onto the end of the GumboStringBuffer.
49
49
  void gumbo_string_buffer_append_string (
50
- GumboStringPiece* str,
50
+ const GumboStringPiece* str,
51
51
  GumboStringBuffer* output
52
52
  );
53
53
 
@@ -0,0 +1,79 @@
1
+ /*
2
+ Copyright 2018 Stephen Checkoway
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
16
+
17
+ #include <assert.h>
18
+
19
+ #include "ascii.h"
20
+ #include "token_buffer.h"
21
+ #include "tokenizer.h"
22
+ #include "util.h"
23
+
24
+ struct GumboInternalCharacterToken {
25
+ GumboSourcePosition position;
26
+ GumboStringPiece original_text;
27
+ int c;
28
+ };
29
+
30
+ void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer) {
31
+ buffer->data = NULL;
32
+ buffer->length = 0;
33
+ buffer->capacity = 0;
34
+ }
35
+
36
+ void gumbo_character_token_buffer_append (
37
+ const GumboToken* token,
38
+ GumboCharacterTokenBuffer* buffer
39
+ ) {
40
+ assert(token->type == GUMBO_TOKEN_WHITESPACE
41
+ || token->type == GUMBO_TOKEN_CHARACTER);
42
+ if (buffer->length == buffer->capacity) {
43
+ if (buffer->capacity == 0)
44
+ buffer->capacity = 10;
45
+ else
46
+ buffer->capacity *= 2;
47
+ size_t bytes = sizeof(*buffer->data) * buffer->capacity;
48
+ buffer->data = gumbo_realloc(buffer->data, bytes);
49
+ }
50
+ size_t index = buffer->length++;
51
+ buffer->data[index].position = token->position;
52
+ buffer->data[index].original_text = token->original_text;
53
+ buffer->data[index].c = token->v.character;
54
+ }
55
+
56
+ void gumbo_character_token_buffer_get (
57
+ const GumboCharacterTokenBuffer* buffer,
58
+ size_t index,
59
+ struct GumboInternalToken* output
60
+ ) {
61
+ assert(index < buffer->length);
62
+ int c = buffer->data[index].c;
63
+ output->type = gumbo_ascii_isspace(c)?
64
+ GUMBO_TOKEN_WHITESPACE : GUMBO_TOKEN_CHARACTER;
65
+ output->position = buffer->data[index].position;
66
+ output->original_text = buffer->data[index].original_text;
67
+ output->v.character = c;
68
+ }
69
+
70
+ void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer) {
71
+ buffer->length = 0;
72
+ }
73
+
74
+ void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer) {
75
+ gumbo_free(buffer->data);
76
+ buffer->data = NULL;
77
+ buffer->length = 0;
78
+ buffer->capacity = 0;
79
+ }
@@ -0,0 +1,71 @@
1
+ /*
2
+ Copyright 2018 Stephen Checkoway
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
16
+
17
+ #ifndef GUMBO_TOKEN_BUFFER_H
18
+ #define GUMBO_TOKEN_BUFFER_H
19
+
20
+ #include <stdbool.h>
21
+ #include <stddef.h>
22
+
23
+ #include "gumbo.h"
24
+
25
+ #ifdef __cplusplus
26
+ extern "C" {
27
+ #endif
28
+
29
+ struct GumboInternalCharacterToken;
30
+ struct GumboInternalToken;
31
+
32
+ // A struct representing a growable sequence of character (and whitespace)
33
+ // tokens.
34
+ typedef struct {
35
+ // A pointer to the start of the sequence.
36
+ struct GumboInternalCharacterToken* data;
37
+
38
+ // The length of the sequence.
39
+ size_t length;
40
+
41
+ // The capacity of the buffer.
42
+ size_t capacity;
43
+ } GumboCharacterTokenBuffer;
44
+
45
+ // Initializes a new GumboCharacterTokenBuffer.
46
+ void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer);
47
+
48
+ // Appends a character (or whitespace) token.
49
+ void gumbo_character_token_buffer_append (
50
+ const struct GumboInternalToken* token,
51
+ GumboCharacterTokenBuffer* buffer
52
+ );
53
+
54
+ void gumbo_character_token_buffer_get (
55
+ const GumboCharacterTokenBuffer* buffer,
56
+ size_t index,
57
+ struct GumboInternalToken* output
58
+ );
59
+
60
+ // Reinitialize this string buffer. This clears it by setting length=0. It
61
+ // does not zero out the buffer itself.
62
+ void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer);
63
+
64
+ // Deallocates this GumboCharacterTokenBuffer.
65
+ void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer);
66
+
67
+ #ifdef __cplusplus
68
+ }
69
+ #endif
70
+
71
+ #endif // GUMBO_TOKEN_BUFFER_H
@@ -1,5 +1,7 @@
1
1
  /*
2
2
  Copyright 2010 Google Inc.
3
+ Copyright 2017-2018 Craig Barnes
4
+ Copyright 2018 Stephen Checkoway
3
5
 
4
6
  Licensed under the Apache License, Version 2.0 (the "License");
5
7
  you may not use this file except in compliance with the License.
@@ -60,15 +62,18 @@
60
62
  #include "util.h"
61
63
  #include "vector.h"
62
64
 
63
- // Compared against _script_data_buffer to determine if we're in
65
+ // Compared against _temporary_buffer to determine if we're in
64
66
  // double-escaped script mode.
65
67
  static const GumboStringPiece kScriptTag = {.data = "script", .length = 6};
66
68
 
67
- // An enum for the return value of each individual state.
69
+ // An enum for the return value of each individual state. Each of the emit_*
70
+ // functions should return EMIT_TOKEN and should be called as
71
+ // return emit_foo(parser, ..., output);
72
+ // Each of the handle_*_state functions that do not return emit_* should
73
+ // instead return CONTINUE to indicate to gumbo_lex to continue lexing.
68
74
  typedef enum {
69
- RETURN_ERROR, // Return false (error) from the tokenizer.
70
- RETURN_SUCCESS, // Return true (success) from the tokenizer.
71
- NEXT_CHAR // Proceed to the next character and continue lexing.
75
+ EMIT_TOKEN,
76
+ CONTINUE,
72
77
  } StateResult;
73
78
 
74
79
  // This is a struct containing state necessary to build up a tag token,
@@ -103,12 +108,6 @@ typedef struct GumboInternalTagState {
103
108
  // the attribute value, but shouldn't overwrite the existing value.
104
109
  bool _drop_next_attr_value;
105
110
 
106
- // The state that caused the tokenizer to switch into a character reference in
107
- // attribute value state. This is used to set the additional allowed
108
- // character, and is switched back to on completion. Initialized as the
109
- // tokenizer enters the character reference state.
110
- GumboTokenizerEnum _attr_value_state;
111
-
112
111
  // The last start tag to have been emitted by the tokenizer. This is
113
112
  // necessary to check for appropriate end tags.
114
113
  GumboTag _last_start_tag;
@@ -133,15 +132,19 @@ typedef struct GumboInternalTokenizerState {
133
132
  // "Reconsume the current input character in..."
134
133
  bool _reconsume_current_input;
135
134
 
136
- // A flag indicating whether the current node is a foreign element. This is
137
- // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
138
- // markup declaration state.
139
- bool _is_current_node_foreign;
135
+ // A flag indicating whether the adjusted current node is a foreign element.
136
+ // This is set by gumbo_tokenizer_set_is_adjusted_current_node_foreign and
137
+ // checked in the markup declaration state.
138
+ bool _is_adjusted_current_node_foreign;
140
139
 
141
140
  // A flag indicating whether the tokenizer is in a CDATA section. If so, then
142
141
  // text tokens emitted will be GUMBO_TOKEN_CDATA.
143
142
  bool _is_in_cdata;
144
143
 
144
+ // A flag indicating whether the tokenizer has seen a parse error since the
145
+ // last token was emitted.
146
+ bool _parse_error;
147
+
145
148
  // Certain states (notably character references) may emit two character tokens
146
149
  // at once, but the contract for lex() fills in only one token at a time. The
147
150
  // extra character is buffered here, and then this is checked on entry to
@@ -159,27 +162,24 @@ typedef struct GumboInternalTokenizerState {
159
162
 
160
163
  // A temporary buffer to accumulate characters, as described by the "temporary
161
164
  // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
162
- // way: we record the specific character to go into the buffer, which may
163
- // sometimes be a lowercased version of the actual input character. However,
164
- // we *also* use utf8iterator_mark() to record the position at tag start.
165
- // When we start flushing the temporary buffer, we set _temporary_buffer_emit
166
- // to the start of it, and then increment it for each call to the tokenizer.
167
- // We also call utf8iterator_reset(), and utf8iterator_next() through the
168
- // input stream, so that tokens emitted by emit_char have the correct position
169
- // and original text.
165
+ // way: In situations where the spec calls for inserting characters into the
166
+ // temporary buffer that exactly match the input in order to emit them as
167
+ // character tokens, we don't actually do it.
168
+ // Instead, we mark the input and reset the input to it using set_mark() and
169
+ // emit_from_mark(). We do use the temporary buffer for other uses such as
170
+ // DOCTYPEs, comments, and detecting escaped <script> tags.
170
171
  GumboStringBuffer _temporary_buffer;
171
172
 
172
- // The current cursor position we're emitting from within
173
- // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
174
- const char* _temporary_buffer_emit;
173
+ // The position to resume normal operation after we start emitting from the
174
+ // mark. NULL whenever we're not emitting from the mark.
175
+ const char* _resume_pos;
175
176
 
176
- // The temporary buffer is also used by the spec to check whether we should
177
- // enter the script data double escaped state, but we can't use the same
178
- // buffer for both because we have to flush out "<s" as emits while still
179
- // maintaining the context that will eventually become "script". This is a
180
- // separate buffer that's used in place of the temporary buffer for states
181
- // that may enter the script data double escape start state.
182
- GumboStringBuffer _script_data_buffer;
177
+ // The character reference state uses a return state to return to the state
178
+ // it was invoked from.
179
+ GumboTokenizerEnum _return_state;
180
+
181
+ // Numeric character reference.
182
+ uint32_t _character_reference_code;
183
183
 
184
184
  // Pointer to the beginning of the current token in the original buffer; used
185
185
  // to record the original text.
@@ -201,123 +201,69 @@ typedef struct GumboInternalTokenizerState {
201
201
  Utf8Iterator _input;
202
202
  } GumboTokenizerState;
203
203
 
204
- // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
204
+ // Adds a parse error to the parser's error struct.
205
205
  static void tokenizer_add_parse_error (
206
206
  GumboParser* parser,
207
207
  GumboErrorType type
208
208
  ) {
209
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
210
+ tokenizer->_parse_error = true;
209
211
  GumboError* error = gumbo_add_error(parser);
210
212
  if (!error) {
211
213
  return;
212
214
  }
215
+ const Utf8Iterator* input = &tokenizer->_input;
216
+ utf8iterator_get_position(input, &error->position);
217
+ error->original_text.data = utf8iterator_get_char_pointer(input);
218
+ error->original_text.length = utf8iterator_get_width(input);
219
+ error->type = type;
220
+ error->v.tokenizer.state = tokenizer->_state;
221
+ error->v.tokenizer.codepoint = utf8iterator_current(input);
222
+ }
223
+
224
+ // Adds an error pointing at the start of the character reference.
225
+ static void tokenizer_add_char_ref_error (
226
+ struct GumboInternalParser* parser,
227
+ GumboErrorType type,
228
+ int codepoint
229
+ ) {
213
230
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
214
- utf8iterator_get_position(&tokenizer->_input, &error->position);
215
- error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
231
+ tokenizer->_parse_error = true;
232
+ GumboError* error = gumbo_add_error(parser);
233
+ if (!error)
234
+ return;
235
+ Utf8Iterator* input = &tokenizer->_input;
216
236
  error->type = type;
217
- error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
218
- switch (tokenizer->_state) {
219
- case GUMBO_LEX_DATA:
220
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
221
- break;
222
- case GUMBO_LEX_CHAR_REF_IN_DATA:
223
- case GUMBO_LEX_CHAR_REF_IN_RCDATA:
224
- case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
225
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
226
- break;
227
- case GUMBO_LEX_RCDATA:
228
- case GUMBO_LEX_RCDATA_LT:
229
- case GUMBO_LEX_RCDATA_END_TAG_OPEN:
230
- case GUMBO_LEX_RCDATA_END_TAG_NAME:
231
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
232
- break;
233
- case GUMBO_LEX_RAWTEXT:
234
- case GUMBO_LEX_RAWTEXT_LT:
235
- case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
236
- case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
237
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
238
- break;
239
- case GUMBO_LEX_PLAINTEXT:
240
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
241
- break;
242
- case GUMBO_LEX_SCRIPT:
243
- case GUMBO_LEX_SCRIPT_LT:
244
- case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
245
- case GUMBO_LEX_SCRIPT_END_TAG_NAME:
246
- case GUMBO_LEX_SCRIPT_ESCAPED_START:
247
- case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
248
- case GUMBO_LEX_SCRIPT_ESCAPED:
249
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
250
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
251
- case GUMBO_LEX_SCRIPT_ESCAPED_LT:
252
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
253
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
254
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
255
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
256
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
257
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
258
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
259
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
260
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
261
- break;
262
- case GUMBO_LEX_TAG_OPEN:
263
- case GUMBO_LEX_END_TAG_OPEN:
264
- case GUMBO_LEX_TAG_NAME:
265
- case GUMBO_LEX_BEFORE_ATTR_NAME:
266
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
267
- break;
268
- case GUMBO_LEX_SELF_CLOSING_START_TAG:
269
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
270
- break;
271
- case GUMBO_LEX_ATTR_NAME:
272
- case GUMBO_LEX_AFTER_ATTR_NAME:
273
- case GUMBO_LEX_BEFORE_ATTR_VALUE:
274
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
275
- break;
276
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
277
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
278
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
279
- case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
280
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
281
- break;
282
- case GUMBO_LEX_BOGUS_COMMENT:
283
- case GUMBO_LEX_COMMENT_START:
284
- case GUMBO_LEX_COMMENT_START_DASH:
285
- case GUMBO_LEX_COMMENT:
286
- case GUMBO_LEX_COMMENT_END_DASH:
287
- case GUMBO_LEX_COMMENT_END:
288
- case GUMBO_LEX_COMMENT_END_BANG:
289
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
290
- break;
291
- case GUMBO_LEX_MARKUP_DECLARATION:
292
- case GUMBO_LEX_DOCTYPE:
293
- case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
294
- case GUMBO_LEX_DOCTYPE_NAME:
295
- case GUMBO_LEX_AFTER_DOCTYPE_NAME:
296
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
297
- case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
298
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
299
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
300
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
301
- case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
302
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
303
- case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
304
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
305
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
306
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
307
- case GUMBO_LEX_BOGUS_DOCTYPE:
308
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
309
- break;
310
- case GUMBO_LEX_CDATA:
311
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
312
- break;
313
- }
237
+ error->position = utf8iterator_get_mark_position(input);
238
+ const char* mark = utf8iterator_get_mark_pointer(input);
239
+ error->original_text.data = mark;
240
+ error->original_text.length = utf8iterator_get_char_pointer(input) - mark;
241
+ error->v.tokenizer.state = tokenizer->_state;
242
+ error->v.tokenizer.codepoint = codepoint;
243
+ }
244
+
245
+ // Adds an error pointing at the start of the token.
246
+ static void tokenizer_add_token_parse_error (
247
+ GumboParser* parser,
248
+ GumboErrorType type
249
+ ) {
250
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
251
+ tokenizer->_parse_error = true;
252
+ GumboError* error = gumbo_add_error(parser);
253
+ if (!error)
254
+ return;
255
+ Utf8Iterator* input = &tokenizer->_input;
256
+ error->type = type;
257
+ error->position = tokenizer->_token_start_pos;
258
+ error->original_text.data = tokenizer->_token_start;
259
+ error->original_text.length =
260
+ utf8iterator_get_char_pointer(input) - tokenizer->_token_start;
261
+ error->v.tokenizer.state = tokenizer->_state;
262
+ error->v.tokenizer.codepoint = 0;
314
263
  }
315
264
 
316
265
  static bool is_alpha(int c) {
317
- // We don't use the ISO C isalpha() function here because it depends
318
- // on the current locale, whereas the behavior in the HTML5 spec is
319
- // locale-independent.
320
- return ((unsigned) c | 32) - 'a' < 26;
266
+ return gumbo_ascii_isalpha(c);
321
267
  }
322
268
 
323
269
  static int ensure_lowercase(int c) {
@@ -347,24 +293,9 @@ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
347
293
  }
348
294
 
349
295
  // Starts recording characters in the temporary buffer.
350
- // Because this needs to reset the utf8iterator_mark to the beginning of the
351
- // text that will eventually be emitted, it needs to be called a couple of
352
- // states before the spec says "Set the temporary buffer to the empty string".
353
- // In general, this should be called whenever there's a transition to a
354
- // "less-than sign state". The initial < and possibly / then need to be
355
- // appended to the temporary buffer, their presence needs to be accounted for in
356
- // states that compare the temporary buffer against a literal value, and
357
- // spec stanzas that say "emit a < and / character token along with a character
358
- // token for each character in the temporary buffer" need to be adjusted to
359
- // account for the presence of the < and / inside the temporary buffer.
360
296
  static void clear_temporary_buffer(GumboParser* parser) {
361
297
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
362
- assert(!tokenizer->_temporary_buffer_emit);
363
- utf8iterator_mark(&tokenizer->_input);
364
298
  gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
365
- // The temporary buffer and script data buffer are the same object in the
366
- // spec, so the script data buffer should be cleared as well.
367
- gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
368
299
  }
369
300
 
370
301
  // Appends a codepoint to the temporary buffer.
@@ -378,25 +309,20 @@ static void append_char_to_temporary_buffer (
378
309
  );
379
310
  }
380
311
 
381
- #ifndef NDEBUG
382
- static bool temporary_buffer_equals__ (
383
- const GumboParser* parser,
384
- const char* text,
385
- size_t text_len
312
+ static void append_string_to_temporary_buffer (
313
+ GumboParser* parser,
314
+ const GumboStringPiece* str
386
315
  ) {
387
- const GumboStringBuffer* buf = &parser->_tokenizer_state->_temporary_buffer;
388
- return
389
- text_len == buf->length
390
- && memcmp(buf->data, text, text_len) == 0;
316
+ gumbo_string_buffer_append_string (
317
+ str,
318
+ &parser->_tokenizer_state->_temporary_buffer
319
+ );
391
320
  }
392
321
 
393
- #define temporary_buffer_equals(parser, text) \
394
- temporary_buffer_equals__(parser, "" text, sizeof(text) - 1)
395
322
 
396
323
  static bool temporary_buffer_is_empty(const GumboParser* parser) {
397
324
  return parser->_tokenizer_state->_temporary_buffer.length == 0;
398
325
  }
399
- #endif
400
326
 
401
327
  static void doc_type_state_init(GumboParser* parser) {
402
328
  GumboTokenDocType* doc_type_state =
@@ -493,56 +419,49 @@ static void finish_doctype_system_id(GumboParser* parser) {
493
419
  }
494
420
 
495
421
  // Writes a single specified character to the output token.
496
- static void emit_char(GumboParser* parser, int c, GumboToken* output) {
422
+ static StateResult emit_char(GumboParser* parser, int c, GumboToken* output) {
497
423
  output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
498
424
  output->v.character = c;
499
425
  finish_token(parser, output);
426
+ return EMIT_TOKEN;
500
427
  }
501
428
 
502
429
  // Writes a replacement character token and records a parse error.
503
- // Always returns RETURN_ERROR, per gumbo_lex return value.
430
+ // Always returns EMIT_TOKEN, per gumbo_lex return value.
504
431
  static StateResult emit_replacement_char(
505
432
  GumboParser* parser, GumboToken* output) {
506
433
  // In all cases, this is because of a null byte in the input stream.
507
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
434
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
508
435
  emit_char(parser, kUtf8ReplacementChar, output);
509
- return RETURN_ERROR;
436
+ return EMIT_TOKEN;
510
437
  }
511
438
 
512
- // Writes an EOF character token. Always returns RETURN_SUCCESS.
439
+ // Writes an EOF character token. Always returns EMIT_TOKEN.
513
440
  static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
514
- emit_char(parser, -1, output);
515
- return RETURN_SUCCESS;
516
- }
517
-
518
- // Writes the current input character out as a character token.
519
- // Always returns RETURN_SUCCESS.
520
- static bool emit_current_char(GumboParser* parser, GumboToken* output) {
521
- emit_char(
522
- parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
523
- return RETURN_SUCCESS;
441
+ return emit_char(parser, -1, output);
524
442
  }
525
443
 
526
444
  // Writes out a doctype token, copying it from the tokenizer state.
527
- static void emit_doctype(GumboParser* parser, GumboToken* output) {
445
+ static StateResult emit_doctype(GumboParser* parser, GumboToken* output) {
528
446
  output->type = GUMBO_TOKEN_DOCTYPE;
529
447
  output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
530
448
  finish_token(parser, output);
531
449
  doc_type_state_init(parser);
450
+ return EMIT_TOKEN;
532
451
  }
533
452
 
534
453
  // Debug-only function that explicitly sets the attribute vector data to NULL so
535
454
  // it can be asserted on tag creation, verifying that there are no memory leaks.
536
455
  static void mark_tag_state_as_empty(GumboTagState* tag_state) {
537
456
  UNUSED_IF_NDEBUG(tag_state);
538
- #ifndef NDEBUG
539
457
  tag_state->_name = NULL;
458
+ #ifndef NDEBUG
540
459
  tag_state->_attributes = kGumboEmptyVector;
541
460
  #endif
542
461
  }
543
462
 
544
463
  // Writes out the current tag as a start or end tag token.
545
- // Always returns RETURN_SUCCESS.
464
+ // Always returns EMIT_TOKEN.
546
465
  static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
547
466
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
548
467
  if (tag_state->_is_start_tag) {
@@ -559,7 +478,10 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
559
478
  output->type = GUMBO_TOKEN_END_TAG;
560
479
  output->v.end_tag.tag = tag_state->_tag;
561
480
  output->v.end_tag.name = tag_state->_name;
562
- output->v.end_tag.is_self_closing = tag_state->_is_self_closing;
481
+ if (tag_state->_is_self_closing)
482
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS);
483
+ if (tag_state->_attributes.length > 0)
484
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_ATTRIBUTES);
563
485
  // In end tags, ownership of the attributes vector is not transferred to the
564
486
  // token, but it's still initialized as normal, so it must be manually
565
487
  // deallocated. There may also be attributes to destroy, in certain broken
@@ -582,7 +504,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
582
504
  assert(output->original_text.length >= 2);
583
505
  assert(output->original_text.data[0] == '<');
584
506
  assert(output->original_text.data[output->original_text.length - 1] == '>');
585
- return RETURN_SUCCESS;
507
+ return EMIT_TOKEN;
586
508
  }
587
509
 
588
510
  // In some states, we speculatively start a tag, but don't know whether it'll be
@@ -600,90 +522,59 @@ static void abandon_current_tag(GumboParser* parser) {
600
522
  gumbo_debug("Abandoning current tag.\n");
601
523
  }
602
524
 
603
- // Wraps the gumbo_consume_char_ref function to handle its output and make the
604
- // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
605
- // error occurred, RETURN_SUCCESS otherwise.
606
- static StateResult emit_char_ref (
607
- GumboParser* parser,
608
- int additional_allowed_char,
609
- bool UNUSED_ARG(is_in_attribute),
610
- GumboToken* output
611
- ) {
612
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
613
- OneOrTwoCodepoints char_ref;
614
- bool status = gumbo_consume_char_ref (
615
- parser,
616
- &tokenizer->_input,
617
- additional_allowed_char,
618
- false,
619
- &char_ref
620
- );
621
- if (char_ref.first != kGumboNoChar) {
622
- // gumbo_consume_char_ref ends with the iterator pointing at the next
623
- // character, so we need to be sure not advance it again before
624
- // reading the next token.
625
- tokenizer->_reconsume_current_input = true;
626
- emit_char(parser, char_ref.first, output);
627
- tokenizer->_buffered_emit_char = char_ref.second;
628
- } else {
629
- emit_char(parser, '&', output);
630
- }
631
- return status ? RETURN_SUCCESS : RETURN_ERROR;
632
- }
633
-
634
525
  // Emits a comment token. Comments use the temporary buffer to accumulate their
635
526
  // data, and then it's copied over and released to the 'text' field of the
636
- // GumboToken union. Always returns RETURN_SUCCESS.
527
+ // GumboToken union. Always returns EMIT_TOKEN.
637
528
  static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
638
529
  output->type = GUMBO_TOKEN_COMMENT;
639
530
  finish_temporary_buffer(parser, &output->v.text);
640
531
  finish_token(parser, output);
641
- return RETURN_SUCCESS;
532
+ return EMIT_TOKEN;
642
533
  }
643
534
 
644
- // Checks to see we should be flushing accumulated characters in the temporary
645
- // buffer, and fills the output token with the next output character if so.
646
- // Returns true if a character has been emitted and the tokenizer should
647
- // immediately return, false if we're at the end of the temporary buffer and
648
- // should resume normal operation.
649
- static bool maybe_emit_from_temporary_buffer(
650
- GumboParser* parser, GumboToken* output) {
535
+ static void set_mark(GumboParser* parser) {
651
536
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
652
- const char* c = tokenizer->_temporary_buffer_emit;
653
- GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
537
+ utf8iterator_mark(&tokenizer->_input);
538
+ }
654
539
 
655
- if (!c || c >= buffer->data + buffer->length) {
656
- tokenizer->_temporary_buffer_emit = NULL;
657
- return false;
540
+ // Checks to see we should be emitting characters from the mark, and fills the
541
+ // output token with the next output character if so.
542
+ // Returns EMIT_TOKEN if a character has been emitted and the tokenizer should
543
+ // immediately return, CONTINUE if we should resume normal operation.
544
+ static StateResult maybe_emit_from_mark (
545
+ GumboParser* parser,
546
+ GumboToken* output
547
+ ) {
548
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
549
+ const char* pos = tokenizer->_resume_pos;
550
+
551
+ if (!pos)
552
+ return CONTINUE;
553
+ if (utf8iterator_get_char_pointer(&tokenizer->_input) >= pos) {
554
+ tokenizer->_resume_pos = NULL;
555
+ return CONTINUE;
658
556
  }
659
557
 
660
- assert(*c == utf8iterator_current(&tokenizer->_input));
661
- // emit_char also advances the input stream. We need to do some juggling of
662
- // the _reconsume_current_input flag to get the proper behavior when emitting
663
- // previous tokens. Basically, _reconsume_current_input should *never* be set
664
- // when emitting anything from the temporary buffer, since those characters
665
- // have already been advanced past. However, it should be preserved so that
666
- // when the *next* character is encountered again, the tokenizer knows not to
667
- // advance past it.
668
- bool saved_reconsume_state = tokenizer->_reconsume_current_input;
669
- tokenizer->_reconsume_current_input = false;
670
- emit_char(parser, *c, output);
671
- ++tokenizer->_temporary_buffer_emit;
672
- tokenizer->_reconsume_current_input = saved_reconsume_state;
673
- return true;
558
+ // emit_char advances the input stream. _reconsume_current_input should
559
+ // *never* be set when emitting from the mark since those characters have
560
+ // already been advanced past.
561
+ assert(!tokenizer->_reconsume_current_input);
562
+ return emit_char(parser, utf8iterator_current(&tokenizer->_input), output);
674
563
  }
675
564
 
676
- // Sets up the tokenizer to begin flushing the temporary buffer.
677
- // This resets the input iterator stream to the start of the last tag, sets up
678
- // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
679
- // the first character in it. It returns true if a character was emitted, false
680
- // otherwise.
681
- static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
565
+ // Sets up the tokenizer to begin emitting from the mark up to, but not
566
+ // including, the current code point. This resets the input iterator stream to
567
+ // the mark, sets up _resume_pos, and then emits the first character in it.
568
+ // Returns EMIT_TOKEN.
569
+ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
682
570
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
683
- assert(tokenizer->_temporary_buffer.data);
571
+ tokenizer->_resume_pos = utf8iterator_get_char_pointer(&tokenizer->_input);
684
572
  utf8iterator_reset(&tokenizer->_input);
685
- tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
686
- return maybe_emit_from_temporary_buffer(parser, output);
573
+ // Now that we have reset the input, we need to advance through it.
574
+ tokenizer->_reconsume_current_input = false;
575
+ StateResult result = maybe_emit_from_mark(parser, output);
576
+ assert(result == EMIT_TOKEN);
577
+ return result;
687
578
  }
688
579
 
689
580
  // Appends a codepoint to the current tag buffer. If
@@ -703,6 +594,19 @@ static void append_char_to_tag_buffer (
703
594
  gumbo_string_buffer_append_codepoint(codepoint, buffer);
704
595
  }
705
596
 
597
+ // Like above but append a string.
598
+ static void append_string_to_tag_buffer (
599
+ GumboParser* parser,
600
+ GumboStringPiece* str,
601
+ bool reinitilize_position_on_first
602
+ ) {
603
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
604
+ if (buffer->length == 0 && reinitilize_position_on_first) {
605
+ reset_tag_buffer_start_point(parser);
606
+ }
607
+ gumbo_string_buffer_append_string(str, buffer);
608
+ }
609
+
706
610
  // (Re-)initialize the tag buffer. This also resets the original_text pointer
707
611
  // and _start_pos field to point to the current position.
708
612
  static void initialize_tag_buffer(GumboParser* parser) {
@@ -713,6 +617,70 @@ static void initialize_tag_buffer(GumboParser* parser) {
713
617
  reset_tag_buffer_start_point(parser);
714
618
  }
715
619
 
620
+ // https://html.spec.whatwg.org/multipage/parsing.html#charref-in-attribute
621
+ static bool character_reference_part_of_attribute(GumboParser* parser) {
622
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
623
+ switch (tokenizer->_return_state) {
624
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
625
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
626
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
627
+ return true;
628
+ default:
629
+ return false;
630
+ }
631
+ }
632
+
633
+ // https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference
634
+ // For each code point in the temporary buffer, add to the current attribute
635
+ // value if the character reference was consumed as part of an attribute or
636
+ // emit the code point as a character token.
637
+ static StateResult flush_code_points_consumed_as_character_reference (
638
+ GumboParser* parser,
639
+ GumboToken* output
640
+ ) {
641
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
642
+ if (character_reference_part_of_attribute(parser)) {
643
+ const char *start = utf8iterator_get_mark_pointer(&tokenizer->_input);
644
+ assert(start);
645
+ GumboStringPiece str = {
646
+ .data = start,
647
+ .length = utf8iterator_get_char_pointer(&tokenizer->_input) - start,
648
+ };
649
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
650
+ append_string_to_tag_buffer(parser, &str, unquoted);
651
+ return CONTINUE;
652
+ }
653
+ return emit_from_mark(parser, output);
654
+ }
655
+
656
+ // After a character reference has been successfully constructed, the standard
657
+ // says to set the temporary buffer equal to the empty string, append the code
658
+ // point(s) associated with the reference and flush code points consumed as a
659
+ // character reference.
660
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
661
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
662
+ // That doesn't work for us because we use the temporary buffer in lock step
663
+ // with the input for position and that would fail if we inserted a different
664
+ // number of code points. So duplicate a bit of the above logic.
665
+ static StateResult flush_char_ref (
666
+ GumboParser* parser,
667
+ int first,
668
+ int second,
669
+ GumboToken* output
670
+ ) {
671
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
672
+ if (character_reference_part_of_attribute(parser)) {
673
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
674
+ append_char_to_tag_buffer(parser, first, unquoted);
675
+ if (second != kGumboNoChar)
676
+ append_char_to_tag_buffer(parser, second, unquoted);
677
+ return CONTINUE;
678
+ }
679
+ tokenizer->_buffered_emit_char = second;
680
+ return emit_char(parser, first, output);
681
+ }
682
+
683
+
716
684
  // Initializes the tag_state to start a new tag, keeping track of the opening
717
685
  // positions and original text. Takes a boolean indicating whether this is a
718
686
  // start or end tag.
@@ -725,7 +693,6 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
725
693
  assert(is_alpha(c));
726
694
 
727
695
  initialize_tag_buffer(parser);
728
- gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
729
696
 
730
697
  assert(tag_state->_name == NULL);
731
698
  assert(tag_state->_attributes.data == NULL);
@@ -801,23 +768,20 @@ static void finish_tag_name(GumboParser* parser) {
801
768
  }
802
769
 
803
770
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
804
- static void add_duplicate_attr_error (
805
- GumboParser* parser,
806
- int original_index,
807
- int new_index
808
- ) {
771
+ static void add_duplicate_attr_error(GumboParser* parser) {
772
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
773
+ tokenizer->_parse_error = true;
809
774
  GumboError* error = gumbo_add_error(parser);
810
775
  if (!error) {
811
776
  return;
812
777
  }
813
778
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
814
- error->type = GUMBO_ERR_DUPLICATE_ATTR;
779
+ error->type = GUMBO_ERR_DUPLICATE_ATTRIBUTE;
815
780
  error->position = tag_state->_start_pos;
816
- error->original_text = tag_state->_original_text;
817
- error->v.duplicate_attr.original_index = original_index;
818
- error->v.duplicate_attr.new_index = new_index;
819
- copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
820
- reinitialize_tag_buffer(parser);
781
+ error->original_text.data = tag_state->_original_text;
782
+ error->original_text.length =
783
+ utf8iterator_get_char_pointer(&tokenizer->_input) - error->original_text.data;
784
+ error->v.tokenizer.state = tokenizer->_state;
821
785
  }
822
786
 
823
787
  // Creates a new attribute in the current tag, copying the current tag buffer to
@@ -846,7 +810,8 @@ static bool finish_attribute_name(GumboParser* parser) {
846
810
  )
847
811
  ) {
848
812
  // Identical attribute; bail.
849
- add_duplicate_attr_error(parser, i, attributes->length);
813
+ add_duplicate_attr_error(parser);
814
+ reinitialize_tag_buffer(parser);
850
815
  tag_state->_drop_next_attr_value = true;
851
816
  return false;
852
817
  }
@@ -911,19 +876,21 @@ void gumbo_tokenizer_state_init (
911
876
  GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState));
912
877
  parser->_tokenizer_state = tokenizer;
913
878
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
879
+ tokenizer->_return_state = GUMBO_LEX_DATA;
880
+ tokenizer->_character_reference_code = 0;
914
881
  tokenizer->_reconsume_current_input = false;
915
- tokenizer->_is_current_node_foreign = false;
882
+ tokenizer->_is_adjusted_current_node_foreign = false;
916
883
  tokenizer->_is_in_cdata = false;
884
+ tokenizer->_parse_error = false;
917
885
  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
918
886
  tokenizer->_tag_state._name = NULL;
919
887
 
920
888
  tokenizer->_buffered_emit_char = kGumboNoChar;
921
889
  gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
922
- tokenizer->_temporary_buffer_emit = NULL;
890
+ tokenizer->_resume_pos = NULL;
923
891
 
924
892
  mark_tag_state_as_empty(&tokenizer->_tag_state);
925
893
 
926
- gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
927
894
  tokenizer->_token_start = text;
928
895
  utf8iterator_init(parser, text, text_length, &tokenizer->_input);
929
896
  utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
@@ -936,7 +903,6 @@ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
936
903
  assert(tokenizer->_doc_type_state.public_identifier == NULL);
937
904
  assert(tokenizer->_doc_type_state.system_identifier == NULL);
938
905
  gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
939
- gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
940
906
  assert(tokenizer->_tag_state._name == NULL);
941
907
  assert(tokenizer->_tag_state._attributes.data == NULL);
942
908
  gumbo_free(tokenizer);
@@ -946,17 +912,23 @@ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
946
912
  parser->_tokenizer_state->_state = state;
947
913
  }
948
914
 
949
- void gumbo_tokenizer_set_is_current_node_foreign (
915
+ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
950
916
  GumboParser* parser,
951
917
  bool is_foreign
952
918
  ) {
953
- if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
919
+ if (is_foreign != parser->_tokenizer_state->_is_adjusted_current_node_foreign) {
954
920
  gumbo_debug (
955
921
  "Toggling is_current_node_foreign to %s.\n",
956
922
  is_foreign ? "true" : "false"
957
923
  );
958
924
  }
959
- parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
925
+ parser->_tokenizer_state->_is_adjusted_current_node_foreign = is_foreign;
926
+ }
927
+
928
+ static void reconsume_in_state(GumboParser* parser, GumboTokenizerEnum state) {
929
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
930
+ tokenizer->_reconsume_current_input = true;
931
+ tokenizer->_state = state;
960
932
  }
961
933
 
962
934
  // https://html.spec.whatwg.org/multipage/parsing.html#data-state
@@ -968,37 +940,24 @@ static StateResult handle_data_state (
968
940
  ) {
969
941
  switch (c) {
970
942
  case '&':
971
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
972
- // The char_ref machinery expects to be on the & so it can mark that
973
- // and return to it if the text isn't a char ref, so we need to
974
- // reconsume it.
975
- tokenizer->_reconsume_current_input = true;
976
- return NEXT_CHAR;
943
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
944
+ set_mark(parser);
945
+ tokenizer->_return_state = GUMBO_LEX_DATA;
946
+ return CONTINUE;
977
947
  case '<':
978
948
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
979
- clear_temporary_buffer(parser);
980
- append_char_to_temporary_buffer(parser, '<');
981
- return NEXT_CHAR;
949
+ set_mark(parser);
950
+ return CONTINUE;
982
951
  case '\0':
983
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
984
- emit_char(parser, c, output);
985
- return RETURN_ERROR;
952
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
953
+ return emit_char(parser, c, output);
954
+ case -1:
955
+ return emit_eof(parser, output);
986
956
  default:
987
- return emit_current_char(parser, output);
957
+ return emit_char(parser, c, output);
988
958
  }
989
959
  }
990
960
 
991
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-data-state
992
- static StateResult handle_char_ref_in_data_state (
993
- GumboParser* parser,
994
- GumboTokenizerState* UNUSED_ARG(tokenizer),
995
- int UNUSED_ARG(c),
996
- GumboToken* output
997
- ) {
998
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
999
- return emit_char_ref(parser, ' ', false, output);
1000
- }
1001
-
1002
961
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
1003
962
  static StateResult handle_rcdata_state (
1004
963
  GumboParser* parser,
@@ -1008,34 +967,23 @@ static StateResult handle_rcdata_state (
1008
967
  ) {
1009
968
  switch (c) {
1010
969
  case '&':
1011
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
1012
- tokenizer->_reconsume_current_input = true;
1013
- return NEXT_CHAR;
970
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
971
+ set_mark(parser);
972
+ tokenizer->_return_state = GUMBO_LEX_RCDATA;
973
+ return CONTINUE;
1014
974
  case '<':
1015
975
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
1016
- clear_temporary_buffer(parser);
1017
- append_char_to_temporary_buffer(parser, '<');
1018
- return NEXT_CHAR;
976
+ set_mark(parser);
977
+ return CONTINUE;
1019
978
  case '\0':
1020
979
  return emit_replacement_char(parser, output);
1021
980
  case -1:
1022
981
  return emit_eof(parser, output);
1023
982
  default:
1024
- return emit_current_char(parser, output);
983
+ return emit_char(parser, c, output);
1025
984
  }
1026
985
  }
1027
986
 
1028
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-rcdata-state
1029
- static StateResult handle_char_ref_in_rcdata_state (
1030
- GumboParser* parser,
1031
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1032
- int UNUSED_ARG(c),
1033
- GumboToken* output
1034
- ) {
1035
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1036
- return emit_char_ref(parser, ' ', false, output);
1037
- }
1038
-
1039
987
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
1040
988
  static StateResult handle_rawtext_state (
1041
989
  GumboParser* parser,
@@ -1046,20 +994,19 @@ static StateResult handle_rawtext_state (
1046
994
  switch (c) {
1047
995
  case '<':
1048
996
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
1049
- clear_temporary_buffer(parser);
1050
- append_char_to_temporary_buffer(parser, '<');
1051
- return NEXT_CHAR;
997
+ set_mark(parser);
998
+ return CONTINUE;
1052
999
  case '\0':
1053
1000
  return emit_replacement_char(parser, output);
1054
1001
  case -1:
1055
1002
  return emit_eof(parser, output);
1056
1003
  default:
1057
- return emit_current_char(parser, output);
1004
+ return emit_char(parser, c, output);
1058
1005
  }
1059
1006
  }
1060
1007
 
1061
1008
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
1062
- static StateResult handle_script_state (
1009
+ static StateResult handle_script_data_state (
1063
1010
  GumboParser* parser,
1064
1011
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1065
1012
  int c,
@@ -1067,16 +1014,15 @@ static StateResult handle_script_state (
1067
1014
  ) {
1068
1015
  switch (c) {
1069
1016
  case '<':
1070
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
1071
- clear_temporary_buffer(parser);
1072
- append_char_to_temporary_buffer(parser, '<');
1073
- return NEXT_CHAR;
1017
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_LT);
1018
+ set_mark(parser);
1019
+ return CONTINUE;
1074
1020
  case '\0':
1075
1021
  return emit_replacement_char(parser, output);
1076
1022
  case -1:
1077
1023
  return emit_eof(parser, output);
1078
1024
  default:
1079
- return emit_current_char(parser, output);
1025
+ return emit_char(parser, c, output);
1080
1026
  }
1081
1027
  }
1082
1028
 
@@ -1093,75 +1039,75 @@ static StateResult handle_plaintext_state (
1093
1039
  case -1:
1094
1040
  return emit_eof(parser, output);
1095
1041
  default:
1096
- return emit_current_char(parser, output);
1042
+ return emit_char(parser, c, output);
1097
1043
  }
1098
1044
  }
1099
1045
 
1100
1046
  // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
1101
1047
  static StateResult handle_tag_open_state (
1102
1048
  GumboParser* parser,
1103
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1049
+ GumboTokenizerState* tokenizer,
1104
1050
  int c,
1105
1051
  GumboToken* output
1106
1052
  ) {
1107
- assert(temporary_buffer_equals(parser, "<"));
1108
1053
  switch (c) {
1109
1054
  case '!':
1110
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1055
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION_OPEN);
1111
1056
  clear_temporary_buffer(parser);
1112
- return NEXT_CHAR;
1057
+ return CONTINUE;
1113
1058
  case '/':
1114
1059
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1115
- append_char_to_temporary_buffer(parser, '/');
1116
- return NEXT_CHAR;
1060
+ return CONTINUE;
1117
1061
  case '?':
1118
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1062
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
1119
1063
  clear_temporary_buffer(parser);
1120
- append_char_to_temporary_buffer(parser, '?');
1121
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1122
- return NEXT_CHAR;
1064
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1065
+ return CONTINUE;
1066
+ case -1:
1067
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1068
+ // Switch to data to emit EOF.
1069
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1070
+ return emit_from_mark(parser, output);
1123
1071
  default:
1124
1072
  if (is_alpha(c)) {
1125
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1073
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1126
1074
  start_new_tag(parser, true);
1127
- return NEXT_CHAR;
1128
- } else {
1129
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1130
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1131
- emit_temporary_buffer(parser, output);
1132
- return RETURN_ERROR;
1075
+ return CONTINUE;
1133
1076
  }
1077
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1078
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1079
+ return emit_from_mark(parser, output);
1134
1080
  }
1135
1081
  }
1136
1082
 
1137
1083
  // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
1138
1084
  static StateResult handle_end_tag_open_state (
1139
1085
  GumboParser* parser,
1140
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1086
+ GumboTokenizerState* tokenizer,
1141
1087
  int c,
1142
1088
  GumboToken* output
1143
1089
  ) {
1144
- assert(temporary_buffer_equals(parser, "</"));
1145
1090
  switch (c) {
1146
1091
  case '>':
1147
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1092
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_END_TAG_NAME);
1148
1093
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1149
- return NEXT_CHAR;
1094
+ return CONTINUE;
1150
1095
  case -1:
1151
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1152
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1153
- return emit_temporary_buffer(parser, output);
1096
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1097
+ // Similar to the tag open state except we need to emit '<' and '/'
1098
+ // before the EOF.
1099
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1100
+ return emit_from_mark(parser, output);
1154
1101
  default:
1155
1102
  if (is_alpha(c)) {
1156
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1103
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1157
1104
  start_new_tag(parser, false);
1158
1105
  } else {
1159
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1160
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1106
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1107
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1161
1108
  clear_temporary_buffer(parser);
1162
- append_char_to_temporary_buffer(parser, c);
1163
1109
  }
1164
- return NEXT_CHAR;
1110
+ return CONTINUE;
1165
1111
  }
1166
1112
  }
1167
1113
 
@@ -1179,27 +1125,26 @@ static StateResult handle_tag_name_state (
1179
1125
  case ' ':
1180
1126
  finish_tag_name(parser);
1181
1127
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1182
- return NEXT_CHAR;
1128
+ return CONTINUE;
1183
1129
  case '/':
1184
1130
  finish_tag_name(parser);
1185
1131
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1186
- return NEXT_CHAR;
1132
+ return CONTINUE;
1187
1133
  case '>':
1188
1134
  finish_tag_name(parser);
1189
1135
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1190
1136
  return emit_current_tag(parser, output);
1191
1137
  case '\0':
1192
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1138
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1193
1139
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1194
- return NEXT_CHAR;
1140
+ return CONTINUE;
1195
1141
  case -1:
1196
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1142
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1197
1143
  abandon_current_tag(parser);
1198
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1199
- return NEXT_CHAR;
1144
+ return emit_eof(parser, output);
1200
1145
  default:
1201
1146
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1202
- return NEXT_CHAR;
1147
+ return CONTINUE;
1203
1148
  }
1204
1149
  }
1205
1150
 
@@ -1210,36 +1155,29 @@ static StateResult handle_rcdata_lt_state (
1210
1155
  int c,
1211
1156
  GumboToken* output
1212
1157
  ) {
1213
- assert(temporary_buffer_equals(parser, "<"));
1214
1158
  if (c == '/') {
1215
1159
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1216
- append_char_to_temporary_buffer(parser, '/');
1217
- return NEXT_CHAR;
1160
+ return CONTINUE;
1218
1161
  } else {
1219
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1220
- tokenizer->_reconsume_current_input = true;
1221
- return emit_temporary_buffer(parser, output);
1162
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1163
+ return emit_from_mark(parser, output);
1222
1164
  }
1223
1165
  }
1224
1166
 
1225
1167
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
1226
1168
  static StateResult handle_rcdata_end_tag_open_state (
1227
1169
  GumboParser* parser,
1228
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1170
+ GumboTokenizerState* tokenizer,
1229
1171
  int c,
1230
1172
  GumboToken* output
1231
1173
  ) {
1232
- assert(temporary_buffer_equals(parser, "</"));
1233
1174
  if (is_alpha(c)) {
1234
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1175
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1235
1176
  start_new_tag(parser, false);
1236
- append_char_to_temporary_buffer(parser, c);
1237
- return NEXT_CHAR;
1238
- } else {
1239
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1240
- return emit_temporary_buffer(parser, output);
1177
+ return CONTINUE;
1241
1178
  }
1242
- return true;
1179
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1180
+ return emit_from_mark(parser, output);
1243
1181
  }
1244
1182
 
1245
1183
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
@@ -1250,33 +1188,39 @@ static StateResult handle_rcdata_end_tag_name_state (
1250
1188
  GumboToken* output
1251
1189
  ) {
1252
1190
  UNUSED_IF_NDEBUG(tokenizer);
1253
- assert(tokenizer->_temporary_buffer.length >= 2);
1254
1191
  if (is_alpha(c)) {
1255
1192
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1256
- append_char_to_temporary_buffer(parser, c);
1257
- return NEXT_CHAR;
1258
- } else if (is_appropriate_end_tag(parser)) {
1259
- switch (c) {
1260
- case '\t':
1261
- case '\n':
1262
- case '\f':
1263
- case ' ':
1264
- finish_tag_name(parser);
1265
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1266
- return NEXT_CHAR;
1267
- case '/':
1268
- finish_tag_name(parser);
1269
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1270
- return NEXT_CHAR;
1271
- case '>':
1272
- finish_tag_name(parser);
1273
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1274
- return emit_current_tag(parser, output);
1193
+ return CONTINUE;
1194
+ }
1195
+ switch (c) {
1196
+ case '\t':
1197
+ case '\n':
1198
+ case '\f':
1199
+ case ' ':
1200
+ if (is_appropriate_end_tag(parser)) {
1201
+ finish_tag_name(parser);
1202
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1203
+ return CONTINUE;
1204
+ }
1205
+ break;
1206
+ case '/':
1207
+ if (is_appropriate_end_tag(parser)) {
1208
+ finish_tag_name(parser);
1209
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1210
+ return CONTINUE;
1211
+ }
1212
+ break;
1213
+ case '>':
1214
+ if (is_appropriate_end_tag(parser)) {
1215
+ finish_tag_name(parser);
1216
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1217
+ return emit_current_tag(parser, output);
1275
1218
  }
1219
+ break;
1276
1220
  }
1277
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1278
1221
  abandon_current_tag(parser);
1279
- return emit_temporary_buffer(parser, output);
1222
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1223
+ return emit_from_mark(parser, output);
1280
1224
  }
1281
1225
 
1282
1226
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
@@ -1286,34 +1230,29 @@ static StateResult handle_rawtext_lt_state (
1286
1230
  int c,
1287
1231
  GumboToken* output
1288
1232
  ) {
1289
- assert(temporary_buffer_equals(parser, "<"));
1290
1233
  if (c == '/') {
1291
1234
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1292
- append_char_to_temporary_buffer(parser, '/');
1293
- return NEXT_CHAR;
1235
+ return CONTINUE;
1294
1236
  } else {
1295
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1296
- tokenizer->_reconsume_current_input = true;
1297
- return emit_temporary_buffer(parser, output);
1237
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1238
+ return emit_from_mark(parser, output);
1298
1239
  }
1299
1240
  }
1300
1241
 
1301
1242
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
1302
1243
  static StateResult handle_rawtext_end_tag_open_state (
1303
1244
  GumboParser* parser,
1304
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1245
+ GumboTokenizerState* tokenizer,
1305
1246
  int c,
1306
1247
  GumboToken* output
1307
1248
  ) {
1308
- assert(temporary_buffer_equals(parser, "</"));
1309
1249
  if (is_alpha(c)) {
1310
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1250
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1311
1251
  start_new_tag(parser, false);
1312
- append_char_to_temporary_buffer(parser, c);
1313
- return NEXT_CHAR;
1252
+ return CONTINUE;
1314
1253
  } else {
1315
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1316
- return emit_temporary_buffer(parser, output);
1254
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1255
+ return emit_from_mark(parser, output);
1317
1256
  }
1318
1257
  }
1319
1258
 
@@ -1324,153 +1263,156 @@ static StateResult handle_rawtext_end_tag_name_state (
1324
1263
  int c,
1325
1264
  GumboToken* output
1326
1265
  ) {
1327
- assert(tokenizer->_temporary_buffer.length >= 2);
1328
- gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1329
- tokenizer->_tag_state._buffer.data);
1330
1266
  if (is_alpha(c)) {
1331
1267
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1332
- append_char_to_temporary_buffer(parser, c);
1333
- return NEXT_CHAR;
1334
- } else if (is_appropriate_end_tag(parser)) {
1335
- gumbo_debug("Is an appropriate end tag.\n");
1336
- switch (c) {
1337
- case '\t':
1338
- case '\n':
1339
- case '\f':
1340
- case ' ':
1341
- finish_tag_name(parser);
1342
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1343
- return NEXT_CHAR;
1344
- case '/':
1345
- finish_tag_name(parser);
1346
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1347
- return NEXT_CHAR;
1348
- case '>':
1349
- finish_tag_name(parser);
1350
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1351
- return emit_current_tag(parser, output);
1268
+ return CONTINUE;
1269
+ }
1270
+ switch (c) {
1271
+ case '\t':
1272
+ case '\n':
1273
+ case '\f':
1274
+ case ' ':
1275
+ if (is_appropriate_end_tag(parser)) {
1276
+ finish_tag_name(parser);
1277
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1278
+ return CONTINUE;
1279
+ }
1280
+ break;
1281
+ case '/':
1282
+ if (is_appropriate_end_tag(parser)) {
1283
+ finish_tag_name(parser);
1284
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1285
+ return CONTINUE;
1352
1286
  }
1287
+ break;
1288
+ case '>':
1289
+ if (is_appropriate_end_tag(parser)) {
1290
+ finish_tag_name(parser);
1291
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1292
+ return emit_current_tag(parser, output);
1293
+ }
1294
+ break;
1353
1295
  }
1354
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1355
1296
  abandon_current_tag(parser);
1356
- return emit_temporary_buffer(parser, output);
1297
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1298
+ return emit_from_mark(parser, output);
1357
1299
  }
1358
1300
 
1359
1301
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
1360
- static StateResult handle_script_lt_state (
1302
+ static StateResult handle_script_data_lt_state (
1361
1303
  GumboParser* parser,
1362
1304
  GumboTokenizerState* tokenizer,
1363
1305
  int c,
1364
1306
  GumboToken* output
1365
1307
  ) {
1366
- assert(temporary_buffer_equals(parser, "<"));
1367
1308
  if (c == '/') {
1368
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1369
- append_char_to_temporary_buffer(parser, '/');
1370
- return NEXT_CHAR;
1371
- } else if (c == '!') {
1372
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1373
- append_char_to_temporary_buffer(parser, '!');
1374
- return emit_temporary_buffer(parser, output);
1375
- } else {
1376
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1377
- tokenizer->_reconsume_current_input = true;
1378
- return emit_temporary_buffer(parser, output);
1309
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN);
1310
+ return CONTINUE;
1311
+ }
1312
+ if (c == '!') {
1313
+ // This is the only place we don't reconsume the input before emitting the
1314
+ // temporary buffer. Since the current position is stored and the current
1315
+ // character is not emitted, we need to advance the input and then
1316
+ // reconsume.
1317
+ utf8iterator_next(&tokenizer->_input);
1318
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START);
1319
+ return emit_from_mark(parser, output);
1379
1320
  }
1321
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1322
+ return emit_from_mark(parser, output);
1380
1323
  }
1381
1324
 
1382
1325
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
1383
- static StateResult handle_script_end_tag_open_state (
1326
+ static StateResult handle_script_data_end_tag_open_state (
1384
1327
  GumboParser* parser,
1385
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1328
+ GumboTokenizerState* tokenizer,
1386
1329
  int c,
1387
1330
  GumboToken* output
1388
1331
  ) {
1389
- assert(temporary_buffer_equals(parser, "</"));
1390
1332
  if (is_alpha(c)) {
1391
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1333
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME);
1392
1334
  start_new_tag(parser, false);
1393
- append_char_to_temporary_buffer(parser, c);
1394
- return NEXT_CHAR;
1395
- } else {
1396
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1397
- return emit_temporary_buffer(parser, output);
1335
+ return CONTINUE;
1398
1336
  }
1337
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1338
+ return emit_from_mark(parser, output);
1399
1339
  }
1400
1340
 
1401
1341
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
1402
- static StateResult handle_script_end_tag_name_state (
1342
+ static StateResult handle_script_data_end_tag_name_state (
1403
1343
  GumboParser* parser,
1404
1344
  GumboTokenizerState* tokenizer,
1405
1345
  int c,
1406
1346
  GumboToken* output
1407
1347
  ) {
1408
- UNUSED_IF_NDEBUG(tokenizer);
1409
- assert(tokenizer->_temporary_buffer.length >= 2);
1410
1348
  if (is_alpha(c)) {
1411
1349
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1412
- append_char_to_temporary_buffer(parser, c);
1413
- return NEXT_CHAR;
1414
- } else if (is_appropriate_end_tag(parser)) {
1415
- switch (c) {
1416
- case '\t':
1417
- case '\n':
1418
- case '\f':
1419
- case ' ':
1420
- finish_tag_name(parser);
1421
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1422
- return NEXT_CHAR;
1423
- case '/':
1424
- finish_tag_name(parser);
1425
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1426
- return NEXT_CHAR;
1427
- case '>':
1428
- finish_tag_name(parser);
1429
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1430
- return emit_current_tag(parser, output);
1350
+ return CONTINUE;
1351
+ }
1352
+ switch (c) {
1353
+ case '\t':
1354
+ case '\n':
1355
+ case '\f':
1356
+ case ' ':
1357
+ if (is_appropriate_end_tag(parser)) {
1358
+ finish_tag_name(parser);
1359
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1360
+ return CONTINUE;
1361
+ }
1362
+ break;
1363
+ case '/':
1364
+ if (is_appropriate_end_tag(parser)) {
1365
+ finish_tag_name(parser);
1366
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1367
+ return CONTINUE;
1368
+ }
1369
+ break;
1370
+ case '>':
1371
+ if (is_appropriate_end_tag(parser)) {
1372
+ finish_tag_name(parser);
1373
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1374
+ return emit_current_tag(parser, output);
1431
1375
  }
1376
+ break;
1432
1377
  }
1433
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1434
1378
  abandon_current_tag(parser);
1435
- return emit_temporary_buffer(parser, output);
1379
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1380
+ return emit_from_mark(parser, output);
1436
1381
  }
1437
1382
 
1438
1383
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
1439
- static StateResult handle_script_escaped_start_state (
1384
+ static StateResult handle_script_data_escaped_start_state (
1440
1385
  GumboParser* parser,
1441
1386
  GumboTokenizerState* tokenizer,
1442
1387
  int c,
1443
1388
  GumboToken* output
1444
1389
  ) {
1445
1390
  if (c == '-') {
1446
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1447
- return emit_current_char(parser, output);
1448
- } else {
1449
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1450
- tokenizer->_reconsume_current_input = true;
1451
- return NEXT_CHAR;
1391
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH);
1392
+ return emit_char(parser, c, output);
1452
1393
  }
1394
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1395
+ return CONTINUE;
1453
1396
  }
1454
1397
 
1455
1398
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
1456
- static StateResult handle_script_escaped_start_dash_state (
1399
+ static StateResult handle_script_data_escaped_start_dash_state (
1457
1400
  GumboParser* parser,
1458
1401
  GumboTokenizerState* tokenizer,
1459
1402
  int c,
1460
1403
  GumboToken* output
1461
1404
  ) {
1462
1405
  if (c == '-') {
1463
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1464
- return emit_current_char(parser, output);
1406
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1407
+ return emit_char(parser, c, output);
1465
1408
  } else {
1466
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1467
- tokenizer->_reconsume_current_input = true;
1468
- return NEXT_CHAR;
1409
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1410
+ return CONTINUE;
1469
1411
  }
1470
1412
  }
1471
1413
 
1472
1414
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
1473
- static StateResult handle_script_escaped_state (
1415
+ static StateResult handle_script_data_escaped_state (
1474
1416
  GumboParser* parser,
1475
1417
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1476
1418
  int c,
@@ -1478,25 +1420,25 @@ static StateResult handle_script_escaped_state (
1478
1420
  ) {
1479
1421
  switch (c) {
1480
1422
  case '-':
1481
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1482
- return emit_current_char(parser, output);
1423
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH);
1424
+ return emit_char(parser, c, output);
1483
1425
  case '<':
1484
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1426
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1485
1427
  clear_temporary_buffer(parser);
1486
- append_char_to_temporary_buffer(parser, c);
1487
- return NEXT_CHAR;
1428
+ set_mark(parser);
1429
+ return CONTINUE;
1488
1430
  case '\0':
1489
1431
  return emit_replacement_char(parser, output);
1490
1432
  case -1:
1491
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1433
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1492
1434
  return emit_eof(parser, output);
1493
1435
  default:
1494
- return emit_current_char(parser, output);
1436
+ return emit_char(parser, c, output);
1495
1437
  }
1496
1438
  }
1497
1439
 
1498
1440
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
1499
- static StateResult handle_script_escaped_dash_state (
1441
+ static StateResult handle_script_data_escaped_dash_state (
1500
1442
  GumboParser* parser,
1501
1443
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1502
1444
  int c,
@@ -1504,28 +1446,27 @@ static StateResult handle_script_escaped_dash_state (
1504
1446
  ) {
1505
1447
  switch (c) {
1506
1448
  case '-':
1507
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1508
- return emit_current_char(parser, output);
1449
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1450
+ return emit_char(parser, c, output);
1509
1451
  case '<':
1510
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1452
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1511
1453
  clear_temporary_buffer(parser);
1512
- append_char_to_temporary_buffer(parser, c);
1513
- return NEXT_CHAR;
1454
+ set_mark(parser);
1455
+ return CONTINUE;
1514
1456
  case '\0':
1515
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1457
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1516
1458
  return emit_replacement_char(parser, output);
1517
1459
  case -1:
1518
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1519
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1520
- return NEXT_CHAR;
1460
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1461
+ return emit_eof(parser, output);
1521
1462
  default:
1522
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1523
- return emit_current_char(parser, output);
1463
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1464
+ return emit_char(parser, c, output);
1524
1465
  }
1525
1466
  }
1526
1467
 
1527
1468
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
1528
- static StateResult handle_script_escaped_dash_dash_state (
1469
+ static StateResult handle_script_data_escaped_dash_dash_state (
1529
1470
  GumboParser* parser,
1530
1471
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1531
1472
  int c,
@@ -1533,113 +1474,107 @@ static StateResult handle_script_escaped_dash_dash_state (
1533
1474
  ) {
1534
1475
  switch (c) {
1535
1476
  case '-':
1536
- return emit_current_char(parser, output);
1477
+ return emit_char(parser, c, output);
1537
1478
  case '<':
1538
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1479
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1539
1480
  clear_temporary_buffer(parser);
1540
- append_char_to_temporary_buffer(parser, c);
1541
- return NEXT_CHAR;
1481
+ set_mark(parser);
1482
+ return CONTINUE;
1542
1483
  case '>':
1543
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1544
- return emit_current_char(parser, output);
1484
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1485
+ return emit_char(parser, c, output);
1545
1486
  case '\0':
1546
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1487
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1547
1488
  return emit_replacement_char(parser, output);
1548
1489
  case -1:
1549
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1550
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1551
- return NEXT_CHAR;
1490
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1491
+ return emit_eof(parser, output);
1552
1492
  default:
1553
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1554
- return emit_current_char(parser, output);
1493
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1494
+ return emit_char(parser, c, output);
1555
1495
  }
1556
1496
  }
1557
1497
 
1558
1498
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
1559
- static StateResult handle_script_escaped_lt_state (
1499
+ static StateResult handle_script_data_escaped_lt_state (
1560
1500
  GumboParser* parser,
1561
1501
  GumboTokenizerState* tokenizer,
1562
1502
  int c,
1563
1503
  GumboToken* output
1564
1504
  ) {
1565
- assert(temporary_buffer_equals(parser, "<"));
1566
- assert(!tokenizer->_script_data_buffer.length);
1505
+ assert(temporary_buffer_is_empty(parser));
1567
1506
  if (c == '/') {
1568
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1569
- append_char_to_temporary_buffer(parser, c);
1570
- return NEXT_CHAR;
1571
- } else if (is_alpha(c)) {
1572
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1573
- append_char_to_temporary_buffer(parser, c);
1574
- gumbo_string_buffer_append_codepoint (
1575
- ensure_lowercase(c),
1576
- &tokenizer->_script_data_buffer
1577
- );
1578
- return emit_temporary_buffer(parser, output);
1579
- } else {
1580
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1581
- return emit_temporary_buffer(parser, output);
1507
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN);
1508
+ return CONTINUE;
1509
+ }
1510
+ if (is_alpha(c)) {
1511
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START);
1512
+ return emit_from_mark(parser, output);
1582
1513
  }
1514
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1515
+ return emit_from_mark(parser, output);
1583
1516
  }
1584
1517
 
1585
1518
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
1586
- static StateResult handle_script_escaped_end_tag_open_state (
1519
+ static StateResult handle_script_data_escaped_end_tag_open_state (
1587
1520
  GumboParser* parser,
1588
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1521
+ GumboTokenizerState* tokenizer,
1589
1522
  int c,
1590
1523
  GumboToken* output
1591
1524
  ) {
1592
- assert(temporary_buffer_equals(parser, "</"));
1593
1525
  if (is_alpha(c)) {
1594
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1526
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME);
1595
1527
  start_new_tag(parser, false);
1596
- append_char_to_temporary_buffer(parser, c);
1597
- return NEXT_CHAR;
1598
- } else {
1599
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1600
- return emit_temporary_buffer(parser, output);
1528
+ return CONTINUE;
1601
1529
  }
1530
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1531
+ return emit_from_mark(parser, output);
1602
1532
  }
1603
1533
 
1604
1534
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
1605
- static StateResult handle_script_escaped_end_tag_name_state (
1535
+ static StateResult handle_script_data_escaped_end_tag_name_state (
1606
1536
  GumboParser* parser,
1607
1537
  GumboTokenizerState* tokenizer,
1608
1538
  int c,
1609
1539
  GumboToken* output
1610
1540
  ) {
1611
- UNUSED_IF_NDEBUG(tokenizer);
1612
- assert(tokenizer->_temporary_buffer.length >= 2);
1613
1541
  if (is_alpha(c)) {
1614
1542
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1615
- append_char_to_temporary_buffer(parser, c);
1616
- return NEXT_CHAR;
1617
- } else if (is_appropriate_end_tag(parser)) {
1618
- switch (c) {
1619
- case '\t':
1620
- case '\n':
1621
- case '\f':
1622
- case ' ':
1623
- finish_tag_name(parser);
1624
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1625
- return NEXT_CHAR;
1626
- case '/':
1627
- finish_tag_name(parser);
1628
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1629
- return NEXT_CHAR;
1630
- case '>':
1631
- finish_tag_name(parser);
1632
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1633
- return emit_current_tag(parser, output);
1543
+ return CONTINUE;
1544
+ }
1545
+ switch (c) {
1546
+ case '\t':
1547
+ case '\n':
1548
+ case '\f':
1549
+ case ' ':
1550
+ if (is_appropriate_end_tag(parser)) {
1551
+ finish_tag_name(parser);
1552
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1553
+ return CONTINUE;
1554
+ }
1555
+ break;
1556
+ case '/':
1557
+ if (is_appropriate_end_tag(parser)) {
1558
+ finish_tag_name(parser);
1559
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1560
+ return CONTINUE;
1561
+ }
1562
+ break;
1563
+ case '>':
1564
+ if (is_appropriate_end_tag(parser)) {
1565
+ finish_tag_name(parser);
1566
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1567
+ return emit_current_tag(parser, output);
1634
1568
  }
1569
+ break;
1635
1570
  }
1636
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1637
1571
  abandon_current_tag(parser);
1638
- return emit_temporary_buffer(parser, output);
1572
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1573
+ return emit_from_mark(parser, output);
1639
1574
  }
1640
1575
 
1641
1576
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
1642
- static StateResult handle_script_double_escaped_start_state (
1577
+ static StateResult handle_script_data_double_escaped_start_state (
1643
1578
  GumboParser* parser,
1644
1579
  GumboTokenizerState* tokenizer,
1645
1580
  int c,
@@ -1656,29 +1591,23 @@ static StateResult handle_script_double_escaped_start_state (
1656
1591
  parser,
1657
1592
  gumbo_string_equals (
1658
1593
  &kScriptTag,
1659
- (GumboStringPiece*) &tokenizer->_script_data_buffer
1594
+ (GumboStringPiece*) &tokenizer->_temporary_buffer
1660
1595
  )
1661
- ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1662
- : GUMBO_LEX_SCRIPT_ESCAPED
1596
+ ? GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED
1597
+ : GUMBO_LEX_SCRIPT_DATA_ESCAPED
1663
1598
  );
1664
- return emit_current_char(parser, output);
1665
- default:
1666
- if (is_alpha(c)) {
1667
- gumbo_string_buffer_append_codepoint (
1668
- ensure_lowercase(c),
1669
- &tokenizer->_script_data_buffer
1670
- );
1671
- return emit_current_char(parser, output);
1672
- } else {
1673
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1674
- tokenizer->_reconsume_current_input = true;
1675
- return NEXT_CHAR;
1676
- }
1599
+ return emit_char(parser, c, output);
1600
+ }
1601
+ if (is_alpha(c)) {
1602
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1603
+ return emit_char(parser, c, output);
1677
1604
  }
1605
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1606
+ return CONTINUE;
1678
1607
  }
1679
1608
 
1680
1609
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
1681
- static StateResult handle_script_double_escaped_state (
1610
+ static StateResult handle_script_data_double_escaped_state (
1682
1611
  GumboParser* parser,
1683
1612
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1684
1613
  int c,
@@ -1686,24 +1615,23 @@ static StateResult handle_script_double_escaped_state (
1686
1615
  ) {
1687
1616
  switch (c) {
1688
1617
  case '-':
1689
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1690
- return emit_current_char(parser, output);
1618
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH);
1619
+ return emit_char(parser, c, output);
1691
1620
  case '<':
1692
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1693
- return emit_current_char(parser, output);
1621
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1622
+ return emit_char(parser, c, output);
1694
1623
  case '\0':
1695
1624
  return emit_replacement_char(parser, output);
1696
1625
  case -1:
1697
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1698
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1699
- return NEXT_CHAR;
1626
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1627
+ return emit_eof(parser, output);
1700
1628
  default:
1701
- return emit_current_char(parser, output);
1629
+ return emit_char(parser, c, output);
1702
1630
  }
1703
1631
  }
1704
1632
 
1705
1633
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
1706
- static StateResult handle_script_double_escaped_dash_state (
1634
+ static StateResult handle_script_data_double_escaped_dash_state (
1707
1635
  GumboParser* parser,
1708
1636
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1709
1637
  int c,
@@ -1712,26 +1640,25 @@ static StateResult handle_script_double_escaped_dash_state (
1712
1640
  switch (c) {
1713
1641
  case '-':
1714
1642
  gumbo_tokenizer_set_state(
1715
- parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1716
- return emit_current_char(parser, output);
1643
+ parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH);
1644
+ return emit_char(parser, c, output);
1717
1645
  case '<':
1718
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1719
- return emit_current_char(parser, output);
1646
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1647
+ return emit_char(parser, c, output);
1720
1648
  case '\0':
1721
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1649
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1722
1650
  return emit_replacement_char(parser, output);
1723
1651
  case -1:
1724
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1725
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1726
- return NEXT_CHAR;
1652
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1653
+ return emit_eof(parser, output);
1727
1654
  default:
1728
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1729
- return emit_current_char(parser, output);
1655
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1656
+ return emit_char(parser, c, output);
1730
1657
  }
1731
1658
  }
1732
1659
 
1733
1660
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
1734
- static StateResult handle_script_double_escaped_dash_dash_state (
1661
+ static StateResult handle_script_data_double_escaped_dash_dash_state (
1735
1662
  GumboParser* parser,
1736
1663
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1737
1664
  int c,
@@ -1739,46 +1666,44 @@ static StateResult handle_script_double_escaped_dash_dash_state (
1739
1666
  ) {
1740
1667
  switch (c) {
1741
1668
  case '-':
1742
- return emit_current_char(parser, output);
1669
+ return emit_char(parser, c, output);
1743
1670
  case '<':
1744
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1745
- return emit_current_char(parser, output);
1671
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1672
+ return emit_char(parser, c, output);
1746
1673
  case '>':
1747
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1748
- return emit_current_char(parser, output);
1674
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1675
+ return emit_char(parser, c, output);
1749
1676
  case '\0':
1750
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1677
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1751
1678
  return emit_replacement_char(parser, output);
1752
1679
  case -1:
1753
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1754
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1755
- return NEXT_CHAR;
1680
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1681
+ return emit_eof(parser, output);
1756
1682
  default:
1757
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1758
- return emit_current_char(parser, output);
1683
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1684
+ return emit_char(parser, c, output);
1759
1685
  }
1760
1686
  }
1761
1687
 
1762
1688
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
1763
- static StateResult handle_script_double_escaped_lt_state (
1689
+ static StateResult handle_script_data_double_escaped_lt_state (
1764
1690
  GumboParser* parser,
1765
1691
  GumboTokenizerState* tokenizer,
1766
1692
  int c,
1767
1693
  GumboToken* output
1768
1694
  ) {
1769
1695
  if (c == '/') {
1770
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1771
- gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
1772
- return emit_current_char(parser, output);
1696
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END);
1697
+ clear_temporary_buffer(parser);
1698
+ return emit_char(parser, c, output);
1773
1699
  } else {
1774
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1775
- tokenizer->_reconsume_current_input = true;
1776
- return NEXT_CHAR;
1700
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1701
+ return CONTINUE;
1777
1702
  }
1778
1703
  }
1779
1704
 
1780
1705
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
1781
- static StateResult handle_script_double_escaped_end_state (
1706
+ static StateResult handle_script_data_double_escaped_end_state (
1782
1707
  GumboParser* parser,
1783
1708
  GumboTokenizerState* tokenizer,
1784
1709
  int c,
@@ -1793,29 +1718,23 @@ static StateResult handle_script_double_escaped_end_state (
1793
1718
  case '>':
1794
1719
  gumbo_tokenizer_set_state(
1795
1720
  parser, gumbo_string_equals(&kScriptTag,
1796
- (GumboStringPiece*) &tokenizer->_script_data_buffer)
1797
- ? GUMBO_LEX_SCRIPT_ESCAPED
1798
- : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1799
- return emit_current_char(parser, output);
1800
- default:
1801
- if (is_alpha(c)) {
1802
- gumbo_string_buffer_append_codepoint (
1803
- ensure_lowercase(c),
1804
- &tokenizer->_script_data_buffer
1805
- );
1806
- return emit_current_char(parser, output);
1807
- } else {
1808
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1809
- tokenizer->_reconsume_current_input = true;
1810
- return NEXT_CHAR;
1811
- }
1721
+ (GumboStringPiece*) &tokenizer->_temporary_buffer)
1722
+ ? GUMBO_LEX_SCRIPT_DATA_ESCAPED
1723
+ : GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1724
+ return emit_char(parser, c, output);
1725
+ }
1726
+ if (is_alpha(c)) {
1727
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1728
+ return emit_char(parser, c, output);
1812
1729
  }
1730
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1731
+ return CONTINUE;
1813
1732
  }
1814
1733
 
1815
1734
  // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
1816
1735
  static StateResult handle_before_attr_name_state (
1817
1736
  GumboParser* parser,
1818
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1737
+ GumboTokenizerState* tokenizer,
1819
1738
  int c,
1820
1739
  GumboToken* output
1821
1740
  ) {
@@ -1824,40 +1743,27 @@ static StateResult handle_before_attr_name_state (
1824
1743
  case '\n':
1825
1744
  case '\f':
1826
1745
  case ' ':
1827
- return NEXT_CHAR;
1746
+ return CONTINUE;
1828
1747
  case '/':
1829
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1830
- return NEXT_CHAR;
1831
1748
  case '>':
1832
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1833
- return emit_current_tag(parser, output);
1834
- case '\0':
1835
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1836
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1837
- append_char_to_temporary_buffer(parser, 0xfffd);
1838
- return NEXT_CHAR;
1839
1749
  case -1:
1840
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1841
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1842
- abandon_current_tag(parser);
1843
- return NEXT_CHAR;
1844
- case '"':
1845
- case '\'':
1846
- case '<':
1750
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1751
+ return CONTINUE;
1847
1752
  case '=':
1848
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1849
- // Fall through.
1850
- default:
1753
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME);
1851
1754
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1852
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1853
- return NEXT_CHAR;
1755
+ append_char_to_tag_buffer(parser, c, true);
1756
+ return CONTINUE;
1757
+ default:
1758
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1759
+ return CONTINUE;
1854
1760
  }
1855
1761
  }
1856
1762
 
1857
1763
  // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
1858
1764
  static StateResult handle_attr_name_state (
1859
1765
  GumboParser* parser,
1860
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1766
+ GumboTokenizerState* tokenizer,
1861
1767
  int c,
1862
1768
  GumboToken* output
1863
1769
  ) {
@@ -1866,45 +1772,35 @@ static StateResult handle_attr_name_state (
1866
1772
  case '\n':
1867
1773
  case '\f':
1868
1774
  case ' ':
1869
- finish_attribute_name(parser);
1870
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1871
- return NEXT_CHAR;
1872
1775
  case '/':
1776
+ case '>':
1777
+ case -1:
1873
1778
  finish_attribute_name(parser);
1874
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1875
- return NEXT_CHAR;
1779
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1780
+ return CONTINUE;
1876
1781
  case '=':
1877
1782
  finish_attribute_name(parser);
1878
1783
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1879
- return NEXT_CHAR;
1880
- case '>':
1881
- finish_attribute_name(parser);
1882
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1883
- return emit_current_tag(parser, output);
1784
+ return CONTINUE;
1884
1785
  case '\0':
1885
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1786
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1886
1787
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1887
- return NEXT_CHAR;
1888
- case -1:
1889
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1890
- abandon_current_tag(parser);
1891
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1892
- return NEXT_CHAR;
1788
+ return CONTINUE;
1893
1789
  case '"':
1894
1790
  case '\'':
1895
1791
  case '<':
1896
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1792
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME);
1897
1793
  // Fall through.
1898
1794
  default:
1899
1795
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1900
- return NEXT_CHAR;
1796
+ return CONTINUE;
1901
1797
  }
1902
1798
  }
1903
1799
 
1904
1800
  // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
1905
1801
  static StateResult handle_after_attr_name_state (
1906
1802
  GumboParser* parser,
1907
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1803
+ GumboTokenizerState* tokenizer,
1908
1804
  int c,
1909
1805
  GumboToken* output
1910
1806
  ) {
@@ -1913,35 +1809,23 @@ static StateResult handle_after_attr_name_state (
1913
1809
  case '\n':
1914
1810
  case '\f':
1915
1811
  case ' ':
1916
- return NEXT_CHAR;
1812
+ return CONTINUE;
1917
1813
  case '/':
1918
1814
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1919
- return NEXT_CHAR;
1815
+ return CONTINUE;
1920
1816
  case '=':
1921
1817
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1922
- return NEXT_CHAR;
1818
+ return CONTINUE;
1923
1819
  case '>':
1924
1820
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1925
1821
  return emit_current_tag(parser, output);
1926
- case '\0':
1927
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1928
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1929
- append_char_to_temporary_buffer(parser, 0xfffd);
1930
- return NEXT_CHAR;
1931
1822
  case -1:
1932
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1933
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1823
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1934
1824
  abandon_current_tag(parser);
1935
- return NEXT_CHAR;
1936
- case '"':
1937
- case '\'':
1938
- case '<':
1939
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1940
- // Fall through.
1825
+ return emit_eof(parser, output);
1941
1826
  default:
1942
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1943
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1944
- return NEXT_CHAR;
1827
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1828
+ return CONTINUE;
1945
1829
  }
1946
1830
  }
1947
1831
 
@@ -1957,45 +1841,22 @@ static StateResult handle_before_attr_value_state (
1957
1841
  case '\n':
1958
1842
  case '\f':
1959
1843
  case ' ':
1960
- return NEXT_CHAR;
1844
+ return CONTINUE;
1961
1845
  case '"':
1962
1846
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1963
1847
  reset_tag_buffer_start_point(parser);
1964
- return NEXT_CHAR;
1965
- case '&':
1966
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1967
- tokenizer->_reconsume_current_input = true;
1968
- return NEXT_CHAR;
1848
+ return CONTINUE;
1969
1849
  case '\'':
1970
1850
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1971
1851
  reset_tag_buffer_start_point(parser);
1972
- return NEXT_CHAR;
1973
- case '\0':
1974
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1975
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1976
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1977
- return NEXT_CHAR;
1978
- case -1:
1979
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1980
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1981
- abandon_current_tag(parser);
1982
- tokenizer->_reconsume_current_input = true;
1983
- return NEXT_CHAR;
1852
+ return CONTINUE;
1984
1853
  case '>':
1985
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1854
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_ATTRIBUTE_VALUE);
1986
1855
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1987
- emit_current_tag(parser, output);
1988
- return RETURN_ERROR;
1989
- case '<':
1990
- case '=':
1991
- case '`':
1992
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1993
- // Fall through.
1994
- default:
1995
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1996
- append_char_to_tag_buffer(parser, c, true);
1997
- return NEXT_CHAR;
1856
+ return emit_current_tag(parser, output);
1998
1857
  }
1858
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1859
+ return CONTINUE;
1999
1860
  }
2000
1861
 
2001
1862
  // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
@@ -2003,30 +1864,28 @@ static StateResult handle_attr_value_double_quoted_state (
2003
1864
  GumboParser* parser,
2004
1865
  GumboTokenizerState* tokenizer,
2005
1866
  int c,
2006
- GumboToken* UNUSED_ARG(output)
1867
+ GumboToken* output
2007
1868
  ) {
2008
1869
  switch (c) {
2009
1870
  case '"':
2010
1871
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
2011
- return NEXT_CHAR;
1872
+ return CONTINUE;
2012
1873
  case '&':
2013
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2014
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2015
- tokenizer->_reconsume_current_input = true;
2016
- return NEXT_CHAR;
1874
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1875
+ set_mark(parser);
1876
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED;
1877
+ return CONTINUE;
2017
1878
  case '\0':
2018
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1879
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2019
1880
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
2020
- return NEXT_CHAR;
1881
+ return CONTINUE;
2021
1882
  case -1:
2022
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
2023
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1883
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2024
1884
  abandon_current_tag(parser);
2025
- tokenizer->_reconsume_current_input = true;
2026
- return NEXT_CHAR;
1885
+ return emit_eof(parser, output);
2027
1886
  default:
2028
1887
  append_char_to_tag_buffer(parser, c, false);
2029
- return NEXT_CHAR;
1888
+ return CONTINUE;
2030
1889
  }
2031
1890
  }
2032
1891
 
@@ -2035,30 +1894,28 @@ static StateResult handle_attr_value_single_quoted_state (
2035
1894
  GumboParser* parser,
2036
1895
  GumboTokenizerState* tokenizer,
2037
1896
  int c,
2038
- GumboToken* UNUSED_ARG(output)
1897
+ GumboToken* output
2039
1898
  ) {
2040
1899
  switch (c) {
2041
1900
  case '\'':
2042
1901
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
2043
- return NEXT_CHAR;
1902
+ return CONTINUE;
2044
1903
  case '&':
2045
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2046
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2047
- tokenizer->_reconsume_current_input = true;
2048
- return NEXT_CHAR;
1904
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1905
+ set_mark(parser);
1906
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED;
1907
+ return CONTINUE;
2049
1908
  case '\0':
2050
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1909
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2051
1910
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
2052
- return NEXT_CHAR;
1911
+ return CONTINUE;
2053
1912
  case -1:
2054
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
2055
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1913
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2056
1914
  abandon_current_tag(parser);
2057
- tokenizer->_reconsume_current_input = true;
2058
- return NEXT_CHAR;
1915
+ return emit_eof(parser, output);
2059
1916
  default:
2060
1917
  append_char_to_tag_buffer(parser, c, false);
2061
- return NEXT_CHAR;
1918
+ return CONTINUE;
2062
1919
  }
2063
1920
  }
2064
1921
 
@@ -2076,91 +1933,37 @@ static StateResult handle_attr_value_unquoted_state (
2076
1933
  case ' ':
2077
1934
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2078
1935
  finish_attribute_value(parser);
2079
- return NEXT_CHAR;
1936
+ return CONTINUE;
2080
1937
  case '&':
2081
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2082
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2083
- tokenizer->_reconsume_current_input = true;
2084
- return NEXT_CHAR;
1938
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1939
+ set_mark(parser);
1940
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_UNQUOTED;
1941
+ return CONTINUE;
2085
1942
  case '>':
2086
1943
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2087
1944
  finish_attribute_value(parser);
2088
1945
  return emit_current_tag(parser, output);
2089
1946
  case '\0':
2090
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1947
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2091
1948
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
2092
- return NEXT_CHAR;
1949
+ return CONTINUE;
2093
1950
  case -1:
2094
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
2095
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2096
- tokenizer->_reconsume_current_input = true;
1951
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2097
1952
  abandon_current_tag(parser);
2098
- return NEXT_CHAR;
2099
- case '<':
2100
- case '=':
1953
+ return emit_eof(parser, output);
2101
1954
  case '"':
2102
1955
  case '\'':
1956
+ case '<':
1957
+ case '=':
2103
1958
  case '`':
2104
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1959
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE);
2105
1960
  // Fall through.
2106
1961
  default:
2107
1962
  append_char_to_tag_buffer(parser, c, true);
2108
- return NEXT_CHAR;
1963
+ return CONTINUE;
2109
1964
  }
2110
1965
  }
2111
1966
 
2112
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-attribute-value-state
2113
- static StateResult handle_char_ref_in_attr_value_state (
2114
- GumboParser* parser,
2115
- GumboTokenizerState* tokenizer,
2116
- int UNUSED_ARG(c),
2117
- GumboToken* UNUSED_ARG(output)
2118
- ) {
2119
- OneOrTwoCodepoints char_ref;
2120
- int allowed_char;
2121
- bool is_unquoted = false;
2122
- switch (tokenizer->_tag_state._attr_value_state) {
2123
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
2124
- allowed_char = '"';
2125
- break;
2126
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
2127
- allowed_char = '\'';
2128
- break;
2129
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
2130
- allowed_char = '>';
2131
- is_unquoted = true;
2132
- break;
2133
- default:
2134
- // -Wmaybe-uninitialized is a little overzealous here, and doesn't
2135
- // get that the assert(0) means this codepath will never happen.
2136
- allowed_char = ' ';
2137
- assert(0);
2138
- }
2139
-
2140
- // Ignore the status, since we don't have a convenient way of signalling that
2141
- // a parser error has occurred when the error occurs in the middle of a
2142
- // multi-state token. We'd need a flag inside the TokenizerState to do this,
2143
- // but that's a low priority fix.
2144
- gumbo_consume_char_ref (
2145
- parser,
2146
- &tokenizer->_input,
2147
- allowed_char,
2148
- true,
2149
- &char_ref
2150
- );
2151
- if (char_ref.first != kGumboNoChar) {
2152
- tokenizer->_reconsume_current_input = true;
2153
- append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
2154
- if (char_ref.second != kGumboNoChar) {
2155
- append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
2156
- }
2157
- } else {
2158
- append_char_to_tag_buffer(parser, '&', is_unquoted);
2159
- }
2160
- gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
2161
- return NEXT_CHAR;
2162
- }
2163
-
2164
1967
  // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
2165
1968
  static StateResult handle_after_attr_value_quoted_state (
2166
1969
  GumboParser* parser,
@@ -2175,24 +1978,21 @@ static StateResult handle_after_attr_value_quoted_state (
2175
1978
  case '\f':
2176
1979
  case ' ':
2177
1980
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2178
- return NEXT_CHAR;
1981
+ return CONTINUE;
2179
1982
  case '/':
2180
1983
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
2181
- return NEXT_CHAR;
1984
+ return CONTINUE;
2182
1985
  case '>':
2183
1986
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2184
1987
  return emit_current_tag(parser, output);
2185
1988
  case -1:
2186
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
2187
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1989
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2188
1990
  abandon_current_tag(parser);
2189
- tokenizer->_reconsume_current_input = true;
2190
- return NEXT_CHAR;
1991
+ return emit_eof(parser, output);
2191
1992
  default:
2192
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
2193
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2194
- tokenizer->_reconsume_current_input = true;
2195
- return NEXT_CHAR;
1993
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES);
1994
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1995
+ return CONTINUE;
2196
1996
  }
2197
1997
  }
2198
1998
 
@@ -2209,15 +2009,13 @@ static StateResult handle_self_closing_start_tag_state (
2209
2009
  tokenizer->_tag_state._is_self_closing = true;
2210
2010
  return emit_current_tag(parser, output);
2211
2011
  case -1:
2212
- tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
2213
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2012
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2214
2013
  abandon_current_tag(parser);
2215
- return NEXT_CHAR;
2014
+ return emit_eof(parser, output);
2216
2015
  default:
2217
- tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
2218
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2219
- tokenizer->_reconsume_current_input = true;
2220
- return NEXT_CHAR;
2016
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG);
2017
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2018
+ return CONTINUE;
2221
2019
  }
2222
2020
  }
2223
2021
 
@@ -2228,21 +2026,27 @@ static StateResult handle_bogus_comment_state (
2228
2026
  int c,
2229
2027
  GumboToken* output
2230
2028
  ) {
2231
- while (c != '>' && c != -1) {
2232
- if (c == '\0') {
2233
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2234
- c = 0xFFFD;
2235
- }
2029
+ switch (c) {
2030
+ case '>':
2031
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2032
+ return emit_comment(parser, output);
2033
+ case -1:
2034
+ // We need to emit the comment and then the EOF, so reconsume in data
2035
+ // state.
2036
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2037
+ return emit_comment(parser, output);
2038
+ case '\0':
2039
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2040
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2041
+ return CONTINUE;
2042
+ default:
2236
2043
  append_char_to_temporary_buffer(parser, c);
2237
- utf8iterator_next(&tokenizer->_input);
2238
- c = utf8iterator_current(&tokenizer->_input);
2044
+ return CONTINUE;
2239
2045
  }
2240
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2241
- return emit_comment(parser, output);
2242
2046
  }
2243
2047
 
2244
2048
  // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
2245
- static StateResult handle_markup_declaration_state (
2049
+ static StateResult handle_markup_declaration_open_state (
2246
2050
  GumboParser* parser,
2247
2051
  GumboTokenizerState* tokenizer,
2248
2052
  int UNUSED_ARG(c),
@@ -2253,21 +2057,21 @@ static StateResult handle_markup_declaration_state (
2253
2057
  &tokenizer->_input,
2254
2058
  "--",
2255
2059
  sizeof("--") - 1,
2256
- true
2060
+ /* case sensitive */ true
2257
2061
  )
2258
2062
  ) {
2259
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
2260
- tokenizer->_reconsume_current_input = true;
2261
- } else if (
2063
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_START);
2064
+ return CONTINUE;
2065
+ }
2066
+ if (
2262
2067
  utf8iterator_maybe_consume_match (
2263
2068
  &tokenizer->_input,
2264
2069
  "DOCTYPE",
2265
2070
  sizeof("DOCTYPE") - 1,
2266
- false
2071
+ /* case sensitive */ false
2267
2072
  )
2268
2073
  ) {
2269
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
2270
- tokenizer->_reconsume_current_input = true;
2074
+ reconsume_in_state(parser, GUMBO_LEX_DOCTYPE);
2271
2075
  // If we get here, we know we'll eventually emit a doctype token, so now is
2272
2076
  // the time to initialize the doctype strings. (Not in doctype_state_init,
2273
2077
  // since then they'll leak if ownership never gets transferred to the
@@ -2275,24 +2079,35 @@ static StateResult handle_markup_declaration_state (
2275
2079
  tokenizer->_doc_type_state.name = gumbo_strdup("");
2276
2080
  tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
2277
2081
  tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
2278
- } else if (
2279
- tokenizer->_is_current_node_foreign
2280
- && utf8iterator_maybe_consume_match (
2082
+ return CONTINUE;
2083
+ }
2084
+ if (
2085
+ utf8iterator_maybe_consume_match (
2281
2086
  &tokenizer->_input,
2282
2087
  "[CDATA[", sizeof("[CDATA[") - 1,
2283
- true
2088
+ /* case sensitive */ true
2284
2089
  )
2285
2090
  ) {
2286
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2287
- tokenizer->_is_in_cdata = true;
2288
- tokenizer->_reconsume_current_input = true;
2289
- } else {
2290
- tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2291
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2292
- tokenizer->_reconsume_current_input = true;
2293
- clear_temporary_buffer(parser);
2091
+ if (tokenizer->_is_adjusted_current_node_foreign) {
2092
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2093
+ tokenizer->_is_in_cdata = true;
2094
+ // Start the token after the <![CDATA[.
2095
+ reset_token_start_point(tokenizer);
2096
+ } else {
2097
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_CDATA_IN_HTML_CONTENT);
2098
+ clear_temporary_buffer(parser);
2099
+ append_string_to_temporary_buffer (
2100
+ parser,
2101
+ &(const GumboStringPiece) { .data = "[CDATA[", .length = 7 }
2102
+ );
2103
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2104
+ }
2105
+ return CONTINUE;
2294
2106
  }
2295
- return NEXT_CHAR;
2107
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_OPENED_COMMENT);
2108
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2109
+ clear_temporary_buffer(parser);
2110
+ return CONTINUE;
2296
2111
  }
2297
2112
 
2298
2113
  // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
@@ -2305,26 +2120,14 @@ static StateResult handle_comment_start_state (
2305
2120
  switch (c) {
2306
2121
  case '-':
2307
2122
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2308
- return NEXT_CHAR;
2309
- case '\0':
2310
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2311
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2312
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2313
- return NEXT_CHAR;
2123
+ return CONTINUE;
2314
2124
  case '>':
2315
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2125
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2316
2126
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2317
- emit_comment(parser, output);
2318
- return RETURN_ERROR;
2319
- case -1:
2320
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2321
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2322
- emit_comment(parser, output);
2323
- return RETURN_ERROR;
2127
+ return emit_comment(parser, output);
2324
2128
  default:
2325
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2326
- append_char_to_temporary_buffer(parser, c);
2327
- return NEXT_CHAR;
2129
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2130
+ return CONTINUE;
2328
2131
  }
2329
2132
  }
2330
2133
 
@@ -2338,28 +2141,20 @@ static StateResult handle_comment_start_dash_state (
2338
2141
  switch (c) {
2339
2142
  case '-':
2340
2143
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2341
- return NEXT_CHAR;
2342
- case '\0':
2343
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2344
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2345
- append_char_to_temporary_buffer(parser, '-');
2346
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2347
- return NEXT_CHAR;
2144
+ return CONTINUE;
2348
2145
  case '>':
2349
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2146
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2350
2147
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2351
- emit_comment(parser, output);
2352
- return RETURN_ERROR;
2148
+ return emit_comment(parser, output);
2353
2149
  case -1:
2354
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2355
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2356
- emit_comment(parser, output);
2357
- return RETURN_ERROR;
2150
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2151
+ // Switch to data to emit the EOF next.
2152
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2153
+ return emit_comment(parser, output);
2358
2154
  default:
2359
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2155
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2360
2156
  append_char_to_temporary_buffer(parser, '-');
2361
- append_char_to_temporary_buffer(parser, c);
2362
- return NEXT_CHAR;
2157
+ return CONTINUE;
2363
2158
  }
2364
2159
  }
2365
2160
 
@@ -2371,21 +2166,99 @@ static StateResult handle_comment_state (
2371
2166
  GumboToken* output
2372
2167
  ) {
2373
2168
  switch (c) {
2169
+ case '<':
2170
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT);
2171
+ append_char_to_temporary_buffer(parser, c);
2172
+ return CONTINUE;
2374
2173
  case '-':
2375
2174
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2376
- return NEXT_CHAR;
2175
+ return CONTINUE;
2377
2176
  case '\0':
2378
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2177
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2379
2178
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2380
- return NEXT_CHAR;
2179
+ return CONTINUE;
2381
2180
  case -1:
2382
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2383
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2384
- emit_comment(parser, output);
2385
- return RETURN_ERROR;
2181
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2182
+ // Switch to data to emit the EOF token next.
2183
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2184
+ return emit_comment(parser, output);
2386
2185
  default:
2387
2186
  append_char_to_temporary_buffer(parser, c);
2388
- return NEXT_CHAR;
2187
+ return CONTINUE;
2188
+ }
2189
+ }
2190
+
2191
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
2192
+ static StateResult handle_comment_lt_state (
2193
+ GumboParser* parser,
2194
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2195
+ int c,
2196
+ GumboToken* output
2197
+ ) {
2198
+ switch (c) {
2199
+ case '!':
2200
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG);
2201
+ append_char_to_temporary_buffer(parser, c);
2202
+ return CONTINUE;
2203
+ case '<':
2204
+ append_char_to_temporary_buffer(parser, c);
2205
+ return CONTINUE;
2206
+ default:
2207
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2208
+ return CONTINUE;
2209
+ }
2210
+ }
2211
+
2212
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
2213
+ static StateResult handle_comment_lt_bang_state (
2214
+ GumboParser* parser,
2215
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2216
+ int c,
2217
+ GumboToken* output
2218
+ ) {
2219
+ switch (c) {
2220
+ case '-':
2221
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH);
2222
+ return CONTINUE;
2223
+ default:
2224
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2225
+ return CONTINUE;
2226
+ }
2227
+ }
2228
+
2229
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
2230
+ static StateResult handle_comment_lt_bang_dash_state (
2231
+ GumboParser* parser,
2232
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2233
+ int c,
2234
+ GumboToken* output
2235
+ ) {
2236
+ switch (c) {
2237
+ case '-':
2238
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH);
2239
+ return CONTINUE;
2240
+ default:
2241
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2242
+ return CONTINUE;
2243
+ }
2244
+ }
2245
+
2246
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
2247
+ static StateResult handle_comment_lt_bang_dash_dash_state (
2248
+ GumboParser* parser,
2249
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2250
+ int c,
2251
+ GumboToken* output
2252
+ ) {
2253
+ switch (c) {
2254
+ case '>':
2255
+ case -1:
2256
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2257
+ return CONTINUE;
2258
+ default:
2259
+ tokenizer_add_parse_error(parser, GUMBO_ERR_NESTED_COMMENT);
2260
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2261
+ return CONTINUE;
2389
2262
  }
2390
2263
  }
2391
2264
 
@@ -2397,25 +2270,18 @@ static StateResult handle_comment_end_dash_state (
2397
2270
  GumboToken* output
2398
2271
  ) {
2399
2272
  switch (c) {
2400
- case '-':
2401
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2402
- return NEXT_CHAR;
2403
- case '\0':
2404
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2405
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2406
- append_char_to_temporary_buffer(parser, '-');
2407
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2408
- return NEXT_CHAR;
2409
- case -1:
2410
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2411
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2412
- emit_comment(parser, output);
2413
- return RETURN_ERROR;
2414
- default:
2415
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2416
- append_char_to_temporary_buffer(parser, '-');
2417
- append_char_to_temporary_buffer(parser, c);
2418
- return NEXT_CHAR;
2273
+ case '-':
2274
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2275
+ return CONTINUE;
2276
+ case -1:
2277
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2278
+ // Switch to data to emit EOF next.
2279
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2280
+ return emit_comment(parser, output);
2281
+ default:
2282
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2283
+ append_char_to_temporary_buffer(parser, '-');
2284
+ return CONTINUE;
2419
2285
  }
2420
2286
  }
2421
2287
 
@@ -2430,35 +2296,22 @@ static StateResult handle_comment_end_state (
2430
2296
  case '>':
2431
2297
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2432
2298
  return emit_comment(parser, output);
2433
- case '\0':
2434
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2435
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2436
- append_char_to_temporary_buffer(parser, '-');
2437
- append_char_to_temporary_buffer(parser, '-');
2438
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2439
- return NEXT_CHAR;
2440
2299
  case '!':
2441
- tokenizer_add_parse_error(
2442
- parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2443
2300
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2444
- return NEXT_CHAR;
2301
+ return CONTINUE;
2445
2302
  case '-':
2446
- tokenizer_add_parse_error(
2447
- parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2448
2303
  append_char_to_temporary_buffer(parser, '-');
2449
- return NEXT_CHAR;
2304
+ return CONTINUE;
2450
2305
  case -1:
2451
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2306
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2307
+ // Switch to data to emit EOF next.
2452
2308
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2453
- emit_comment(parser, output);
2454
- return RETURN_ERROR;
2309
+ return emit_comment(parser, output);
2455
2310
  default:
2456
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2457
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2311
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2458
2312
  append_char_to_temporary_buffer(parser, '-');
2459
2313
  append_char_to_temporary_buffer(parser, '-');
2460
- append_char_to_temporary_buffer(parser, c);
2461
- return NEXT_CHAR;
2314
+ return CONTINUE;
2462
2315
  }
2463
2316
  }
2464
2317
 
@@ -2475,30 +2328,22 @@ static StateResult handle_comment_end_bang_state (
2475
2328
  append_char_to_temporary_buffer(parser, '-');
2476
2329
  append_char_to_temporary_buffer(parser, '-');
2477
2330
  append_char_to_temporary_buffer(parser, '!');
2478
- return NEXT_CHAR;
2331
+ return CONTINUE;
2479
2332
  case '>':
2333
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT);
2480
2334
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2481
2335
  return emit_comment(parser, output);
2482
- case '\0':
2483
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2484
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2485
- append_char_to_temporary_buffer(parser, '-');
2486
- append_char_to_temporary_buffer(parser, '-');
2487
- append_char_to_temporary_buffer(parser, '!');
2488
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2489
- return NEXT_CHAR;
2490
2336
  case -1:
2491
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2337
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2338
+ // Switch to data to emit EOF next.
2492
2339
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2493
- emit_comment(parser, output);
2494
- return RETURN_ERROR;
2340
+ return emit_comment(parser, output);
2495
2341
  default:
2496
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2342
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2497
2343
  append_char_to_temporary_buffer(parser, '-');
2498
2344
  append_char_to_temporary_buffer(parser, '-');
2499
2345
  append_char_to_temporary_buffer(parser, '!');
2500
- append_char_to_temporary_buffer(parser, c);
2501
- return NEXT_CHAR;
2346
+ return CONTINUE;
2502
2347
  }
2503
2348
  }
2504
2349
 
@@ -2509,26 +2354,27 @@ static StateResult handle_doctype_state (
2509
2354
  int c,
2510
2355
  GumboToken* output
2511
2356
  ) {
2512
- assert(!tokenizer->_temporary_buffer.length);
2357
+ assert(temporary_buffer_is_empty(parser));
2513
2358
  switch (c) {
2514
2359
  case '\t':
2515
2360
  case '\n':
2516
2361
  case '\f':
2517
2362
  case ' ':
2518
2363
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2519
- return NEXT_CHAR;
2364
+ return CONTINUE;
2365
+ case '>':
2366
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2367
+ return CONTINUE;
2520
2368
  case -1:
2521
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2522
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2369
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2523
2370
  tokenizer->_doc_type_state.force_quirks = true;
2524
- emit_doctype(parser, output);
2525
- return RETURN_ERROR;
2371
+ // Switch to data to emit EOF next.
2372
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2373
+ return emit_doctype(parser, output);
2526
2374
  default:
2527
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2528
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2529
- tokenizer->_reconsume_current_input = true;
2530
- tokenizer->_doc_type_state.force_quirks = true;
2531
- return NEXT_CHAR;
2375
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME);
2376
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2377
+ return CONTINUE;
2532
2378
  }
2533
2379
  }
2534
2380
 
@@ -2544,30 +2390,27 @@ static StateResult handle_before_doctype_name_state (
2544
2390
  case '\n':
2545
2391
  case '\f':
2546
2392
  case ' ':
2547
- return NEXT_CHAR;
2393
+ return CONTINUE;
2548
2394
  case '\0':
2549
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2395
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2550
2396
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2551
- tokenizer->_doc_type_state.force_quirks = true;
2552
2397
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2553
- return NEXT_CHAR;
2398
+ return CONTINUE;
2554
2399
  case '>':
2555
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2400
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_NAME);
2556
2401
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2557
2402
  tokenizer->_doc_type_state.force_quirks = true;
2558
- emit_doctype(parser, output);
2559
- return RETURN_ERROR;
2403
+ return emit_doctype(parser, output);
2560
2404
  case -1:
2561
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2562
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2405
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2563
2406
  tokenizer->_doc_type_state.force_quirks = true;
2564
- emit_doctype(parser, output);
2565
- return RETURN_ERROR;
2407
+ // Switch to data to emit EOF next.
2408
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2409
+ return emit_doctype(parser, output);
2566
2410
  default:
2567
2411
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2568
- tokenizer->_doc_type_state.force_quirks = false;
2569
2412
  append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2570
- return NEXT_CHAR;
2413
+ return CONTINUE;
2571
2414
  }
2572
2415
  }
2573
2416
 
@@ -2586,30 +2429,26 @@ static StateResult handle_doctype_name_state (
2586
2429
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2587
2430
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2588
2431
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2589
- return NEXT_CHAR;
2432
+ return CONTINUE;
2590
2433
  case '>':
2591
2434
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2592
2435
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2593
2436
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2594
- emit_doctype(parser, output);
2595
- return RETURN_SUCCESS;
2437
+ return emit_doctype(parser, output);
2596
2438
  case '\0':
2597
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2439
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2598
2440
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2599
- return NEXT_CHAR;
2441
+ return CONTINUE;
2600
2442
  case -1:
2601
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2602
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2443
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2444
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2603
2445
  tokenizer->_doc_type_state.force_quirks = true;
2604
2446
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2605
2447
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2606
- emit_doctype(parser, output);
2607
- return RETURN_ERROR;
2448
+ return emit_doctype(parser, output);
2608
2449
  default:
2609
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2610
- tokenizer->_doc_type_state.force_quirks = false;
2611
2450
  append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2612
- return NEXT_CHAR;
2451
+ return CONTINUE;
2613
2452
  }
2614
2453
  }
2615
2454
 
@@ -2625,35 +2464,29 @@ static StateResult handle_after_doctype_name_state (
2625
2464
  case '\n':
2626
2465
  case '\f':
2627
2466
  case ' ':
2628
- return NEXT_CHAR;
2467
+ return CONTINUE;
2629
2468
  case '>':
2630
2469
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2631
- emit_doctype(parser, output);
2632
- return RETURN_SUCCESS;
2470
+ return emit_doctype(parser, output);
2633
2471
  case -1:
2634
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2472
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2635
2473
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2636
2474
  tokenizer->_doc_type_state.force_quirks = true;
2637
- emit_doctype(parser, output);
2638
- return RETURN_ERROR;
2475
+ return emit_doctype(parser, output);
2639
2476
  default:
2640
2477
  if (utf8iterator_maybe_consume_match(
2641
2478
  &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2642
- gumbo_tokenizer_set_state(
2643
- parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2644
- tokenizer->_reconsume_current_input = true;
2479
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2645
2480
  } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2646
2481
  sizeof("SYSTEM") - 1, false)) {
2647
- gumbo_tokenizer_set_state(
2648
- parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2649
- tokenizer->_reconsume_current_input = true;
2482
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2650
2483
  } else {
2651
2484
  tokenizer_add_parse_error(
2652
- parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2653
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2485
+ parser, GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME);
2486
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2654
2487
  tokenizer->_doc_type_state.force_quirks = true;
2655
2488
  }
2656
- return NEXT_CHAR;
2489
+ return CONTINUE;
2657
2490
  }
2658
2491
  }
2659
2492
 
@@ -2670,37 +2503,34 @@ static StateResult handle_after_doctype_public_keyword_state (
2670
2503
  case '\f':
2671
2504
  case ' ':
2672
2505
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2673
- return NEXT_CHAR;
2506
+ return CONTINUE;
2674
2507
  case '"':
2675
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2508
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2676
2509
  assert(temporary_buffer_is_empty(parser));
2677
2510
  gumbo_tokenizer_set_state(
2678
2511
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2679
- return NEXT_CHAR;
2512
+ return CONTINUE;
2680
2513
  case '\'':
2681
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2514
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2682
2515
  assert(temporary_buffer_is_empty(parser));
2683
2516
  gumbo_tokenizer_set_state(
2684
2517
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2685
- return NEXT_CHAR;
2518
+ return CONTINUE;
2686
2519
  case '>':
2687
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2520
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2688
2521
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2689
2522
  tokenizer->_doc_type_state.force_quirks = true;
2690
- emit_doctype(parser, output);
2691
- return RETURN_ERROR;
2523
+ return emit_doctype(parser, output);
2692
2524
  case -1:
2693
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2694
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2525
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2526
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2695
2527
  tokenizer->_doc_type_state.force_quirks = true;
2696
- emit_doctype(parser, output);
2697
- return RETURN_ERROR;
2528
+ return emit_doctype(parser, output);
2698
2529
  default:
2699
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2700
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2530
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2531
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2701
2532
  tokenizer->_doc_type_state.force_quirks = true;
2702
- emit_doctype(parser, output);
2703
- return RETURN_ERROR;
2533
+ return CONTINUE;
2704
2534
  }
2705
2535
  }
2706
2536
 
@@ -2716,35 +2546,32 @@ static StateResult handle_before_doctype_public_id_state (
2716
2546
  case '\n':
2717
2547
  case '\f':
2718
2548
  case ' ':
2719
- return NEXT_CHAR;
2549
+ return CONTINUE;
2720
2550
  case '"':
2721
2551
  assert(temporary_buffer_is_empty(parser));
2722
2552
  gumbo_tokenizer_set_state(
2723
2553
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2724
- return NEXT_CHAR;
2554
+ return CONTINUE;
2725
2555
  case '\'':
2726
2556
  assert(temporary_buffer_is_empty(parser));
2727
2557
  gumbo_tokenizer_set_state(
2728
2558
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2729
- return NEXT_CHAR;
2559
+ return CONTINUE;
2730
2560
  case '>':
2731
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2561
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2732
2562
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2733
2563
  tokenizer->_doc_type_state.force_quirks = true;
2734
- emit_doctype(parser, output);
2735
- return RETURN_ERROR;
2564
+ return emit_doctype(parser, output);
2736
2565
  case -1:
2737
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2738
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2566
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2567
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2739
2568
  tokenizer->_doc_type_state.force_quirks = true;
2740
- emit_doctype(parser, output);
2741
- return RETURN_ERROR;
2569
+ return emit_doctype(parser, output);
2742
2570
  default:
2743
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2744
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2571
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2572
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2745
2573
  tokenizer->_doc_type_state.force_quirks = true;
2746
- emit_doctype(parser, output);
2747
- return RETURN_ERROR;
2574
+ return CONTINUE;
2748
2575
  }
2749
2576
  }
2750
2577
 
@@ -2759,28 +2586,26 @@ static StateResult handle_doctype_public_id_double_quoted_state (
2759
2586
  case '"':
2760
2587
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2761
2588
  finish_doctype_public_id(parser);
2762
- return NEXT_CHAR;
2589
+ return CONTINUE;
2763
2590
  case '\0':
2764
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2591
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2765
2592
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2766
- return NEXT_CHAR;
2593
+ return CONTINUE;
2767
2594
  case '>':
2768
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2595
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2769
2596
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2770
2597
  tokenizer->_doc_type_state.force_quirks = true;
2771
2598
  finish_doctype_public_id(parser);
2772
- emit_doctype(parser, output);
2773
- return RETURN_ERROR;
2599
+ return emit_doctype(parser, output);
2774
2600
  case -1:
2775
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2776
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2601
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2602
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2777
2603
  tokenizer->_doc_type_state.force_quirks = true;
2778
2604
  finish_doctype_public_id(parser);
2779
- emit_doctype(parser, output);
2780
- return RETURN_ERROR;
2605
+ return emit_doctype(parser, output);
2781
2606
  default:
2782
2607
  append_char_to_temporary_buffer(parser, c);
2783
- return NEXT_CHAR;
2608
+ return CONTINUE;
2784
2609
  }
2785
2610
  }
2786
2611
 
@@ -2795,28 +2620,26 @@ static StateResult handle_doctype_public_id_single_quoted_state (
2795
2620
  case '\'':
2796
2621
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2797
2622
  finish_doctype_public_id(parser);
2798
- return NEXT_CHAR;
2623
+ return CONTINUE;
2799
2624
  case '\0':
2800
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2625
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2801
2626
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2802
- return NEXT_CHAR;
2627
+ return CONTINUE;
2803
2628
  case '>':
2804
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2629
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2805
2630
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2806
2631
  tokenizer->_doc_type_state.force_quirks = true;
2807
2632
  finish_doctype_public_id(parser);
2808
- emit_doctype(parser, output);
2809
- return RETURN_ERROR;
2633
+ return emit_doctype(parser, output);
2810
2634
  case -1:
2811
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2812
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2635
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2636
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2813
2637
  tokenizer->_doc_type_state.force_quirks = true;
2814
2638
  finish_doctype_public_id(parser);
2815
- emit_doctype(parser, output);
2816
- return RETURN_ERROR;
2639
+ return emit_doctype(parser, output);
2817
2640
  default:
2818
2641
  append_char_to_temporary_buffer(parser, c);
2819
- return NEXT_CHAR;
2642
+ return CONTINUE;
2820
2643
  }
2821
2644
  }
2822
2645
 
@@ -2834,35 +2657,38 @@ static StateResult handle_after_doctype_public_id_state (
2834
2657
  case ' ':
2835
2658
  gumbo_tokenizer_set_state(
2836
2659
  parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2837
- return NEXT_CHAR;
2660
+ return CONTINUE;
2838
2661
  case '>':
2839
2662
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2840
- emit_doctype(parser, output);
2841
- return RETURN_SUCCESS;
2663
+ return emit_doctype(parser, output);
2842
2664
  case '"':
2843
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2665
+ tokenizer_add_parse_error (
2666
+ parser,
2667
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2668
+ );
2844
2669
  assert(temporary_buffer_is_empty(parser));
2845
2670
  gumbo_tokenizer_set_state(
2846
2671
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2847
- return NEXT_CHAR;
2672
+ return CONTINUE;
2848
2673
  case '\'':
2849
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2674
+ tokenizer_add_parse_error (
2675
+ parser,
2676
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2677
+ );
2850
2678
  assert(temporary_buffer_is_empty(parser));
2851
2679
  gumbo_tokenizer_set_state(
2852
2680
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2853
- return NEXT_CHAR;
2681
+ return CONTINUE;
2854
2682
  case -1:
2855
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2856
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2857
- tokenizer->_reconsume_current_input = true;
2683
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2684
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2858
2685
  tokenizer->_doc_type_state.force_quirks = true;
2859
- emit_doctype(parser, output);
2860
- return RETURN_ERROR;
2686
+ return emit_doctype(parser, output);
2861
2687
  default:
2862
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2863
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2688
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2689
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2864
2690
  tokenizer->_doc_type_state.force_quirks = true;
2865
- return NEXT_CHAR;
2691
+ return CONTINUE;
2866
2692
  }
2867
2693
  }
2868
2694
 
@@ -2878,33 +2704,30 @@ static StateResult handle_between_doctype_public_system_id_state (
2878
2704
  case '\n':
2879
2705
  case '\f':
2880
2706
  case ' ':
2881
- return NEXT_CHAR;
2707
+ return CONTINUE;
2882
2708
  case '>':
2883
2709
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2884
- emit_doctype(parser, output);
2885
- return RETURN_SUCCESS;
2710
+ return emit_doctype(parser, output);
2886
2711
  case '"':
2887
2712
  assert(temporary_buffer_is_empty(parser));
2888
2713
  gumbo_tokenizer_set_state(
2889
2714
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2890
- return NEXT_CHAR;
2715
+ return CONTINUE;
2891
2716
  case '\'':
2892
2717
  assert(temporary_buffer_is_empty(parser));
2893
2718
  gumbo_tokenizer_set_state(
2894
2719
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2895
- return NEXT_CHAR;
2720
+ return CONTINUE;
2896
2721
  case -1:
2897
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2898
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2722
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2723
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2899
2724
  tokenizer->_doc_type_state.force_quirks = true;
2900
- emit_doctype(parser, output);
2901
- return RETURN_ERROR;
2725
+ return emit_doctype(parser, output);
2902
2726
  default:
2903
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2904
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2727
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2728
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2905
2729
  tokenizer->_doc_type_state.force_quirks = true;
2906
- emit_doctype(parser, output);
2907
- return RETURN_ERROR;
2730
+ return CONTINUE;
2908
2731
  }
2909
2732
  }
2910
2733
 
@@ -2921,36 +2744,34 @@ static StateResult handle_after_doctype_system_keyword_state (
2921
2744
  case '\f':
2922
2745
  case ' ':
2923
2746
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2924
- return NEXT_CHAR;
2747
+ return CONTINUE;
2925
2748
  case '"':
2926
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2749
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2927
2750
  assert(temporary_buffer_is_empty(parser));
2928
2751
  gumbo_tokenizer_set_state(
2929
2752
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2930
- return NEXT_CHAR;
2753
+ return CONTINUE;
2931
2754
  case '\'':
2932
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2755
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2933
2756
  assert(temporary_buffer_is_empty(parser));
2934
2757
  gumbo_tokenizer_set_state(
2935
2758
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2936
- return NEXT_CHAR;
2759
+ return CONTINUE;
2937
2760
  case '>':
2938
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2761
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2939
2762
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2940
2763
  tokenizer->_doc_type_state.force_quirks = true;
2941
- emit_doctype(parser, output);
2942
- return RETURN_ERROR;
2764
+ return emit_doctype(parser, output);
2943
2765
  case -1:
2944
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2945
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2766
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2767
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2946
2768
  tokenizer->_doc_type_state.force_quirks = true;
2947
- emit_doctype(parser, output);
2948
- return RETURN_ERROR;
2769
+ return emit_doctype(parser, output);
2949
2770
  default:
2950
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2951
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2771
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2772
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2952
2773
  tokenizer->_doc_type_state.force_quirks = true;
2953
- return NEXT_CHAR;
2774
+ return CONTINUE;
2954
2775
  }
2955
2776
  }
2956
2777
 
@@ -2966,34 +2787,32 @@ static StateResult handle_before_doctype_system_id_state (
2966
2787
  case '\n':
2967
2788
  case '\f':
2968
2789
  case ' ':
2969
- return NEXT_CHAR;
2790
+ return CONTINUE;
2970
2791
  case '"':
2971
2792
  assert(temporary_buffer_is_empty(parser));
2972
2793
  gumbo_tokenizer_set_state(
2973
2794
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2974
- return NEXT_CHAR;
2795
+ return CONTINUE;
2975
2796
  case '\'':
2976
2797
  assert(temporary_buffer_is_empty(parser));
2977
2798
  gumbo_tokenizer_set_state(
2978
2799
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2979
- return NEXT_CHAR;
2800
+ return CONTINUE;
2980
2801
  case '>':
2981
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2802
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2982
2803
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2983
2804
  tokenizer->_doc_type_state.force_quirks = true;
2984
- emit_doctype(parser, output);
2985
- return RETURN_ERROR;
2805
+ return emit_doctype(parser, output);
2986
2806
  case -1:
2987
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2988
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2807
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2808
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2989
2809
  tokenizer->_doc_type_state.force_quirks = true;
2990
- emit_doctype(parser, output);
2991
- return RETURN_ERROR;
2810
+ return emit_doctype(parser, output);
2992
2811
  default:
2993
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2994
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2812
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2813
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2995
2814
  tokenizer->_doc_type_state.force_quirks = true;
2996
- return NEXT_CHAR;
2815
+ return CONTINUE;
2997
2816
  }
2998
2817
  }
2999
2818
 
@@ -3008,28 +2827,26 @@ static StateResult handle_doctype_system_id_double_quoted_state (
3008
2827
  case '"':
3009
2828
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
3010
2829
  finish_doctype_system_id(parser);
3011
- return NEXT_CHAR;
2830
+ return CONTINUE;
3012
2831
  case '\0':
3013
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2832
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
3014
2833
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
3015
- return NEXT_CHAR;
2834
+ return CONTINUE;
3016
2835
  case '>':
3017
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2836
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
3018
2837
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3019
2838
  tokenizer->_doc_type_state.force_quirks = true;
3020
2839
  finish_doctype_system_id(parser);
3021
- emit_doctype(parser, output);
3022
- return RETURN_ERROR;
2840
+ return emit_doctype(parser, output);
3023
2841
  case -1:
3024
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
3025
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2842
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2843
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3026
2844
  tokenizer->_doc_type_state.force_quirks = true;
3027
2845
  finish_doctype_system_id(parser);
3028
- emit_doctype(parser, output);
3029
- return RETURN_ERROR;
2846
+ return emit_doctype(parser, output);
3030
2847
  default:
3031
2848
  append_char_to_temporary_buffer(parser, c);
3032
- return NEXT_CHAR;
2849
+ return CONTINUE;
3033
2850
  }
3034
2851
  }
3035
2852
 
@@ -3044,28 +2861,26 @@ static StateResult handle_doctype_system_id_single_quoted_state (
3044
2861
  case '\'':
3045
2862
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
3046
2863
  finish_doctype_system_id(parser);
3047
- return NEXT_CHAR;
2864
+ return CONTINUE;
3048
2865
  case '\0':
3049
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2866
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
3050
2867
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
3051
- return NEXT_CHAR;
2868
+ return CONTINUE;
3052
2869
  case '>':
3053
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2870
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
3054
2871
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3055
2872
  tokenizer->_doc_type_state.force_quirks = true;
3056
2873
  finish_doctype_system_id(parser);
3057
- emit_doctype(parser, output);
3058
- return RETURN_ERROR;
2874
+ return emit_doctype(parser, output);
3059
2875
  case -1:
3060
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
3061
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2876
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2877
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3062
2878
  tokenizer->_doc_type_state.force_quirks = true;
3063
2879
  finish_doctype_system_id(parser);
3064
- emit_doctype(parser, output);
3065
- return RETURN_ERROR;
2880
+ return emit_doctype(parser, output);
3066
2881
  default:
3067
2882
  append_char_to_temporary_buffer(parser, c);
3068
- return NEXT_CHAR;
2883
+ return CONTINUE;
3069
2884
  }
3070
2885
  }
3071
2886
 
@@ -3081,21 +2896,19 @@ static StateResult handle_after_doctype_system_id_state (
3081
2896
  case '\n':
3082
2897
  case '\f':
3083
2898
  case ' ':
3084
- return NEXT_CHAR;
2899
+ return CONTINUE;
3085
2900
  case '>':
3086
2901
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3087
- emit_doctype(parser, output);
3088
- return RETURN_SUCCESS;
2902
+ return emit_doctype(parser, output);
3089
2903
  case -1:
3090
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
3091
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2904
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2905
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3092
2906
  tokenizer->_doc_type_state.force_quirks = true;
3093
- emit_doctype(parser, output);
3094
- return RETURN_ERROR;
2907
+ return emit_doctype(parser, output);
3095
2908
  default:
3096
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
3097
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
3098
- return NEXT_CHAR;
2909
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER);
2910
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2911
+ return CONTINUE;
3099
2912
  }
3100
2913
  }
3101
2914
 
@@ -3106,33 +2919,370 @@ static StateResult handle_bogus_doctype_state (
3106
2919
  int c,
3107
2920
  GumboToken* output
3108
2921
  ) {
3109
- if (c == '>' || c == -1) {
2922
+ switch (c) {
2923
+ case '>':
3110
2924
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3111
- emit_doctype(parser, output);
3112
- return RETURN_ERROR;
2925
+ return emit_doctype(parser, output);
2926
+ case '\0':
2927
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2928
+ return CONTINUE;
2929
+ case -1:
2930
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2931
+ return emit_doctype(parser, output);
2932
+ default:
2933
+ return CONTINUE;
3113
2934
  }
3114
- return NEXT_CHAR;
3115
2935
  }
3116
2936
 
3117
2937
  // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
3118
- static StateResult handle_cdata_state (
2938
+ static StateResult handle_cdata_section_state (
3119
2939
  GumboParser* parser,
3120
2940
  GumboTokenizerState* tokenizer,
3121
2941
  int c,
3122
2942
  GumboToken* output
3123
2943
  ) {
3124
- if (c == -1 || utf8iterator_maybe_consume_match(
3125
- &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
3126
- tokenizer->_reconsume_current_input = true;
2944
+ switch (c) {
2945
+ case ']':
2946
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_BRACKET);
2947
+ set_mark(parser);
2948
+ return CONTINUE;
2949
+ case -1:
2950
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_CDATA);
2951
+ return emit_eof(parser, output);
2952
+ default:
2953
+ return emit_char(parser, c, output);
2954
+ }
2955
+ }
2956
+
2957
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
2958
+ static StateResult handle_cdata_section_bracket_state (
2959
+ GumboParser* parser,
2960
+ GumboTokenizerState* tokenizer,
2961
+ int c,
2962
+ GumboToken* output
2963
+ ) {
2964
+ switch (c) {
2965
+ case ']':
2966
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_END);
2967
+ return CONTINUE;
2968
+ default:
2969
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2970
+ // Emit the ].
2971
+ return emit_from_mark(parser, output);
2972
+ }
2973
+ }
2974
+
2975
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
2976
+ static StateResult handle_cdata_section_end_state (
2977
+ GumboParser* parser,
2978
+ GumboTokenizerState* tokenizer,
2979
+ int c,
2980
+ GumboToken* output
2981
+ ) {
2982
+ switch (c) {
2983
+ case ']':
2984
+ {
2985
+ // XXX: This is terrible. We want to emit a ] corresponding to the first
2986
+ // of the three in a row we've seen. So let's emit one token from the
2987
+ // temporary buffer (which will rewind 3 characters, emit the ] and
2988
+ // advance one). Next, let's clear the temporary buffer which will set the
2989
+ // mark to the middle of the three brackets. Finally, let's move to the
2990
+ // appropriate state.
2991
+ StateResult result = emit_from_mark(parser, output);
2992
+ tokenizer->_resume_pos = NULL;
2993
+ set_mark(parser);
2994
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION);
2995
+ return result;
2996
+ }
2997
+ case '>':
2998
+ // We're done with CDATA so move past the >, reset the token start point
2999
+ // to point after the >, and then reconsume in the data state.
3000
+ utf8iterator_next(&tokenizer->_input);
3127
3001
  reset_token_start_point(tokenizer);
3128
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3002
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3129
3003
  tokenizer->_is_in_cdata = false;
3130
- return NEXT_CHAR;
3131
- } else {
3132
- return emit_current_char(parser, output);
3004
+ return CONTINUE;
3005
+ default:
3006
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
3007
+ return emit_from_mark(parser, output);
3133
3008
  }
3134
3009
  }
3135
3010
 
3011
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
3012
+ static StateResult handle_character_reference_state (
3013
+ GumboParser* parser,
3014
+ GumboTokenizerState* tokenizer,
3015
+ int c,
3016
+ GumboToken* output
3017
+ ) {
3018
+ if (gumbo_ascii_isalnum(c)) {
3019
+ reconsume_in_state(parser, GUMBO_LEX_NAMED_CHARACTER_REFERENCE);
3020
+ return CONTINUE;
3021
+ }
3022
+ if (c == '#') {
3023
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE);
3024
+ return CONTINUE;
3025
+ }
3026
+ reconsume_in_state(parser, tokenizer->_return_state);
3027
+ return flush_code_points_consumed_as_character_reference(parser, output);
3028
+ }
3029
+
3030
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
3031
+ static StateResult handle_named_character_reference_state (
3032
+ GumboParser* parser,
3033
+ GumboTokenizerState* tokenizer,
3034
+ int c,
3035
+ GumboToken* output
3036
+ ) {
3037
+ const char *cur = utf8iterator_get_char_pointer(&tokenizer->_input);
3038
+ const char *end = utf8iterator_get_end_pointer(&tokenizer->_input);
3039
+ int code_point[2];
3040
+ size_t size = match_named_char_ref(cur, end - cur, code_point);
3041
+
3042
+ if (size > 0) {
3043
+ utf8iterator_maybe_consume_match(&tokenizer->_input, cur, size, true);
3044
+ int next = utf8iterator_current(&tokenizer->_input);
3045
+ reconsume_in_state(parser, tokenizer->_return_state);
3046
+ if (character_reference_part_of_attribute(parser)
3047
+ && cur[size-1] != ';'
3048
+ && (next == '=' || gumbo_ascii_isalnum(next))) {
3049
+ GumboStringPiece str = { .data = cur, .length = size };
3050
+ append_string_to_temporary_buffer(parser, &str);
3051
+ return flush_code_points_consumed_as_character_reference(parser, output);
3052
+ }
3053
+ if (cur[size-1] != ';')
3054
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE, -1);
3055
+ reconsume_in_state(parser, tokenizer->_return_state);
3056
+ return flush_char_ref(parser, code_point[0], code_point[1], output);
3057
+ }
3058
+ reconsume_in_state(parser, GUMBO_LEX_AMBIGUOUS_AMPERSAND);
3059
+ return flush_code_points_consumed_as_character_reference(parser, output);
3060
+ }
3061
+
3062
+ // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
3063
+ static StateResult handle_ambiguous_ampersand_state (
3064
+ GumboParser* parser,
3065
+ GumboTokenizerState* tokenizer,
3066
+ int c,
3067
+ GumboToken* output
3068
+ ) {
3069
+ if (gumbo_ascii_isalnum(c)) {
3070
+ if (character_reference_part_of_attribute(parser)) {
3071
+ append_char_to_tag_buffer(parser, c, true);
3072
+ return CONTINUE;
3073
+ }
3074
+ return emit_char(parser, c, output);
3075
+ }
3076
+ if (c == ';') {
3077
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE, -1);
3078
+ // fall through
3079
+ }
3080
+ reconsume_in_state(parser, tokenizer->_return_state);
3081
+ return CONTINUE;
3082
+ }
3083
+
3084
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
3085
+ static StateResult handle_numeric_character_reference_state (
3086
+ GumboParser* parser,
3087
+ GumboTokenizerState* tokenizer,
3088
+ int c,
3089
+ GumboToken* output
3090
+ ) {
3091
+ tokenizer->_character_reference_code = 0;
3092
+ switch (c) {
3093
+ case 'x':
3094
+ case 'X':
3095
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START);
3096
+ return CONTINUE;
3097
+ default:
3098
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START);
3099
+ return CONTINUE;
3100
+ }
3101
+ }
3102
+
3103
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-start-state
3104
+ static StateResult handle_hexadecimal_character_reference_start_state (
3105
+ GumboParser* parser,
3106
+ GumboTokenizerState* tokenizer,
3107
+ int c,
3108
+ GumboToken* output
3109
+ ) {
3110
+ if (gumbo_ascii_isxdigit(c)) {
3111
+ reconsume_in_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE);
3112
+ return CONTINUE;
3113
+ }
3114
+ tokenizer_add_char_ref_error (
3115
+ parser,
3116
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3117
+ -1
3118
+ );
3119
+ reconsume_in_state(parser, tokenizer->_return_state);
3120
+ return flush_code_points_consumed_as_character_reference(parser, output);
3121
+ }
3122
+
3123
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
3124
+ static StateResult handle_decimal_character_reference_start_state (
3125
+ GumboParser* parser,
3126
+ GumboTokenizerState* tokenizer,
3127
+ int c,
3128
+ GumboToken* output
3129
+ ) {
3130
+ if (gumbo_ascii_isdigit(c)) {
3131
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE);
3132
+ return CONTINUE;
3133
+ }
3134
+ tokenizer_add_char_ref_error (
3135
+ parser,
3136
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3137
+ -1
3138
+ );
3139
+ reconsume_in_state(parser, tokenizer->_return_state);
3140
+ return flush_code_points_consumed_as_character_reference(parser, output);
3141
+ }
3142
+
3143
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-state
3144
+ static StateResult handle_hexadecimal_character_reference_state (
3145
+ GumboParser* parser,
3146
+ GumboTokenizerState* tokenizer,
3147
+ int c,
3148
+ GumboToken* output
3149
+ ) {
3150
+ if (gumbo_ascii_isdigit(c)) {
3151
+ tokenizer->_character_reference_code =
3152
+ tokenizer->_character_reference_code * 16 + (c - 0x0030);
3153
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3154
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3155
+ return CONTINUE;
3156
+ }
3157
+ if (gumbo_ascii_isupper_xdigit(c)) {
3158
+ tokenizer->_character_reference_code =
3159
+ tokenizer->_character_reference_code * 16 + (c - 0x0037);
3160
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3161
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3162
+ return CONTINUE;
3163
+ }
3164
+ if (gumbo_ascii_islower_xdigit(c)) {
3165
+ tokenizer->_character_reference_code =
3166
+ tokenizer->_character_reference_code * 16 + (c - 0x0057);
3167
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3168
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3169
+ return CONTINUE;
3170
+ }
3171
+ if (c == ';') {
3172
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3173
+ return CONTINUE;
3174
+ }
3175
+ tokenizer_add_char_ref_error(
3176
+ parser,
3177
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3178
+ tokenizer->_character_reference_code
3179
+ );
3180
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3181
+ return CONTINUE;
3182
+ }
3183
+
3184
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
3185
+ static StateResult handle_decimal_character_reference_state (
3186
+ GumboParser* parser,
3187
+ GumboTokenizerState* tokenizer,
3188
+ int c,
3189
+ GumboToken* output
3190
+ ) {
3191
+ if (gumbo_ascii_isdigit(c)) {
3192
+ tokenizer->_character_reference_code =
3193
+ tokenizer->_character_reference_code * 10 + (c - 0x0030);
3194
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3195
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3196
+ return CONTINUE;
3197
+ }
3198
+ if (c == ';') {
3199
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3200
+ return CONTINUE;
3201
+ }
3202
+ tokenizer_add_char_ref_error(
3203
+ parser,
3204
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3205
+ tokenizer->_character_reference_code
3206
+ );
3207
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3208
+ return CONTINUE;
3209
+ }
3210
+
3211
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
3212
+ static StateResult handle_numeric_character_reference_end_state (
3213
+ GumboParser* parser,
3214
+ GumboTokenizerState* tokenizer,
3215
+ int c,
3216
+ GumboToken* output
3217
+ ) {
3218
+ c = tokenizer->_character_reference_code;
3219
+ if (c == 0) {
3220
+ tokenizer_add_char_ref_error(
3221
+ parser,
3222
+ GUMBO_ERR_NULL_CHARACTER_REFERENCE,
3223
+ c
3224
+ );
3225
+ c = kUtf8ReplacementChar;
3226
+ } else if (c > kUtf8MaxChar) {
3227
+ tokenizer_add_char_ref_error(
3228
+ parser,
3229
+ GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
3230
+ c
3231
+ );
3232
+ c = kUtf8ReplacementChar;
3233
+ } else if (utf8_is_surrogate(c)) {
3234
+ tokenizer_add_char_ref_error(
3235
+ parser,
3236
+ GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
3237
+ c
3238
+ );
3239
+ c = kUtf8ReplacementChar;
3240
+ } else if (utf8_is_noncharacter(c)) {
3241
+ tokenizer_add_char_ref_error(
3242
+ parser,
3243
+ GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
3244
+ c
3245
+ );
3246
+ } else if (c == 0x0D || (utf8_is_control(c) && !gumbo_ascii_isspace(c))) {
3247
+ tokenizer_add_char_ref_error(
3248
+ parser,
3249
+ GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
3250
+ c
3251
+ );
3252
+ switch (c) {
3253
+ case 0x80: c = 0x20AC; break;
3254
+ case 0x82: c = 0x201A; break;
3255
+ case 0x83: c = 0x0192; break;
3256
+ case 0x84: c = 0x201E; break;
3257
+ case 0x85: c = 0x2026; break;
3258
+ case 0x86: c = 0x2020; break;
3259
+ case 0x87: c = 0x2021; break;
3260
+ case 0x88: c = 0x02C6; break;
3261
+ case 0x89: c = 0x2030; break;
3262
+ case 0x8A: c = 0x0160; break;
3263
+ case 0x8B: c = 0x2039; break;
3264
+ case 0x8C: c = 0x0152; break;
3265
+ case 0x8E: c = 0x017D; break;
3266
+ case 0x91: c = 0x2018; break;
3267
+ case 0x92: c = 0x2019; break;
3268
+ case 0x93: c = 0x201C; break;
3269
+ case 0x94: c = 0x201D; break;
3270
+ case 0x95: c = 0x2022; break;
3271
+ case 0x96: c = 0x2013; break;
3272
+ case 0x97: c = 0x2014; break;
3273
+ case 0x98: c = 0x02DC; break;
3274
+ case 0x99: c = 0x2122; break;
3275
+ case 0x9A: c = 0x0161; break;
3276
+ case 0x9B: c = 0x203A; break;
3277
+ case 0x9C: c = 0x0153; break;
3278
+ case 0x9E: c = 0x017E; break;
3279
+ case 0x9F: c = 0x0178; break;
3280
+ }
3281
+ }
3282
+ reconsume_in_state(parser, tokenizer->_return_state);
3283
+ return flush_char_ref(parser, c, kGumboNoChar, output);
3284
+ }
3285
+
3136
3286
  typedef StateResult (*GumboLexerStateFunction) (
3137
3287
  GumboParser* parser,
3138
3288
  GumboTokenizerState* tokenizer,
@@ -3141,74 +3291,86 @@ typedef StateResult (*GumboLexerStateFunction) (
3141
3291
  );
3142
3292
 
3143
3293
  static GumboLexerStateFunction dispatch_table[] = {
3144
- handle_data_state,
3145
- handle_char_ref_in_data_state,
3146
- handle_rcdata_state,
3147
- handle_char_ref_in_rcdata_state,
3148
- handle_rawtext_state,
3149
- handle_script_state,
3150
- handle_plaintext_state,
3151
- handle_tag_open_state,
3152
- handle_end_tag_open_state,
3153
- handle_tag_name_state,
3154
- handle_rcdata_lt_state,
3155
- handle_rcdata_end_tag_open_state,
3156
- handle_rcdata_end_tag_name_state,
3157
- handle_rawtext_lt_state,
3158
- handle_rawtext_end_tag_open_state,
3159
- handle_rawtext_end_tag_name_state,
3160
- handle_script_lt_state,
3161
- handle_script_end_tag_open_state,
3162
- handle_script_end_tag_name_state,
3163
- handle_script_escaped_start_state,
3164
- handle_script_escaped_start_dash_state,
3165
- handle_script_escaped_state,
3166
- handle_script_escaped_dash_state,
3167
- handle_script_escaped_dash_dash_state,
3168
- handle_script_escaped_lt_state,
3169
- handle_script_escaped_end_tag_open_state,
3170
- handle_script_escaped_end_tag_name_state,
3171
- handle_script_double_escaped_start_state,
3172
- handle_script_double_escaped_state,
3173
- handle_script_double_escaped_dash_state,
3174
- handle_script_double_escaped_dash_dash_state,
3175
- handle_script_double_escaped_lt_state,
3176
- handle_script_double_escaped_end_state,
3177
- handle_before_attr_name_state,
3178
- handle_attr_name_state,
3179
- handle_after_attr_name_state,
3180
- handle_before_attr_value_state,
3181
- handle_attr_value_double_quoted_state,
3182
- handle_attr_value_single_quoted_state,
3183
- handle_attr_value_unquoted_state,
3184
- handle_char_ref_in_attr_value_state,
3185
- handle_after_attr_value_quoted_state,
3186
- handle_self_closing_start_tag_state,
3187
- handle_bogus_comment_state,
3188
- handle_markup_declaration_state,
3189
- handle_comment_start_state,
3190
- handle_comment_start_dash_state,
3191
- handle_comment_state,
3192
- handle_comment_end_dash_state,
3193
- handle_comment_end_state,
3194
- handle_comment_end_bang_state,
3195
- handle_doctype_state,
3196
- handle_before_doctype_name_state,
3197
- handle_doctype_name_state,
3198
- handle_after_doctype_name_state,
3199
- handle_after_doctype_public_keyword_state,
3200
- handle_before_doctype_public_id_state,
3201
- handle_doctype_public_id_double_quoted_state,
3202
- handle_doctype_public_id_single_quoted_state,
3203
- handle_after_doctype_public_id_state,
3204
- handle_between_doctype_public_system_id_state,
3205
- handle_after_doctype_system_keyword_state,
3206
- handle_before_doctype_system_id_state,
3207
- handle_doctype_system_id_double_quoted_state,
3208
- handle_doctype_system_id_single_quoted_state,
3209
- handle_after_doctype_system_id_state,
3210
- handle_bogus_doctype_state,
3211
- handle_cdata_state
3294
+ [GUMBO_LEX_DATA] = handle_data_state,
3295
+ [GUMBO_LEX_RCDATA] = handle_rcdata_state,
3296
+ [GUMBO_LEX_RAWTEXT] = handle_rawtext_state,
3297
+ [GUMBO_LEX_SCRIPT_DATA] = handle_script_data_state,
3298
+ [GUMBO_LEX_PLAINTEXT] = handle_plaintext_state,
3299
+ [GUMBO_LEX_TAG_OPEN] = handle_tag_open_state,
3300
+ [GUMBO_LEX_END_TAG_OPEN] = handle_end_tag_open_state,
3301
+ [GUMBO_LEX_TAG_NAME] = handle_tag_name_state,
3302
+ [GUMBO_LEX_RCDATA_LT] = handle_rcdata_lt_state,
3303
+ [GUMBO_LEX_RCDATA_END_TAG_OPEN] = handle_rcdata_end_tag_open_state,
3304
+ [GUMBO_LEX_RCDATA_END_TAG_NAME] = handle_rcdata_end_tag_name_state,
3305
+ [GUMBO_LEX_RAWTEXT_LT] = handle_rawtext_lt_state,
3306
+ [GUMBO_LEX_RAWTEXT_END_TAG_OPEN] = handle_rawtext_end_tag_open_state,
3307
+ [GUMBO_LEX_RAWTEXT_END_TAG_NAME] = handle_rawtext_end_tag_name_state,
3308
+ [GUMBO_LEX_SCRIPT_DATA_LT] = handle_script_data_lt_state,
3309
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN] = handle_script_data_end_tag_open_state,
3310
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME] = handle_script_data_end_tag_name_state,
3311
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START] = handle_script_data_escaped_start_state,
3312
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH] = handle_script_data_escaped_start_dash_state,
3313
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED] = handle_script_data_escaped_state,
3314
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH] = handle_script_data_escaped_dash_state,
3315
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH] = handle_script_data_escaped_dash_dash_state,
3316
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT] = handle_script_data_escaped_lt_state,
3317
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN] = handle_script_data_escaped_end_tag_open_state,
3318
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME] = handle_script_data_escaped_end_tag_name_state,
3319
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START] = handle_script_data_double_escaped_start_state,
3320
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED] = handle_script_data_double_escaped_state,
3321
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH] = handle_script_data_double_escaped_dash_state,
3322
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH] = handle_script_data_double_escaped_dash_dash_state,
3323
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT] = handle_script_data_double_escaped_lt_state,
3324
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END] = handle_script_data_double_escaped_end_state,
3325
+ [GUMBO_LEX_BEFORE_ATTR_NAME] = handle_before_attr_name_state,
3326
+ [GUMBO_LEX_ATTR_NAME] = handle_attr_name_state,
3327
+ [GUMBO_LEX_AFTER_ATTR_NAME] = handle_after_attr_name_state,
3328
+ [GUMBO_LEX_BEFORE_ATTR_VALUE] = handle_before_attr_value_state,
3329
+ [GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED] = handle_attr_value_double_quoted_state,
3330
+ [GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED] = handle_attr_value_single_quoted_state,
3331
+ [GUMBO_LEX_ATTR_VALUE_UNQUOTED] = handle_attr_value_unquoted_state,
3332
+ [GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED] = handle_after_attr_value_quoted_state,
3333
+ [GUMBO_LEX_SELF_CLOSING_START_TAG] = handle_self_closing_start_tag_state,
3334
+ [GUMBO_LEX_BOGUS_COMMENT] = handle_bogus_comment_state,
3335
+ [GUMBO_LEX_MARKUP_DECLARATION_OPEN] = handle_markup_declaration_open_state,
3336
+ [GUMBO_LEX_COMMENT_START] = handle_comment_start_state,
3337
+ [GUMBO_LEX_COMMENT_START_DASH] = handle_comment_start_dash_state,
3338
+ [GUMBO_LEX_COMMENT] = handle_comment_state,
3339
+ [GUMBO_LEX_COMMENT_LT] = handle_comment_lt_state,
3340
+ [GUMBO_LEX_COMMENT_LT_BANG] = handle_comment_lt_bang_state,
3341
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH] = handle_comment_lt_bang_dash_state,
3342
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH] = handle_comment_lt_bang_dash_dash_state,
3343
+ [GUMBO_LEX_COMMENT_END_DASH] = handle_comment_end_dash_state,
3344
+ [GUMBO_LEX_COMMENT_END] = handle_comment_end_state,
3345
+ [GUMBO_LEX_COMMENT_END_BANG] = handle_comment_end_bang_state,
3346
+ [GUMBO_LEX_DOCTYPE] = handle_doctype_state,
3347
+ [GUMBO_LEX_BEFORE_DOCTYPE_NAME] = handle_before_doctype_name_state,
3348
+ [GUMBO_LEX_DOCTYPE_NAME] = handle_doctype_name_state,
3349
+ [GUMBO_LEX_AFTER_DOCTYPE_NAME] = handle_after_doctype_name_state,
3350
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD] = handle_after_doctype_public_keyword_state,
3351
+ [GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID] = handle_before_doctype_public_id_state,
3352
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED] = handle_doctype_public_id_double_quoted_state,
3353
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED] = handle_doctype_public_id_single_quoted_state,
3354
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID] = handle_after_doctype_public_id_state,
3355
+ [GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID] = handle_between_doctype_public_system_id_state,
3356
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD] = handle_after_doctype_system_keyword_state,
3357
+ [GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID] = handle_before_doctype_system_id_state,
3358
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED] = handle_doctype_system_id_double_quoted_state,
3359
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED] = handle_doctype_system_id_single_quoted_state,
3360
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID] = handle_after_doctype_system_id_state,
3361
+ [GUMBO_LEX_BOGUS_DOCTYPE] = handle_bogus_doctype_state,
3362
+ [GUMBO_LEX_CDATA_SECTION] = handle_cdata_section_state,
3363
+ [GUMBO_LEX_CDATA_SECTION_BRACKET] = handle_cdata_section_bracket_state,
3364
+ [GUMBO_LEX_CDATA_SECTION_END] = handle_cdata_section_end_state,
3365
+ [GUMBO_LEX_CHARACTER_REFERENCE] = handle_character_reference_state,
3366
+ [GUMBO_LEX_NAMED_CHARACTER_REFERENCE] = handle_named_character_reference_state,
3367
+ [GUMBO_LEX_AMBIGUOUS_AMPERSAND] = handle_ambiguous_ampersand_state,
3368
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE] = handle_numeric_character_reference_state,
3369
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START] = handle_hexadecimal_character_reference_start_state,
3370
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START] = handle_decimal_character_reference_start_state,
3371
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE] = handle_hexadecimal_character_reference_state,
3372
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE] = handle_decimal_character_reference_state,
3373
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
3212
3374
  };
3213
3375
 
3214
3376
  bool gumbo_lex(GumboParser* parser, GumboToken* output) {
@@ -3239,12 +3401,14 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3239
3401
  return true;
3240
3402
  }
3241
3403
 
3242
- if (maybe_emit_from_temporary_buffer(parser, output)) {
3404
+ if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
3405
+ // Return no error.
3243
3406
  return true;
3244
3407
  }
3245
3408
 
3409
+ tokenizer->_parse_error = false;
3246
3410
  while (1) {
3247
- assert(!tokenizer->_temporary_buffer_emit);
3411
+ assert(!tokenizer->_resume_pos);
3248
3412
  assert(tokenizer->_buffered_emit_char == kGumboNoChar);
3249
3413
  int c = utf8iterator_current(&tokenizer->_input);
3250
3414
  GumboTokenizerEnum state = tokenizer->_state;
@@ -3255,11 +3419,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3255
3419
  bool should_advance = !tokenizer->_reconsume_current_input;
3256
3420
  tokenizer->_reconsume_current_input = false;
3257
3421
 
3258
- if (result == RETURN_SUCCESS) {
3259
- return true;
3260
- } else if (result == RETURN_ERROR) {
3261
- return false;
3262
- }
3422
+ if (result == EMIT_TOKEN)
3423
+ return !tokenizer->_parse_error;
3263
3424
 
3264
3425
  if (should_advance) {
3265
3426
  utf8iterator_next(&tokenizer->_input);
@@ -3285,12 +3446,16 @@ void gumbo_token_destroy(GumboToken* token) {
3285
3446
  }
3286
3447
  }
3287
3448
  gumbo_free((void*) token->v.start_tag.attributes.data);
3288
- if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN)
3449
+ if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
3289
3450
  gumbo_free(token->v.start_tag.name);
3451
+ token->v.start_tag.name = NULL;
3452
+ }
3290
3453
  return;
3291
3454
  case GUMBO_TOKEN_END_TAG:
3292
- if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN)
3455
+ if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN) {
3293
3456
  gumbo_free(token->v.end_tag.name);
3457
+ token->v.end_tag.name = NULL;
3458
+ }
3294
3459
  break;
3295
3460
  case GUMBO_TOKEN_COMMENT:
3296
3461
  gumbo_free((void*) token->v.text);