nokogumbo 2.0.0.pre.alpha → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -79,7 +79,7 @@ void gumbo_string_buffer_append_codepoint (
79
79
  }
80
80
 
81
81
  void gumbo_string_buffer_append_string (
82
- GumboStringPiece* str,
82
+ const GumboStringPiece* str,
83
83
  GumboStringBuffer* output
84
84
  ) {
85
85
  maybe_resize_string_buffer(str->length, output);
@@ -47,7 +47,7 @@ void gumbo_string_buffer_append_codepoint (
47
47
 
48
48
  // Appends a string onto the end of the GumboStringBuffer.
49
49
  void gumbo_string_buffer_append_string (
50
- GumboStringPiece* str,
50
+ const GumboStringPiece* str,
51
51
  GumboStringBuffer* output
52
52
  );
53
53
 
@@ -0,0 +1,79 @@
1
+ /*
2
+ Copyright 2018 Stephen Checkoway
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
16
+
17
+ #include <assert.h>
18
+
19
+ #include "ascii.h"
20
+ #include "token_buffer.h"
21
+ #include "tokenizer.h"
22
+ #include "util.h"
23
+
24
+ struct GumboInternalCharacterToken {
25
+ GumboSourcePosition position;
26
+ GumboStringPiece original_text;
27
+ int c;
28
+ };
29
+
30
+ void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer) {
31
+ buffer->data = NULL;
32
+ buffer->length = 0;
33
+ buffer->capacity = 0;
34
+ }
35
+
36
+ void gumbo_character_token_buffer_append (
37
+ const GumboToken* token,
38
+ GumboCharacterTokenBuffer* buffer
39
+ ) {
40
+ assert(token->type == GUMBO_TOKEN_WHITESPACE
41
+ || token->type == GUMBO_TOKEN_CHARACTER);
42
+ if (buffer->length == buffer->capacity) {
43
+ if (buffer->capacity == 0)
44
+ buffer->capacity = 10;
45
+ else
46
+ buffer->capacity *= 2;
47
+ size_t bytes = sizeof(*buffer->data) * buffer->capacity;
48
+ buffer->data = gumbo_realloc(buffer->data, bytes);
49
+ }
50
+ size_t index = buffer->length++;
51
+ buffer->data[index].position = token->position;
52
+ buffer->data[index].original_text = token->original_text;
53
+ buffer->data[index].c = token->v.character;
54
+ }
55
+
56
+ void gumbo_character_token_buffer_get (
57
+ const GumboCharacterTokenBuffer* buffer,
58
+ size_t index,
59
+ struct GumboInternalToken* output
60
+ ) {
61
+ assert(index < buffer->length);
62
+ int c = buffer->data[index].c;
63
+ output->type = gumbo_ascii_isspace(c)?
64
+ GUMBO_TOKEN_WHITESPACE : GUMBO_TOKEN_CHARACTER;
65
+ output->position = buffer->data[index].position;
66
+ output->original_text = buffer->data[index].original_text;
67
+ output->v.character = c;
68
+ }
69
+
70
+ void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer) {
71
+ buffer->length = 0;
72
+ }
73
+
74
+ void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer) {
75
+ gumbo_free(buffer->data);
76
+ buffer->data = NULL;
77
+ buffer->length = 0;
78
+ buffer->capacity = 0;
79
+ }
@@ -0,0 +1,71 @@
1
+ /*
2
+ Copyright 2018 Stephen Checkoway
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
16
+
17
+ #ifndef GUMBO_TOKEN_BUFFER_H
18
+ #define GUMBO_TOKEN_BUFFER_H
19
+
20
+ #include <stdbool.h>
21
+ #include <stddef.h>
22
+
23
+ #include "gumbo.h"
24
+
25
+ #ifdef __cplusplus
26
+ extern "C" {
27
+ #endif
28
+
29
+ struct GumboInternalCharacterToken;
30
+ struct GumboInternalToken;
31
+
32
+ // A struct representing a growable sequence of character (and whitespace)
33
+ // tokens.
34
+ typedef struct {
35
+ // A pointer to the start of the sequence.
36
+ struct GumboInternalCharacterToken* data;
37
+
38
+ // The length of the sequence.
39
+ size_t length;
40
+
41
+ // The capacity of the buffer.
42
+ size_t capacity;
43
+ } GumboCharacterTokenBuffer;
44
+
45
+ // Initializes a new GumboCharacterTokenBuffer.
46
+ void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer);
47
+
48
+ // Appends a character (or whitespace) token.
49
+ void gumbo_character_token_buffer_append (
50
+ const struct GumboInternalToken* token,
51
+ GumboCharacterTokenBuffer* buffer
52
+ );
53
+
54
+ void gumbo_character_token_buffer_get (
55
+ const GumboCharacterTokenBuffer* buffer,
56
+ size_t index,
57
+ struct GumboInternalToken* output
58
+ );
59
+
60
+ // Reinitialize this string buffer. This clears it by setting length=0. It
61
+ // does not zero out the buffer itself.
62
+ void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer);
63
+
64
+ // Deallocates this GumboCharacterTokenBuffer.
65
+ void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer);
66
+
67
+ #ifdef __cplusplus
68
+ }
69
+ #endif
70
+
71
+ #endif // GUMBO_TOKEN_BUFFER_H
@@ -1,5 +1,7 @@
1
1
  /*
2
2
  Copyright 2010 Google Inc.
3
+ Copyright 2017-2018 Craig Barnes
4
+ Copyright 2018 Stephen Checkoway
3
5
 
4
6
  Licensed under the Apache License, Version 2.0 (the "License");
5
7
  you may not use this file except in compliance with the License.
@@ -18,10 +20,7 @@
18
20
  Coding conventions specific to this file:
19
21
 
20
22
  1. Functions that fill in a token should be named emit_*, and should be
21
- followed immediately by a return from the tokenizer (true if no error
22
- occurred, false if an error occurred). Sometimes the emit functions
23
- themselves return a boolean so that they can be combined with the return
24
- statement; in this case, they should match this convention.
23
+ followed immediately by a return from the tokenizer.
25
24
  2. Functions that shuffle data from temporaries to final API structures
26
25
  should be named finish_*, and be called just before the tokenizer exits the
27
26
  state that accumulates the temporary.
@@ -60,15 +59,18 @@
60
59
  #include "util.h"
61
60
  #include "vector.h"
62
61
 
63
- // Compared against _script_data_buffer to determine if we're in
62
+ // Compared against _temporary_buffer to determine if we're in
64
63
  // double-escaped script mode.
65
64
  static const GumboStringPiece kScriptTag = {.data = "script", .length = 6};
66
65
 
67
- // An enum for the return value of each individual state.
66
+ // An enum for the return value of each individual state. Each of the emit_*
67
+ // functions should return EMIT_TOKEN and should be called as
68
+ // return emit_foo(parser, ..., output);
69
+ // Each of the handle_*_state functions that do not return emit_* should
70
+ // instead return CONTINUE to indicate to gumbo_lex to continue lexing.
68
71
  typedef enum {
69
- RETURN_ERROR, // Return false (error) from the tokenizer.
70
- RETURN_SUCCESS, // Return true (success) from the tokenizer.
71
- NEXT_CHAR // Proceed to the next character and continue lexing.
72
+ EMIT_TOKEN,
73
+ CONTINUE,
72
74
  } StateResult;
73
75
 
74
76
  // This is a struct containing state necessary to build up a tag token,
@@ -103,12 +105,6 @@ typedef struct GumboInternalTagState {
103
105
  // the attribute value, but shouldn't overwrite the existing value.
104
106
  bool _drop_next_attr_value;
105
107
 
106
- // The state that caused the tokenizer to switch into a character reference in
107
- // attribute value state. This is used to set the additional allowed
108
- // character, and is switched back to on completion. Initialized as the
109
- // tokenizer enters the character reference state.
110
- GumboTokenizerEnum _attr_value_state;
111
-
112
108
  // The last start tag to have been emitted by the tokenizer. This is
113
109
  // necessary to check for appropriate end tags.
114
110
  GumboTag _last_start_tag;
@@ -133,10 +129,10 @@ typedef struct GumboInternalTokenizerState {
133
129
  // "Reconsume the current input character in..."
134
130
  bool _reconsume_current_input;
135
131
 
136
- // A flag indicating whether the current node is a foreign element. This is
137
- // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
138
- // markup declaration state.
139
- bool _is_current_node_foreign;
132
+ // A flag indicating whether the adjusted current node is a foreign element.
133
+ // This is set by gumbo_tokenizer_set_is_adjusted_current_node_foreign and
134
+ // checked in the markup declaration state.
135
+ bool _is_adjusted_current_node_foreign;
140
136
 
141
137
  // A flag indicating whether the tokenizer is in a CDATA section. If so, then
142
138
  // text tokens emitted will be GUMBO_TOKEN_CDATA.
@@ -159,27 +155,24 @@ typedef struct GumboInternalTokenizerState {
159
155
 
160
156
  // A temporary buffer to accumulate characters, as described by the "temporary
161
157
  // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
162
- // way: we record the specific character to go into the buffer, which may
163
- // sometimes be a lowercased version of the actual input character. However,
164
- // we *also* use utf8iterator_mark() to record the position at tag start.
165
- // When we start flushing the temporary buffer, we set _temporary_buffer_emit
166
- // to the start of it, and then increment it for each call to the tokenizer.
167
- // We also call utf8iterator_reset(), and utf8iterator_next() through the
168
- // input stream, so that tokens emitted by emit_char have the correct position
169
- // and original text.
158
+ // way: In situations where the spec calls for inserting characters into the
159
+ // temporary buffer that exactly match the input in order to emit them as
160
+ // character tokens, we don't actually do it.
161
+ // Instead, we mark the input and reset the input to it using set_mark() and
162
+ // emit_from_mark(). We do use the temporary buffer for other uses such as
163
+ // DOCTYPEs, comments, and detecting escaped <script> tags.
170
164
  GumboStringBuffer _temporary_buffer;
171
165
 
172
- // The current cursor position we're emitting from within
173
- // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
174
- const char* _temporary_buffer_emit;
166
+ // The position to resume normal operation after we start emitting from the
167
+ // mark. NULL whenever we're not emitting from the mark.
168
+ const char* _resume_pos;
169
+
170
+ // The character reference state uses a return state to return to the state
171
+ // it was invoked from.
172
+ GumboTokenizerEnum _return_state;
175
173
 
176
- // The temporary buffer is also used by the spec to check whether we should
177
- // enter the script data double escaped state, but we can't use the same
178
- // buffer for both because we have to flush out "<s" as emits while still
179
- // maintaining the context that will eventually become "script". This is a
180
- // separate buffer that's used in place of the temporary buffer for states
181
- // that may enter the script data double escape start state.
182
- GumboStringBuffer _script_data_buffer;
174
+ // Numeric character reference.
175
+ uint32_t _character_reference_code;
183
176
 
184
177
  // Pointer to the beginning of the current token in the original buffer; used
185
178
  // to record the original text.
@@ -201,123 +194,66 @@ typedef struct GumboInternalTokenizerState {
201
194
  Utf8Iterator _input;
202
195
  } GumboTokenizerState;
203
196
 
204
- // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
197
+ // Adds a parse error to the parser's error struct.
205
198
  static void tokenizer_add_parse_error (
206
199
  GumboParser* parser,
207
200
  GumboErrorType type
208
201
  ) {
202
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
209
203
  GumboError* error = gumbo_add_error(parser);
210
204
  if (!error) {
211
205
  return;
212
206
  }
207
+ const Utf8Iterator* input = &tokenizer->_input;
208
+ utf8iterator_get_position(input, &error->position);
209
+ error->original_text.data = utf8iterator_get_char_pointer(input);
210
+ error->original_text.length = utf8iterator_get_width(input);
211
+ error->type = type;
212
+ error->v.tokenizer.state = tokenizer->_state;
213
+ error->v.tokenizer.codepoint = utf8iterator_current(input);
214
+ }
215
+
216
+ // Adds an error pointing at the start of the character reference.
217
+ static void tokenizer_add_char_ref_error (
218
+ struct GumboInternalParser* parser,
219
+ GumboErrorType type,
220
+ int codepoint
221
+ ) {
213
222
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
214
- utf8iterator_get_position(&tokenizer->_input, &error->position);
215
- error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
223
+ GumboError* error = gumbo_add_error(parser);
224
+ if (!error)
225
+ return;
226
+ Utf8Iterator* input = &tokenizer->_input;
216
227
  error->type = type;
217
- error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
218
- switch (tokenizer->_state) {
219
- case GUMBO_LEX_DATA:
220
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
221
- break;
222
- case GUMBO_LEX_CHAR_REF_IN_DATA:
223
- case GUMBO_LEX_CHAR_REF_IN_RCDATA:
224
- case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
225
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
226
- break;
227
- case GUMBO_LEX_RCDATA:
228
- case GUMBO_LEX_RCDATA_LT:
229
- case GUMBO_LEX_RCDATA_END_TAG_OPEN:
230
- case GUMBO_LEX_RCDATA_END_TAG_NAME:
231
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
232
- break;
233
- case GUMBO_LEX_RAWTEXT:
234
- case GUMBO_LEX_RAWTEXT_LT:
235
- case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
236
- case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
237
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
238
- break;
239
- case GUMBO_LEX_PLAINTEXT:
240
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
241
- break;
242
- case GUMBO_LEX_SCRIPT:
243
- case GUMBO_LEX_SCRIPT_LT:
244
- case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
245
- case GUMBO_LEX_SCRIPT_END_TAG_NAME:
246
- case GUMBO_LEX_SCRIPT_ESCAPED_START:
247
- case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
248
- case GUMBO_LEX_SCRIPT_ESCAPED:
249
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
250
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
251
- case GUMBO_LEX_SCRIPT_ESCAPED_LT:
252
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
253
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
254
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
255
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
256
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
257
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
258
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
259
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
260
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
261
- break;
262
- case GUMBO_LEX_TAG_OPEN:
263
- case GUMBO_LEX_END_TAG_OPEN:
264
- case GUMBO_LEX_TAG_NAME:
265
- case GUMBO_LEX_BEFORE_ATTR_NAME:
266
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
267
- break;
268
- case GUMBO_LEX_SELF_CLOSING_START_TAG:
269
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
270
- break;
271
- case GUMBO_LEX_ATTR_NAME:
272
- case GUMBO_LEX_AFTER_ATTR_NAME:
273
- case GUMBO_LEX_BEFORE_ATTR_VALUE:
274
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
275
- break;
276
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
277
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
278
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
279
- case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
280
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
281
- break;
282
- case GUMBO_LEX_BOGUS_COMMENT:
283
- case GUMBO_LEX_COMMENT_START:
284
- case GUMBO_LEX_COMMENT_START_DASH:
285
- case GUMBO_LEX_COMMENT:
286
- case GUMBO_LEX_COMMENT_END_DASH:
287
- case GUMBO_LEX_COMMENT_END:
288
- case GUMBO_LEX_COMMENT_END_BANG:
289
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
290
- break;
291
- case GUMBO_LEX_MARKUP_DECLARATION:
292
- case GUMBO_LEX_DOCTYPE:
293
- case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
294
- case GUMBO_LEX_DOCTYPE_NAME:
295
- case GUMBO_LEX_AFTER_DOCTYPE_NAME:
296
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
297
- case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
298
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
299
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
300
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
301
- case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
302
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
303
- case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
304
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
305
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
306
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
307
- case GUMBO_LEX_BOGUS_DOCTYPE:
308
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
309
- break;
310
- case GUMBO_LEX_CDATA:
311
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
312
- break;
313
- }
228
+ error->position = utf8iterator_get_mark_position(input);
229
+ const char* mark = utf8iterator_get_mark_pointer(input);
230
+ error->original_text.data = mark;
231
+ error->original_text.length = utf8iterator_get_char_pointer(input) - mark;
232
+ error->v.tokenizer.state = tokenizer->_state;
233
+ error->v.tokenizer.codepoint = codepoint;
234
+ }
235
+
236
+ // Adds an error pointing at the start of the token.
237
+ static void tokenizer_add_token_parse_error (
238
+ GumboParser* parser,
239
+ GumboErrorType type
240
+ ) {
241
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
242
+ GumboError* error = gumbo_add_error(parser);
243
+ if (!error)
244
+ return;
245
+ Utf8Iterator* input = &tokenizer->_input;
246
+ error->type = type;
247
+ error->position = tokenizer->_token_start_pos;
248
+ error->original_text.data = tokenizer->_token_start;
249
+ error->original_text.length =
250
+ utf8iterator_get_char_pointer(input) - tokenizer->_token_start;
251
+ error->v.tokenizer.state = tokenizer->_state;
252
+ error->v.tokenizer.codepoint = 0;
314
253
  }
315
254
 
316
255
  static bool is_alpha(int c) {
317
- // We don't use the ISO C isalpha() function here because it depends
318
- // on the current locale, whereas the behavior in the HTML5 spec is
319
- // locale-independent.
320
- return ((unsigned) c | 32) - 'a' < 26;
256
+ return gumbo_ascii_isalpha(c);
321
257
  }
322
258
 
323
259
  static int ensure_lowercase(int c) {
@@ -347,24 +283,9 @@ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
347
283
  }
348
284
 
349
285
  // Starts recording characters in the temporary buffer.
350
- // Because this needs to reset the utf8iterator_mark to the beginning of the
351
- // text that will eventually be emitted, it needs to be called a couple of
352
- // states before the spec says "Set the temporary buffer to the empty string".
353
- // In general, this should be called whenever there's a transition to a
354
- // "less-than sign state". The initial < and possibly / then need to be
355
- // appended to the temporary buffer, their presence needs to be accounted for in
356
- // states that compare the temporary buffer against a literal value, and
357
- // spec stanzas that say "emit a < and / character token along with a character
358
- // token for each character in the temporary buffer" need to be adjusted to
359
- // account for the presence of the < and / inside the temporary buffer.
360
286
  static void clear_temporary_buffer(GumboParser* parser) {
361
287
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
362
- assert(!tokenizer->_temporary_buffer_emit);
363
- utf8iterator_mark(&tokenizer->_input);
364
288
  gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
365
- // The temporary buffer and script data buffer are the same object in the
366
- // spec, so the script data buffer should be cleared as well.
367
- gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
368
289
  }
369
290
 
370
291
  // Appends a codepoint to the temporary buffer.
@@ -378,25 +299,20 @@ static void append_char_to_temporary_buffer (
378
299
  );
379
300
  }
380
301
 
381
- #ifndef NDEBUG
382
- static bool temporary_buffer_equals__ (
383
- const GumboParser* parser,
384
- const char* text,
385
- size_t text_len
302
+ static void append_string_to_temporary_buffer (
303
+ GumboParser* parser,
304
+ const GumboStringPiece* str
386
305
  ) {
387
- const GumboStringBuffer* buf = &parser->_tokenizer_state->_temporary_buffer;
388
- return
389
- text_len == buf->length
390
- && memcmp(buf->data, text, text_len) == 0;
306
+ gumbo_string_buffer_append_string (
307
+ str,
308
+ &parser->_tokenizer_state->_temporary_buffer
309
+ );
391
310
  }
392
311
 
393
- #define temporary_buffer_equals(parser, text) \
394
- temporary_buffer_equals__(parser, "" text, sizeof(text) - 1)
395
312
 
396
313
  static bool temporary_buffer_is_empty(const GumboParser* parser) {
397
314
  return parser->_tokenizer_state->_temporary_buffer.length == 0;
398
315
  }
399
- #endif
400
316
 
401
317
  static void doc_type_state_init(GumboParser* parser) {
402
318
  GumboTokenDocType* doc_type_state =
@@ -493,56 +409,49 @@ static void finish_doctype_system_id(GumboParser* parser) {
493
409
  }
494
410
 
495
411
  // Writes a single specified character to the output token.
496
- static void emit_char(GumboParser* parser, int c, GumboToken* output) {
412
+ static StateResult emit_char(GumboParser* parser, int c, GumboToken* output) {
497
413
  output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
498
414
  output->v.character = c;
499
415
  finish_token(parser, output);
416
+ return EMIT_TOKEN;
500
417
  }
501
418
 
502
419
  // Writes a replacement character token and records a parse error.
503
- // Always returns RETURN_ERROR, per gumbo_lex return value.
420
+ // Always returns EMIT_TOKEN, per gumbo_lex return value.
504
421
  static StateResult emit_replacement_char(
505
422
  GumboParser* parser, GumboToken* output) {
506
423
  // In all cases, this is because of a null byte in the input stream.
507
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
424
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
508
425
  emit_char(parser, kUtf8ReplacementChar, output);
509
- return RETURN_ERROR;
426
+ return EMIT_TOKEN;
510
427
  }
511
428
 
512
- // Writes an EOF character token. Always returns RETURN_SUCCESS.
429
+ // Writes an EOF character token. Always returns EMIT_TOKEN.
513
430
  static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
514
- emit_char(parser, -1, output);
515
- return RETURN_SUCCESS;
516
- }
517
-
518
- // Writes the current input character out as a character token.
519
- // Always returns RETURN_SUCCESS.
520
- static bool emit_current_char(GumboParser* parser, GumboToken* output) {
521
- emit_char(
522
- parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
523
- return RETURN_SUCCESS;
431
+ return emit_char(parser, -1, output);
524
432
  }
525
433
 
526
434
  // Writes out a doctype token, copying it from the tokenizer state.
527
- static void emit_doctype(GumboParser* parser, GumboToken* output) {
435
+ static StateResult emit_doctype(GumboParser* parser, GumboToken* output) {
528
436
  output->type = GUMBO_TOKEN_DOCTYPE;
529
437
  output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
530
438
  finish_token(parser, output);
531
439
  doc_type_state_init(parser);
440
+ return EMIT_TOKEN;
532
441
  }
533
442
 
534
443
  // Debug-only function that explicitly sets the attribute vector data to NULL so
535
444
  // it can be asserted on tag creation, verifying that there are no memory leaks.
536
445
  static void mark_tag_state_as_empty(GumboTagState* tag_state) {
537
446
  UNUSED_IF_NDEBUG(tag_state);
538
- #ifndef NDEBUG
539
447
  tag_state->_name = NULL;
448
+ #ifndef NDEBUG
540
449
  tag_state->_attributes = kGumboEmptyVector;
541
450
  #endif
542
451
  }
543
452
 
544
453
  // Writes out the current tag as a start or end tag token.
545
- // Always returns RETURN_SUCCESS.
454
+ // Always returns EMIT_TOKEN.
546
455
  static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
547
456
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
548
457
  if (tag_state->_is_start_tag) {
@@ -559,7 +468,10 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
559
468
  output->type = GUMBO_TOKEN_END_TAG;
560
469
  output->v.end_tag.tag = tag_state->_tag;
561
470
  output->v.end_tag.name = tag_state->_name;
562
- output->v.end_tag.is_self_closing = tag_state->_is_self_closing;
471
+ if (tag_state->_is_self_closing)
472
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS);
473
+ if (tag_state->_attributes.length > 0)
474
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_ATTRIBUTES);
563
475
  // In end tags, ownership of the attributes vector is not transferred to the
564
476
  // token, but it's still initialized as normal, so it must be manually
565
477
  // deallocated. There may also be attributes to destroy, in certain broken
@@ -582,7 +494,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
582
494
  assert(output->original_text.length >= 2);
583
495
  assert(output->original_text.data[0] == '<');
584
496
  assert(output->original_text.data[output->original_text.length - 1] == '>');
585
- return RETURN_SUCCESS;
497
+ return EMIT_TOKEN;
586
498
  }
587
499
 
588
500
  // In some states, we speculatively start a tag, but don't know whether it'll be
@@ -600,90 +512,59 @@ static void abandon_current_tag(GumboParser* parser) {
600
512
  gumbo_debug("Abandoning current tag.\n");
601
513
  }
602
514
 
603
- // Wraps the gumbo_consume_char_ref function to handle its output and make the
604
- // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
605
- // error occurred, RETURN_SUCCESS otherwise.
606
- static StateResult emit_char_ref (
607
- GumboParser* parser,
608
- int additional_allowed_char,
609
- bool UNUSED_ARG(is_in_attribute),
610
- GumboToken* output
611
- ) {
612
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
613
- OneOrTwoCodepoints char_ref;
614
- bool status = gumbo_consume_char_ref (
615
- parser,
616
- &tokenizer->_input,
617
- additional_allowed_char,
618
- false,
619
- &char_ref
620
- );
621
- if (char_ref.first != kGumboNoChar) {
622
- // gumbo_consume_char_ref ends with the iterator pointing at the next
623
- // character, so we need to be sure not advance it again before
624
- // reading the next token.
625
- tokenizer->_reconsume_current_input = true;
626
- emit_char(parser, char_ref.first, output);
627
- tokenizer->_buffered_emit_char = char_ref.second;
628
- } else {
629
- emit_char(parser, '&', output);
630
- }
631
- return status ? RETURN_SUCCESS : RETURN_ERROR;
632
- }
633
-
634
515
  // Emits a comment token. Comments use the temporary buffer to accumulate their
635
516
  // data, and then it's copied over and released to the 'text' field of the
636
- // GumboToken union. Always returns RETURN_SUCCESS.
517
+ // GumboToken union. Always returns EMIT_TOKEN.
637
518
  static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
638
519
  output->type = GUMBO_TOKEN_COMMENT;
639
520
  finish_temporary_buffer(parser, &output->v.text);
640
521
  finish_token(parser, output);
641
- return RETURN_SUCCESS;
522
+ return EMIT_TOKEN;
642
523
  }
643
524
 
644
- // Checks to see we should be flushing accumulated characters in the temporary
645
- // buffer, and fills the output token with the next output character if so.
646
- // Returns true if a character has been emitted and the tokenizer should
647
- // immediately return, false if we're at the end of the temporary buffer and
648
- // should resume normal operation.
649
- static bool maybe_emit_from_temporary_buffer(
650
- GumboParser* parser, GumboToken* output) {
525
+ static void set_mark(GumboParser* parser) {
526
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
527
+ utf8iterator_mark(&tokenizer->_input);
528
+ }
529
+
530
+ // Checks to see we should be emitting characters from the mark, and fills the
531
+ // output token with the next output character if so.
532
+ // Returns EMIT_TOKEN if a character has been emitted and the tokenizer should
533
+ // immediately return, CONTINUE if we should resume normal operation.
534
+ static StateResult maybe_emit_from_mark (
535
+ GumboParser* parser,
536
+ GumboToken* output
537
+ ) {
651
538
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
652
- const char* c = tokenizer->_temporary_buffer_emit;
653
- GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
539
+ const char* pos = tokenizer->_resume_pos;
654
540
 
655
- if (!c || c >= buffer->data + buffer->length) {
656
- tokenizer->_temporary_buffer_emit = NULL;
657
- return false;
541
+ if (!pos)
542
+ return CONTINUE;
543
+ if (utf8iterator_get_char_pointer(&tokenizer->_input) >= pos) {
544
+ tokenizer->_resume_pos = NULL;
545
+ return CONTINUE;
658
546
  }
659
547
 
660
- assert(*c == utf8iterator_current(&tokenizer->_input));
661
- // emit_char also advances the input stream. We need to do some juggling of
662
- // the _reconsume_current_input flag to get the proper behavior when emitting
663
- // previous tokens. Basically, _reconsume_current_input should *never* be set
664
- // when emitting anything from the temporary buffer, since those characters
665
- // have already been advanced past. However, it should be preserved so that
666
- // when the *next* character is encountered again, the tokenizer knows not to
667
- // advance past it.
668
- bool saved_reconsume_state = tokenizer->_reconsume_current_input;
669
- tokenizer->_reconsume_current_input = false;
670
- emit_char(parser, *c, output);
671
- ++tokenizer->_temporary_buffer_emit;
672
- tokenizer->_reconsume_current_input = saved_reconsume_state;
673
- return true;
674
- }
675
-
676
- // Sets up the tokenizer to begin flushing the temporary buffer.
677
- // This resets the input iterator stream to the start of the last tag, sets up
678
- // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
679
- // the first character in it. It returns true if a character was emitted, false
680
- // otherwise.
681
- static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
548
+ // emit_char advances the input stream. _reconsume_current_input should
549
+ // *never* be set when emitting from the mark since those characters have
550
+ // already been advanced past.
551
+ assert(!tokenizer->_reconsume_current_input);
552
+ return emit_char(parser, utf8iterator_current(&tokenizer->_input), output);
553
+ }
554
+
555
+ // Sets up the tokenizer to begin emitting from the mark up to, but not
556
+ // including, the current code point. This resets the input iterator stream to
557
+ // the mark, sets up _resume_pos, and then emits the first character in it.
558
+ // Returns EMIT_TOKEN.
559
+ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
682
560
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
683
- assert(tokenizer->_temporary_buffer.data);
561
+ tokenizer->_resume_pos = utf8iterator_get_char_pointer(&tokenizer->_input);
684
562
  utf8iterator_reset(&tokenizer->_input);
685
- tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
686
- return maybe_emit_from_temporary_buffer(parser, output);
563
+ // Now that we have reset the input, we need to advance through it.
564
+ tokenizer->_reconsume_current_input = false;
565
+ StateResult result = maybe_emit_from_mark(parser, output);
566
+ assert(result == EMIT_TOKEN);
567
+ return result;
687
568
  }
688
569
 
689
570
  // Appends a codepoint to the current tag buffer. If
@@ -703,6 +584,19 @@ static void append_char_to_tag_buffer (
703
584
  gumbo_string_buffer_append_codepoint(codepoint, buffer);
704
585
  }
705
586
 
587
+ // Like above but append a string.
588
+ static void append_string_to_tag_buffer (
589
+ GumboParser* parser,
590
+ GumboStringPiece* str,
591
+ bool reinitilize_position_on_first
592
+ ) {
593
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
594
+ if (buffer->length == 0 && reinitilize_position_on_first) {
595
+ reset_tag_buffer_start_point(parser);
596
+ }
597
+ gumbo_string_buffer_append_string(str, buffer);
598
+ }
599
+
706
600
  // (Re-)initialize the tag buffer. This also resets the original_text pointer
707
601
  // and _start_pos field to point to the current position.
708
602
  static void initialize_tag_buffer(GumboParser* parser) {
@@ -713,6 +607,70 @@ static void initialize_tag_buffer(GumboParser* parser) {
713
607
  reset_tag_buffer_start_point(parser);
714
608
  }
715
609
 
610
+ // https://html.spec.whatwg.org/multipage/parsing.html#charref-in-attribute
611
+ static bool character_reference_part_of_attribute(GumboParser* parser) {
612
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
613
+ switch (tokenizer->_return_state) {
614
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
615
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
616
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
617
+ return true;
618
+ default:
619
+ return false;
620
+ }
621
+ }
622
+
623
+ // https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference
624
+ // For each code point in the temporary buffer, add to the current attribute
625
+ // value if the character reference was consumed as part of an attribute or
626
+ // emit the code point as a character token.
627
+ static StateResult flush_code_points_consumed_as_character_reference (
628
+ GumboParser* parser,
629
+ GumboToken* output
630
+ ) {
631
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
632
+ if (character_reference_part_of_attribute(parser)) {
633
+ const char *start = utf8iterator_get_mark_pointer(&tokenizer->_input);
634
+ assert(start);
635
+ GumboStringPiece str = {
636
+ .data = start,
637
+ .length = utf8iterator_get_char_pointer(&tokenizer->_input) - start,
638
+ };
639
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
640
+ append_string_to_tag_buffer(parser, &str, unquoted);
641
+ return CONTINUE;
642
+ }
643
+ return emit_from_mark(parser, output);
644
+ }
645
+
646
+ // After a character reference has been successfully constructed, the standard
647
+ // says to set the temporary buffer equal to the empty string, append the code
648
+ // point(s) associated with the reference and flush code points consumed as a
649
+ // character reference.
650
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
651
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
652
+ // That doesn't work for us because we use the temporary buffer in lock step
653
+ // with the input for position and that would fail if we inserted a different
654
+ // number of code points. So duplicate a bit of the above logic.
655
+ static StateResult flush_char_ref (
656
+ GumboParser* parser,
657
+ int first,
658
+ int second,
659
+ GumboToken* output
660
+ ) {
661
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
662
+ if (character_reference_part_of_attribute(parser)) {
663
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
664
+ append_char_to_tag_buffer(parser, first, unquoted);
665
+ if (second != kGumboNoChar)
666
+ append_char_to_tag_buffer(parser, second, unquoted);
667
+ return CONTINUE;
668
+ }
669
+ tokenizer->_buffered_emit_char = second;
670
+ return emit_char(parser, first, output);
671
+ }
672
+
673
+
716
674
  // Initializes the tag_state to start a new tag, keeping track of the opening
717
675
  // positions and original text. Takes a boolean indicating whether this is a
718
676
  // start or end tag.
@@ -725,7 +683,6 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
725
683
  assert(is_alpha(c));
726
684
 
727
685
  initialize_tag_buffer(parser);
728
- gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
729
686
 
730
687
  assert(tag_state->_name == NULL);
731
688
  assert(tag_state->_attributes.data == NULL);
@@ -765,7 +722,10 @@ static void copy_over_original_tag_text (
765
722
  original_text->data = tag_state->_original_text;
766
723
  original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
767
724
  tag_state->_original_text;
768
- if (original_text->data[original_text->length - 1] == '\r') {
725
+ if (
726
+ original_text->length
727
+ && original_text->data[original_text->length - 1] == '\r'
728
+ ) {
769
729
  // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
770
730
  // appended to the end of original text even when it's really the first part
771
731
  // of the next character. If we detect this situation, shrink the length of
@@ -801,40 +761,45 @@ static void finish_tag_name(GumboParser* parser) {
801
761
  }
802
762
 
803
763
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
804
- static void add_duplicate_attr_error (
805
- GumboParser* parser,
806
- int original_index,
807
- int new_index
808
- ) {
764
+ static void add_duplicate_attr_error(GumboParser* parser) {
765
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
809
766
  GumboError* error = gumbo_add_error(parser);
810
767
  if (!error) {
811
768
  return;
812
769
  }
813
770
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
814
- error->type = GUMBO_ERR_DUPLICATE_ATTR;
771
+ error->type = GUMBO_ERR_DUPLICATE_ATTRIBUTE;
815
772
  error->position = tag_state->_start_pos;
816
- error->original_text = tag_state->_original_text;
817
- error->v.duplicate_attr.original_index = original_index;
818
- error->v.duplicate_attr.new_index = new_index;
819
- copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
820
- reinitialize_tag_buffer(parser);
773
+ error->original_text.data = tag_state->_original_text;
774
+ error->original_text.length =
775
+ utf8iterator_get_char_pointer(&tokenizer->_input) - error->original_text.data;
776
+ error->v.tokenizer.state = tokenizer->_state;
821
777
  }
822
778
 
823
779
  // Creates a new attribute in the current tag, copying the current tag buffer to
824
780
  // the attribute's name. The attribute's value starts out as the empty string
825
781
  // (following the "Boolean attributes" section of the spec) and is only
826
782
  // overwritten on finish_attribute_value(). If the attribute has already been
827
- // specified, the new attribute is dropped, a parse error is added, and the
828
- // function returns false. Otherwise, this returns true.
829
- static bool finish_attribute_name(GumboParser* parser) {
783
+ // specified, the new attribute is dropped and a parse error is added
784
+ static void finish_attribute_name(GumboParser* parser) {
830
785
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
831
786
  GumboTagState* tag_state = &tokenizer->_tag_state;
787
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
788
+
789
+ int max_attributes = parser->_options->max_attributes;
790
+ if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
791
+ parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
792
+ gumbo_debug("Attributes limit exceeded.\n");
793
+ reinitialize_tag_buffer(parser);
794
+ tag_state->_drop_next_attr_value = true;
795
+ return;
796
+ }
797
+
832
798
  // May've been set by a previous attribute without a value; reset it here.
833
799
  tag_state->_drop_next_attr_value = false;
834
800
  assert(tag_state->_attributes.data);
835
801
  assert(tag_state->_attributes.capacity);
836
802
 
837
- GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
838
803
  for (unsigned int i = 0; i < attributes->length; ++i) {
839
804
  GumboAttribute* attr = attributes->data[i];
840
805
  if (
@@ -846,9 +811,10 @@ static bool finish_attribute_name(GumboParser* parser) {
846
811
  )
847
812
  ) {
848
813
  // Identical attribute; bail.
849
- add_duplicate_attr_error(parser, i, attributes->length);
814
+ add_duplicate_attr_error(parser);
815
+ reinitialize_tag_buffer(parser);
850
816
  tag_state->_drop_next_attr_value = true;
851
- return false;
817
+ return;
852
818
  }
853
819
  }
854
820
 
@@ -870,7 +836,6 @@ static bool finish_attribute_name(GumboParser* parser) {
870
836
  );
871
837
  gumbo_vector_add(attr, attributes);
872
838
  reinitialize_tag_buffer(parser);
873
- return true;
874
839
  }
875
840
 
876
841
  // Finishes an attribute value. This sets the value of the most recently added
@@ -911,22 +876,23 @@ void gumbo_tokenizer_state_init (
911
876
  GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState));
912
877
  parser->_tokenizer_state = tokenizer;
913
878
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
879
+ tokenizer->_return_state = GUMBO_LEX_DATA;
880
+ tokenizer->_character_reference_code = 0;
914
881
  tokenizer->_reconsume_current_input = false;
915
- tokenizer->_is_current_node_foreign = false;
882
+ tokenizer->_is_adjusted_current_node_foreign = false;
916
883
  tokenizer->_is_in_cdata = false;
917
884
  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
918
885
  tokenizer->_tag_state._name = NULL;
919
886
 
920
887
  tokenizer->_buffered_emit_char = kGumboNoChar;
921
888
  gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
922
- tokenizer->_temporary_buffer_emit = NULL;
889
+ tokenizer->_resume_pos = NULL;
923
890
 
924
891
  mark_tag_state_as_empty(&tokenizer->_tag_state);
925
892
 
926
- gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
927
- tokenizer->_token_start = text;
928
893
  utf8iterator_init(parser, text, text_length, &tokenizer->_input);
929
894
  utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
895
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
930
896
  doc_type_state_init(parser);
931
897
  }
932
898
 
@@ -936,7 +902,6 @@ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
936
902
  assert(tokenizer->_doc_type_state.public_identifier == NULL);
937
903
  assert(tokenizer->_doc_type_state.system_identifier == NULL);
938
904
  gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
939
- gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
940
905
  assert(tokenizer->_tag_state._name == NULL);
941
906
  assert(tokenizer->_tag_state._attributes.data == NULL);
942
907
  gumbo_free(tokenizer);
@@ -946,17 +911,23 @@ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
946
911
  parser->_tokenizer_state->_state = state;
947
912
  }
948
913
 
949
- void gumbo_tokenizer_set_is_current_node_foreign (
914
+ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
950
915
  GumboParser* parser,
951
916
  bool is_foreign
952
917
  ) {
953
- if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
918
+ if (is_foreign != parser->_tokenizer_state->_is_adjusted_current_node_foreign) {
954
919
  gumbo_debug (
955
920
  "Toggling is_current_node_foreign to %s.\n",
956
921
  is_foreign ? "true" : "false"
957
922
  );
958
923
  }
959
- parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
924
+ parser->_tokenizer_state->_is_adjusted_current_node_foreign = is_foreign;
925
+ }
926
+
927
+ static void reconsume_in_state(GumboParser* parser, GumboTokenizerEnum state) {
928
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
929
+ tokenizer->_reconsume_current_input = true;
930
+ tokenizer->_state = state;
960
931
  }
961
932
 
962
933
  // https://html.spec.whatwg.org/multipage/parsing.html#data-state
@@ -968,37 +939,24 @@ static StateResult handle_data_state (
968
939
  ) {
969
940
  switch (c) {
970
941
  case '&':
971
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
972
- // The char_ref machinery expects to be on the & so it can mark that
973
- // and return to it if the text isn't a char ref, so we need to
974
- // reconsume it.
975
- tokenizer->_reconsume_current_input = true;
976
- return NEXT_CHAR;
942
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
943
+ set_mark(parser);
944
+ tokenizer->_return_state = GUMBO_LEX_DATA;
945
+ return CONTINUE;
977
946
  case '<':
978
947
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
979
- clear_temporary_buffer(parser);
980
- append_char_to_temporary_buffer(parser, '<');
981
- return NEXT_CHAR;
948
+ set_mark(parser);
949
+ return CONTINUE;
982
950
  case '\0':
983
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
984
- emit_char(parser, c, output);
985
- return RETURN_ERROR;
951
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
952
+ return emit_char(parser, c, output);
953
+ case -1:
954
+ return emit_eof(parser, output);
986
955
  default:
987
- return emit_current_char(parser, output);
956
+ return emit_char(parser, c, output);
988
957
  }
989
958
  }
990
959
 
991
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-data-state
992
- static StateResult handle_char_ref_in_data_state (
993
- GumboParser* parser,
994
- GumboTokenizerState* UNUSED_ARG(tokenizer),
995
- int UNUSED_ARG(c),
996
- GumboToken* output
997
- ) {
998
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
999
- return emit_char_ref(parser, ' ', false, output);
1000
- }
1001
-
1002
960
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
1003
961
  static StateResult handle_rcdata_state (
1004
962
  GumboParser* parser,
@@ -1008,34 +966,23 @@ static StateResult handle_rcdata_state (
1008
966
  ) {
1009
967
  switch (c) {
1010
968
  case '&':
1011
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
1012
- tokenizer->_reconsume_current_input = true;
1013
- return NEXT_CHAR;
969
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
970
+ set_mark(parser);
971
+ tokenizer->_return_state = GUMBO_LEX_RCDATA;
972
+ return CONTINUE;
1014
973
  case '<':
1015
974
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
1016
- clear_temporary_buffer(parser);
1017
- append_char_to_temporary_buffer(parser, '<');
1018
- return NEXT_CHAR;
975
+ set_mark(parser);
976
+ return CONTINUE;
1019
977
  case '\0':
1020
978
  return emit_replacement_char(parser, output);
1021
979
  case -1:
1022
980
  return emit_eof(parser, output);
1023
981
  default:
1024
- return emit_current_char(parser, output);
982
+ return emit_char(parser, c, output);
1025
983
  }
1026
984
  }
1027
985
 
1028
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-rcdata-state
1029
- static StateResult handle_char_ref_in_rcdata_state (
1030
- GumboParser* parser,
1031
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1032
- int UNUSED_ARG(c),
1033
- GumboToken* output
1034
- ) {
1035
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1036
- return emit_char_ref(parser, ' ', false, output);
1037
- }
1038
-
1039
986
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
1040
987
  static StateResult handle_rawtext_state (
1041
988
  GumboParser* parser,
@@ -1046,20 +993,19 @@ static StateResult handle_rawtext_state (
1046
993
  switch (c) {
1047
994
  case '<':
1048
995
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
1049
- clear_temporary_buffer(parser);
1050
- append_char_to_temporary_buffer(parser, '<');
1051
- return NEXT_CHAR;
996
+ set_mark(parser);
997
+ return CONTINUE;
1052
998
  case '\0':
1053
999
  return emit_replacement_char(parser, output);
1054
1000
  case -1:
1055
1001
  return emit_eof(parser, output);
1056
1002
  default:
1057
- return emit_current_char(parser, output);
1003
+ return emit_char(parser, c, output);
1058
1004
  }
1059
1005
  }
1060
1006
 
1061
1007
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
1062
- static StateResult handle_script_state (
1008
+ static StateResult handle_script_data_state (
1063
1009
  GumboParser* parser,
1064
1010
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1065
1011
  int c,
@@ -1067,16 +1013,15 @@ static StateResult handle_script_state (
1067
1013
  ) {
1068
1014
  switch (c) {
1069
1015
  case '<':
1070
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
1071
- clear_temporary_buffer(parser);
1072
- append_char_to_temporary_buffer(parser, '<');
1073
- return NEXT_CHAR;
1016
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_LT);
1017
+ set_mark(parser);
1018
+ return CONTINUE;
1074
1019
  case '\0':
1075
1020
  return emit_replacement_char(parser, output);
1076
1021
  case -1:
1077
1022
  return emit_eof(parser, output);
1078
1023
  default:
1079
- return emit_current_char(parser, output);
1024
+ return emit_char(parser, c, output);
1080
1025
  }
1081
1026
  }
1082
1027
 
@@ -1093,75 +1038,75 @@ static StateResult handle_plaintext_state (
1093
1038
  case -1:
1094
1039
  return emit_eof(parser, output);
1095
1040
  default:
1096
- return emit_current_char(parser, output);
1041
+ return emit_char(parser, c, output);
1097
1042
  }
1098
1043
  }
1099
1044
 
1100
1045
  // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
1101
1046
  static StateResult handle_tag_open_state (
1102
1047
  GumboParser* parser,
1103
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1048
+ GumboTokenizerState* tokenizer,
1104
1049
  int c,
1105
1050
  GumboToken* output
1106
1051
  ) {
1107
- assert(temporary_buffer_equals(parser, "<"));
1108
1052
  switch (c) {
1109
1053
  case '!':
1110
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1054
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION_OPEN);
1111
1055
  clear_temporary_buffer(parser);
1112
- return NEXT_CHAR;
1056
+ return CONTINUE;
1113
1057
  case '/':
1114
1058
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1115
- append_char_to_temporary_buffer(parser, '/');
1116
- return NEXT_CHAR;
1059
+ return CONTINUE;
1117
1060
  case '?':
1118
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1061
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
1119
1062
  clear_temporary_buffer(parser);
1120
- append_char_to_temporary_buffer(parser, '?');
1121
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1122
- return NEXT_CHAR;
1063
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1064
+ return CONTINUE;
1065
+ case -1:
1066
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1067
+ // Switch to data to emit EOF.
1068
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1069
+ return emit_from_mark(parser, output);
1123
1070
  default:
1124
1071
  if (is_alpha(c)) {
1125
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1072
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1126
1073
  start_new_tag(parser, true);
1127
- return NEXT_CHAR;
1128
- } else {
1129
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1130
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1131
- emit_temporary_buffer(parser, output);
1132
- return RETURN_ERROR;
1074
+ return CONTINUE;
1133
1075
  }
1076
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1077
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1078
+ return emit_from_mark(parser, output);
1134
1079
  }
1135
1080
  }
1136
1081
 
1137
1082
  // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
1138
1083
  static StateResult handle_end_tag_open_state (
1139
1084
  GumboParser* parser,
1140
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1085
+ GumboTokenizerState* tokenizer,
1141
1086
  int c,
1142
1087
  GumboToken* output
1143
1088
  ) {
1144
- assert(temporary_buffer_equals(parser, "</"));
1145
1089
  switch (c) {
1146
1090
  case '>':
1147
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1091
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_END_TAG_NAME);
1148
1092
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1149
- return NEXT_CHAR;
1093
+ return CONTINUE;
1150
1094
  case -1:
1151
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1152
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1153
- return emit_temporary_buffer(parser, output);
1095
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1096
+ // Similar to the tag open state except we need to emit '<' and '/'
1097
+ // before the EOF.
1098
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1099
+ return emit_from_mark(parser, output);
1154
1100
  default:
1155
1101
  if (is_alpha(c)) {
1156
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1102
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1157
1103
  start_new_tag(parser, false);
1158
1104
  } else {
1159
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1160
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1105
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1106
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1161
1107
  clear_temporary_buffer(parser);
1162
- append_char_to_temporary_buffer(parser, c);
1163
1108
  }
1164
- return NEXT_CHAR;
1109
+ return CONTINUE;
1165
1110
  }
1166
1111
  }
1167
1112
 
@@ -1179,27 +1124,26 @@ static StateResult handle_tag_name_state (
1179
1124
  case ' ':
1180
1125
  finish_tag_name(parser);
1181
1126
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1182
- return NEXT_CHAR;
1127
+ return CONTINUE;
1183
1128
  case '/':
1184
1129
  finish_tag_name(parser);
1185
1130
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1186
- return NEXT_CHAR;
1131
+ return CONTINUE;
1187
1132
  case '>':
1188
1133
  finish_tag_name(parser);
1189
1134
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1190
1135
  return emit_current_tag(parser, output);
1191
1136
  case '\0':
1192
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1137
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1193
1138
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1194
- return NEXT_CHAR;
1139
+ return CONTINUE;
1195
1140
  case -1:
1196
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1141
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1197
1142
  abandon_current_tag(parser);
1198
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1199
- return NEXT_CHAR;
1143
+ return emit_eof(parser, output);
1200
1144
  default:
1201
1145
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1202
- return NEXT_CHAR;
1146
+ return CONTINUE;
1203
1147
  }
1204
1148
  }
1205
1149
 
@@ -1210,36 +1154,29 @@ static StateResult handle_rcdata_lt_state (
1210
1154
  int c,
1211
1155
  GumboToken* output
1212
1156
  ) {
1213
- assert(temporary_buffer_equals(parser, "<"));
1214
1157
  if (c == '/') {
1215
1158
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1216
- append_char_to_temporary_buffer(parser, '/');
1217
- return NEXT_CHAR;
1159
+ return CONTINUE;
1218
1160
  } else {
1219
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1220
- tokenizer->_reconsume_current_input = true;
1221
- return emit_temporary_buffer(parser, output);
1161
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1162
+ return emit_from_mark(parser, output);
1222
1163
  }
1223
1164
  }
1224
1165
 
1225
1166
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
1226
1167
  static StateResult handle_rcdata_end_tag_open_state (
1227
1168
  GumboParser* parser,
1228
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1169
+ GumboTokenizerState* tokenizer,
1229
1170
  int c,
1230
1171
  GumboToken* output
1231
1172
  ) {
1232
- assert(temporary_buffer_equals(parser, "</"));
1233
1173
  if (is_alpha(c)) {
1234
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1174
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1235
1175
  start_new_tag(parser, false);
1236
- append_char_to_temporary_buffer(parser, c);
1237
- return NEXT_CHAR;
1238
- } else {
1239
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1240
- return emit_temporary_buffer(parser, output);
1176
+ return CONTINUE;
1241
1177
  }
1242
- return true;
1178
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1179
+ return emit_from_mark(parser, output);
1243
1180
  }
1244
1181
 
1245
1182
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
@@ -1250,33 +1187,39 @@ static StateResult handle_rcdata_end_tag_name_state (
1250
1187
  GumboToken* output
1251
1188
  ) {
1252
1189
  UNUSED_IF_NDEBUG(tokenizer);
1253
- assert(tokenizer->_temporary_buffer.length >= 2);
1254
1190
  if (is_alpha(c)) {
1255
1191
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1256
- append_char_to_temporary_buffer(parser, c);
1257
- return NEXT_CHAR;
1258
- } else if (is_appropriate_end_tag(parser)) {
1259
- switch (c) {
1260
- case '\t':
1261
- case '\n':
1262
- case '\f':
1263
- case ' ':
1264
- finish_tag_name(parser);
1265
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1266
- return NEXT_CHAR;
1267
- case '/':
1268
- finish_tag_name(parser);
1269
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1270
- return NEXT_CHAR;
1271
- case '>':
1272
- finish_tag_name(parser);
1273
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1274
- return emit_current_tag(parser, output);
1192
+ return CONTINUE;
1193
+ }
1194
+ switch (c) {
1195
+ case '\t':
1196
+ case '\n':
1197
+ case '\f':
1198
+ case ' ':
1199
+ if (is_appropriate_end_tag(parser)) {
1200
+ finish_tag_name(parser);
1201
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1202
+ return CONTINUE;
1203
+ }
1204
+ break;
1205
+ case '/':
1206
+ if (is_appropriate_end_tag(parser)) {
1207
+ finish_tag_name(parser);
1208
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1209
+ return CONTINUE;
1210
+ }
1211
+ break;
1212
+ case '>':
1213
+ if (is_appropriate_end_tag(parser)) {
1214
+ finish_tag_name(parser);
1215
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1216
+ return emit_current_tag(parser, output);
1275
1217
  }
1218
+ break;
1276
1219
  }
1277
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1278
1220
  abandon_current_tag(parser);
1279
- return emit_temporary_buffer(parser, output);
1221
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1222
+ return emit_from_mark(parser, output);
1280
1223
  }
1281
1224
 
1282
1225
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
@@ -1286,34 +1229,29 @@ static StateResult handle_rawtext_lt_state (
1286
1229
  int c,
1287
1230
  GumboToken* output
1288
1231
  ) {
1289
- assert(temporary_buffer_equals(parser, "<"));
1290
1232
  if (c == '/') {
1291
1233
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1292
- append_char_to_temporary_buffer(parser, '/');
1293
- return NEXT_CHAR;
1234
+ return CONTINUE;
1294
1235
  } else {
1295
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1296
- tokenizer->_reconsume_current_input = true;
1297
- return emit_temporary_buffer(parser, output);
1236
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1237
+ return emit_from_mark(parser, output);
1298
1238
  }
1299
1239
  }
1300
1240
 
1301
1241
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
1302
1242
  static StateResult handle_rawtext_end_tag_open_state (
1303
1243
  GumboParser* parser,
1304
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1244
+ GumboTokenizerState* tokenizer,
1305
1245
  int c,
1306
1246
  GumboToken* output
1307
1247
  ) {
1308
- assert(temporary_buffer_equals(parser, "</"));
1309
1248
  if (is_alpha(c)) {
1310
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1249
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1311
1250
  start_new_tag(parser, false);
1312
- append_char_to_temporary_buffer(parser, c);
1313
- return NEXT_CHAR;
1251
+ return CONTINUE;
1314
1252
  } else {
1315
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1316
- return emit_temporary_buffer(parser, output);
1253
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1254
+ return emit_from_mark(parser, output);
1317
1255
  }
1318
1256
  }
1319
1257
 
@@ -1324,153 +1262,156 @@ static StateResult handle_rawtext_end_tag_name_state (
1324
1262
  int c,
1325
1263
  GumboToken* output
1326
1264
  ) {
1327
- assert(tokenizer->_temporary_buffer.length >= 2);
1328
- gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1329
- tokenizer->_tag_state._buffer.data);
1330
1265
  if (is_alpha(c)) {
1331
1266
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1332
- append_char_to_temporary_buffer(parser, c);
1333
- return NEXT_CHAR;
1334
- } else if (is_appropriate_end_tag(parser)) {
1335
- gumbo_debug("Is an appropriate end tag.\n");
1336
- switch (c) {
1337
- case '\t':
1338
- case '\n':
1339
- case '\f':
1340
- case ' ':
1341
- finish_tag_name(parser);
1342
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1343
- return NEXT_CHAR;
1344
- case '/':
1345
- finish_tag_name(parser);
1346
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1347
- return NEXT_CHAR;
1348
- case '>':
1349
- finish_tag_name(parser);
1350
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1351
- return emit_current_tag(parser, output);
1267
+ return CONTINUE;
1268
+ }
1269
+ switch (c) {
1270
+ case '\t':
1271
+ case '\n':
1272
+ case '\f':
1273
+ case ' ':
1274
+ if (is_appropriate_end_tag(parser)) {
1275
+ finish_tag_name(parser);
1276
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1277
+ return CONTINUE;
1278
+ }
1279
+ break;
1280
+ case '/':
1281
+ if (is_appropriate_end_tag(parser)) {
1282
+ finish_tag_name(parser);
1283
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1284
+ return CONTINUE;
1285
+ }
1286
+ break;
1287
+ case '>':
1288
+ if (is_appropriate_end_tag(parser)) {
1289
+ finish_tag_name(parser);
1290
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1291
+ return emit_current_tag(parser, output);
1352
1292
  }
1293
+ break;
1353
1294
  }
1354
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1355
1295
  abandon_current_tag(parser);
1356
- return emit_temporary_buffer(parser, output);
1296
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1297
+ return emit_from_mark(parser, output);
1357
1298
  }
1358
1299
 
1359
1300
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
1360
- static StateResult handle_script_lt_state (
1301
+ static StateResult handle_script_data_lt_state (
1361
1302
  GumboParser* parser,
1362
1303
  GumboTokenizerState* tokenizer,
1363
1304
  int c,
1364
1305
  GumboToken* output
1365
1306
  ) {
1366
- assert(temporary_buffer_equals(parser, "<"));
1367
1307
  if (c == '/') {
1368
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1369
- append_char_to_temporary_buffer(parser, '/');
1370
- return NEXT_CHAR;
1371
- } else if (c == '!') {
1372
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1373
- append_char_to_temporary_buffer(parser, '!');
1374
- return emit_temporary_buffer(parser, output);
1375
- } else {
1376
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1377
- tokenizer->_reconsume_current_input = true;
1378
- return emit_temporary_buffer(parser, output);
1308
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN);
1309
+ return CONTINUE;
1310
+ }
1311
+ if (c == '!') {
1312
+ // This is the only place we don't reconsume the input before emitting the
1313
+ // temporary buffer. Since the current position is stored and the current
1314
+ // character is not emitted, we need to advance the input and then
1315
+ // reconsume.
1316
+ utf8iterator_next(&tokenizer->_input);
1317
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START);
1318
+ return emit_from_mark(parser, output);
1379
1319
  }
1320
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1321
+ return emit_from_mark(parser, output);
1380
1322
  }
1381
1323
 
1382
1324
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
1383
- static StateResult handle_script_end_tag_open_state (
1325
+ static StateResult handle_script_data_end_tag_open_state (
1384
1326
  GumboParser* parser,
1385
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1327
+ GumboTokenizerState* tokenizer,
1386
1328
  int c,
1387
1329
  GumboToken* output
1388
1330
  ) {
1389
- assert(temporary_buffer_equals(parser, "</"));
1390
1331
  if (is_alpha(c)) {
1391
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1332
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME);
1392
1333
  start_new_tag(parser, false);
1393
- append_char_to_temporary_buffer(parser, c);
1394
- return NEXT_CHAR;
1395
- } else {
1396
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1397
- return emit_temporary_buffer(parser, output);
1334
+ return CONTINUE;
1398
1335
  }
1336
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1337
+ return emit_from_mark(parser, output);
1399
1338
  }
1400
1339
 
1401
1340
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
1402
- static StateResult handle_script_end_tag_name_state (
1341
+ static StateResult handle_script_data_end_tag_name_state (
1403
1342
  GumboParser* parser,
1404
1343
  GumboTokenizerState* tokenizer,
1405
1344
  int c,
1406
1345
  GumboToken* output
1407
1346
  ) {
1408
- UNUSED_IF_NDEBUG(tokenizer);
1409
- assert(tokenizer->_temporary_buffer.length >= 2);
1410
1347
  if (is_alpha(c)) {
1411
1348
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1412
- append_char_to_temporary_buffer(parser, c);
1413
- return NEXT_CHAR;
1414
- } else if (is_appropriate_end_tag(parser)) {
1415
- switch (c) {
1416
- case '\t':
1417
- case '\n':
1418
- case '\f':
1419
- case ' ':
1420
- finish_tag_name(parser);
1421
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1422
- return NEXT_CHAR;
1423
- case '/':
1424
- finish_tag_name(parser);
1425
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1426
- return NEXT_CHAR;
1427
- case '>':
1428
- finish_tag_name(parser);
1429
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1430
- return emit_current_tag(parser, output);
1349
+ return CONTINUE;
1350
+ }
1351
+ switch (c) {
1352
+ case '\t':
1353
+ case '\n':
1354
+ case '\f':
1355
+ case ' ':
1356
+ if (is_appropriate_end_tag(parser)) {
1357
+ finish_tag_name(parser);
1358
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1359
+ return CONTINUE;
1360
+ }
1361
+ break;
1362
+ case '/':
1363
+ if (is_appropriate_end_tag(parser)) {
1364
+ finish_tag_name(parser);
1365
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1366
+ return CONTINUE;
1367
+ }
1368
+ break;
1369
+ case '>':
1370
+ if (is_appropriate_end_tag(parser)) {
1371
+ finish_tag_name(parser);
1372
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1373
+ return emit_current_tag(parser, output);
1431
1374
  }
1375
+ break;
1432
1376
  }
1433
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1434
1377
  abandon_current_tag(parser);
1435
- return emit_temporary_buffer(parser, output);
1378
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1379
+ return emit_from_mark(parser, output);
1436
1380
  }
1437
1381
 
1438
1382
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
1439
- static StateResult handle_script_escaped_start_state (
1383
+ static StateResult handle_script_data_escaped_start_state (
1440
1384
  GumboParser* parser,
1441
1385
  GumboTokenizerState* tokenizer,
1442
1386
  int c,
1443
1387
  GumboToken* output
1444
1388
  ) {
1445
1389
  if (c == '-') {
1446
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1447
- return emit_current_char(parser, output);
1448
- } else {
1449
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1450
- tokenizer->_reconsume_current_input = true;
1451
- return NEXT_CHAR;
1390
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH);
1391
+ return emit_char(parser, c, output);
1452
1392
  }
1393
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1394
+ return CONTINUE;
1453
1395
  }
1454
1396
 
1455
1397
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
1456
- static StateResult handle_script_escaped_start_dash_state (
1398
+ static StateResult handle_script_data_escaped_start_dash_state (
1457
1399
  GumboParser* parser,
1458
1400
  GumboTokenizerState* tokenizer,
1459
1401
  int c,
1460
1402
  GumboToken* output
1461
1403
  ) {
1462
1404
  if (c == '-') {
1463
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1464
- return emit_current_char(parser, output);
1405
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1406
+ return emit_char(parser, c, output);
1465
1407
  } else {
1466
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1467
- tokenizer->_reconsume_current_input = true;
1468
- return NEXT_CHAR;
1408
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1409
+ return CONTINUE;
1469
1410
  }
1470
1411
  }
1471
1412
 
1472
1413
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
1473
- static StateResult handle_script_escaped_state (
1414
+ static StateResult handle_script_data_escaped_state (
1474
1415
  GumboParser* parser,
1475
1416
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1476
1417
  int c,
@@ -1478,25 +1419,25 @@ static StateResult handle_script_escaped_state (
1478
1419
  ) {
1479
1420
  switch (c) {
1480
1421
  case '-':
1481
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1482
- return emit_current_char(parser, output);
1422
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH);
1423
+ return emit_char(parser, c, output);
1483
1424
  case '<':
1484
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1425
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1485
1426
  clear_temporary_buffer(parser);
1486
- append_char_to_temporary_buffer(parser, c);
1487
- return NEXT_CHAR;
1427
+ set_mark(parser);
1428
+ return CONTINUE;
1488
1429
  case '\0':
1489
1430
  return emit_replacement_char(parser, output);
1490
1431
  case -1:
1491
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1432
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1492
1433
  return emit_eof(parser, output);
1493
1434
  default:
1494
- return emit_current_char(parser, output);
1435
+ return emit_char(parser, c, output);
1495
1436
  }
1496
1437
  }
1497
1438
 
1498
1439
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
1499
- static StateResult handle_script_escaped_dash_state (
1440
+ static StateResult handle_script_data_escaped_dash_state (
1500
1441
  GumboParser* parser,
1501
1442
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1502
1443
  int c,
@@ -1504,28 +1445,27 @@ static StateResult handle_script_escaped_dash_state (
1504
1445
  ) {
1505
1446
  switch (c) {
1506
1447
  case '-':
1507
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1508
- return emit_current_char(parser, output);
1448
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1449
+ return emit_char(parser, c, output);
1509
1450
  case '<':
1510
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1451
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1511
1452
  clear_temporary_buffer(parser);
1512
- append_char_to_temporary_buffer(parser, c);
1513
- return NEXT_CHAR;
1453
+ set_mark(parser);
1454
+ return CONTINUE;
1514
1455
  case '\0':
1515
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1456
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1516
1457
  return emit_replacement_char(parser, output);
1517
1458
  case -1:
1518
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1519
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1520
- return NEXT_CHAR;
1459
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1460
+ return emit_eof(parser, output);
1521
1461
  default:
1522
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1523
- return emit_current_char(parser, output);
1462
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1463
+ return emit_char(parser, c, output);
1524
1464
  }
1525
1465
  }
1526
1466
 
1527
1467
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
1528
- static StateResult handle_script_escaped_dash_dash_state (
1468
+ static StateResult handle_script_data_escaped_dash_dash_state (
1529
1469
  GumboParser* parser,
1530
1470
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1531
1471
  int c,
@@ -1533,113 +1473,107 @@ static StateResult handle_script_escaped_dash_dash_state (
1533
1473
  ) {
1534
1474
  switch (c) {
1535
1475
  case '-':
1536
- return emit_current_char(parser, output);
1476
+ return emit_char(parser, c, output);
1537
1477
  case '<':
1538
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1478
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1539
1479
  clear_temporary_buffer(parser);
1540
- append_char_to_temporary_buffer(parser, c);
1541
- return NEXT_CHAR;
1480
+ set_mark(parser);
1481
+ return CONTINUE;
1542
1482
  case '>':
1543
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1544
- return emit_current_char(parser, output);
1483
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1484
+ return emit_char(parser, c, output);
1545
1485
  case '\0':
1546
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1486
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1547
1487
  return emit_replacement_char(parser, output);
1548
1488
  case -1:
1549
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1550
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1551
- return NEXT_CHAR;
1489
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1490
+ return emit_eof(parser, output);
1552
1491
  default:
1553
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1554
- return emit_current_char(parser, output);
1492
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1493
+ return emit_char(parser, c, output);
1555
1494
  }
1556
1495
  }
1557
1496
 
1558
1497
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
1559
- static StateResult handle_script_escaped_lt_state (
1498
+ static StateResult handle_script_data_escaped_lt_state (
1560
1499
  GumboParser* parser,
1561
1500
  GumboTokenizerState* tokenizer,
1562
1501
  int c,
1563
1502
  GumboToken* output
1564
1503
  ) {
1565
- assert(temporary_buffer_equals(parser, "<"));
1566
- assert(!tokenizer->_script_data_buffer.length);
1504
+ assert(temporary_buffer_is_empty(parser));
1567
1505
  if (c == '/') {
1568
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1569
- append_char_to_temporary_buffer(parser, c);
1570
- return NEXT_CHAR;
1571
- } else if (is_alpha(c)) {
1572
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1573
- append_char_to_temporary_buffer(parser, c);
1574
- gumbo_string_buffer_append_codepoint (
1575
- ensure_lowercase(c),
1576
- &tokenizer->_script_data_buffer
1577
- );
1578
- return emit_temporary_buffer(parser, output);
1579
- } else {
1580
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1581
- return emit_temporary_buffer(parser, output);
1506
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN);
1507
+ return CONTINUE;
1508
+ }
1509
+ if (is_alpha(c)) {
1510
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START);
1511
+ return emit_from_mark(parser, output);
1582
1512
  }
1513
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1514
+ return emit_from_mark(parser, output);
1583
1515
  }
1584
1516
 
1585
1517
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
1586
- static StateResult handle_script_escaped_end_tag_open_state (
1518
+ static StateResult handle_script_data_escaped_end_tag_open_state (
1587
1519
  GumboParser* parser,
1588
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1520
+ GumboTokenizerState* tokenizer,
1589
1521
  int c,
1590
1522
  GumboToken* output
1591
1523
  ) {
1592
- assert(temporary_buffer_equals(parser, "</"));
1593
1524
  if (is_alpha(c)) {
1594
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1525
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME);
1595
1526
  start_new_tag(parser, false);
1596
- append_char_to_temporary_buffer(parser, c);
1597
- return NEXT_CHAR;
1598
- } else {
1599
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1600
- return emit_temporary_buffer(parser, output);
1527
+ return CONTINUE;
1601
1528
  }
1529
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1530
+ return emit_from_mark(parser, output);
1602
1531
  }
1603
1532
 
1604
1533
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
1605
- static StateResult handle_script_escaped_end_tag_name_state (
1534
+ static StateResult handle_script_data_escaped_end_tag_name_state (
1606
1535
  GumboParser* parser,
1607
1536
  GumboTokenizerState* tokenizer,
1608
1537
  int c,
1609
1538
  GumboToken* output
1610
1539
  ) {
1611
- UNUSED_IF_NDEBUG(tokenizer);
1612
- assert(tokenizer->_temporary_buffer.length >= 2);
1613
1540
  if (is_alpha(c)) {
1614
1541
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1615
- append_char_to_temporary_buffer(parser, c);
1616
- return NEXT_CHAR;
1617
- } else if (is_appropriate_end_tag(parser)) {
1618
- switch (c) {
1619
- case '\t':
1620
- case '\n':
1621
- case '\f':
1622
- case ' ':
1623
- finish_tag_name(parser);
1624
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1625
- return NEXT_CHAR;
1626
- case '/':
1627
- finish_tag_name(parser);
1628
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1629
- return NEXT_CHAR;
1630
- case '>':
1631
- finish_tag_name(parser);
1632
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1633
- return emit_current_tag(parser, output);
1542
+ return CONTINUE;
1543
+ }
1544
+ switch (c) {
1545
+ case '\t':
1546
+ case '\n':
1547
+ case '\f':
1548
+ case ' ':
1549
+ if (is_appropriate_end_tag(parser)) {
1550
+ finish_tag_name(parser);
1551
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1552
+ return CONTINUE;
1553
+ }
1554
+ break;
1555
+ case '/':
1556
+ if (is_appropriate_end_tag(parser)) {
1557
+ finish_tag_name(parser);
1558
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1559
+ return CONTINUE;
1560
+ }
1561
+ break;
1562
+ case '>':
1563
+ if (is_appropriate_end_tag(parser)) {
1564
+ finish_tag_name(parser);
1565
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1566
+ return emit_current_tag(parser, output);
1634
1567
  }
1568
+ break;
1635
1569
  }
1636
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1637
1570
  abandon_current_tag(parser);
1638
- return emit_temporary_buffer(parser, output);
1571
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1572
+ return emit_from_mark(parser, output);
1639
1573
  }
1640
1574
 
1641
1575
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
1642
- static StateResult handle_script_double_escaped_start_state (
1576
+ static StateResult handle_script_data_double_escaped_start_state (
1643
1577
  GumboParser* parser,
1644
1578
  GumboTokenizerState* tokenizer,
1645
1579
  int c,
@@ -1656,29 +1590,23 @@ static StateResult handle_script_double_escaped_start_state (
1656
1590
  parser,
1657
1591
  gumbo_string_equals (
1658
1592
  &kScriptTag,
1659
- (GumboStringPiece*) &tokenizer->_script_data_buffer
1593
+ (GumboStringPiece*) &tokenizer->_temporary_buffer
1660
1594
  )
1661
- ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1662
- : GUMBO_LEX_SCRIPT_ESCAPED
1595
+ ? GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED
1596
+ : GUMBO_LEX_SCRIPT_DATA_ESCAPED
1663
1597
  );
1664
- return emit_current_char(parser, output);
1665
- default:
1666
- if (is_alpha(c)) {
1667
- gumbo_string_buffer_append_codepoint (
1668
- ensure_lowercase(c),
1669
- &tokenizer->_script_data_buffer
1670
- );
1671
- return emit_current_char(parser, output);
1672
- } else {
1673
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1674
- tokenizer->_reconsume_current_input = true;
1675
- return NEXT_CHAR;
1676
- }
1598
+ return emit_char(parser, c, output);
1599
+ }
1600
+ if (is_alpha(c)) {
1601
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1602
+ return emit_char(parser, c, output);
1677
1603
  }
1604
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1605
+ return CONTINUE;
1678
1606
  }
1679
1607
 
1680
1608
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
1681
- static StateResult handle_script_double_escaped_state (
1609
+ static StateResult handle_script_data_double_escaped_state (
1682
1610
  GumboParser* parser,
1683
1611
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1684
1612
  int c,
@@ -1686,24 +1614,23 @@ static StateResult handle_script_double_escaped_state (
1686
1614
  ) {
1687
1615
  switch (c) {
1688
1616
  case '-':
1689
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1690
- return emit_current_char(parser, output);
1617
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH);
1618
+ return emit_char(parser, c, output);
1691
1619
  case '<':
1692
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1693
- return emit_current_char(parser, output);
1620
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1621
+ return emit_char(parser, c, output);
1694
1622
  case '\0':
1695
1623
  return emit_replacement_char(parser, output);
1696
1624
  case -1:
1697
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1698
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1699
- return NEXT_CHAR;
1625
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1626
+ return emit_eof(parser, output);
1700
1627
  default:
1701
- return emit_current_char(parser, output);
1628
+ return emit_char(parser, c, output);
1702
1629
  }
1703
1630
  }
1704
1631
 
1705
1632
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
1706
- static StateResult handle_script_double_escaped_dash_state (
1633
+ static StateResult handle_script_data_double_escaped_dash_state (
1707
1634
  GumboParser* parser,
1708
1635
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1709
1636
  int c,
@@ -1712,26 +1639,25 @@ static StateResult handle_script_double_escaped_dash_state (
1712
1639
  switch (c) {
1713
1640
  case '-':
1714
1641
  gumbo_tokenizer_set_state(
1715
- parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1716
- return emit_current_char(parser, output);
1642
+ parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH);
1643
+ return emit_char(parser, c, output);
1717
1644
  case '<':
1718
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1719
- return emit_current_char(parser, output);
1645
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1646
+ return emit_char(parser, c, output);
1720
1647
  case '\0':
1721
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1648
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1722
1649
  return emit_replacement_char(parser, output);
1723
1650
  case -1:
1724
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1725
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1726
- return NEXT_CHAR;
1651
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1652
+ return emit_eof(parser, output);
1727
1653
  default:
1728
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1729
- return emit_current_char(parser, output);
1654
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1655
+ return emit_char(parser, c, output);
1730
1656
  }
1731
1657
  }
1732
1658
 
1733
1659
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
1734
- static StateResult handle_script_double_escaped_dash_dash_state (
1660
+ static StateResult handle_script_data_double_escaped_dash_dash_state (
1735
1661
  GumboParser* parser,
1736
1662
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1737
1663
  int c,
@@ -1739,46 +1665,44 @@ static StateResult handle_script_double_escaped_dash_dash_state (
1739
1665
  ) {
1740
1666
  switch (c) {
1741
1667
  case '-':
1742
- return emit_current_char(parser, output);
1668
+ return emit_char(parser, c, output);
1743
1669
  case '<':
1744
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1745
- return emit_current_char(parser, output);
1670
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1671
+ return emit_char(parser, c, output);
1746
1672
  case '>':
1747
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1748
- return emit_current_char(parser, output);
1673
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1674
+ return emit_char(parser, c, output);
1749
1675
  case '\0':
1750
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1676
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1751
1677
  return emit_replacement_char(parser, output);
1752
1678
  case -1:
1753
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1754
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1755
- return NEXT_CHAR;
1679
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1680
+ return emit_eof(parser, output);
1756
1681
  default:
1757
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1758
- return emit_current_char(parser, output);
1682
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1683
+ return emit_char(parser, c, output);
1759
1684
  }
1760
1685
  }
1761
1686
 
1762
1687
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
1763
- static StateResult handle_script_double_escaped_lt_state (
1688
+ static StateResult handle_script_data_double_escaped_lt_state (
1764
1689
  GumboParser* parser,
1765
1690
  GumboTokenizerState* tokenizer,
1766
1691
  int c,
1767
1692
  GumboToken* output
1768
1693
  ) {
1769
1694
  if (c == '/') {
1770
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1771
- gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
1772
- return emit_current_char(parser, output);
1695
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END);
1696
+ clear_temporary_buffer(parser);
1697
+ return emit_char(parser, c, output);
1773
1698
  } else {
1774
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1775
- tokenizer->_reconsume_current_input = true;
1776
- return NEXT_CHAR;
1699
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1700
+ return CONTINUE;
1777
1701
  }
1778
1702
  }
1779
1703
 
1780
1704
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
1781
- static StateResult handle_script_double_escaped_end_state (
1705
+ static StateResult handle_script_data_double_escaped_end_state (
1782
1706
  GumboParser* parser,
1783
1707
  GumboTokenizerState* tokenizer,
1784
1708
  int c,
@@ -1793,29 +1717,23 @@ static StateResult handle_script_double_escaped_end_state (
1793
1717
  case '>':
1794
1718
  gumbo_tokenizer_set_state(
1795
1719
  parser, gumbo_string_equals(&kScriptTag,
1796
- (GumboStringPiece*) &tokenizer->_script_data_buffer)
1797
- ? GUMBO_LEX_SCRIPT_ESCAPED
1798
- : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1799
- return emit_current_char(parser, output);
1800
- default:
1801
- if (is_alpha(c)) {
1802
- gumbo_string_buffer_append_codepoint (
1803
- ensure_lowercase(c),
1804
- &tokenizer->_script_data_buffer
1805
- );
1806
- return emit_current_char(parser, output);
1807
- } else {
1808
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1809
- tokenizer->_reconsume_current_input = true;
1810
- return NEXT_CHAR;
1811
- }
1720
+ (GumboStringPiece*) &tokenizer->_temporary_buffer)
1721
+ ? GUMBO_LEX_SCRIPT_DATA_ESCAPED
1722
+ : GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1723
+ return emit_char(parser, c, output);
1724
+ }
1725
+ if (is_alpha(c)) {
1726
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1727
+ return emit_char(parser, c, output);
1812
1728
  }
1729
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1730
+ return CONTINUE;
1813
1731
  }
1814
1732
 
1815
1733
  // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
1816
1734
  static StateResult handle_before_attr_name_state (
1817
1735
  GumboParser* parser,
1818
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1736
+ GumboTokenizerState* tokenizer,
1819
1737
  int c,
1820
1738
  GumboToken* output
1821
1739
  ) {
@@ -1824,40 +1742,27 @@ static StateResult handle_before_attr_name_state (
1824
1742
  case '\n':
1825
1743
  case '\f':
1826
1744
  case ' ':
1827
- return NEXT_CHAR;
1745
+ return CONTINUE;
1828
1746
  case '/':
1829
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1830
- return NEXT_CHAR;
1831
1747
  case '>':
1832
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1833
- return emit_current_tag(parser, output);
1834
- case '\0':
1835
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1836
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1837
- append_char_to_temporary_buffer(parser, 0xfffd);
1838
- return NEXT_CHAR;
1839
1748
  case -1:
1840
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1841
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1842
- abandon_current_tag(parser);
1843
- return NEXT_CHAR;
1844
- case '"':
1845
- case '\'':
1846
- case '<':
1749
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1750
+ return CONTINUE;
1847
1751
  case '=':
1848
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1849
- // Fall through.
1850
- default:
1752
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME);
1851
1753
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1852
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1853
- return NEXT_CHAR;
1754
+ append_char_to_tag_buffer(parser, c, true);
1755
+ return CONTINUE;
1756
+ default:
1757
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1758
+ return CONTINUE;
1854
1759
  }
1855
1760
  }
1856
1761
 
1857
1762
  // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
1858
1763
  static StateResult handle_attr_name_state (
1859
1764
  GumboParser* parser,
1860
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1765
+ GumboTokenizerState* tokenizer,
1861
1766
  int c,
1862
1767
  GumboToken* output
1863
1768
  ) {
@@ -1866,45 +1771,35 @@ static StateResult handle_attr_name_state (
1866
1771
  case '\n':
1867
1772
  case '\f':
1868
1773
  case ' ':
1869
- finish_attribute_name(parser);
1870
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1871
- return NEXT_CHAR;
1872
1774
  case '/':
1775
+ case '>':
1776
+ case -1:
1873
1777
  finish_attribute_name(parser);
1874
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1875
- return NEXT_CHAR;
1778
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1779
+ return CONTINUE;
1876
1780
  case '=':
1877
1781
  finish_attribute_name(parser);
1878
1782
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1879
- return NEXT_CHAR;
1880
- case '>':
1881
- finish_attribute_name(parser);
1882
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1883
- return emit_current_tag(parser, output);
1783
+ return CONTINUE;
1884
1784
  case '\0':
1885
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1785
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1886
1786
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1887
- return NEXT_CHAR;
1888
- case -1:
1889
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1890
- abandon_current_tag(parser);
1891
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1892
- return NEXT_CHAR;
1787
+ return CONTINUE;
1893
1788
  case '"':
1894
1789
  case '\'':
1895
1790
  case '<':
1896
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1791
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME);
1897
1792
  // Fall through.
1898
1793
  default:
1899
1794
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1900
- return NEXT_CHAR;
1795
+ return CONTINUE;
1901
1796
  }
1902
1797
  }
1903
1798
 
1904
1799
  // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
1905
1800
  static StateResult handle_after_attr_name_state (
1906
1801
  GumboParser* parser,
1907
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1802
+ GumboTokenizerState* tokenizer,
1908
1803
  int c,
1909
1804
  GumboToken* output
1910
1805
  ) {
@@ -1913,35 +1808,23 @@ static StateResult handle_after_attr_name_state (
1913
1808
  case '\n':
1914
1809
  case '\f':
1915
1810
  case ' ':
1916
- return NEXT_CHAR;
1811
+ return CONTINUE;
1917
1812
  case '/':
1918
1813
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1919
- return NEXT_CHAR;
1814
+ return CONTINUE;
1920
1815
  case '=':
1921
1816
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1922
- return NEXT_CHAR;
1817
+ return CONTINUE;
1923
1818
  case '>':
1924
1819
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1925
1820
  return emit_current_tag(parser, output);
1926
- case '\0':
1927
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1928
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1929
- append_char_to_temporary_buffer(parser, 0xfffd);
1930
- return NEXT_CHAR;
1931
1821
  case -1:
1932
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1933
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1822
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1934
1823
  abandon_current_tag(parser);
1935
- return NEXT_CHAR;
1936
- case '"':
1937
- case '\'':
1938
- case '<':
1939
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1940
- // Fall through.
1824
+ return emit_eof(parser, output);
1941
1825
  default:
1942
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1943
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1944
- return NEXT_CHAR;
1826
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1827
+ return CONTINUE;
1945
1828
  }
1946
1829
  }
1947
1830
 
@@ -1957,45 +1840,22 @@ static StateResult handle_before_attr_value_state (
1957
1840
  case '\n':
1958
1841
  case '\f':
1959
1842
  case ' ':
1960
- return NEXT_CHAR;
1843
+ return CONTINUE;
1961
1844
  case '"':
1962
1845
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1963
1846
  reset_tag_buffer_start_point(parser);
1964
- return NEXT_CHAR;
1965
- case '&':
1966
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1967
- tokenizer->_reconsume_current_input = true;
1968
- return NEXT_CHAR;
1847
+ return CONTINUE;
1969
1848
  case '\'':
1970
1849
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1971
1850
  reset_tag_buffer_start_point(parser);
1972
- return NEXT_CHAR;
1973
- case '\0':
1974
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1975
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1976
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1977
- return NEXT_CHAR;
1978
- case -1:
1979
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1980
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1981
- abandon_current_tag(parser);
1982
- tokenizer->_reconsume_current_input = true;
1983
- return NEXT_CHAR;
1851
+ return CONTINUE;
1984
1852
  case '>':
1985
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1853
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_ATTRIBUTE_VALUE);
1986
1854
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1987
- emit_current_tag(parser, output);
1988
- return RETURN_ERROR;
1989
- case '<':
1990
- case '=':
1991
- case '`':
1992
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1993
- // Fall through.
1994
- default:
1995
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1996
- append_char_to_tag_buffer(parser, c, true);
1997
- return NEXT_CHAR;
1855
+ return emit_current_tag(parser, output);
1998
1856
  }
1857
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1858
+ return CONTINUE;
1999
1859
  }
2000
1860
 
2001
1861
  // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
@@ -2003,30 +1863,28 @@ static StateResult handle_attr_value_double_quoted_state (
2003
1863
  GumboParser* parser,
2004
1864
  GumboTokenizerState* tokenizer,
2005
1865
  int c,
2006
- GumboToken* UNUSED_ARG(output)
1866
+ GumboToken* output
2007
1867
  ) {
2008
1868
  switch (c) {
2009
1869
  case '"':
2010
1870
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
2011
- return NEXT_CHAR;
1871
+ return CONTINUE;
2012
1872
  case '&':
2013
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2014
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2015
- tokenizer->_reconsume_current_input = true;
2016
- return NEXT_CHAR;
1873
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1874
+ set_mark(parser);
1875
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED;
1876
+ return CONTINUE;
2017
1877
  case '\0':
2018
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1878
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2019
1879
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
2020
- return NEXT_CHAR;
1880
+ return CONTINUE;
2021
1881
  case -1:
2022
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
2023
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1882
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2024
1883
  abandon_current_tag(parser);
2025
- tokenizer->_reconsume_current_input = true;
2026
- return NEXT_CHAR;
1884
+ return emit_eof(parser, output);
2027
1885
  default:
2028
1886
  append_char_to_tag_buffer(parser, c, false);
2029
- return NEXT_CHAR;
1887
+ return CONTINUE;
2030
1888
  }
2031
1889
  }
2032
1890
 
@@ -2035,30 +1893,28 @@ static StateResult handle_attr_value_single_quoted_state (
2035
1893
  GumboParser* parser,
2036
1894
  GumboTokenizerState* tokenizer,
2037
1895
  int c,
2038
- GumboToken* UNUSED_ARG(output)
1896
+ GumboToken* output
2039
1897
  ) {
2040
1898
  switch (c) {
2041
1899
  case '\'':
2042
1900
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
2043
- return NEXT_CHAR;
1901
+ return CONTINUE;
2044
1902
  case '&':
2045
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2046
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2047
- tokenizer->_reconsume_current_input = true;
2048
- return NEXT_CHAR;
1903
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1904
+ set_mark(parser);
1905
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED;
1906
+ return CONTINUE;
2049
1907
  case '\0':
2050
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1908
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2051
1909
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
2052
- return NEXT_CHAR;
1910
+ return CONTINUE;
2053
1911
  case -1:
2054
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
2055
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1912
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2056
1913
  abandon_current_tag(parser);
2057
- tokenizer->_reconsume_current_input = true;
2058
- return NEXT_CHAR;
1914
+ return emit_eof(parser, output);
2059
1915
  default:
2060
1916
  append_char_to_tag_buffer(parser, c, false);
2061
- return NEXT_CHAR;
1917
+ return CONTINUE;
2062
1918
  }
2063
1919
  }
2064
1920
 
@@ -2076,89 +1932,35 @@ static StateResult handle_attr_value_unquoted_state (
2076
1932
  case ' ':
2077
1933
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2078
1934
  finish_attribute_value(parser);
2079
- return NEXT_CHAR;
1935
+ return CONTINUE;
2080
1936
  case '&':
2081
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2082
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2083
- tokenizer->_reconsume_current_input = true;
2084
- return NEXT_CHAR;
1937
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1938
+ set_mark(parser);
1939
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_UNQUOTED;
1940
+ return CONTINUE;
2085
1941
  case '>':
2086
1942
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2087
1943
  finish_attribute_value(parser);
2088
1944
  return emit_current_tag(parser, output);
2089
1945
  case '\0':
2090
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1946
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2091
1947
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
2092
- return NEXT_CHAR;
1948
+ return CONTINUE;
2093
1949
  case -1:
2094
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
2095
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2096
- tokenizer->_reconsume_current_input = true;
1950
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2097
1951
  abandon_current_tag(parser);
2098
- return NEXT_CHAR;
2099
- case '<':
2100
- case '=':
1952
+ return emit_eof(parser, output);
2101
1953
  case '"':
2102
1954
  case '\'':
1955
+ case '<':
1956
+ case '=':
2103
1957
  case '`':
2104
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1958
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE);
2105
1959
  // Fall through.
2106
1960
  default:
2107
1961
  append_char_to_tag_buffer(parser, c, true);
2108
- return NEXT_CHAR;
2109
- }
2110
- }
2111
-
2112
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-attribute-value-state
2113
- static StateResult handle_char_ref_in_attr_value_state (
2114
- GumboParser* parser,
2115
- GumboTokenizerState* tokenizer,
2116
- int UNUSED_ARG(c),
2117
- GumboToken* UNUSED_ARG(output)
2118
- ) {
2119
- OneOrTwoCodepoints char_ref;
2120
- int allowed_char;
2121
- bool is_unquoted = false;
2122
- switch (tokenizer->_tag_state._attr_value_state) {
2123
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
2124
- allowed_char = '"';
2125
- break;
2126
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
2127
- allowed_char = '\'';
2128
- break;
2129
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
2130
- allowed_char = '>';
2131
- is_unquoted = true;
2132
- break;
2133
- default:
2134
- // -Wmaybe-uninitialized is a little overzealous here, and doesn't
2135
- // get that the assert(0) means this codepath will never happen.
2136
- allowed_char = ' ';
2137
- assert(0);
1962
+ return CONTINUE;
2138
1963
  }
2139
-
2140
- // Ignore the status, since we don't have a convenient way of signalling that
2141
- // a parser error has occurred when the error occurs in the middle of a
2142
- // multi-state token. We'd need a flag inside the TokenizerState to do this,
2143
- // but that's a low priority fix.
2144
- gumbo_consume_char_ref (
2145
- parser,
2146
- &tokenizer->_input,
2147
- allowed_char,
2148
- true,
2149
- &char_ref
2150
- );
2151
- if (char_ref.first != kGumboNoChar) {
2152
- tokenizer->_reconsume_current_input = true;
2153
- append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
2154
- if (char_ref.second != kGumboNoChar) {
2155
- append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
2156
- }
2157
- } else {
2158
- append_char_to_tag_buffer(parser, '&', is_unquoted);
2159
- }
2160
- gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
2161
- return NEXT_CHAR;
2162
1964
  }
2163
1965
 
2164
1966
  // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
@@ -2175,24 +1977,21 @@ static StateResult handle_after_attr_value_quoted_state (
2175
1977
  case '\f':
2176
1978
  case ' ':
2177
1979
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2178
- return NEXT_CHAR;
1980
+ return CONTINUE;
2179
1981
  case '/':
2180
1982
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
2181
- return NEXT_CHAR;
1983
+ return CONTINUE;
2182
1984
  case '>':
2183
1985
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2184
1986
  return emit_current_tag(parser, output);
2185
1987
  case -1:
2186
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
2187
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1988
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2188
1989
  abandon_current_tag(parser);
2189
- tokenizer->_reconsume_current_input = true;
2190
- return NEXT_CHAR;
1990
+ return emit_eof(parser, output);
2191
1991
  default:
2192
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
2193
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2194
- tokenizer->_reconsume_current_input = true;
2195
- return NEXT_CHAR;
1992
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES);
1993
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1994
+ return CONTINUE;
2196
1995
  }
2197
1996
  }
2198
1997
 
@@ -2209,15 +2008,13 @@ static StateResult handle_self_closing_start_tag_state (
2209
2008
  tokenizer->_tag_state._is_self_closing = true;
2210
2009
  return emit_current_tag(parser, output);
2211
2010
  case -1:
2212
- tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
2213
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2011
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2214
2012
  abandon_current_tag(parser);
2215
- return NEXT_CHAR;
2013
+ return emit_eof(parser, output);
2216
2014
  default:
2217
- tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
2218
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2219
- tokenizer->_reconsume_current_input = true;
2220
- return NEXT_CHAR;
2015
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG);
2016
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2017
+ return CONTINUE;
2221
2018
  }
2222
2019
  }
2223
2020
 
@@ -2228,21 +2025,27 @@ static StateResult handle_bogus_comment_state (
2228
2025
  int c,
2229
2026
  GumboToken* output
2230
2027
  ) {
2231
- while (c != '>' && c != -1) {
2232
- if (c == '\0') {
2233
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2234
- c = 0xFFFD;
2235
- }
2028
+ switch (c) {
2029
+ case '>':
2030
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2031
+ return emit_comment(parser, output);
2032
+ case -1:
2033
+ // We need to emit the comment and then the EOF, so reconsume in data
2034
+ // state.
2035
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2036
+ return emit_comment(parser, output);
2037
+ case '\0':
2038
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2039
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2040
+ return CONTINUE;
2041
+ default:
2236
2042
  append_char_to_temporary_buffer(parser, c);
2237
- utf8iterator_next(&tokenizer->_input);
2238
- c = utf8iterator_current(&tokenizer->_input);
2043
+ return CONTINUE;
2239
2044
  }
2240
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2241
- return emit_comment(parser, output);
2242
2045
  }
2243
2046
 
2244
2047
  // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
2245
- static StateResult handle_markup_declaration_state (
2048
+ static StateResult handle_markup_declaration_open_state (
2246
2049
  GumboParser* parser,
2247
2050
  GumboTokenizerState* tokenizer,
2248
2051
  int UNUSED_ARG(c),
@@ -2253,21 +2056,21 @@ static StateResult handle_markup_declaration_state (
2253
2056
  &tokenizer->_input,
2254
2057
  "--",
2255
2058
  sizeof("--") - 1,
2256
- true
2059
+ /* case sensitive */ true
2257
2060
  )
2258
2061
  ) {
2259
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
2260
- tokenizer->_reconsume_current_input = true;
2261
- } else if (
2062
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_START);
2063
+ return CONTINUE;
2064
+ }
2065
+ if (
2262
2066
  utf8iterator_maybe_consume_match (
2263
2067
  &tokenizer->_input,
2264
2068
  "DOCTYPE",
2265
2069
  sizeof("DOCTYPE") - 1,
2266
- false
2070
+ /* case sensitive */ false
2267
2071
  )
2268
2072
  ) {
2269
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
2270
- tokenizer->_reconsume_current_input = true;
2073
+ reconsume_in_state(parser, GUMBO_LEX_DOCTYPE);
2271
2074
  // If we get here, we know we'll eventually emit a doctype token, so now is
2272
2075
  // the time to initialize the doctype strings. (Not in doctype_state_init,
2273
2076
  // since then they'll leak if ownership never gets transferred to the
@@ -2275,24 +2078,35 @@ static StateResult handle_markup_declaration_state (
2275
2078
  tokenizer->_doc_type_state.name = gumbo_strdup("");
2276
2079
  tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
2277
2080
  tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
2278
- } else if (
2279
- tokenizer->_is_current_node_foreign
2280
- && utf8iterator_maybe_consume_match (
2081
+ return CONTINUE;
2082
+ }
2083
+ if (
2084
+ utf8iterator_maybe_consume_match (
2281
2085
  &tokenizer->_input,
2282
2086
  "[CDATA[", sizeof("[CDATA[") - 1,
2283
- true
2087
+ /* case sensitive */ true
2284
2088
  )
2285
2089
  ) {
2286
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2287
- tokenizer->_is_in_cdata = true;
2288
- tokenizer->_reconsume_current_input = true;
2289
- } else {
2290
- tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2291
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2292
- tokenizer->_reconsume_current_input = true;
2293
- clear_temporary_buffer(parser);
2090
+ if (tokenizer->_is_adjusted_current_node_foreign) {
2091
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2092
+ tokenizer->_is_in_cdata = true;
2093
+ // Start the token after the <![CDATA[.
2094
+ reset_token_start_point(tokenizer);
2095
+ } else {
2096
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_CDATA_IN_HTML_CONTENT);
2097
+ clear_temporary_buffer(parser);
2098
+ append_string_to_temporary_buffer (
2099
+ parser,
2100
+ &(const GumboStringPiece) { .data = "[CDATA[", .length = 7 }
2101
+ );
2102
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2103
+ }
2104
+ return CONTINUE;
2294
2105
  }
2295
- return NEXT_CHAR;
2106
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_OPENED_COMMENT);
2107
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2108
+ clear_temporary_buffer(parser);
2109
+ return CONTINUE;
2296
2110
  }
2297
2111
 
2298
2112
  // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
@@ -2305,26 +2119,14 @@ static StateResult handle_comment_start_state (
2305
2119
  switch (c) {
2306
2120
  case '-':
2307
2121
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2308
- return NEXT_CHAR;
2309
- case '\0':
2310
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2311
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2312
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2313
- return NEXT_CHAR;
2122
+ return CONTINUE;
2314
2123
  case '>':
2315
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2124
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2316
2125
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2317
- emit_comment(parser, output);
2318
- return RETURN_ERROR;
2319
- case -1:
2320
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2321
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2322
- emit_comment(parser, output);
2323
- return RETURN_ERROR;
2126
+ return emit_comment(parser, output);
2324
2127
  default:
2325
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2326
- append_char_to_temporary_buffer(parser, c);
2327
- return NEXT_CHAR;
2128
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2129
+ return CONTINUE;
2328
2130
  }
2329
2131
  }
2330
2132
 
@@ -2338,28 +2140,20 @@ static StateResult handle_comment_start_dash_state (
2338
2140
  switch (c) {
2339
2141
  case '-':
2340
2142
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2341
- return NEXT_CHAR;
2342
- case '\0':
2343
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2344
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2345
- append_char_to_temporary_buffer(parser, '-');
2346
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2347
- return NEXT_CHAR;
2143
+ return CONTINUE;
2348
2144
  case '>':
2349
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2145
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2350
2146
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2351
- emit_comment(parser, output);
2352
- return RETURN_ERROR;
2147
+ return emit_comment(parser, output);
2353
2148
  case -1:
2354
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2355
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2356
- emit_comment(parser, output);
2357
- return RETURN_ERROR;
2149
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2150
+ // Switch to data to emit the EOF next.
2151
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2152
+ return emit_comment(parser, output);
2358
2153
  default:
2359
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2154
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2360
2155
  append_char_to_temporary_buffer(parser, '-');
2361
- append_char_to_temporary_buffer(parser, c);
2362
- return NEXT_CHAR;
2156
+ return CONTINUE;
2363
2157
  }
2364
2158
  }
2365
2159
 
@@ -2371,21 +2165,99 @@ static StateResult handle_comment_state (
2371
2165
  GumboToken* output
2372
2166
  ) {
2373
2167
  switch (c) {
2168
+ case '<':
2169
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT);
2170
+ append_char_to_temporary_buffer(parser, c);
2171
+ return CONTINUE;
2374
2172
  case '-':
2375
2173
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2376
- return NEXT_CHAR;
2174
+ return CONTINUE;
2377
2175
  case '\0':
2378
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2176
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2379
2177
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2380
- return NEXT_CHAR;
2178
+ return CONTINUE;
2381
2179
  case -1:
2382
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2383
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2384
- emit_comment(parser, output);
2385
- return RETURN_ERROR;
2180
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2181
+ // Switch to data to emit the EOF token next.
2182
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2183
+ return emit_comment(parser, output);
2386
2184
  default:
2387
2185
  append_char_to_temporary_buffer(parser, c);
2388
- return NEXT_CHAR;
2186
+ return CONTINUE;
2187
+ }
2188
+ }
2189
+
2190
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
2191
+ static StateResult handle_comment_lt_state (
2192
+ GumboParser* parser,
2193
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2194
+ int c,
2195
+ GumboToken* output
2196
+ ) {
2197
+ switch (c) {
2198
+ case '!':
2199
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG);
2200
+ append_char_to_temporary_buffer(parser, c);
2201
+ return CONTINUE;
2202
+ case '<':
2203
+ append_char_to_temporary_buffer(parser, c);
2204
+ return CONTINUE;
2205
+ default:
2206
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2207
+ return CONTINUE;
2208
+ }
2209
+ }
2210
+
2211
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
2212
+ static StateResult handle_comment_lt_bang_state (
2213
+ GumboParser* parser,
2214
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2215
+ int c,
2216
+ GumboToken* output
2217
+ ) {
2218
+ switch (c) {
2219
+ case '-':
2220
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH);
2221
+ return CONTINUE;
2222
+ default:
2223
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2224
+ return CONTINUE;
2225
+ }
2226
+ }
2227
+
2228
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
2229
+ static StateResult handle_comment_lt_bang_dash_state (
2230
+ GumboParser* parser,
2231
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2232
+ int c,
2233
+ GumboToken* output
2234
+ ) {
2235
+ switch (c) {
2236
+ case '-':
2237
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH);
2238
+ return CONTINUE;
2239
+ default:
2240
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2241
+ return CONTINUE;
2242
+ }
2243
+ }
2244
+
2245
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
2246
+ static StateResult handle_comment_lt_bang_dash_dash_state (
2247
+ GumboParser* parser,
2248
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2249
+ int c,
2250
+ GumboToken* output
2251
+ ) {
2252
+ switch (c) {
2253
+ case '>':
2254
+ case -1:
2255
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2256
+ return CONTINUE;
2257
+ default:
2258
+ tokenizer_add_parse_error(parser, GUMBO_ERR_NESTED_COMMENT);
2259
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2260
+ return CONTINUE;
2389
2261
  }
2390
2262
  }
2391
2263
 
@@ -2397,25 +2269,18 @@ static StateResult handle_comment_end_dash_state (
2397
2269
  GumboToken* output
2398
2270
  ) {
2399
2271
  switch (c) {
2400
- case '-':
2401
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2402
- return NEXT_CHAR;
2403
- case '\0':
2404
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2405
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2406
- append_char_to_temporary_buffer(parser, '-');
2407
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2408
- return NEXT_CHAR;
2409
- case -1:
2410
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2411
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2412
- emit_comment(parser, output);
2413
- return RETURN_ERROR;
2414
- default:
2415
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2416
- append_char_to_temporary_buffer(parser, '-');
2417
- append_char_to_temporary_buffer(parser, c);
2418
- return NEXT_CHAR;
2272
+ case '-':
2273
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2274
+ return CONTINUE;
2275
+ case -1:
2276
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2277
+ // Switch to data to emit EOF next.
2278
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2279
+ return emit_comment(parser, output);
2280
+ default:
2281
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2282
+ append_char_to_temporary_buffer(parser, '-');
2283
+ return CONTINUE;
2419
2284
  }
2420
2285
  }
2421
2286
 
@@ -2430,35 +2295,22 @@ static StateResult handle_comment_end_state (
2430
2295
  case '>':
2431
2296
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2432
2297
  return emit_comment(parser, output);
2433
- case '\0':
2434
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2435
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2436
- append_char_to_temporary_buffer(parser, '-');
2437
- append_char_to_temporary_buffer(parser, '-');
2438
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2439
- return NEXT_CHAR;
2440
2298
  case '!':
2441
- tokenizer_add_parse_error(
2442
- parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2443
2299
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2444
- return NEXT_CHAR;
2300
+ return CONTINUE;
2445
2301
  case '-':
2446
- tokenizer_add_parse_error(
2447
- parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2448
2302
  append_char_to_temporary_buffer(parser, '-');
2449
- return NEXT_CHAR;
2303
+ return CONTINUE;
2450
2304
  case -1:
2451
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2305
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2306
+ // Switch to data to emit EOF next.
2452
2307
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2453
- emit_comment(parser, output);
2454
- return RETURN_ERROR;
2308
+ return emit_comment(parser, output);
2455
2309
  default:
2456
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2457
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2310
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2458
2311
  append_char_to_temporary_buffer(parser, '-');
2459
2312
  append_char_to_temporary_buffer(parser, '-');
2460
- append_char_to_temporary_buffer(parser, c);
2461
- return NEXT_CHAR;
2313
+ return CONTINUE;
2462
2314
  }
2463
2315
  }
2464
2316
 
@@ -2475,30 +2327,22 @@ static StateResult handle_comment_end_bang_state (
2475
2327
  append_char_to_temporary_buffer(parser, '-');
2476
2328
  append_char_to_temporary_buffer(parser, '-');
2477
2329
  append_char_to_temporary_buffer(parser, '!');
2478
- return NEXT_CHAR;
2330
+ return CONTINUE;
2479
2331
  case '>':
2332
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT);
2480
2333
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2481
2334
  return emit_comment(parser, output);
2482
- case '\0':
2483
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2484
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2485
- append_char_to_temporary_buffer(parser, '-');
2486
- append_char_to_temporary_buffer(parser, '-');
2487
- append_char_to_temporary_buffer(parser, '!');
2488
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2489
- return NEXT_CHAR;
2490
2335
  case -1:
2491
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2336
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2337
+ // Switch to data to emit EOF next.
2492
2338
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2493
- emit_comment(parser, output);
2494
- return RETURN_ERROR;
2339
+ return emit_comment(parser, output);
2495
2340
  default:
2496
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2341
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2497
2342
  append_char_to_temporary_buffer(parser, '-');
2498
2343
  append_char_to_temporary_buffer(parser, '-');
2499
2344
  append_char_to_temporary_buffer(parser, '!');
2500
- append_char_to_temporary_buffer(parser, c);
2501
- return NEXT_CHAR;
2345
+ return CONTINUE;
2502
2346
  }
2503
2347
  }
2504
2348
 
@@ -2509,26 +2353,27 @@ static StateResult handle_doctype_state (
2509
2353
  int c,
2510
2354
  GumboToken* output
2511
2355
  ) {
2512
- assert(!tokenizer->_temporary_buffer.length);
2356
+ assert(temporary_buffer_is_empty(parser));
2513
2357
  switch (c) {
2514
2358
  case '\t':
2515
2359
  case '\n':
2516
2360
  case '\f':
2517
2361
  case ' ':
2518
2362
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2519
- return NEXT_CHAR;
2363
+ return CONTINUE;
2364
+ case '>':
2365
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2366
+ return CONTINUE;
2520
2367
  case -1:
2521
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2522
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2368
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2523
2369
  tokenizer->_doc_type_state.force_quirks = true;
2524
- emit_doctype(parser, output);
2525
- return RETURN_ERROR;
2370
+ // Switch to data to emit EOF next.
2371
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2372
+ return emit_doctype(parser, output);
2526
2373
  default:
2527
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2528
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2529
- tokenizer->_reconsume_current_input = true;
2530
- tokenizer->_doc_type_state.force_quirks = true;
2531
- return NEXT_CHAR;
2374
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME);
2375
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2376
+ return CONTINUE;
2532
2377
  }
2533
2378
  }
2534
2379
 
@@ -2544,30 +2389,27 @@ static StateResult handle_before_doctype_name_state (
2544
2389
  case '\n':
2545
2390
  case '\f':
2546
2391
  case ' ':
2547
- return NEXT_CHAR;
2392
+ return CONTINUE;
2548
2393
  case '\0':
2549
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2394
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2550
2395
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2551
- tokenizer->_doc_type_state.force_quirks = true;
2552
2396
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2553
- return NEXT_CHAR;
2397
+ return CONTINUE;
2554
2398
  case '>':
2555
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2399
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_NAME);
2556
2400
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2557
2401
  tokenizer->_doc_type_state.force_quirks = true;
2558
- emit_doctype(parser, output);
2559
- return RETURN_ERROR;
2402
+ return emit_doctype(parser, output);
2560
2403
  case -1:
2561
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2562
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2404
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2563
2405
  tokenizer->_doc_type_state.force_quirks = true;
2564
- emit_doctype(parser, output);
2565
- return RETURN_ERROR;
2406
+ // Switch to data to emit EOF next.
2407
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2408
+ return emit_doctype(parser, output);
2566
2409
  default:
2567
2410
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2568
- tokenizer->_doc_type_state.force_quirks = false;
2569
2411
  append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2570
- return NEXT_CHAR;
2412
+ return CONTINUE;
2571
2413
  }
2572
2414
  }
2573
2415
 
@@ -2586,30 +2428,26 @@ static StateResult handle_doctype_name_state (
2586
2428
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2587
2429
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2588
2430
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2589
- return NEXT_CHAR;
2431
+ return CONTINUE;
2590
2432
  case '>':
2591
2433
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2592
2434
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2593
2435
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2594
- emit_doctype(parser, output);
2595
- return RETURN_SUCCESS;
2436
+ return emit_doctype(parser, output);
2596
2437
  case '\0':
2597
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2438
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2598
2439
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2599
- return NEXT_CHAR;
2440
+ return CONTINUE;
2600
2441
  case -1:
2601
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2602
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2442
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2443
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2603
2444
  tokenizer->_doc_type_state.force_quirks = true;
2604
2445
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2605
2446
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2606
- emit_doctype(parser, output);
2607
- return RETURN_ERROR;
2447
+ return emit_doctype(parser, output);
2608
2448
  default:
2609
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2610
- tokenizer->_doc_type_state.force_quirks = false;
2611
2449
  append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2612
- return NEXT_CHAR;
2450
+ return CONTINUE;
2613
2451
  }
2614
2452
  }
2615
2453
 
@@ -2625,35 +2463,29 @@ static StateResult handle_after_doctype_name_state (
2625
2463
  case '\n':
2626
2464
  case '\f':
2627
2465
  case ' ':
2628
- return NEXT_CHAR;
2466
+ return CONTINUE;
2629
2467
  case '>':
2630
2468
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2631
- emit_doctype(parser, output);
2632
- return RETURN_SUCCESS;
2469
+ return emit_doctype(parser, output);
2633
2470
  case -1:
2634
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2471
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2635
2472
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2636
2473
  tokenizer->_doc_type_state.force_quirks = true;
2637
- emit_doctype(parser, output);
2638
- return RETURN_ERROR;
2474
+ return emit_doctype(parser, output);
2639
2475
  default:
2640
2476
  if (utf8iterator_maybe_consume_match(
2641
2477
  &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2642
- gumbo_tokenizer_set_state(
2643
- parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2644
- tokenizer->_reconsume_current_input = true;
2478
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2645
2479
  } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2646
2480
  sizeof("SYSTEM") - 1, false)) {
2647
- gumbo_tokenizer_set_state(
2648
- parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2649
- tokenizer->_reconsume_current_input = true;
2481
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2650
2482
  } else {
2651
2483
  tokenizer_add_parse_error(
2652
- parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2653
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2484
+ parser, GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME);
2485
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2654
2486
  tokenizer->_doc_type_state.force_quirks = true;
2655
2487
  }
2656
- return NEXT_CHAR;
2488
+ return CONTINUE;
2657
2489
  }
2658
2490
  }
2659
2491
 
@@ -2670,37 +2502,34 @@ static StateResult handle_after_doctype_public_keyword_state (
2670
2502
  case '\f':
2671
2503
  case ' ':
2672
2504
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2673
- return NEXT_CHAR;
2505
+ return CONTINUE;
2674
2506
  case '"':
2675
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2507
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2676
2508
  assert(temporary_buffer_is_empty(parser));
2677
2509
  gumbo_tokenizer_set_state(
2678
2510
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2679
- return NEXT_CHAR;
2511
+ return CONTINUE;
2680
2512
  case '\'':
2681
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2513
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2682
2514
  assert(temporary_buffer_is_empty(parser));
2683
2515
  gumbo_tokenizer_set_state(
2684
2516
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2685
- return NEXT_CHAR;
2517
+ return CONTINUE;
2686
2518
  case '>':
2687
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2519
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2688
2520
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2689
2521
  tokenizer->_doc_type_state.force_quirks = true;
2690
- emit_doctype(parser, output);
2691
- return RETURN_ERROR;
2522
+ return emit_doctype(parser, output);
2692
2523
  case -1:
2693
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2694
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2524
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2525
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2695
2526
  tokenizer->_doc_type_state.force_quirks = true;
2696
- emit_doctype(parser, output);
2697
- return RETURN_ERROR;
2527
+ return emit_doctype(parser, output);
2698
2528
  default:
2699
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2700
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2529
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2530
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2701
2531
  tokenizer->_doc_type_state.force_quirks = true;
2702
- emit_doctype(parser, output);
2703
- return RETURN_ERROR;
2532
+ return CONTINUE;
2704
2533
  }
2705
2534
  }
2706
2535
 
@@ -2716,35 +2545,32 @@ static StateResult handle_before_doctype_public_id_state (
2716
2545
  case '\n':
2717
2546
  case '\f':
2718
2547
  case ' ':
2719
- return NEXT_CHAR;
2548
+ return CONTINUE;
2720
2549
  case '"':
2721
2550
  assert(temporary_buffer_is_empty(parser));
2722
2551
  gumbo_tokenizer_set_state(
2723
2552
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2724
- return NEXT_CHAR;
2553
+ return CONTINUE;
2725
2554
  case '\'':
2726
2555
  assert(temporary_buffer_is_empty(parser));
2727
2556
  gumbo_tokenizer_set_state(
2728
2557
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2729
- return NEXT_CHAR;
2558
+ return CONTINUE;
2730
2559
  case '>':
2731
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2560
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2732
2561
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2733
2562
  tokenizer->_doc_type_state.force_quirks = true;
2734
- emit_doctype(parser, output);
2735
- return RETURN_ERROR;
2563
+ return emit_doctype(parser, output);
2736
2564
  case -1:
2737
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2738
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2565
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2566
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2739
2567
  tokenizer->_doc_type_state.force_quirks = true;
2740
- emit_doctype(parser, output);
2741
- return RETURN_ERROR;
2568
+ return emit_doctype(parser, output);
2742
2569
  default:
2743
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2744
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2570
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2571
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2745
2572
  tokenizer->_doc_type_state.force_quirks = true;
2746
- emit_doctype(parser, output);
2747
- return RETURN_ERROR;
2573
+ return CONTINUE;
2748
2574
  }
2749
2575
  }
2750
2576
 
@@ -2759,28 +2585,26 @@ static StateResult handle_doctype_public_id_double_quoted_state (
2759
2585
  case '"':
2760
2586
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2761
2587
  finish_doctype_public_id(parser);
2762
- return NEXT_CHAR;
2588
+ return CONTINUE;
2763
2589
  case '\0':
2764
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2590
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2765
2591
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2766
- return NEXT_CHAR;
2592
+ return CONTINUE;
2767
2593
  case '>':
2768
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2594
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2769
2595
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2770
2596
  tokenizer->_doc_type_state.force_quirks = true;
2771
2597
  finish_doctype_public_id(parser);
2772
- emit_doctype(parser, output);
2773
- return RETURN_ERROR;
2598
+ return emit_doctype(parser, output);
2774
2599
  case -1:
2775
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2776
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2600
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2601
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2777
2602
  tokenizer->_doc_type_state.force_quirks = true;
2778
2603
  finish_doctype_public_id(parser);
2779
- emit_doctype(parser, output);
2780
- return RETURN_ERROR;
2604
+ return emit_doctype(parser, output);
2781
2605
  default:
2782
2606
  append_char_to_temporary_buffer(parser, c);
2783
- return NEXT_CHAR;
2607
+ return CONTINUE;
2784
2608
  }
2785
2609
  }
2786
2610
 
@@ -2795,28 +2619,26 @@ static StateResult handle_doctype_public_id_single_quoted_state (
2795
2619
  case '\'':
2796
2620
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2797
2621
  finish_doctype_public_id(parser);
2798
- return NEXT_CHAR;
2622
+ return CONTINUE;
2799
2623
  case '\0':
2800
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2624
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2801
2625
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2802
- return NEXT_CHAR;
2626
+ return CONTINUE;
2803
2627
  case '>':
2804
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2628
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2805
2629
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2806
2630
  tokenizer->_doc_type_state.force_quirks = true;
2807
2631
  finish_doctype_public_id(parser);
2808
- emit_doctype(parser, output);
2809
- return RETURN_ERROR;
2632
+ return emit_doctype(parser, output);
2810
2633
  case -1:
2811
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2812
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2634
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2635
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2813
2636
  tokenizer->_doc_type_state.force_quirks = true;
2814
2637
  finish_doctype_public_id(parser);
2815
- emit_doctype(parser, output);
2816
- return RETURN_ERROR;
2638
+ return emit_doctype(parser, output);
2817
2639
  default:
2818
2640
  append_char_to_temporary_buffer(parser, c);
2819
- return NEXT_CHAR;
2641
+ return CONTINUE;
2820
2642
  }
2821
2643
  }
2822
2644
 
@@ -2834,35 +2656,38 @@ static StateResult handle_after_doctype_public_id_state (
2834
2656
  case ' ':
2835
2657
  gumbo_tokenizer_set_state(
2836
2658
  parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2837
- return NEXT_CHAR;
2659
+ return CONTINUE;
2838
2660
  case '>':
2839
2661
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2840
- emit_doctype(parser, output);
2841
- return RETURN_SUCCESS;
2662
+ return emit_doctype(parser, output);
2842
2663
  case '"':
2843
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2664
+ tokenizer_add_parse_error (
2665
+ parser,
2666
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2667
+ );
2844
2668
  assert(temporary_buffer_is_empty(parser));
2845
2669
  gumbo_tokenizer_set_state(
2846
2670
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2847
- return NEXT_CHAR;
2671
+ return CONTINUE;
2848
2672
  case '\'':
2849
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2673
+ tokenizer_add_parse_error (
2674
+ parser,
2675
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2676
+ );
2850
2677
  assert(temporary_buffer_is_empty(parser));
2851
2678
  gumbo_tokenizer_set_state(
2852
2679
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2853
- return NEXT_CHAR;
2680
+ return CONTINUE;
2854
2681
  case -1:
2855
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2856
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2857
- tokenizer->_reconsume_current_input = true;
2682
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2683
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2858
2684
  tokenizer->_doc_type_state.force_quirks = true;
2859
- emit_doctype(parser, output);
2860
- return RETURN_ERROR;
2685
+ return emit_doctype(parser, output);
2861
2686
  default:
2862
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2863
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2687
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2688
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2864
2689
  tokenizer->_doc_type_state.force_quirks = true;
2865
- return NEXT_CHAR;
2690
+ return CONTINUE;
2866
2691
  }
2867
2692
  }
2868
2693
 
@@ -2878,33 +2703,30 @@ static StateResult handle_between_doctype_public_system_id_state (
2878
2703
  case '\n':
2879
2704
  case '\f':
2880
2705
  case ' ':
2881
- return NEXT_CHAR;
2706
+ return CONTINUE;
2882
2707
  case '>':
2883
2708
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2884
- emit_doctype(parser, output);
2885
- return RETURN_SUCCESS;
2709
+ return emit_doctype(parser, output);
2886
2710
  case '"':
2887
2711
  assert(temporary_buffer_is_empty(parser));
2888
2712
  gumbo_tokenizer_set_state(
2889
2713
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2890
- return NEXT_CHAR;
2714
+ return CONTINUE;
2891
2715
  case '\'':
2892
2716
  assert(temporary_buffer_is_empty(parser));
2893
2717
  gumbo_tokenizer_set_state(
2894
2718
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2895
- return NEXT_CHAR;
2719
+ return CONTINUE;
2896
2720
  case -1:
2897
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2898
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2721
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2722
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2899
2723
  tokenizer->_doc_type_state.force_quirks = true;
2900
- emit_doctype(parser, output);
2901
- return RETURN_ERROR;
2724
+ return emit_doctype(parser, output);
2902
2725
  default:
2903
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2904
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2726
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2727
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2905
2728
  tokenizer->_doc_type_state.force_quirks = true;
2906
- emit_doctype(parser, output);
2907
- return RETURN_ERROR;
2729
+ return CONTINUE;
2908
2730
  }
2909
2731
  }
2910
2732
 
@@ -2921,36 +2743,34 @@ static StateResult handle_after_doctype_system_keyword_state (
2921
2743
  case '\f':
2922
2744
  case ' ':
2923
2745
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2924
- return NEXT_CHAR;
2746
+ return CONTINUE;
2925
2747
  case '"':
2926
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2748
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2927
2749
  assert(temporary_buffer_is_empty(parser));
2928
2750
  gumbo_tokenizer_set_state(
2929
2751
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2930
- return NEXT_CHAR;
2752
+ return CONTINUE;
2931
2753
  case '\'':
2932
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2754
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2933
2755
  assert(temporary_buffer_is_empty(parser));
2934
2756
  gumbo_tokenizer_set_state(
2935
2757
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2936
- return NEXT_CHAR;
2758
+ return CONTINUE;
2937
2759
  case '>':
2938
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2760
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2939
2761
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2940
2762
  tokenizer->_doc_type_state.force_quirks = true;
2941
- emit_doctype(parser, output);
2942
- return RETURN_ERROR;
2763
+ return emit_doctype(parser, output);
2943
2764
  case -1:
2944
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2945
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2765
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2766
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2946
2767
  tokenizer->_doc_type_state.force_quirks = true;
2947
- emit_doctype(parser, output);
2948
- return RETURN_ERROR;
2768
+ return emit_doctype(parser, output);
2949
2769
  default:
2950
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2951
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2770
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2771
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2952
2772
  tokenizer->_doc_type_state.force_quirks = true;
2953
- return NEXT_CHAR;
2773
+ return CONTINUE;
2954
2774
  }
2955
2775
  }
2956
2776
 
@@ -2966,34 +2786,32 @@ static StateResult handle_before_doctype_system_id_state (
2966
2786
  case '\n':
2967
2787
  case '\f':
2968
2788
  case ' ':
2969
- return NEXT_CHAR;
2789
+ return CONTINUE;
2970
2790
  case '"':
2971
2791
  assert(temporary_buffer_is_empty(parser));
2972
2792
  gumbo_tokenizer_set_state(
2973
2793
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2974
- return NEXT_CHAR;
2794
+ return CONTINUE;
2975
2795
  case '\'':
2976
2796
  assert(temporary_buffer_is_empty(parser));
2977
2797
  gumbo_tokenizer_set_state(
2978
2798
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2979
- return NEXT_CHAR;
2799
+ return CONTINUE;
2980
2800
  case '>':
2981
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2801
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2982
2802
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2983
2803
  tokenizer->_doc_type_state.force_quirks = true;
2984
- emit_doctype(parser, output);
2985
- return RETURN_ERROR;
2804
+ return emit_doctype(parser, output);
2986
2805
  case -1:
2987
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2988
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2806
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2807
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2989
2808
  tokenizer->_doc_type_state.force_quirks = true;
2990
- emit_doctype(parser, output);
2991
- return RETURN_ERROR;
2809
+ return emit_doctype(parser, output);
2992
2810
  default:
2993
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2994
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2811
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2812
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2995
2813
  tokenizer->_doc_type_state.force_quirks = true;
2996
- return NEXT_CHAR;
2814
+ return CONTINUE;
2997
2815
  }
2998
2816
  }
2999
2817
 
@@ -3008,28 +2826,26 @@ static StateResult handle_doctype_system_id_double_quoted_state (
3008
2826
  case '"':
3009
2827
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
3010
2828
  finish_doctype_system_id(parser);
3011
- return NEXT_CHAR;
2829
+ return CONTINUE;
3012
2830
  case '\0':
3013
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2831
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
3014
2832
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
3015
- return NEXT_CHAR;
2833
+ return CONTINUE;
3016
2834
  case '>':
3017
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2835
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
3018
2836
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3019
2837
  tokenizer->_doc_type_state.force_quirks = true;
3020
2838
  finish_doctype_system_id(parser);
3021
- emit_doctype(parser, output);
3022
- return RETURN_ERROR;
2839
+ return emit_doctype(parser, output);
3023
2840
  case -1:
3024
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
3025
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2841
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2842
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3026
2843
  tokenizer->_doc_type_state.force_quirks = true;
3027
2844
  finish_doctype_system_id(parser);
3028
- emit_doctype(parser, output);
3029
- return RETURN_ERROR;
2845
+ return emit_doctype(parser, output);
3030
2846
  default:
3031
2847
  append_char_to_temporary_buffer(parser, c);
3032
- return NEXT_CHAR;
2848
+ return CONTINUE;
3033
2849
  }
3034
2850
  }
3035
2851
 
@@ -3044,28 +2860,26 @@ static StateResult handle_doctype_system_id_single_quoted_state (
3044
2860
  case '\'':
3045
2861
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
3046
2862
  finish_doctype_system_id(parser);
3047
- return NEXT_CHAR;
2863
+ return CONTINUE;
3048
2864
  case '\0':
3049
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2865
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
3050
2866
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
3051
- return NEXT_CHAR;
2867
+ return CONTINUE;
3052
2868
  case '>':
3053
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2869
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
3054
2870
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3055
2871
  tokenizer->_doc_type_state.force_quirks = true;
3056
2872
  finish_doctype_system_id(parser);
3057
- emit_doctype(parser, output);
3058
- return RETURN_ERROR;
2873
+ return emit_doctype(parser, output);
3059
2874
  case -1:
3060
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
3061
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2875
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2876
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3062
2877
  tokenizer->_doc_type_state.force_quirks = true;
3063
2878
  finish_doctype_system_id(parser);
3064
- emit_doctype(parser, output);
3065
- return RETURN_ERROR;
2879
+ return emit_doctype(parser, output);
3066
2880
  default:
3067
2881
  append_char_to_temporary_buffer(parser, c);
3068
- return NEXT_CHAR;
2882
+ return CONTINUE;
3069
2883
  }
3070
2884
  }
3071
2885
 
@@ -3081,21 +2895,19 @@ static StateResult handle_after_doctype_system_id_state (
3081
2895
  case '\n':
3082
2896
  case '\f':
3083
2897
  case ' ':
3084
- return NEXT_CHAR;
2898
+ return CONTINUE;
3085
2899
  case '>':
3086
2900
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3087
- emit_doctype(parser, output);
3088
- return RETURN_SUCCESS;
2901
+ return emit_doctype(parser, output);
3089
2902
  case -1:
3090
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
3091
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2903
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2904
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3092
2905
  tokenizer->_doc_type_state.force_quirks = true;
3093
- emit_doctype(parser, output);
3094
- return RETURN_ERROR;
2906
+ return emit_doctype(parser, output);
3095
2907
  default:
3096
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
3097
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
3098
- return NEXT_CHAR;
2908
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER);
2909
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2910
+ return CONTINUE;
3099
2911
  }
3100
2912
  }
3101
2913
 
@@ -3106,33 +2918,370 @@ static StateResult handle_bogus_doctype_state (
3106
2918
  int c,
3107
2919
  GumboToken* output
3108
2920
  ) {
3109
- if (c == '>' || c == -1) {
2921
+ switch (c) {
2922
+ case '>':
3110
2923
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3111
- emit_doctype(parser, output);
3112
- return RETURN_ERROR;
2924
+ return emit_doctype(parser, output);
2925
+ case '\0':
2926
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2927
+ return CONTINUE;
2928
+ case -1:
2929
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2930
+ return emit_doctype(parser, output);
2931
+ default:
2932
+ return CONTINUE;
3113
2933
  }
3114
- return NEXT_CHAR;
3115
2934
  }
3116
2935
 
3117
2936
  // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
3118
- static StateResult handle_cdata_state (
2937
+ static StateResult handle_cdata_section_state (
3119
2938
  GumboParser* parser,
3120
2939
  GumboTokenizerState* tokenizer,
3121
2940
  int c,
3122
2941
  GumboToken* output
3123
2942
  ) {
3124
- if (c == -1 || utf8iterator_maybe_consume_match(
3125
- &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
3126
- tokenizer->_reconsume_current_input = true;
2943
+ switch (c) {
2944
+ case ']':
2945
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_BRACKET);
2946
+ set_mark(parser);
2947
+ return CONTINUE;
2948
+ case -1:
2949
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_CDATA);
2950
+ return emit_eof(parser, output);
2951
+ default:
2952
+ return emit_char(parser, c, output);
2953
+ }
2954
+ }
2955
+
2956
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
2957
+ static StateResult handle_cdata_section_bracket_state (
2958
+ GumboParser* parser,
2959
+ GumboTokenizerState* tokenizer,
2960
+ int c,
2961
+ GumboToken* output
2962
+ ) {
2963
+ switch (c) {
2964
+ case ']':
2965
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_END);
2966
+ return CONTINUE;
2967
+ default:
2968
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2969
+ // Emit the ].
2970
+ return emit_from_mark(parser, output);
2971
+ }
2972
+ }
2973
+
2974
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
2975
+ static StateResult handle_cdata_section_end_state (
2976
+ GumboParser* parser,
2977
+ GumboTokenizerState* tokenizer,
2978
+ int c,
2979
+ GumboToken* output
2980
+ ) {
2981
+ switch (c) {
2982
+ case ']':
2983
+ {
2984
+ // XXX: This is terrible. We want to emit a ] corresponding to the first
2985
+ // of the three in a row we've seen. So let's emit one token from the
2986
+ // temporary buffer (which will rewind 3 characters, emit the ] and
2987
+ // advance one). Next, let's clear the temporary buffer which will set the
2988
+ // mark to the middle of the three brackets. Finally, let's move to the
2989
+ // appropriate state.
2990
+ StateResult result = emit_from_mark(parser, output);
2991
+ tokenizer->_resume_pos = NULL;
2992
+ set_mark(parser);
2993
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION);
2994
+ return result;
2995
+ }
2996
+ case '>':
2997
+ // We're done with CDATA so move past the >, reset the token start point
2998
+ // to point after the >, and then reconsume in the data state.
2999
+ utf8iterator_next(&tokenizer->_input);
3127
3000
  reset_token_start_point(tokenizer);
3128
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3001
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3129
3002
  tokenizer->_is_in_cdata = false;
3130
- return NEXT_CHAR;
3131
- } else {
3132
- return emit_current_char(parser, output);
3003
+ return CONTINUE;
3004
+ default:
3005
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
3006
+ return emit_from_mark(parser, output);
3007
+ }
3008
+ }
3009
+
3010
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
3011
+ static StateResult handle_character_reference_state (
3012
+ GumboParser* parser,
3013
+ GumboTokenizerState* tokenizer,
3014
+ int c,
3015
+ GumboToken* output
3016
+ ) {
3017
+ if (gumbo_ascii_isalnum(c)) {
3018
+ reconsume_in_state(parser, GUMBO_LEX_NAMED_CHARACTER_REFERENCE);
3019
+ return CONTINUE;
3020
+ }
3021
+ if (c == '#') {
3022
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE);
3023
+ return CONTINUE;
3024
+ }
3025
+ reconsume_in_state(parser, tokenizer->_return_state);
3026
+ return flush_code_points_consumed_as_character_reference(parser, output);
3027
+ }
3028
+
3029
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
3030
+ static StateResult handle_named_character_reference_state (
3031
+ GumboParser* parser,
3032
+ GumboTokenizerState* tokenizer,
3033
+ int c,
3034
+ GumboToken* output
3035
+ ) {
3036
+ const char *cur = utf8iterator_get_char_pointer(&tokenizer->_input);
3037
+ const char *end = utf8iterator_get_end_pointer(&tokenizer->_input);
3038
+ int code_point[2];
3039
+ size_t size = match_named_char_ref(cur, end - cur, code_point);
3040
+
3041
+ if (size > 0) {
3042
+ utf8iterator_maybe_consume_match(&tokenizer->_input, cur, size, true);
3043
+ int next = utf8iterator_current(&tokenizer->_input);
3044
+ reconsume_in_state(parser, tokenizer->_return_state);
3045
+ if (character_reference_part_of_attribute(parser)
3046
+ && cur[size-1] != ';'
3047
+ && (next == '=' || gumbo_ascii_isalnum(next))) {
3048
+ GumboStringPiece str = { .data = cur, .length = size };
3049
+ append_string_to_temporary_buffer(parser, &str);
3050
+ return flush_code_points_consumed_as_character_reference(parser, output);
3051
+ }
3052
+ if (cur[size-1] != ';')
3053
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE, -1);
3054
+ reconsume_in_state(parser, tokenizer->_return_state);
3055
+ return flush_char_ref(parser, code_point[0], code_point[1], output);
3056
+ }
3057
+ reconsume_in_state(parser, GUMBO_LEX_AMBIGUOUS_AMPERSAND);
3058
+ return flush_code_points_consumed_as_character_reference(parser, output);
3059
+ }
3060
+
3061
+ // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
3062
+ static StateResult handle_ambiguous_ampersand_state (
3063
+ GumboParser* parser,
3064
+ GumboTokenizerState* tokenizer,
3065
+ int c,
3066
+ GumboToken* output
3067
+ ) {
3068
+ if (gumbo_ascii_isalnum(c)) {
3069
+ if (character_reference_part_of_attribute(parser)) {
3070
+ append_char_to_tag_buffer(parser, c, true);
3071
+ return CONTINUE;
3072
+ }
3073
+ return emit_char(parser, c, output);
3074
+ }
3075
+ if (c == ';') {
3076
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE, -1);
3077
+ // fall through
3078
+ }
3079
+ reconsume_in_state(parser, tokenizer->_return_state);
3080
+ return CONTINUE;
3081
+ }
3082
+
3083
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
3084
+ static StateResult handle_numeric_character_reference_state (
3085
+ GumboParser* parser,
3086
+ GumboTokenizerState* tokenizer,
3087
+ int c,
3088
+ GumboToken* output
3089
+ ) {
3090
+ tokenizer->_character_reference_code = 0;
3091
+ switch (c) {
3092
+ case 'x':
3093
+ case 'X':
3094
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START);
3095
+ return CONTINUE;
3096
+ default:
3097
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START);
3098
+ return CONTINUE;
3133
3099
  }
3134
3100
  }
3135
3101
 
3102
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-start-state
3103
+ static StateResult handle_hexadecimal_character_reference_start_state (
3104
+ GumboParser* parser,
3105
+ GumboTokenizerState* tokenizer,
3106
+ int c,
3107
+ GumboToken* output
3108
+ ) {
3109
+ if (gumbo_ascii_isxdigit(c)) {
3110
+ reconsume_in_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE);
3111
+ return CONTINUE;
3112
+ }
3113
+ tokenizer_add_char_ref_error (
3114
+ parser,
3115
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3116
+ -1
3117
+ );
3118
+ reconsume_in_state(parser, tokenizer->_return_state);
3119
+ return flush_code_points_consumed_as_character_reference(parser, output);
3120
+ }
3121
+
3122
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
3123
+ static StateResult handle_decimal_character_reference_start_state (
3124
+ GumboParser* parser,
3125
+ GumboTokenizerState* tokenizer,
3126
+ int c,
3127
+ GumboToken* output
3128
+ ) {
3129
+ if (gumbo_ascii_isdigit(c)) {
3130
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE);
3131
+ return CONTINUE;
3132
+ }
3133
+ tokenizer_add_char_ref_error (
3134
+ parser,
3135
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3136
+ -1
3137
+ );
3138
+ reconsume_in_state(parser, tokenizer->_return_state);
3139
+ return flush_code_points_consumed_as_character_reference(parser, output);
3140
+ }
3141
+
3142
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-state
3143
+ static StateResult handle_hexadecimal_character_reference_state (
3144
+ GumboParser* parser,
3145
+ GumboTokenizerState* tokenizer,
3146
+ int c,
3147
+ GumboToken* output
3148
+ ) {
3149
+ if (gumbo_ascii_isdigit(c)) {
3150
+ tokenizer->_character_reference_code =
3151
+ tokenizer->_character_reference_code * 16 + (c - 0x0030);
3152
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3153
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3154
+ return CONTINUE;
3155
+ }
3156
+ if (gumbo_ascii_isupper_xdigit(c)) {
3157
+ tokenizer->_character_reference_code =
3158
+ tokenizer->_character_reference_code * 16 + (c - 0x0037);
3159
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3160
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3161
+ return CONTINUE;
3162
+ }
3163
+ if (gumbo_ascii_islower_xdigit(c)) {
3164
+ tokenizer->_character_reference_code =
3165
+ tokenizer->_character_reference_code * 16 + (c - 0x0057);
3166
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3167
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3168
+ return CONTINUE;
3169
+ }
3170
+ if (c == ';') {
3171
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3172
+ return CONTINUE;
3173
+ }
3174
+ tokenizer_add_char_ref_error(
3175
+ parser,
3176
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3177
+ tokenizer->_character_reference_code
3178
+ );
3179
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3180
+ return CONTINUE;
3181
+ }
3182
+
3183
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
3184
+ static StateResult handle_decimal_character_reference_state (
3185
+ GumboParser* parser,
3186
+ GumboTokenizerState* tokenizer,
3187
+ int c,
3188
+ GumboToken* output
3189
+ ) {
3190
+ if (gumbo_ascii_isdigit(c)) {
3191
+ tokenizer->_character_reference_code =
3192
+ tokenizer->_character_reference_code * 10 + (c - 0x0030);
3193
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3194
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3195
+ return CONTINUE;
3196
+ }
3197
+ if (c == ';') {
3198
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3199
+ return CONTINUE;
3200
+ }
3201
+ tokenizer_add_char_ref_error(
3202
+ parser,
3203
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3204
+ tokenizer->_character_reference_code
3205
+ );
3206
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3207
+ return CONTINUE;
3208
+ }
3209
+
3210
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
3211
+ static StateResult handle_numeric_character_reference_end_state (
3212
+ GumboParser* parser,
3213
+ GumboTokenizerState* tokenizer,
3214
+ int c,
3215
+ GumboToken* output
3216
+ ) {
3217
+ c = tokenizer->_character_reference_code;
3218
+ if (c == 0) {
3219
+ tokenizer_add_char_ref_error(
3220
+ parser,
3221
+ GUMBO_ERR_NULL_CHARACTER_REFERENCE,
3222
+ c
3223
+ );
3224
+ c = kUtf8ReplacementChar;
3225
+ } else if (c > kUtf8MaxChar) {
3226
+ tokenizer_add_char_ref_error(
3227
+ parser,
3228
+ GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
3229
+ c
3230
+ );
3231
+ c = kUtf8ReplacementChar;
3232
+ } else if (utf8_is_surrogate(c)) {
3233
+ tokenizer_add_char_ref_error(
3234
+ parser,
3235
+ GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
3236
+ c
3237
+ );
3238
+ c = kUtf8ReplacementChar;
3239
+ } else if (utf8_is_noncharacter(c)) {
3240
+ tokenizer_add_char_ref_error(
3241
+ parser,
3242
+ GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
3243
+ c
3244
+ );
3245
+ } else if (c == 0x0D || (utf8_is_control(c) && !gumbo_ascii_isspace(c))) {
3246
+ tokenizer_add_char_ref_error(
3247
+ parser,
3248
+ GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
3249
+ c
3250
+ );
3251
+ switch (c) {
3252
+ case 0x80: c = 0x20AC; break;
3253
+ case 0x82: c = 0x201A; break;
3254
+ case 0x83: c = 0x0192; break;
3255
+ case 0x84: c = 0x201E; break;
3256
+ case 0x85: c = 0x2026; break;
3257
+ case 0x86: c = 0x2020; break;
3258
+ case 0x87: c = 0x2021; break;
3259
+ case 0x88: c = 0x02C6; break;
3260
+ case 0x89: c = 0x2030; break;
3261
+ case 0x8A: c = 0x0160; break;
3262
+ case 0x8B: c = 0x2039; break;
3263
+ case 0x8C: c = 0x0152; break;
3264
+ case 0x8E: c = 0x017D; break;
3265
+ case 0x91: c = 0x2018; break;
3266
+ case 0x92: c = 0x2019; break;
3267
+ case 0x93: c = 0x201C; break;
3268
+ case 0x94: c = 0x201D; break;
3269
+ case 0x95: c = 0x2022; break;
3270
+ case 0x96: c = 0x2013; break;
3271
+ case 0x97: c = 0x2014; break;
3272
+ case 0x98: c = 0x02DC; break;
3273
+ case 0x99: c = 0x2122; break;
3274
+ case 0x9A: c = 0x0161; break;
3275
+ case 0x9B: c = 0x203A; break;
3276
+ case 0x9C: c = 0x0153; break;
3277
+ case 0x9E: c = 0x017E; break;
3278
+ case 0x9F: c = 0x0178; break;
3279
+ }
3280
+ }
3281
+ reconsume_in_state(parser, tokenizer->_return_state);
3282
+ return flush_char_ref(parser, c, kGumboNoChar, output);
3283
+ }
3284
+
3136
3285
  typedef StateResult (*GumboLexerStateFunction) (
3137
3286
  GumboParser* parser,
3138
3287
  GumboTokenizerState* tokenizer,
@@ -3141,77 +3290,89 @@ typedef StateResult (*GumboLexerStateFunction) (
3141
3290
  );
3142
3291
 
3143
3292
  static GumboLexerStateFunction dispatch_table[] = {
3144
- handle_data_state,
3145
- handle_char_ref_in_data_state,
3146
- handle_rcdata_state,
3147
- handle_char_ref_in_rcdata_state,
3148
- handle_rawtext_state,
3149
- handle_script_state,
3150
- handle_plaintext_state,
3151
- handle_tag_open_state,
3152
- handle_end_tag_open_state,
3153
- handle_tag_name_state,
3154
- handle_rcdata_lt_state,
3155
- handle_rcdata_end_tag_open_state,
3156
- handle_rcdata_end_tag_name_state,
3157
- handle_rawtext_lt_state,
3158
- handle_rawtext_end_tag_open_state,
3159
- handle_rawtext_end_tag_name_state,
3160
- handle_script_lt_state,
3161
- handle_script_end_tag_open_state,
3162
- handle_script_end_tag_name_state,
3163
- handle_script_escaped_start_state,
3164
- handle_script_escaped_start_dash_state,
3165
- handle_script_escaped_state,
3166
- handle_script_escaped_dash_state,
3167
- handle_script_escaped_dash_dash_state,
3168
- handle_script_escaped_lt_state,
3169
- handle_script_escaped_end_tag_open_state,
3170
- handle_script_escaped_end_tag_name_state,
3171
- handle_script_double_escaped_start_state,
3172
- handle_script_double_escaped_state,
3173
- handle_script_double_escaped_dash_state,
3174
- handle_script_double_escaped_dash_dash_state,
3175
- handle_script_double_escaped_lt_state,
3176
- handle_script_double_escaped_end_state,
3177
- handle_before_attr_name_state,
3178
- handle_attr_name_state,
3179
- handle_after_attr_name_state,
3180
- handle_before_attr_value_state,
3181
- handle_attr_value_double_quoted_state,
3182
- handle_attr_value_single_quoted_state,
3183
- handle_attr_value_unquoted_state,
3184
- handle_char_ref_in_attr_value_state,
3185
- handle_after_attr_value_quoted_state,
3186
- handle_self_closing_start_tag_state,
3187
- handle_bogus_comment_state,
3188
- handle_markup_declaration_state,
3189
- handle_comment_start_state,
3190
- handle_comment_start_dash_state,
3191
- handle_comment_state,
3192
- handle_comment_end_dash_state,
3193
- handle_comment_end_state,
3194
- handle_comment_end_bang_state,
3195
- handle_doctype_state,
3196
- handle_before_doctype_name_state,
3197
- handle_doctype_name_state,
3198
- handle_after_doctype_name_state,
3199
- handle_after_doctype_public_keyword_state,
3200
- handle_before_doctype_public_id_state,
3201
- handle_doctype_public_id_double_quoted_state,
3202
- handle_doctype_public_id_single_quoted_state,
3203
- handle_after_doctype_public_id_state,
3204
- handle_between_doctype_public_system_id_state,
3205
- handle_after_doctype_system_keyword_state,
3206
- handle_before_doctype_system_id_state,
3207
- handle_doctype_system_id_double_quoted_state,
3208
- handle_doctype_system_id_single_quoted_state,
3209
- handle_after_doctype_system_id_state,
3210
- handle_bogus_doctype_state,
3211
- handle_cdata_state
3293
+ [GUMBO_LEX_DATA] = handle_data_state,
3294
+ [GUMBO_LEX_RCDATA] = handle_rcdata_state,
3295
+ [GUMBO_LEX_RAWTEXT] = handle_rawtext_state,
3296
+ [GUMBO_LEX_SCRIPT_DATA] = handle_script_data_state,
3297
+ [GUMBO_LEX_PLAINTEXT] = handle_plaintext_state,
3298
+ [GUMBO_LEX_TAG_OPEN] = handle_tag_open_state,
3299
+ [GUMBO_LEX_END_TAG_OPEN] = handle_end_tag_open_state,
3300
+ [GUMBO_LEX_TAG_NAME] = handle_tag_name_state,
3301
+ [GUMBO_LEX_RCDATA_LT] = handle_rcdata_lt_state,
3302
+ [GUMBO_LEX_RCDATA_END_TAG_OPEN] = handle_rcdata_end_tag_open_state,
3303
+ [GUMBO_LEX_RCDATA_END_TAG_NAME] = handle_rcdata_end_tag_name_state,
3304
+ [GUMBO_LEX_RAWTEXT_LT] = handle_rawtext_lt_state,
3305
+ [GUMBO_LEX_RAWTEXT_END_TAG_OPEN] = handle_rawtext_end_tag_open_state,
3306
+ [GUMBO_LEX_RAWTEXT_END_TAG_NAME] = handle_rawtext_end_tag_name_state,
3307
+ [GUMBO_LEX_SCRIPT_DATA_LT] = handle_script_data_lt_state,
3308
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN] = handle_script_data_end_tag_open_state,
3309
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME] = handle_script_data_end_tag_name_state,
3310
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START] = handle_script_data_escaped_start_state,
3311
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH] = handle_script_data_escaped_start_dash_state,
3312
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED] = handle_script_data_escaped_state,
3313
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH] = handle_script_data_escaped_dash_state,
3314
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH] = handle_script_data_escaped_dash_dash_state,
3315
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT] = handle_script_data_escaped_lt_state,
3316
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN] = handle_script_data_escaped_end_tag_open_state,
3317
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME] = handle_script_data_escaped_end_tag_name_state,
3318
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START] = handle_script_data_double_escaped_start_state,
3319
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED] = handle_script_data_double_escaped_state,
3320
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH] = handle_script_data_double_escaped_dash_state,
3321
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH] = handle_script_data_double_escaped_dash_dash_state,
3322
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT] = handle_script_data_double_escaped_lt_state,
3323
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END] = handle_script_data_double_escaped_end_state,
3324
+ [GUMBO_LEX_BEFORE_ATTR_NAME] = handle_before_attr_name_state,
3325
+ [GUMBO_LEX_ATTR_NAME] = handle_attr_name_state,
3326
+ [GUMBO_LEX_AFTER_ATTR_NAME] = handle_after_attr_name_state,
3327
+ [GUMBO_LEX_BEFORE_ATTR_VALUE] = handle_before_attr_value_state,
3328
+ [GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED] = handle_attr_value_double_quoted_state,
3329
+ [GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED] = handle_attr_value_single_quoted_state,
3330
+ [GUMBO_LEX_ATTR_VALUE_UNQUOTED] = handle_attr_value_unquoted_state,
3331
+ [GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED] = handle_after_attr_value_quoted_state,
3332
+ [GUMBO_LEX_SELF_CLOSING_START_TAG] = handle_self_closing_start_tag_state,
3333
+ [GUMBO_LEX_BOGUS_COMMENT] = handle_bogus_comment_state,
3334
+ [GUMBO_LEX_MARKUP_DECLARATION_OPEN] = handle_markup_declaration_open_state,
3335
+ [GUMBO_LEX_COMMENT_START] = handle_comment_start_state,
3336
+ [GUMBO_LEX_COMMENT_START_DASH] = handle_comment_start_dash_state,
3337
+ [GUMBO_LEX_COMMENT] = handle_comment_state,
3338
+ [GUMBO_LEX_COMMENT_LT] = handle_comment_lt_state,
3339
+ [GUMBO_LEX_COMMENT_LT_BANG] = handle_comment_lt_bang_state,
3340
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH] = handle_comment_lt_bang_dash_state,
3341
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH] = handle_comment_lt_bang_dash_dash_state,
3342
+ [GUMBO_LEX_COMMENT_END_DASH] = handle_comment_end_dash_state,
3343
+ [GUMBO_LEX_COMMENT_END] = handle_comment_end_state,
3344
+ [GUMBO_LEX_COMMENT_END_BANG] = handle_comment_end_bang_state,
3345
+ [GUMBO_LEX_DOCTYPE] = handle_doctype_state,
3346
+ [GUMBO_LEX_BEFORE_DOCTYPE_NAME] = handle_before_doctype_name_state,
3347
+ [GUMBO_LEX_DOCTYPE_NAME] = handle_doctype_name_state,
3348
+ [GUMBO_LEX_AFTER_DOCTYPE_NAME] = handle_after_doctype_name_state,
3349
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD] = handle_after_doctype_public_keyword_state,
3350
+ [GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID] = handle_before_doctype_public_id_state,
3351
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED] = handle_doctype_public_id_double_quoted_state,
3352
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED] = handle_doctype_public_id_single_quoted_state,
3353
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID] = handle_after_doctype_public_id_state,
3354
+ [GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID] = handle_between_doctype_public_system_id_state,
3355
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD] = handle_after_doctype_system_keyword_state,
3356
+ [GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID] = handle_before_doctype_system_id_state,
3357
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED] = handle_doctype_system_id_double_quoted_state,
3358
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED] = handle_doctype_system_id_single_quoted_state,
3359
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID] = handle_after_doctype_system_id_state,
3360
+ [GUMBO_LEX_BOGUS_DOCTYPE] = handle_bogus_doctype_state,
3361
+ [GUMBO_LEX_CDATA_SECTION] = handle_cdata_section_state,
3362
+ [GUMBO_LEX_CDATA_SECTION_BRACKET] = handle_cdata_section_bracket_state,
3363
+ [GUMBO_LEX_CDATA_SECTION_END] = handle_cdata_section_end_state,
3364
+ [GUMBO_LEX_CHARACTER_REFERENCE] = handle_character_reference_state,
3365
+ [GUMBO_LEX_NAMED_CHARACTER_REFERENCE] = handle_named_character_reference_state,
3366
+ [GUMBO_LEX_AMBIGUOUS_AMPERSAND] = handle_ambiguous_ampersand_state,
3367
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE] = handle_numeric_character_reference_state,
3368
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START] = handle_hexadecimal_character_reference_start_state,
3369
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START] = handle_decimal_character_reference_start_state,
3370
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE] = handle_hexadecimal_character_reference_state,
3371
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE] = handle_decimal_character_reference_state,
3372
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
3212
3373
  };
3213
3374
 
3214
- bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3375
+ void gumbo_lex(GumboParser* parser, GumboToken* output) {
3215
3376
  // Because of the spec requirements that...
3216
3377
  //
3217
3378
  // 1. Tokens be handled immediately by the parser upon emission.
@@ -3236,15 +3397,15 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3236
3397
  // isn't consumed twice.
3237
3398
  tokenizer->_reconsume_current_input = false;
3238
3399
  tokenizer->_buffered_emit_char = kGumboNoChar;
3239
- return true;
3400
+ return;
3240
3401
  }
3241
3402
 
3242
- if (maybe_emit_from_temporary_buffer(parser, output)) {
3243
- return true;
3403
+ if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
3404
+ return;
3244
3405
  }
3245
3406
 
3246
3407
  while (1) {
3247
- assert(!tokenizer->_temporary_buffer_emit);
3408
+ assert(!tokenizer->_resume_pos);
3248
3409
  assert(tokenizer->_buffered_emit_char == kGumboNoChar);
3249
3410
  int c = utf8iterator_current(&tokenizer->_input);
3250
3411
  GumboTokenizerEnum state = tokenizer->_state;
@@ -3255,11 +3416,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3255
3416
  bool should_advance = !tokenizer->_reconsume_current_input;
3256
3417
  tokenizer->_reconsume_current_input = false;
3257
3418
 
3258
- if (result == RETURN_SUCCESS) {
3259
- return true;
3260
- } else if (result == RETURN_ERROR) {
3261
- return false;
3262
- }
3419
+ if (result == EMIT_TOKEN)
3420
+ return;
3263
3421
 
3264
3422
  if (should_advance) {
3265
3423
  utf8iterator_next(&tokenizer->_input);
@@ -3285,12 +3443,16 @@ void gumbo_token_destroy(GumboToken* token) {
3285
3443
  }
3286
3444
  }
3287
3445
  gumbo_free((void*) token->v.start_tag.attributes.data);
3288
- if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN)
3446
+ if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
3289
3447
  gumbo_free(token->v.start_tag.name);
3448
+ token->v.start_tag.name = NULL;
3449
+ }
3290
3450
  return;
3291
3451
  case GUMBO_TOKEN_END_TAG:
3292
- if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN)
3452
+ if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN) {
3293
3453
  gumbo_free(token->v.end_tag.name);
3454
+ token->v.end_tag.name = NULL;
3455
+ }
3294
3456
  break;
3295
3457
  case GUMBO_TOKEN_COMMENT:
3296
3458
  gumbo_free((void*) token->v.text);