nokogumbo 2.0.0.pre.alpha → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -79,7 +79,7 @@ void gumbo_string_buffer_append_codepoint (
79
79
  }
80
80
 
81
81
  void gumbo_string_buffer_append_string (
82
- GumboStringPiece* str,
82
+ const GumboStringPiece* str,
83
83
  GumboStringBuffer* output
84
84
  ) {
85
85
  maybe_resize_string_buffer(str->length, output);
@@ -47,7 +47,7 @@ void gumbo_string_buffer_append_codepoint (
47
47
 
48
48
  // Appends a string onto the end of the GumboStringBuffer.
49
49
  void gumbo_string_buffer_append_string (
50
- GumboStringPiece* str,
50
+ const GumboStringPiece* str,
51
51
  GumboStringBuffer* output
52
52
  );
53
53
 
@@ -0,0 +1,79 @@
1
+ /*
2
+ Copyright 2018 Stephen Checkoway
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
16
+
17
+ #include <assert.h>
18
+
19
+ #include "ascii.h"
20
+ #include "token_buffer.h"
21
+ #include "tokenizer.h"
22
+ #include "util.h"
23
+
24
+ struct GumboInternalCharacterToken {
25
+ GumboSourcePosition position;
26
+ GumboStringPiece original_text;
27
+ int c;
28
+ };
29
+
30
+ void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer) {
31
+ buffer->data = NULL;
32
+ buffer->length = 0;
33
+ buffer->capacity = 0;
34
+ }
35
+
36
+ void gumbo_character_token_buffer_append (
37
+ const GumboToken* token,
38
+ GumboCharacterTokenBuffer* buffer
39
+ ) {
40
+ assert(token->type == GUMBO_TOKEN_WHITESPACE
41
+ || token->type == GUMBO_TOKEN_CHARACTER);
42
+ if (buffer->length == buffer->capacity) {
43
+ if (buffer->capacity == 0)
44
+ buffer->capacity = 10;
45
+ else
46
+ buffer->capacity *= 2;
47
+ size_t bytes = sizeof(*buffer->data) * buffer->capacity;
48
+ buffer->data = gumbo_realloc(buffer->data, bytes);
49
+ }
50
+ size_t index = buffer->length++;
51
+ buffer->data[index].position = token->position;
52
+ buffer->data[index].original_text = token->original_text;
53
+ buffer->data[index].c = token->v.character;
54
+ }
55
+
56
+ void gumbo_character_token_buffer_get (
57
+ const GumboCharacterTokenBuffer* buffer,
58
+ size_t index,
59
+ struct GumboInternalToken* output
60
+ ) {
61
+ assert(index < buffer->length);
62
+ int c = buffer->data[index].c;
63
+ output->type = gumbo_ascii_isspace(c)?
64
+ GUMBO_TOKEN_WHITESPACE : GUMBO_TOKEN_CHARACTER;
65
+ output->position = buffer->data[index].position;
66
+ output->original_text = buffer->data[index].original_text;
67
+ output->v.character = c;
68
+ }
69
+
70
+ void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer) {
71
+ buffer->length = 0;
72
+ }
73
+
74
+ void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer) {
75
+ gumbo_free(buffer->data);
76
+ buffer->data = NULL;
77
+ buffer->length = 0;
78
+ buffer->capacity = 0;
79
+ }
@@ -0,0 +1,71 @@
1
+ /*
2
+ Copyright 2018 Stephen Checkoway
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
16
+
17
+ #ifndef GUMBO_TOKEN_BUFFER_H
18
+ #define GUMBO_TOKEN_BUFFER_H
19
+
20
+ #include <stdbool.h>
21
+ #include <stddef.h>
22
+
23
+ #include "gumbo.h"
24
+
25
+ #ifdef __cplusplus
26
+ extern "C" {
27
+ #endif
28
+
29
+ struct GumboInternalCharacterToken;
30
+ struct GumboInternalToken;
31
+
32
+ // A struct representing a growable sequence of character (and whitespace)
33
+ // tokens.
34
+ typedef struct {
35
+ // A pointer to the start of the sequence.
36
+ struct GumboInternalCharacterToken* data;
37
+
38
+ // The length of the sequence.
39
+ size_t length;
40
+
41
+ // The capacity of the buffer.
42
+ size_t capacity;
43
+ } GumboCharacterTokenBuffer;
44
+
45
+ // Initializes a new GumboCharacterTokenBuffer.
46
+ void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer);
47
+
48
+ // Appends a character (or whitespace) token.
49
+ void gumbo_character_token_buffer_append (
50
+ const struct GumboInternalToken* token,
51
+ GumboCharacterTokenBuffer* buffer
52
+ );
53
+
54
+ void gumbo_character_token_buffer_get (
55
+ const GumboCharacterTokenBuffer* buffer,
56
+ size_t index,
57
+ struct GumboInternalToken* output
58
+ );
59
+
60
+ // Reinitialize this string buffer. This clears it by setting length=0. It
61
+ // does not zero out the buffer itself.
62
+ void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer);
63
+
64
+ // Deallocates this GumboCharacterTokenBuffer.
65
+ void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer);
66
+
67
+ #ifdef __cplusplus
68
+ }
69
+ #endif
70
+
71
+ #endif // GUMBO_TOKEN_BUFFER_H
@@ -1,5 +1,7 @@
1
1
  /*
2
2
  Copyright 2010 Google Inc.
3
+ Copyright 2017-2018 Craig Barnes
4
+ Copyright 2018 Stephen Checkoway
3
5
 
4
6
  Licensed under the Apache License, Version 2.0 (the "License");
5
7
  you may not use this file except in compliance with the License.
@@ -18,10 +20,7 @@
18
20
  Coding conventions specific to this file:
19
21
 
20
22
  1. Functions that fill in a token should be named emit_*, and should be
21
- followed immediately by a return from the tokenizer (true if no error
22
- occurred, false if an error occurred). Sometimes the emit functions
23
- themselves return a boolean so that they can be combined with the return
24
- statement; in this case, they should match this convention.
23
+ followed immediately by a return from the tokenizer.
25
24
  2. Functions that shuffle data from temporaries to final API structures
26
25
  should be named finish_*, and be called just before the tokenizer exits the
27
26
  state that accumulates the temporary.
@@ -60,15 +59,18 @@
60
59
  #include "util.h"
61
60
  #include "vector.h"
62
61
 
63
- // Compared against _script_data_buffer to determine if we're in
62
+ // Compared against _temporary_buffer to determine if we're in
64
63
  // double-escaped script mode.
65
64
  static const GumboStringPiece kScriptTag = {.data = "script", .length = 6};
66
65
 
67
- // An enum for the return value of each individual state.
66
+ // An enum for the return value of each individual state. Each of the emit_*
67
+ // functions should return EMIT_TOKEN and should be called as
68
+ // return emit_foo(parser, ..., output);
69
+ // Each of the handle_*_state functions that do not return emit_* should
70
+ // instead return CONTINUE to indicate to gumbo_lex to continue lexing.
68
71
  typedef enum {
69
- RETURN_ERROR, // Return false (error) from the tokenizer.
70
- RETURN_SUCCESS, // Return true (success) from the tokenizer.
71
- NEXT_CHAR // Proceed to the next character and continue lexing.
72
+ EMIT_TOKEN,
73
+ CONTINUE,
72
74
  } StateResult;
73
75
 
74
76
  // This is a struct containing state necessary to build up a tag token,
@@ -103,12 +105,6 @@ typedef struct GumboInternalTagState {
103
105
  // the attribute value, but shouldn't overwrite the existing value.
104
106
  bool _drop_next_attr_value;
105
107
 
106
- // The state that caused the tokenizer to switch into a character reference in
107
- // attribute value state. This is used to set the additional allowed
108
- // character, and is switched back to on completion. Initialized as the
109
- // tokenizer enters the character reference state.
110
- GumboTokenizerEnum _attr_value_state;
111
-
112
108
  // The last start tag to have been emitted by the tokenizer. This is
113
109
  // necessary to check for appropriate end tags.
114
110
  GumboTag _last_start_tag;
@@ -133,10 +129,10 @@ typedef struct GumboInternalTokenizerState {
133
129
  // "Reconsume the current input character in..."
134
130
  bool _reconsume_current_input;
135
131
 
136
- // A flag indicating whether the current node is a foreign element. This is
137
- // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
138
- // markup declaration state.
139
- bool _is_current_node_foreign;
132
+ // A flag indicating whether the adjusted current node is a foreign element.
133
+ // This is set by gumbo_tokenizer_set_is_adjusted_current_node_foreign and
134
+ // checked in the markup declaration state.
135
+ bool _is_adjusted_current_node_foreign;
140
136
 
141
137
  // A flag indicating whether the tokenizer is in a CDATA section. If so, then
142
138
  // text tokens emitted will be GUMBO_TOKEN_CDATA.
@@ -159,27 +155,24 @@ typedef struct GumboInternalTokenizerState {
159
155
 
160
156
  // A temporary buffer to accumulate characters, as described by the "temporary
161
157
  // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
162
- // way: we record the specific character to go into the buffer, which may
163
- // sometimes be a lowercased version of the actual input character. However,
164
- // we *also* use utf8iterator_mark() to record the position at tag start.
165
- // When we start flushing the temporary buffer, we set _temporary_buffer_emit
166
- // to the start of it, and then increment it for each call to the tokenizer.
167
- // We also call utf8iterator_reset(), and utf8iterator_next() through the
168
- // input stream, so that tokens emitted by emit_char have the correct position
169
- // and original text.
158
+ // way: In situations where the spec calls for inserting characters into the
159
+ // temporary buffer that exactly match the input in order to emit them as
160
+ // character tokens, we don't actually do it.
161
+ // Instead, we mark the input and reset the input to it using set_mark() and
162
+ // emit_from_mark(). We do use the temporary buffer for other uses such as
163
+ // DOCTYPEs, comments, and detecting escaped <script> tags.
170
164
  GumboStringBuffer _temporary_buffer;
171
165
 
172
- // The current cursor position we're emitting from within
173
- // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
174
- const char* _temporary_buffer_emit;
166
+ // The position to resume normal operation after we start emitting from the
167
+ // mark. NULL whenever we're not emitting from the mark.
168
+ const char* _resume_pos;
169
+
170
+ // The character reference state uses a return state to return to the state
171
+ // it was invoked from.
172
+ GumboTokenizerEnum _return_state;
175
173
 
176
- // The temporary buffer is also used by the spec to check whether we should
177
- // enter the script data double escaped state, but we can't use the same
178
- // buffer for both because we have to flush out "<s" as emits while still
179
- // maintaining the context that will eventually become "script". This is a
180
- // separate buffer that's used in place of the temporary buffer for states
181
- // that may enter the script data double escape start state.
182
- GumboStringBuffer _script_data_buffer;
174
+ // Numeric character reference.
175
+ uint32_t _character_reference_code;
183
176
 
184
177
  // Pointer to the beginning of the current token in the original buffer; used
185
178
  // to record the original text.
@@ -201,123 +194,66 @@ typedef struct GumboInternalTokenizerState {
201
194
  Utf8Iterator _input;
202
195
  } GumboTokenizerState;
203
196
 
204
- // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
197
+ // Adds a parse error to the parser's error struct.
205
198
  static void tokenizer_add_parse_error (
206
199
  GumboParser* parser,
207
200
  GumboErrorType type
208
201
  ) {
202
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
209
203
  GumboError* error = gumbo_add_error(parser);
210
204
  if (!error) {
211
205
  return;
212
206
  }
207
+ const Utf8Iterator* input = &tokenizer->_input;
208
+ utf8iterator_get_position(input, &error->position);
209
+ error->original_text.data = utf8iterator_get_char_pointer(input);
210
+ error->original_text.length = utf8iterator_get_width(input);
211
+ error->type = type;
212
+ error->v.tokenizer.state = tokenizer->_state;
213
+ error->v.tokenizer.codepoint = utf8iterator_current(input);
214
+ }
215
+
216
+ // Adds an error pointing at the start of the character reference.
217
+ static void tokenizer_add_char_ref_error (
218
+ struct GumboInternalParser* parser,
219
+ GumboErrorType type,
220
+ int codepoint
221
+ ) {
213
222
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
214
- utf8iterator_get_position(&tokenizer->_input, &error->position);
215
- error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
223
+ GumboError* error = gumbo_add_error(parser);
224
+ if (!error)
225
+ return;
226
+ Utf8Iterator* input = &tokenizer->_input;
216
227
  error->type = type;
217
- error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
218
- switch (tokenizer->_state) {
219
- case GUMBO_LEX_DATA:
220
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
221
- break;
222
- case GUMBO_LEX_CHAR_REF_IN_DATA:
223
- case GUMBO_LEX_CHAR_REF_IN_RCDATA:
224
- case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
225
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
226
- break;
227
- case GUMBO_LEX_RCDATA:
228
- case GUMBO_LEX_RCDATA_LT:
229
- case GUMBO_LEX_RCDATA_END_TAG_OPEN:
230
- case GUMBO_LEX_RCDATA_END_TAG_NAME:
231
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
232
- break;
233
- case GUMBO_LEX_RAWTEXT:
234
- case GUMBO_LEX_RAWTEXT_LT:
235
- case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
236
- case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
237
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
238
- break;
239
- case GUMBO_LEX_PLAINTEXT:
240
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
241
- break;
242
- case GUMBO_LEX_SCRIPT:
243
- case GUMBO_LEX_SCRIPT_LT:
244
- case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
245
- case GUMBO_LEX_SCRIPT_END_TAG_NAME:
246
- case GUMBO_LEX_SCRIPT_ESCAPED_START:
247
- case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
248
- case GUMBO_LEX_SCRIPT_ESCAPED:
249
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
250
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
251
- case GUMBO_LEX_SCRIPT_ESCAPED_LT:
252
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
253
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
254
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
255
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
256
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
257
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
258
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
259
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
260
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
261
- break;
262
- case GUMBO_LEX_TAG_OPEN:
263
- case GUMBO_LEX_END_TAG_OPEN:
264
- case GUMBO_LEX_TAG_NAME:
265
- case GUMBO_LEX_BEFORE_ATTR_NAME:
266
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
267
- break;
268
- case GUMBO_LEX_SELF_CLOSING_START_TAG:
269
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
270
- break;
271
- case GUMBO_LEX_ATTR_NAME:
272
- case GUMBO_LEX_AFTER_ATTR_NAME:
273
- case GUMBO_LEX_BEFORE_ATTR_VALUE:
274
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
275
- break;
276
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
277
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
278
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
279
- case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
280
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
281
- break;
282
- case GUMBO_LEX_BOGUS_COMMENT:
283
- case GUMBO_LEX_COMMENT_START:
284
- case GUMBO_LEX_COMMENT_START_DASH:
285
- case GUMBO_LEX_COMMENT:
286
- case GUMBO_LEX_COMMENT_END_DASH:
287
- case GUMBO_LEX_COMMENT_END:
288
- case GUMBO_LEX_COMMENT_END_BANG:
289
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
290
- break;
291
- case GUMBO_LEX_MARKUP_DECLARATION:
292
- case GUMBO_LEX_DOCTYPE:
293
- case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
294
- case GUMBO_LEX_DOCTYPE_NAME:
295
- case GUMBO_LEX_AFTER_DOCTYPE_NAME:
296
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
297
- case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
298
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
299
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
300
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
301
- case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
302
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
303
- case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
304
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
305
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
306
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
307
- case GUMBO_LEX_BOGUS_DOCTYPE:
308
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
309
- break;
310
- case GUMBO_LEX_CDATA:
311
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
312
- break;
313
- }
228
+ error->position = utf8iterator_get_mark_position(input);
229
+ const char* mark = utf8iterator_get_mark_pointer(input);
230
+ error->original_text.data = mark;
231
+ error->original_text.length = utf8iterator_get_char_pointer(input) - mark;
232
+ error->v.tokenizer.state = tokenizer->_state;
233
+ error->v.tokenizer.codepoint = codepoint;
234
+ }
235
+
236
+ // Adds an error pointing at the start of the token.
237
+ static void tokenizer_add_token_parse_error (
238
+ GumboParser* parser,
239
+ GumboErrorType type
240
+ ) {
241
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
242
+ GumboError* error = gumbo_add_error(parser);
243
+ if (!error)
244
+ return;
245
+ Utf8Iterator* input = &tokenizer->_input;
246
+ error->type = type;
247
+ error->position = tokenizer->_token_start_pos;
248
+ error->original_text.data = tokenizer->_token_start;
249
+ error->original_text.length =
250
+ utf8iterator_get_char_pointer(input) - tokenizer->_token_start;
251
+ error->v.tokenizer.state = tokenizer->_state;
252
+ error->v.tokenizer.codepoint = 0;
314
253
  }
315
254
 
316
255
  static bool is_alpha(int c) {
317
- // We don't use the ISO C isalpha() function here because it depends
318
- // on the current locale, whereas the behavior in the HTML5 spec is
319
- // locale-independent.
320
- return ((unsigned) c | 32) - 'a' < 26;
256
+ return gumbo_ascii_isalpha(c);
321
257
  }
322
258
 
323
259
  static int ensure_lowercase(int c) {
@@ -347,24 +283,9 @@ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
347
283
  }
348
284
 
349
285
  // Starts recording characters in the temporary buffer.
350
- // Because this needs to reset the utf8iterator_mark to the beginning of the
351
- // text that will eventually be emitted, it needs to be called a couple of
352
- // states before the spec says "Set the temporary buffer to the empty string".
353
- // In general, this should be called whenever there's a transition to a
354
- // "less-than sign state". The initial < and possibly / then need to be
355
- // appended to the temporary buffer, their presence needs to be accounted for in
356
- // states that compare the temporary buffer against a literal value, and
357
- // spec stanzas that say "emit a < and / character token along with a character
358
- // token for each character in the temporary buffer" need to be adjusted to
359
- // account for the presence of the < and / inside the temporary buffer.
360
286
  static void clear_temporary_buffer(GumboParser* parser) {
361
287
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
362
- assert(!tokenizer->_temporary_buffer_emit);
363
- utf8iterator_mark(&tokenizer->_input);
364
288
  gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
365
- // The temporary buffer and script data buffer are the same object in the
366
- // spec, so the script data buffer should be cleared as well.
367
- gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
368
289
  }
369
290
 
370
291
  // Appends a codepoint to the temporary buffer.
@@ -378,25 +299,20 @@ static void append_char_to_temporary_buffer (
378
299
  );
379
300
  }
380
301
 
381
- #ifndef NDEBUG
382
- static bool temporary_buffer_equals__ (
383
- const GumboParser* parser,
384
- const char* text,
385
- size_t text_len
302
+ static void append_string_to_temporary_buffer (
303
+ GumboParser* parser,
304
+ const GumboStringPiece* str
386
305
  ) {
387
- const GumboStringBuffer* buf = &parser->_tokenizer_state->_temporary_buffer;
388
- return
389
- text_len == buf->length
390
- && memcmp(buf->data, text, text_len) == 0;
306
+ gumbo_string_buffer_append_string (
307
+ str,
308
+ &parser->_tokenizer_state->_temporary_buffer
309
+ );
391
310
  }
392
311
 
393
- #define temporary_buffer_equals(parser, text) \
394
- temporary_buffer_equals__(parser, "" text, sizeof(text) - 1)
395
312
 
396
313
  static bool temporary_buffer_is_empty(const GumboParser* parser) {
397
314
  return parser->_tokenizer_state->_temporary_buffer.length == 0;
398
315
  }
399
- #endif
400
316
 
401
317
  static void doc_type_state_init(GumboParser* parser) {
402
318
  GumboTokenDocType* doc_type_state =
@@ -493,56 +409,49 @@ static void finish_doctype_system_id(GumboParser* parser) {
493
409
  }
494
410
 
495
411
  // Writes a single specified character to the output token.
496
- static void emit_char(GumboParser* parser, int c, GumboToken* output) {
412
+ static StateResult emit_char(GumboParser* parser, int c, GumboToken* output) {
497
413
  output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
498
414
  output->v.character = c;
499
415
  finish_token(parser, output);
416
+ return EMIT_TOKEN;
500
417
  }
501
418
 
502
419
  // Writes a replacement character token and records a parse error.
503
- // Always returns RETURN_ERROR, per gumbo_lex return value.
420
+ // Always returns EMIT_TOKEN, per gumbo_lex return value.
504
421
  static StateResult emit_replacement_char(
505
422
  GumboParser* parser, GumboToken* output) {
506
423
  // In all cases, this is because of a null byte in the input stream.
507
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
424
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
508
425
  emit_char(parser, kUtf8ReplacementChar, output);
509
- return RETURN_ERROR;
426
+ return EMIT_TOKEN;
510
427
  }
511
428
 
512
- // Writes an EOF character token. Always returns RETURN_SUCCESS.
429
+ // Writes an EOF character token. Always returns EMIT_TOKEN.
513
430
  static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
514
- emit_char(parser, -1, output);
515
- return RETURN_SUCCESS;
516
- }
517
-
518
- // Writes the current input character out as a character token.
519
- // Always returns RETURN_SUCCESS.
520
- static bool emit_current_char(GumboParser* parser, GumboToken* output) {
521
- emit_char(
522
- parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
523
- return RETURN_SUCCESS;
431
+ return emit_char(parser, -1, output);
524
432
  }
525
433
 
526
434
  // Writes out a doctype token, copying it from the tokenizer state.
527
- static void emit_doctype(GumboParser* parser, GumboToken* output) {
435
+ static StateResult emit_doctype(GumboParser* parser, GumboToken* output) {
528
436
  output->type = GUMBO_TOKEN_DOCTYPE;
529
437
  output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
530
438
  finish_token(parser, output);
531
439
  doc_type_state_init(parser);
440
+ return EMIT_TOKEN;
532
441
  }
533
442
 
534
443
  // Debug-only function that explicitly sets the attribute vector data to NULL so
535
444
  // it can be asserted on tag creation, verifying that there are no memory leaks.
536
445
  static void mark_tag_state_as_empty(GumboTagState* tag_state) {
537
446
  UNUSED_IF_NDEBUG(tag_state);
538
- #ifndef NDEBUG
539
447
  tag_state->_name = NULL;
448
+ #ifndef NDEBUG
540
449
  tag_state->_attributes = kGumboEmptyVector;
541
450
  #endif
542
451
  }
543
452
 
544
453
  // Writes out the current tag as a start or end tag token.
545
- // Always returns RETURN_SUCCESS.
454
+ // Always returns EMIT_TOKEN.
546
455
  static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
547
456
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
548
457
  if (tag_state->_is_start_tag) {
@@ -559,7 +468,10 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
559
468
  output->type = GUMBO_TOKEN_END_TAG;
560
469
  output->v.end_tag.tag = tag_state->_tag;
561
470
  output->v.end_tag.name = tag_state->_name;
562
- output->v.end_tag.is_self_closing = tag_state->_is_self_closing;
471
+ if (tag_state->_is_self_closing)
472
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS);
473
+ if (tag_state->_attributes.length > 0)
474
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_ATTRIBUTES);
563
475
  // In end tags, ownership of the attributes vector is not transferred to the
564
476
  // token, but it's still initialized as normal, so it must be manually
565
477
  // deallocated. There may also be attributes to destroy, in certain broken
@@ -582,7 +494,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
582
494
  assert(output->original_text.length >= 2);
583
495
  assert(output->original_text.data[0] == '<');
584
496
  assert(output->original_text.data[output->original_text.length - 1] == '>');
585
- return RETURN_SUCCESS;
497
+ return EMIT_TOKEN;
586
498
  }
587
499
 
588
500
  // In some states, we speculatively start a tag, but don't know whether it'll be
@@ -600,90 +512,59 @@ static void abandon_current_tag(GumboParser* parser) {
600
512
  gumbo_debug("Abandoning current tag.\n");
601
513
  }
602
514
 
603
- // Wraps the gumbo_consume_char_ref function to handle its output and make the
604
- // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
605
- // error occurred, RETURN_SUCCESS otherwise.
606
- static StateResult emit_char_ref (
607
- GumboParser* parser,
608
- int additional_allowed_char,
609
- bool UNUSED_ARG(is_in_attribute),
610
- GumboToken* output
611
- ) {
612
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
613
- OneOrTwoCodepoints char_ref;
614
- bool status = gumbo_consume_char_ref (
615
- parser,
616
- &tokenizer->_input,
617
- additional_allowed_char,
618
- false,
619
- &char_ref
620
- );
621
- if (char_ref.first != kGumboNoChar) {
622
- // gumbo_consume_char_ref ends with the iterator pointing at the next
623
- // character, so we need to be sure not advance it again before
624
- // reading the next token.
625
- tokenizer->_reconsume_current_input = true;
626
- emit_char(parser, char_ref.first, output);
627
- tokenizer->_buffered_emit_char = char_ref.second;
628
- } else {
629
- emit_char(parser, '&', output);
630
- }
631
- return status ? RETURN_SUCCESS : RETURN_ERROR;
632
- }
633
-
634
515
  // Emits a comment token. Comments use the temporary buffer to accumulate their
635
516
  // data, and then it's copied over and released to the 'text' field of the
636
- // GumboToken union. Always returns RETURN_SUCCESS.
517
+ // GumboToken union. Always returns EMIT_TOKEN.
637
518
  static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
638
519
  output->type = GUMBO_TOKEN_COMMENT;
639
520
  finish_temporary_buffer(parser, &output->v.text);
640
521
  finish_token(parser, output);
641
- return RETURN_SUCCESS;
522
+ return EMIT_TOKEN;
642
523
  }
643
524
 
644
- // Checks to see we should be flushing accumulated characters in the temporary
645
- // buffer, and fills the output token with the next output character if so.
646
- // Returns true if a character has been emitted and the tokenizer should
647
- // immediately return, false if we're at the end of the temporary buffer and
648
- // should resume normal operation.
649
- static bool maybe_emit_from_temporary_buffer(
650
- GumboParser* parser, GumboToken* output) {
525
+ static void set_mark(GumboParser* parser) {
526
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
527
+ utf8iterator_mark(&tokenizer->_input);
528
+ }
529
+
530
+ // Checks to see we should be emitting characters from the mark, and fills the
531
+ // output token with the next output character if so.
532
+ // Returns EMIT_TOKEN if a character has been emitted and the tokenizer should
533
+ // immediately return, CONTINUE if we should resume normal operation.
534
+ static StateResult maybe_emit_from_mark (
535
+ GumboParser* parser,
536
+ GumboToken* output
537
+ ) {
651
538
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
652
- const char* c = tokenizer->_temporary_buffer_emit;
653
- GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
539
+ const char* pos = tokenizer->_resume_pos;
654
540
 
655
- if (!c || c >= buffer->data + buffer->length) {
656
- tokenizer->_temporary_buffer_emit = NULL;
657
- return false;
541
+ if (!pos)
542
+ return CONTINUE;
543
+ if (utf8iterator_get_char_pointer(&tokenizer->_input) >= pos) {
544
+ tokenizer->_resume_pos = NULL;
545
+ return CONTINUE;
658
546
  }
659
547
 
660
- assert(*c == utf8iterator_current(&tokenizer->_input));
661
- // emit_char also advances the input stream. We need to do some juggling of
662
- // the _reconsume_current_input flag to get the proper behavior when emitting
663
- // previous tokens. Basically, _reconsume_current_input should *never* be set
664
- // when emitting anything from the temporary buffer, since those characters
665
- // have already been advanced past. However, it should be preserved so that
666
- // when the *next* character is encountered again, the tokenizer knows not to
667
- // advance past it.
668
- bool saved_reconsume_state = tokenizer->_reconsume_current_input;
669
- tokenizer->_reconsume_current_input = false;
670
- emit_char(parser, *c, output);
671
- ++tokenizer->_temporary_buffer_emit;
672
- tokenizer->_reconsume_current_input = saved_reconsume_state;
673
- return true;
674
- }
675
-
676
- // Sets up the tokenizer to begin flushing the temporary buffer.
677
- // This resets the input iterator stream to the start of the last tag, sets up
678
- // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
679
- // the first character in it. It returns true if a character was emitted, false
680
- // otherwise.
681
- static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
548
+ // emit_char advances the input stream. _reconsume_current_input should
549
+ // *never* be set when emitting from the mark since those characters have
550
+ // already been advanced past.
551
+ assert(!tokenizer->_reconsume_current_input);
552
+ return emit_char(parser, utf8iterator_current(&tokenizer->_input), output);
553
+ }
554
+
555
+ // Sets up the tokenizer to begin emitting from the mark up to, but not
556
+ // including, the current code point. This resets the input iterator stream to
557
+ // the mark, sets up _resume_pos, and then emits the first character in it.
558
+ // Returns EMIT_TOKEN.
559
+ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
682
560
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
683
- assert(tokenizer->_temporary_buffer.data);
561
+ tokenizer->_resume_pos = utf8iterator_get_char_pointer(&tokenizer->_input);
684
562
  utf8iterator_reset(&tokenizer->_input);
685
- tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
686
- return maybe_emit_from_temporary_buffer(parser, output);
563
+ // Now that we have reset the input, we need to advance through it.
564
+ tokenizer->_reconsume_current_input = false;
565
+ StateResult result = maybe_emit_from_mark(parser, output);
566
+ assert(result == EMIT_TOKEN);
567
+ return result;
687
568
  }
688
569
 
689
570
  // Appends a codepoint to the current tag buffer. If
@@ -703,6 +584,19 @@ static void append_char_to_tag_buffer (
703
584
  gumbo_string_buffer_append_codepoint(codepoint, buffer);
704
585
  }
705
586
 
587
+ // Like above but append a string.
588
+ static void append_string_to_tag_buffer (
589
+ GumboParser* parser,
590
+ GumboStringPiece* str,
591
+ bool reinitilize_position_on_first
592
+ ) {
593
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
594
+ if (buffer->length == 0 && reinitilize_position_on_first) {
595
+ reset_tag_buffer_start_point(parser);
596
+ }
597
+ gumbo_string_buffer_append_string(str, buffer);
598
+ }
599
+
706
600
  // (Re-)initialize the tag buffer. This also resets the original_text pointer
707
601
  // and _start_pos field to point to the current position.
708
602
  static void initialize_tag_buffer(GumboParser* parser) {
@@ -713,6 +607,70 @@ static void initialize_tag_buffer(GumboParser* parser) {
713
607
  reset_tag_buffer_start_point(parser);
714
608
  }
715
609
 
610
+ // https://html.spec.whatwg.org/multipage/parsing.html#charref-in-attribute
611
+ static bool character_reference_part_of_attribute(GumboParser* parser) {
612
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
613
+ switch (tokenizer->_return_state) {
614
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
615
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
616
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
617
+ return true;
618
+ default:
619
+ return false;
620
+ }
621
+ }
622
+
623
+ // https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference
624
+ // For each code point in the temporary buffer, add to the current attribute
625
+ // value if the character reference was consumed as part of an attribute or
626
+ // emit the code point as a character token.
627
+ static StateResult flush_code_points_consumed_as_character_reference (
628
+ GumboParser* parser,
629
+ GumboToken* output
630
+ ) {
631
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
632
+ if (character_reference_part_of_attribute(parser)) {
633
+ const char *start = utf8iterator_get_mark_pointer(&tokenizer->_input);
634
+ assert(start);
635
+ GumboStringPiece str = {
636
+ .data = start,
637
+ .length = utf8iterator_get_char_pointer(&tokenizer->_input) - start,
638
+ };
639
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
640
+ append_string_to_tag_buffer(parser, &str, unquoted);
641
+ return CONTINUE;
642
+ }
643
+ return emit_from_mark(parser, output);
644
+ }
645
+
646
+ // After a character reference has been successfully constructed, the standard
647
+ // says to set the temporary buffer equal to the empty string, append the code
648
+ // point(s) associated with the reference and flush code points consumed as a
649
+ // character reference.
650
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
651
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
652
+ // That doesn't work for us because we use the temporary buffer in lock step
653
+ // with the input for position and that would fail if we inserted a different
654
+ // number of code points. So duplicate a bit of the above logic.
655
+ static StateResult flush_char_ref (
656
+ GumboParser* parser,
657
+ int first,
658
+ int second,
659
+ GumboToken* output
660
+ ) {
661
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
662
+ if (character_reference_part_of_attribute(parser)) {
663
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
664
+ append_char_to_tag_buffer(parser, first, unquoted);
665
+ if (second != kGumboNoChar)
666
+ append_char_to_tag_buffer(parser, second, unquoted);
667
+ return CONTINUE;
668
+ }
669
+ tokenizer->_buffered_emit_char = second;
670
+ return emit_char(parser, first, output);
671
+ }
672
+
673
+
716
674
  // Initializes the tag_state to start a new tag, keeping track of the opening
717
675
  // positions and original text. Takes a boolean indicating whether this is a
718
676
  // start or end tag.
@@ -725,7 +683,6 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
725
683
  assert(is_alpha(c));
726
684
 
727
685
  initialize_tag_buffer(parser);
728
- gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
729
686
 
730
687
  assert(tag_state->_name == NULL);
731
688
  assert(tag_state->_attributes.data == NULL);
@@ -765,7 +722,10 @@ static void copy_over_original_tag_text (
765
722
  original_text->data = tag_state->_original_text;
766
723
  original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
767
724
  tag_state->_original_text;
768
- if (original_text->data[original_text->length - 1] == '\r') {
725
+ if (
726
+ original_text->length
727
+ && original_text->data[original_text->length - 1] == '\r'
728
+ ) {
769
729
  // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
770
730
  // appended to the end of original text even when it's really the first part
771
731
  // of the next character. If we detect this situation, shrink the length of
@@ -801,40 +761,45 @@ static void finish_tag_name(GumboParser* parser) {
801
761
  }
802
762
 
803
763
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
804
- static void add_duplicate_attr_error (
805
- GumboParser* parser,
806
- int original_index,
807
- int new_index
808
- ) {
764
+ static void add_duplicate_attr_error(GumboParser* parser) {
765
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
809
766
  GumboError* error = gumbo_add_error(parser);
810
767
  if (!error) {
811
768
  return;
812
769
  }
813
770
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
814
- error->type = GUMBO_ERR_DUPLICATE_ATTR;
771
+ error->type = GUMBO_ERR_DUPLICATE_ATTRIBUTE;
815
772
  error->position = tag_state->_start_pos;
816
- error->original_text = tag_state->_original_text;
817
- error->v.duplicate_attr.original_index = original_index;
818
- error->v.duplicate_attr.new_index = new_index;
819
- copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
820
- reinitialize_tag_buffer(parser);
773
+ error->original_text.data = tag_state->_original_text;
774
+ error->original_text.length =
775
+ utf8iterator_get_char_pointer(&tokenizer->_input) - error->original_text.data;
776
+ error->v.tokenizer.state = tokenizer->_state;
821
777
  }
822
778
 
823
779
  // Creates a new attribute in the current tag, copying the current tag buffer to
824
780
  // the attribute's name. The attribute's value starts out as the empty string
825
781
  // (following the "Boolean attributes" section of the spec) and is only
826
782
  // overwritten on finish_attribute_value(). If the attribute has already been
827
- // specified, the new attribute is dropped, a parse error is added, and the
828
- // function returns false. Otherwise, this returns true.
829
- static bool finish_attribute_name(GumboParser* parser) {
783
+ // specified, the new attribute is dropped and a parse error is added
784
+ static void finish_attribute_name(GumboParser* parser) {
830
785
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
831
786
  GumboTagState* tag_state = &tokenizer->_tag_state;
787
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
788
+
789
+ int max_attributes = parser->_options->max_attributes;
790
+ if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
791
+ parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
792
+ gumbo_debug("Attributes limit exceeded.\n");
793
+ reinitialize_tag_buffer(parser);
794
+ tag_state->_drop_next_attr_value = true;
795
+ return;
796
+ }
797
+
832
798
  // May've been set by a previous attribute without a value; reset it here.
833
799
  tag_state->_drop_next_attr_value = false;
834
800
  assert(tag_state->_attributes.data);
835
801
  assert(tag_state->_attributes.capacity);
836
802
 
837
- GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
838
803
  for (unsigned int i = 0; i < attributes->length; ++i) {
839
804
  GumboAttribute* attr = attributes->data[i];
840
805
  if (
@@ -846,9 +811,10 @@ static bool finish_attribute_name(GumboParser* parser) {
846
811
  )
847
812
  ) {
848
813
  // Identical attribute; bail.
849
- add_duplicate_attr_error(parser, i, attributes->length);
814
+ add_duplicate_attr_error(parser);
815
+ reinitialize_tag_buffer(parser);
850
816
  tag_state->_drop_next_attr_value = true;
851
- return false;
817
+ return;
852
818
  }
853
819
  }
854
820
 
@@ -870,7 +836,6 @@ static bool finish_attribute_name(GumboParser* parser) {
870
836
  );
871
837
  gumbo_vector_add(attr, attributes);
872
838
  reinitialize_tag_buffer(parser);
873
- return true;
874
839
  }
875
840
 
876
841
  // Finishes an attribute value. This sets the value of the most recently added
@@ -911,22 +876,23 @@ void gumbo_tokenizer_state_init (
911
876
  GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState));
912
877
  parser->_tokenizer_state = tokenizer;
913
878
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
879
+ tokenizer->_return_state = GUMBO_LEX_DATA;
880
+ tokenizer->_character_reference_code = 0;
914
881
  tokenizer->_reconsume_current_input = false;
915
- tokenizer->_is_current_node_foreign = false;
882
+ tokenizer->_is_adjusted_current_node_foreign = false;
916
883
  tokenizer->_is_in_cdata = false;
917
884
  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
918
885
  tokenizer->_tag_state._name = NULL;
919
886
 
920
887
  tokenizer->_buffered_emit_char = kGumboNoChar;
921
888
  gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
922
- tokenizer->_temporary_buffer_emit = NULL;
889
+ tokenizer->_resume_pos = NULL;
923
890
 
924
891
  mark_tag_state_as_empty(&tokenizer->_tag_state);
925
892
 
926
- gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
927
- tokenizer->_token_start = text;
928
893
  utf8iterator_init(parser, text, text_length, &tokenizer->_input);
929
894
  utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
895
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
930
896
  doc_type_state_init(parser);
931
897
  }
932
898
 
@@ -936,7 +902,6 @@ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
936
902
  assert(tokenizer->_doc_type_state.public_identifier == NULL);
937
903
  assert(tokenizer->_doc_type_state.system_identifier == NULL);
938
904
  gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
939
- gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
940
905
  assert(tokenizer->_tag_state._name == NULL);
941
906
  assert(tokenizer->_tag_state._attributes.data == NULL);
942
907
  gumbo_free(tokenizer);
@@ -946,17 +911,23 @@ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
946
911
  parser->_tokenizer_state->_state = state;
947
912
  }
948
913
 
949
- void gumbo_tokenizer_set_is_current_node_foreign (
914
+ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
950
915
  GumboParser* parser,
951
916
  bool is_foreign
952
917
  ) {
953
- if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
918
+ if (is_foreign != parser->_tokenizer_state->_is_adjusted_current_node_foreign) {
954
919
  gumbo_debug (
955
920
  "Toggling is_current_node_foreign to %s.\n",
956
921
  is_foreign ? "true" : "false"
957
922
  );
958
923
  }
959
- parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
924
+ parser->_tokenizer_state->_is_adjusted_current_node_foreign = is_foreign;
925
+ }
926
+
927
+ static void reconsume_in_state(GumboParser* parser, GumboTokenizerEnum state) {
928
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
929
+ tokenizer->_reconsume_current_input = true;
930
+ tokenizer->_state = state;
960
931
  }
961
932
 
962
933
  // https://html.spec.whatwg.org/multipage/parsing.html#data-state
@@ -968,37 +939,24 @@ static StateResult handle_data_state (
968
939
  ) {
969
940
  switch (c) {
970
941
  case '&':
971
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
972
- // The char_ref machinery expects to be on the & so it can mark that
973
- // and return to it if the text isn't a char ref, so we need to
974
- // reconsume it.
975
- tokenizer->_reconsume_current_input = true;
976
- return NEXT_CHAR;
942
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
943
+ set_mark(parser);
944
+ tokenizer->_return_state = GUMBO_LEX_DATA;
945
+ return CONTINUE;
977
946
  case '<':
978
947
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
979
- clear_temporary_buffer(parser);
980
- append_char_to_temporary_buffer(parser, '<');
981
- return NEXT_CHAR;
948
+ set_mark(parser);
949
+ return CONTINUE;
982
950
  case '\0':
983
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
984
- emit_char(parser, c, output);
985
- return RETURN_ERROR;
951
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
952
+ return emit_char(parser, c, output);
953
+ case -1:
954
+ return emit_eof(parser, output);
986
955
  default:
987
- return emit_current_char(parser, output);
956
+ return emit_char(parser, c, output);
988
957
  }
989
958
  }
990
959
 
991
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-data-state
992
- static StateResult handle_char_ref_in_data_state (
993
- GumboParser* parser,
994
- GumboTokenizerState* UNUSED_ARG(tokenizer),
995
- int UNUSED_ARG(c),
996
- GumboToken* output
997
- ) {
998
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
999
- return emit_char_ref(parser, ' ', false, output);
1000
- }
1001
-
1002
960
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
1003
961
  static StateResult handle_rcdata_state (
1004
962
  GumboParser* parser,
@@ -1008,34 +966,23 @@ static StateResult handle_rcdata_state (
1008
966
  ) {
1009
967
  switch (c) {
1010
968
  case '&':
1011
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
1012
- tokenizer->_reconsume_current_input = true;
1013
- return NEXT_CHAR;
969
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
970
+ set_mark(parser);
971
+ tokenizer->_return_state = GUMBO_LEX_RCDATA;
972
+ return CONTINUE;
1014
973
  case '<':
1015
974
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
1016
- clear_temporary_buffer(parser);
1017
- append_char_to_temporary_buffer(parser, '<');
1018
- return NEXT_CHAR;
975
+ set_mark(parser);
976
+ return CONTINUE;
1019
977
  case '\0':
1020
978
  return emit_replacement_char(parser, output);
1021
979
  case -1:
1022
980
  return emit_eof(parser, output);
1023
981
  default:
1024
- return emit_current_char(parser, output);
982
+ return emit_char(parser, c, output);
1025
983
  }
1026
984
  }
1027
985
 
1028
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-rcdata-state
1029
- static StateResult handle_char_ref_in_rcdata_state (
1030
- GumboParser* parser,
1031
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1032
- int UNUSED_ARG(c),
1033
- GumboToken* output
1034
- ) {
1035
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1036
- return emit_char_ref(parser, ' ', false, output);
1037
- }
1038
-
1039
986
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
1040
987
  static StateResult handle_rawtext_state (
1041
988
  GumboParser* parser,
@@ -1046,20 +993,19 @@ static StateResult handle_rawtext_state (
1046
993
  switch (c) {
1047
994
  case '<':
1048
995
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
1049
- clear_temporary_buffer(parser);
1050
- append_char_to_temporary_buffer(parser, '<');
1051
- return NEXT_CHAR;
996
+ set_mark(parser);
997
+ return CONTINUE;
1052
998
  case '\0':
1053
999
  return emit_replacement_char(parser, output);
1054
1000
  case -1:
1055
1001
  return emit_eof(parser, output);
1056
1002
  default:
1057
- return emit_current_char(parser, output);
1003
+ return emit_char(parser, c, output);
1058
1004
  }
1059
1005
  }
1060
1006
 
1061
1007
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
1062
- static StateResult handle_script_state (
1008
+ static StateResult handle_script_data_state (
1063
1009
  GumboParser* parser,
1064
1010
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1065
1011
  int c,
@@ -1067,16 +1013,15 @@ static StateResult handle_script_state (
1067
1013
  ) {
1068
1014
  switch (c) {
1069
1015
  case '<':
1070
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
1071
- clear_temporary_buffer(parser);
1072
- append_char_to_temporary_buffer(parser, '<');
1073
- return NEXT_CHAR;
1016
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_LT);
1017
+ set_mark(parser);
1018
+ return CONTINUE;
1074
1019
  case '\0':
1075
1020
  return emit_replacement_char(parser, output);
1076
1021
  case -1:
1077
1022
  return emit_eof(parser, output);
1078
1023
  default:
1079
- return emit_current_char(parser, output);
1024
+ return emit_char(parser, c, output);
1080
1025
  }
1081
1026
  }
1082
1027
 
@@ -1093,75 +1038,75 @@ static StateResult handle_plaintext_state (
1093
1038
  case -1:
1094
1039
  return emit_eof(parser, output);
1095
1040
  default:
1096
- return emit_current_char(parser, output);
1041
+ return emit_char(parser, c, output);
1097
1042
  }
1098
1043
  }
1099
1044
 
1100
1045
  // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
1101
1046
  static StateResult handle_tag_open_state (
1102
1047
  GumboParser* parser,
1103
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1048
+ GumboTokenizerState* tokenizer,
1104
1049
  int c,
1105
1050
  GumboToken* output
1106
1051
  ) {
1107
- assert(temporary_buffer_equals(parser, "<"));
1108
1052
  switch (c) {
1109
1053
  case '!':
1110
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1054
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION_OPEN);
1111
1055
  clear_temporary_buffer(parser);
1112
- return NEXT_CHAR;
1056
+ return CONTINUE;
1113
1057
  case '/':
1114
1058
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1115
- append_char_to_temporary_buffer(parser, '/');
1116
- return NEXT_CHAR;
1059
+ return CONTINUE;
1117
1060
  case '?':
1118
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1061
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
1119
1062
  clear_temporary_buffer(parser);
1120
- append_char_to_temporary_buffer(parser, '?');
1121
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1122
- return NEXT_CHAR;
1063
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1064
+ return CONTINUE;
1065
+ case -1:
1066
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1067
+ // Switch to data to emit EOF.
1068
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1069
+ return emit_from_mark(parser, output);
1123
1070
  default:
1124
1071
  if (is_alpha(c)) {
1125
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1072
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1126
1073
  start_new_tag(parser, true);
1127
- return NEXT_CHAR;
1128
- } else {
1129
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1130
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1131
- emit_temporary_buffer(parser, output);
1132
- return RETURN_ERROR;
1074
+ return CONTINUE;
1133
1075
  }
1076
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1077
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1078
+ return emit_from_mark(parser, output);
1134
1079
  }
1135
1080
  }
1136
1081
 
1137
1082
  // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
1138
1083
  static StateResult handle_end_tag_open_state (
1139
1084
  GumboParser* parser,
1140
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1085
+ GumboTokenizerState* tokenizer,
1141
1086
  int c,
1142
1087
  GumboToken* output
1143
1088
  ) {
1144
- assert(temporary_buffer_equals(parser, "</"));
1145
1089
  switch (c) {
1146
1090
  case '>':
1147
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1091
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_END_TAG_NAME);
1148
1092
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1149
- return NEXT_CHAR;
1093
+ return CONTINUE;
1150
1094
  case -1:
1151
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1152
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1153
- return emit_temporary_buffer(parser, output);
1095
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1096
+ // Similar to the tag open state except we need to emit '<' and '/'
1097
+ // before the EOF.
1098
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1099
+ return emit_from_mark(parser, output);
1154
1100
  default:
1155
1101
  if (is_alpha(c)) {
1156
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1102
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1157
1103
  start_new_tag(parser, false);
1158
1104
  } else {
1159
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1160
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1105
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1106
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1161
1107
  clear_temporary_buffer(parser);
1162
- append_char_to_temporary_buffer(parser, c);
1163
1108
  }
1164
- return NEXT_CHAR;
1109
+ return CONTINUE;
1165
1110
  }
1166
1111
  }
1167
1112
 
@@ -1179,27 +1124,26 @@ static StateResult handle_tag_name_state (
1179
1124
  case ' ':
1180
1125
  finish_tag_name(parser);
1181
1126
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1182
- return NEXT_CHAR;
1127
+ return CONTINUE;
1183
1128
  case '/':
1184
1129
  finish_tag_name(parser);
1185
1130
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1186
- return NEXT_CHAR;
1131
+ return CONTINUE;
1187
1132
  case '>':
1188
1133
  finish_tag_name(parser);
1189
1134
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1190
1135
  return emit_current_tag(parser, output);
1191
1136
  case '\0':
1192
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1137
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1193
1138
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1194
- return NEXT_CHAR;
1139
+ return CONTINUE;
1195
1140
  case -1:
1196
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1141
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1197
1142
  abandon_current_tag(parser);
1198
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1199
- return NEXT_CHAR;
1143
+ return emit_eof(parser, output);
1200
1144
  default:
1201
1145
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1202
- return NEXT_CHAR;
1146
+ return CONTINUE;
1203
1147
  }
1204
1148
  }
1205
1149
 
@@ -1210,36 +1154,29 @@ static StateResult handle_rcdata_lt_state (
1210
1154
  int c,
1211
1155
  GumboToken* output
1212
1156
  ) {
1213
- assert(temporary_buffer_equals(parser, "<"));
1214
1157
  if (c == '/') {
1215
1158
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1216
- append_char_to_temporary_buffer(parser, '/');
1217
- return NEXT_CHAR;
1159
+ return CONTINUE;
1218
1160
  } else {
1219
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1220
- tokenizer->_reconsume_current_input = true;
1221
- return emit_temporary_buffer(parser, output);
1161
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1162
+ return emit_from_mark(parser, output);
1222
1163
  }
1223
1164
  }
1224
1165
 
1225
1166
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
1226
1167
  static StateResult handle_rcdata_end_tag_open_state (
1227
1168
  GumboParser* parser,
1228
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1169
+ GumboTokenizerState* tokenizer,
1229
1170
  int c,
1230
1171
  GumboToken* output
1231
1172
  ) {
1232
- assert(temporary_buffer_equals(parser, "</"));
1233
1173
  if (is_alpha(c)) {
1234
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1174
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1235
1175
  start_new_tag(parser, false);
1236
- append_char_to_temporary_buffer(parser, c);
1237
- return NEXT_CHAR;
1238
- } else {
1239
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1240
- return emit_temporary_buffer(parser, output);
1176
+ return CONTINUE;
1241
1177
  }
1242
- return true;
1178
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1179
+ return emit_from_mark(parser, output);
1243
1180
  }
1244
1181
 
1245
1182
  // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
@@ -1250,33 +1187,39 @@ static StateResult handle_rcdata_end_tag_name_state (
1250
1187
  GumboToken* output
1251
1188
  ) {
1252
1189
  UNUSED_IF_NDEBUG(tokenizer);
1253
- assert(tokenizer->_temporary_buffer.length >= 2);
1254
1190
  if (is_alpha(c)) {
1255
1191
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1256
- append_char_to_temporary_buffer(parser, c);
1257
- return NEXT_CHAR;
1258
- } else if (is_appropriate_end_tag(parser)) {
1259
- switch (c) {
1260
- case '\t':
1261
- case '\n':
1262
- case '\f':
1263
- case ' ':
1264
- finish_tag_name(parser);
1265
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1266
- return NEXT_CHAR;
1267
- case '/':
1268
- finish_tag_name(parser);
1269
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1270
- return NEXT_CHAR;
1271
- case '>':
1272
- finish_tag_name(parser);
1273
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1274
- return emit_current_tag(parser, output);
1192
+ return CONTINUE;
1193
+ }
1194
+ switch (c) {
1195
+ case '\t':
1196
+ case '\n':
1197
+ case '\f':
1198
+ case ' ':
1199
+ if (is_appropriate_end_tag(parser)) {
1200
+ finish_tag_name(parser);
1201
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1202
+ return CONTINUE;
1203
+ }
1204
+ break;
1205
+ case '/':
1206
+ if (is_appropriate_end_tag(parser)) {
1207
+ finish_tag_name(parser);
1208
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1209
+ return CONTINUE;
1210
+ }
1211
+ break;
1212
+ case '>':
1213
+ if (is_appropriate_end_tag(parser)) {
1214
+ finish_tag_name(parser);
1215
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1216
+ return emit_current_tag(parser, output);
1275
1217
  }
1218
+ break;
1276
1219
  }
1277
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1278
1220
  abandon_current_tag(parser);
1279
- return emit_temporary_buffer(parser, output);
1221
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1222
+ return emit_from_mark(parser, output);
1280
1223
  }
1281
1224
 
1282
1225
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
@@ -1286,34 +1229,29 @@ static StateResult handle_rawtext_lt_state (
1286
1229
  int c,
1287
1230
  GumboToken* output
1288
1231
  ) {
1289
- assert(temporary_buffer_equals(parser, "<"));
1290
1232
  if (c == '/') {
1291
1233
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1292
- append_char_to_temporary_buffer(parser, '/');
1293
- return NEXT_CHAR;
1234
+ return CONTINUE;
1294
1235
  } else {
1295
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1296
- tokenizer->_reconsume_current_input = true;
1297
- return emit_temporary_buffer(parser, output);
1236
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1237
+ return emit_from_mark(parser, output);
1298
1238
  }
1299
1239
  }
1300
1240
 
1301
1241
  // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
1302
1242
  static StateResult handle_rawtext_end_tag_open_state (
1303
1243
  GumboParser* parser,
1304
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1244
+ GumboTokenizerState* tokenizer,
1305
1245
  int c,
1306
1246
  GumboToken* output
1307
1247
  ) {
1308
- assert(temporary_buffer_equals(parser, "</"));
1309
1248
  if (is_alpha(c)) {
1310
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1249
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1311
1250
  start_new_tag(parser, false);
1312
- append_char_to_temporary_buffer(parser, c);
1313
- return NEXT_CHAR;
1251
+ return CONTINUE;
1314
1252
  } else {
1315
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1316
- return emit_temporary_buffer(parser, output);
1253
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1254
+ return emit_from_mark(parser, output);
1317
1255
  }
1318
1256
  }
1319
1257
 
@@ -1324,153 +1262,156 @@ static StateResult handle_rawtext_end_tag_name_state (
1324
1262
  int c,
1325
1263
  GumboToken* output
1326
1264
  ) {
1327
- assert(tokenizer->_temporary_buffer.length >= 2);
1328
- gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1329
- tokenizer->_tag_state._buffer.data);
1330
1265
  if (is_alpha(c)) {
1331
1266
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1332
- append_char_to_temporary_buffer(parser, c);
1333
- return NEXT_CHAR;
1334
- } else if (is_appropriate_end_tag(parser)) {
1335
- gumbo_debug("Is an appropriate end tag.\n");
1336
- switch (c) {
1337
- case '\t':
1338
- case '\n':
1339
- case '\f':
1340
- case ' ':
1341
- finish_tag_name(parser);
1342
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1343
- return NEXT_CHAR;
1344
- case '/':
1345
- finish_tag_name(parser);
1346
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1347
- return NEXT_CHAR;
1348
- case '>':
1349
- finish_tag_name(parser);
1350
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1351
- return emit_current_tag(parser, output);
1267
+ return CONTINUE;
1268
+ }
1269
+ switch (c) {
1270
+ case '\t':
1271
+ case '\n':
1272
+ case '\f':
1273
+ case ' ':
1274
+ if (is_appropriate_end_tag(parser)) {
1275
+ finish_tag_name(parser);
1276
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1277
+ return CONTINUE;
1278
+ }
1279
+ break;
1280
+ case '/':
1281
+ if (is_appropriate_end_tag(parser)) {
1282
+ finish_tag_name(parser);
1283
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1284
+ return CONTINUE;
1285
+ }
1286
+ break;
1287
+ case '>':
1288
+ if (is_appropriate_end_tag(parser)) {
1289
+ finish_tag_name(parser);
1290
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1291
+ return emit_current_tag(parser, output);
1352
1292
  }
1293
+ break;
1353
1294
  }
1354
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1355
1295
  abandon_current_tag(parser);
1356
- return emit_temporary_buffer(parser, output);
1296
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1297
+ return emit_from_mark(parser, output);
1357
1298
  }
1358
1299
 
1359
1300
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
1360
- static StateResult handle_script_lt_state (
1301
+ static StateResult handle_script_data_lt_state (
1361
1302
  GumboParser* parser,
1362
1303
  GumboTokenizerState* tokenizer,
1363
1304
  int c,
1364
1305
  GumboToken* output
1365
1306
  ) {
1366
- assert(temporary_buffer_equals(parser, "<"));
1367
1307
  if (c == '/') {
1368
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1369
- append_char_to_temporary_buffer(parser, '/');
1370
- return NEXT_CHAR;
1371
- } else if (c == '!') {
1372
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1373
- append_char_to_temporary_buffer(parser, '!');
1374
- return emit_temporary_buffer(parser, output);
1375
- } else {
1376
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1377
- tokenizer->_reconsume_current_input = true;
1378
- return emit_temporary_buffer(parser, output);
1308
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN);
1309
+ return CONTINUE;
1310
+ }
1311
+ if (c == '!') {
1312
+ // This is the only place we don't reconsume the input before emitting the
1313
+ // temporary buffer. Since the current position is stored and the current
1314
+ // character is not emitted, we need to advance the input and then
1315
+ // reconsume.
1316
+ utf8iterator_next(&tokenizer->_input);
1317
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START);
1318
+ return emit_from_mark(parser, output);
1379
1319
  }
1320
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1321
+ return emit_from_mark(parser, output);
1380
1322
  }
1381
1323
 
1382
1324
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
1383
- static StateResult handle_script_end_tag_open_state (
1325
+ static StateResult handle_script_data_end_tag_open_state (
1384
1326
  GumboParser* parser,
1385
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1327
+ GumboTokenizerState* tokenizer,
1386
1328
  int c,
1387
1329
  GumboToken* output
1388
1330
  ) {
1389
- assert(temporary_buffer_equals(parser, "</"));
1390
1331
  if (is_alpha(c)) {
1391
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1332
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME);
1392
1333
  start_new_tag(parser, false);
1393
- append_char_to_temporary_buffer(parser, c);
1394
- return NEXT_CHAR;
1395
- } else {
1396
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1397
- return emit_temporary_buffer(parser, output);
1334
+ return CONTINUE;
1398
1335
  }
1336
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1337
+ return emit_from_mark(parser, output);
1399
1338
  }
1400
1339
 
1401
1340
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
1402
- static StateResult handle_script_end_tag_name_state (
1341
+ static StateResult handle_script_data_end_tag_name_state (
1403
1342
  GumboParser* parser,
1404
1343
  GumboTokenizerState* tokenizer,
1405
1344
  int c,
1406
1345
  GumboToken* output
1407
1346
  ) {
1408
- UNUSED_IF_NDEBUG(tokenizer);
1409
- assert(tokenizer->_temporary_buffer.length >= 2);
1410
1347
  if (is_alpha(c)) {
1411
1348
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1412
- append_char_to_temporary_buffer(parser, c);
1413
- return NEXT_CHAR;
1414
- } else if (is_appropriate_end_tag(parser)) {
1415
- switch (c) {
1416
- case '\t':
1417
- case '\n':
1418
- case '\f':
1419
- case ' ':
1420
- finish_tag_name(parser);
1421
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1422
- return NEXT_CHAR;
1423
- case '/':
1424
- finish_tag_name(parser);
1425
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1426
- return NEXT_CHAR;
1427
- case '>':
1428
- finish_tag_name(parser);
1429
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1430
- return emit_current_tag(parser, output);
1349
+ return CONTINUE;
1350
+ }
1351
+ switch (c) {
1352
+ case '\t':
1353
+ case '\n':
1354
+ case '\f':
1355
+ case ' ':
1356
+ if (is_appropriate_end_tag(parser)) {
1357
+ finish_tag_name(parser);
1358
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1359
+ return CONTINUE;
1360
+ }
1361
+ break;
1362
+ case '/':
1363
+ if (is_appropriate_end_tag(parser)) {
1364
+ finish_tag_name(parser);
1365
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1366
+ return CONTINUE;
1367
+ }
1368
+ break;
1369
+ case '>':
1370
+ if (is_appropriate_end_tag(parser)) {
1371
+ finish_tag_name(parser);
1372
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1373
+ return emit_current_tag(parser, output);
1431
1374
  }
1375
+ break;
1432
1376
  }
1433
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1434
1377
  abandon_current_tag(parser);
1435
- return emit_temporary_buffer(parser, output);
1378
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1379
+ return emit_from_mark(parser, output);
1436
1380
  }
1437
1381
 
1438
1382
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
1439
- static StateResult handle_script_escaped_start_state (
1383
+ static StateResult handle_script_data_escaped_start_state (
1440
1384
  GumboParser* parser,
1441
1385
  GumboTokenizerState* tokenizer,
1442
1386
  int c,
1443
1387
  GumboToken* output
1444
1388
  ) {
1445
1389
  if (c == '-') {
1446
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1447
- return emit_current_char(parser, output);
1448
- } else {
1449
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1450
- tokenizer->_reconsume_current_input = true;
1451
- return NEXT_CHAR;
1390
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH);
1391
+ return emit_char(parser, c, output);
1452
1392
  }
1393
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1394
+ return CONTINUE;
1453
1395
  }
1454
1396
 
1455
1397
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
1456
- static StateResult handle_script_escaped_start_dash_state (
1398
+ static StateResult handle_script_data_escaped_start_dash_state (
1457
1399
  GumboParser* parser,
1458
1400
  GumboTokenizerState* tokenizer,
1459
1401
  int c,
1460
1402
  GumboToken* output
1461
1403
  ) {
1462
1404
  if (c == '-') {
1463
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1464
- return emit_current_char(parser, output);
1405
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1406
+ return emit_char(parser, c, output);
1465
1407
  } else {
1466
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1467
- tokenizer->_reconsume_current_input = true;
1468
- return NEXT_CHAR;
1408
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1409
+ return CONTINUE;
1469
1410
  }
1470
1411
  }
1471
1412
 
1472
1413
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
1473
- static StateResult handle_script_escaped_state (
1414
+ static StateResult handle_script_data_escaped_state (
1474
1415
  GumboParser* parser,
1475
1416
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1476
1417
  int c,
@@ -1478,25 +1419,25 @@ static StateResult handle_script_escaped_state (
1478
1419
  ) {
1479
1420
  switch (c) {
1480
1421
  case '-':
1481
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1482
- return emit_current_char(parser, output);
1422
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH);
1423
+ return emit_char(parser, c, output);
1483
1424
  case '<':
1484
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1425
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1485
1426
  clear_temporary_buffer(parser);
1486
- append_char_to_temporary_buffer(parser, c);
1487
- return NEXT_CHAR;
1427
+ set_mark(parser);
1428
+ return CONTINUE;
1488
1429
  case '\0':
1489
1430
  return emit_replacement_char(parser, output);
1490
1431
  case -1:
1491
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1432
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1492
1433
  return emit_eof(parser, output);
1493
1434
  default:
1494
- return emit_current_char(parser, output);
1435
+ return emit_char(parser, c, output);
1495
1436
  }
1496
1437
  }
1497
1438
 
1498
1439
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
1499
- static StateResult handle_script_escaped_dash_state (
1440
+ static StateResult handle_script_data_escaped_dash_state (
1500
1441
  GumboParser* parser,
1501
1442
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1502
1443
  int c,
@@ -1504,28 +1445,27 @@ static StateResult handle_script_escaped_dash_state (
1504
1445
  ) {
1505
1446
  switch (c) {
1506
1447
  case '-':
1507
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1508
- return emit_current_char(parser, output);
1448
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1449
+ return emit_char(parser, c, output);
1509
1450
  case '<':
1510
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1451
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1511
1452
  clear_temporary_buffer(parser);
1512
- append_char_to_temporary_buffer(parser, c);
1513
- return NEXT_CHAR;
1453
+ set_mark(parser);
1454
+ return CONTINUE;
1514
1455
  case '\0':
1515
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1456
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1516
1457
  return emit_replacement_char(parser, output);
1517
1458
  case -1:
1518
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1519
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1520
- return NEXT_CHAR;
1459
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1460
+ return emit_eof(parser, output);
1521
1461
  default:
1522
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1523
- return emit_current_char(parser, output);
1462
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1463
+ return emit_char(parser, c, output);
1524
1464
  }
1525
1465
  }
1526
1466
 
1527
1467
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
1528
- static StateResult handle_script_escaped_dash_dash_state (
1468
+ static StateResult handle_script_data_escaped_dash_dash_state (
1529
1469
  GumboParser* parser,
1530
1470
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1531
1471
  int c,
@@ -1533,113 +1473,107 @@ static StateResult handle_script_escaped_dash_dash_state (
1533
1473
  ) {
1534
1474
  switch (c) {
1535
1475
  case '-':
1536
- return emit_current_char(parser, output);
1476
+ return emit_char(parser, c, output);
1537
1477
  case '<':
1538
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1478
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1539
1479
  clear_temporary_buffer(parser);
1540
- append_char_to_temporary_buffer(parser, c);
1541
- return NEXT_CHAR;
1480
+ set_mark(parser);
1481
+ return CONTINUE;
1542
1482
  case '>':
1543
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1544
- return emit_current_char(parser, output);
1483
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1484
+ return emit_char(parser, c, output);
1545
1485
  case '\0':
1546
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1486
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1547
1487
  return emit_replacement_char(parser, output);
1548
1488
  case -1:
1549
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1550
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1551
- return NEXT_CHAR;
1489
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1490
+ return emit_eof(parser, output);
1552
1491
  default:
1553
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1554
- return emit_current_char(parser, output);
1492
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1493
+ return emit_char(parser, c, output);
1555
1494
  }
1556
1495
  }
1557
1496
 
1558
1497
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
1559
- static StateResult handle_script_escaped_lt_state (
1498
+ static StateResult handle_script_data_escaped_lt_state (
1560
1499
  GumboParser* parser,
1561
1500
  GumboTokenizerState* tokenizer,
1562
1501
  int c,
1563
1502
  GumboToken* output
1564
1503
  ) {
1565
- assert(temporary_buffer_equals(parser, "<"));
1566
- assert(!tokenizer->_script_data_buffer.length);
1504
+ assert(temporary_buffer_is_empty(parser));
1567
1505
  if (c == '/') {
1568
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1569
- append_char_to_temporary_buffer(parser, c);
1570
- return NEXT_CHAR;
1571
- } else if (is_alpha(c)) {
1572
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1573
- append_char_to_temporary_buffer(parser, c);
1574
- gumbo_string_buffer_append_codepoint (
1575
- ensure_lowercase(c),
1576
- &tokenizer->_script_data_buffer
1577
- );
1578
- return emit_temporary_buffer(parser, output);
1579
- } else {
1580
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1581
- return emit_temporary_buffer(parser, output);
1506
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN);
1507
+ return CONTINUE;
1508
+ }
1509
+ if (is_alpha(c)) {
1510
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START);
1511
+ return emit_from_mark(parser, output);
1582
1512
  }
1513
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1514
+ return emit_from_mark(parser, output);
1583
1515
  }
1584
1516
 
1585
1517
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
1586
- static StateResult handle_script_escaped_end_tag_open_state (
1518
+ static StateResult handle_script_data_escaped_end_tag_open_state (
1587
1519
  GumboParser* parser,
1588
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1520
+ GumboTokenizerState* tokenizer,
1589
1521
  int c,
1590
1522
  GumboToken* output
1591
1523
  ) {
1592
- assert(temporary_buffer_equals(parser, "</"));
1593
1524
  if (is_alpha(c)) {
1594
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1525
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME);
1595
1526
  start_new_tag(parser, false);
1596
- append_char_to_temporary_buffer(parser, c);
1597
- return NEXT_CHAR;
1598
- } else {
1599
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1600
- return emit_temporary_buffer(parser, output);
1527
+ return CONTINUE;
1601
1528
  }
1529
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1530
+ return emit_from_mark(parser, output);
1602
1531
  }
1603
1532
 
1604
1533
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
1605
- static StateResult handle_script_escaped_end_tag_name_state (
1534
+ static StateResult handle_script_data_escaped_end_tag_name_state (
1606
1535
  GumboParser* parser,
1607
1536
  GumboTokenizerState* tokenizer,
1608
1537
  int c,
1609
1538
  GumboToken* output
1610
1539
  ) {
1611
- UNUSED_IF_NDEBUG(tokenizer);
1612
- assert(tokenizer->_temporary_buffer.length >= 2);
1613
1540
  if (is_alpha(c)) {
1614
1541
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1615
- append_char_to_temporary_buffer(parser, c);
1616
- return NEXT_CHAR;
1617
- } else if (is_appropriate_end_tag(parser)) {
1618
- switch (c) {
1619
- case '\t':
1620
- case '\n':
1621
- case '\f':
1622
- case ' ':
1623
- finish_tag_name(parser);
1624
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1625
- return NEXT_CHAR;
1626
- case '/':
1627
- finish_tag_name(parser);
1628
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1629
- return NEXT_CHAR;
1630
- case '>':
1631
- finish_tag_name(parser);
1632
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1633
- return emit_current_tag(parser, output);
1542
+ return CONTINUE;
1543
+ }
1544
+ switch (c) {
1545
+ case '\t':
1546
+ case '\n':
1547
+ case '\f':
1548
+ case ' ':
1549
+ if (is_appropriate_end_tag(parser)) {
1550
+ finish_tag_name(parser);
1551
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1552
+ return CONTINUE;
1553
+ }
1554
+ break;
1555
+ case '/':
1556
+ if (is_appropriate_end_tag(parser)) {
1557
+ finish_tag_name(parser);
1558
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1559
+ return CONTINUE;
1560
+ }
1561
+ break;
1562
+ case '>':
1563
+ if (is_appropriate_end_tag(parser)) {
1564
+ finish_tag_name(parser);
1565
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1566
+ return emit_current_tag(parser, output);
1634
1567
  }
1568
+ break;
1635
1569
  }
1636
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1637
1570
  abandon_current_tag(parser);
1638
- return emit_temporary_buffer(parser, output);
1571
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1572
+ return emit_from_mark(parser, output);
1639
1573
  }
1640
1574
 
1641
1575
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
1642
- static StateResult handle_script_double_escaped_start_state (
1576
+ static StateResult handle_script_data_double_escaped_start_state (
1643
1577
  GumboParser* parser,
1644
1578
  GumboTokenizerState* tokenizer,
1645
1579
  int c,
@@ -1656,29 +1590,23 @@ static StateResult handle_script_double_escaped_start_state (
1656
1590
  parser,
1657
1591
  gumbo_string_equals (
1658
1592
  &kScriptTag,
1659
- (GumboStringPiece*) &tokenizer->_script_data_buffer
1593
+ (GumboStringPiece*) &tokenizer->_temporary_buffer
1660
1594
  )
1661
- ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1662
- : GUMBO_LEX_SCRIPT_ESCAPED
1595
+ ? GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED
1596
+ : GUMBO_LEX_SCRIPT_DATA_ESCAPED
1663
1597
  );
1664
- return emit_current_char(parser, output);
1665
- default:
1666
- if (is_alpha(c)) {
1667
- gumbo_string_buffer_append_codepoint (
1668
- ensure_lowercase(c),
1669
- &tokenizer->_script_data_buffer
1670
- );
1671
- return emit_current_char(parser, output);
1672
- } else {
1673
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1674
- tokenizer->_reconsume_current_input = true;
1675
- return NEXT_CHAR;
1676
- }
1598
+ return emit_char(parser, c, output);
1599
+ }
1600
+ if (is_alpha(c)) {
1601
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1602
+ return emit_char(parser, c, output);
1677
1603
  }
1604
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1605
+ return CONTINUE;
1678
1606
  }
1679
1607
 
1680
1608
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
1681
- static StateResult handle_script_double_escaped_state (
1609
+ static StateResult handle_script_data_double_escaped_state (
1682
1610
  GumboParser* parser,
1683
1611
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1684
1612
  int c,
@@ -1686,24 +1614,23 @@ static StateResult handle_script_double_escaped_state (
1686
1614
  ) {
1687
1615
  switch (c) {
1688
1616
  case '-':
1689
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1690
- return emit_current_char(parser, output);
1617
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH);
1618
+ return emit_char(parser, c, output);
1691
1619
  case '<':
1692
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1693
- return emit_current_char(parser, output);
1620
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1621
+ return emit_char(parser, c, output);
1694
1622
  case '\0':
1695
1623
  return emit_replacement_char(parser, output);
1696
1624
  case -1:
1697
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1698
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1699
- return NEXT_CHAR;
1625
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1626
+ return emit_eof(parser, output);
1700
1627
  default:
1701
- return emit_current_char(parser, output);
1628
+ return emit_char(parser, c, output);
1702
1629
  }
1703
1630
  }
1704
1631
 
1705
1632
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
1706
- static StateResult handle_script_double_escaped_dash_state (
1633
+ static StateResult handle_script_data_double_escaped_dash_state (
1707
1634
  GumboParser* parser,
1708
1635
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1709
1636
  int c,
@@ -1712,26 +1639,25 @@ static StateResult handle_script_double_escaped_dash_state (
1712
1639
  switch (c) {
1713
1640
  case '-':
1714
1641
  gumbo_tokenizer_set_state(
1715
- parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1716
- return emit_current_char(parser, output);
1642
+ parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH);
1643
+ return emit_char(parser, c, output);
1717
1644
  case '<':
1718
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1719
- return emit_current_char(parser, output);
1645
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1646
+ return emit_char(parser, c, output);
1720
1647
  case '\0':
1721
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1648
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1722
1649
  return emit_replacement_char(parser, output);
1723
1650
  case -1:
1724
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1725
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1726
- return NEXT_CHAR;
1651
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1652
+ return emit_eof(parser, output);
1727
1653
  default:
1728
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1729
- return emit_current_char(parser, output);
1654
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1655
+ return emit_char(parser, c, output);
1730
1656
  }
1731
1657
  }
1732
1658
 
1733
1659
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
1734
- static StateResult handle_script_double_escaped_dash_dash_state (
1660
+ static StateResult handle_script_data_double_escaped_dash_dash_state (
1735
1661
  GumboParser* parser,
1736
1662
  GumboTokenizerState* UNUSED_ARG(tokenizer),
1737
1663
  int c,
@@ -1739,46 +1665,44 @@ static StateResult handle_script_double_escaped_dash_dash_state (
1739
1665
  ) {
1740
1666
  switch (c) {
1741
1667
  case '-':
1742
- return emit_current_char(parser, output);
1668
+ return emit_char(parser, c, output);
1743
1669
  case '<':
1744
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1745
- return emit_current_char(parser, output);
1670
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1671
+ return emit_char(parser, c, output);
1746
1672
  case '>':
1747
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1748
- return emit_current_char(parser, output);
1673
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1674
+ return emit_char(parser, c, output);
1749
1675
  case '\0':
1750
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1676
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1751
1677
  return emit_replacement_char(parser, output);
1752
1678
  case -1:
1753
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1754
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1755
- return NEXT_CHAR;
1679
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1680
+ return emit_eof(parser, output);
1756
1681
  default:
1757
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1758
- return emit_current_char(parser, output);
1682
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1683
+ return emit_char(parser, c, output);
1759
1684
  }
1760
1685
  }
1761
1686
 
1762
1687
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
1763
- static StateResult handle_script_double_escaped_lt_state (
1688
+ static StateResult handle_script_data_double_escaped_lt_state (
1764
1689
  GumboParser* parser,
1765
1690
  GumboTokenizerState* tokenizer,
1766
1691
  int c,
1767
1692
  GumboToken* output
1768
1693
  ) {
1769
1694
  if (c == '/') {
1770
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1771
- gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
1772
- return emit_current_char(parser, output);
1695
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END);
1696
+ clear_temporary_buffer(parser);
1697
+ return emit_char(parser, c, output);
1773
1698
  } else {
1774
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1775
- tokenizer->_reconsume_current_input = true;
1776
- return NEXT_CHAR;
1699
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1700
+ return CONTINUE;
1777
1701
  }
1778
1702
  }
1779
1703
 
1780
1704
  // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
1781
- static StateResult handle_script_double_escaped_end_state (
1705
+ static StateResult handle_script_data_double_escaped_end_state (
1782
1706
  GumboParser* parser,
1783
1707
  GumboTokenizerState* tokenizer,
1784
1708
  int c,
@@ -1793,29 +1717,23 @@ static StateResult handle_script_double_escaped_end_state (
1793
1717
  case '>':
1794
1718
  gumbo_tokenizer_set_state(
1795
1719
  parser, gumbo_string_equals(&kScriptTag,
1796
- (GumboStringPiece*) &tokenizer->_script_data_buffer)
1797
- ? GUMBO_LEX_SCRIPT_ESCAPED
1798
- : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1799
- return emit_current_char(parser, output);
1800
- default:
1801
- if (is_alpha(c)) {
1802
- gumbo_string_buffer_append_codepoint (
1803
- ensure_lowercase(c),
1804
- &tokenizer->_script_data_buffer
1805
- );
1806
- return emit_current_char(parser, output);
1807
- } else {
1808
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1809
- tokenizer->_reconsume_current_input = true;
1810
- return NEXT_CHAR;
1811
- }
1720
+ (GumboStringPiece*) &tokenizer->_temporary_buffer)
1721
+ ? GUMBO_LEX_SCRIPT_DATA_ESCAPED
1722
+ : GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1723
+ return emit_char(parser, c, output);
1724
+ }
1725
+ if (is_alpha(c)) {
1726
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1727
+ return emit_char(parser, c, output);
1812
1728
  }
1729
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1730
+ return CONTINUE;
1813
1731
  }
1814
1732
 
1815
1733
  // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
1816
1734
  static StateResult handle_before_attr_name_state (
1817
1735
  GumboParser* parser,
1818
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1736
+ GumboTokenizerState* tokenizer,
1819
1737
  int c,
1820
1738
  GumboToken* output
1821
1739
  ) {
@@ -1824,40 +1742,27 @@ static StateResult handle_before_attr_name_state (
1824
1742
  case '\n':
1825
1743
  case '\f':
1826
1744
  case ' ':
1827
- return NEXT_CHAR;
1745
+ return CONTINUE;
1828
1746
  case '/':
1829
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1830
- return NEXT_CHAR;
1831
1747
  case '>':
1832
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1833
- return emit_current_tag(parser, output);
1834
- case '\0':
1835
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1836
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1837
- append_char_to_temporary_buffer(parser, 0xfffd);
1838
- return NEXT_CHAR;
1839
1748
  case -1:
1840
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1841
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1842
- abandon_current_tag(parser);
1843
- return NEXT_CHAR;
1844
- case '"':
1845
- case '\'':
1846
- case '<':
1749
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1750
+ return CONTINUE;
1847
1751
  case '=':
1848
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1849
- // Fall through.
1850
- default:
1752
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME);
1851
1753
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1852
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1853
- return NEXT_CHAR;
1754
+ append_char_to_tag_buffer(parser, c, true);
1755
+ return CONTINUE;
1756
+ default:
1757
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1758
+ return CONTINUE;
1854
1759
  }
1855
1760
  }
1856
1761
 
1857
1762
  // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
1858
1763
  static StateResult handle_attr_name_state (
1859
1764
  GumboParser* parser,
1860
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1765
+ GumboTokenizerState* tokenizer,
1861
1766
  int c,
1862
1767
  GumboToken* output
1863
1768
  ) {
@@ -1866,45 +1771,35 @@ static StateResult handle_attr_name_state (
1866
1771
  case '\n':
1867
1772
  case '\f':
1868
1773
  case ' ':
1869
- finish_attribute_name(parser);
1870
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1871
- return NEXT_CHAR;
1872
1774
  case '/':
1775
+ case '>':
1776
+ case -1:
1873
1777
  finish_attribute_name(parser);
1874
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1875
- return NEXT_CHAR;
1778
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1779
+ return CONTINUE;
1876
1780
  case '=':
1877
1781
  finish_attribute_name(parser);
1878
1782
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1879
- return NEXT_CHAR;
1880
- case '>':
1881
- finish_attribute_name(parser);
1882
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1883
- return emit_current_tag(parser, output);
1783
+ return CONTINUE;
1884
1784
  case '\0':
1885
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1785
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1886
1786
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1887
- return NEXT_CHAR;
1888
- case -1:
1889
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1890
- abandon_current_tag(parser);
1891
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1892
- return NEXT_CHAR;
1787
+ return CONTINUE;
1893
1788
  case '"':
1894
1789
  case '\'':
1895
1790
  case '<':
1896
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1791
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME);
1897
1792
  // Fall through.
1898
1793
  default:
1899
1794
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1900
- return NEXT_CHAR;
1795
+ return CONTINUE;
1901
1796
  }
1902
1797
  }
1903
1798
 
1904
1799
  // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
1905
1800
  static StateResult handle_after_attr_name_state (
1906
1801
  GumboParser* parser,
1907
- GumboTokenizerState* UNUSED_ARG(tokenizer),
1802
+ GumboTokenizerState* tokenizer,
1908
1803
  int c,
1909
1804
  GumboToken* output
1910
1805
  ) {
@@ -1913,35 +1808,23 @@ static StateResult handle_after_attr_name_state (
1913
1808
  case '\n':
1914
1809
  case '\f':
1915
1810
  case ' ':
1916
- return NEXT_CHAR;
1811
+ return CONTINUE;
1917
1812
  case '/':
1918
1813
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1919
- return NEXT_CHAR;
1814
+ return CONTINUE;
1920
1815
  case '=':
1921
1816
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1922
- return NEXT_CHAR;
1817
+ return CONTINUE;
1923
1818
  case '>':
1924
1819
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1925
1820
  return emit_current_tag(parser, output);
1926
- case '\0':
1927
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1928
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1929
- append_char_to_temporary_buffer(parser, 0xfffd);
1930
- return NEXT_CHAR;
1931
1821
  case -1:
1932
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1933
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1822
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1934
1823
  abandon_current_tag(parser);
1935
- return NEXT_CHAR;
1936
- case '"':
1937
- case '\'':
1938
- case '<':
1939
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1940
- // Fall through.
1824
+ return emit_eof(parser, output);
1941
1825
  default:
1942
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1943
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1944
- return NEXT_CHAR;
1826
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1827
+ return CONTINUE;
1945
1828
  }
1946
1829
  }
1947
1830
 
@@ -1957,45 +1840,22 @@ static StateResult handle_before_attr_value_state (
1957
1840
  case '\n':
1958
1841
  case '\f':
1959
1842
  case ' ':
1960
- return NEXT_CHAR;
1843
+ return CONTINUE;
1961
1844
  case '"':
1962
1845
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1963
1846
  reset_tag_buffer_start_point(parser);
1964
- return NEXT_CHAR;
1965
- case '&':
1966
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1967
- tokenizer->_reconsume_current_input = true;
1968
- return NEXT_CHAR;
1847
+ return CONTINUE;
1969
1848
  case '\'':
1970
1849
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1971
1850
  reset_tag_buffer_start_point(parser);
1972
- return NEXT_CHAR;
1973
- case '\0':
1974
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1975
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1976
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1977
- return NEXT_CHAR;
1978
- case -1:
1979
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1980
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1981
- abandon_current_tag(parser);
1982
- tokenizer->_reconsume_current_input = true;
1983
- return NEXT_CHAR;
1851
+ return CONTINUE;
1984
1852
  case '>':
1985
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1853
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_ATTRIBUTE_VALUE);
1986
1854
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1987
- emit_current_tag(parser, output);
1988
- return RETURN_ERROR;
1989
- case '<':
1990
- case '=':
1991
- case '`':
1992
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1993
- // Fall through.
1994
- default:
1995
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1996
- append_char_to_tag_buffer(parser, c, true);
1997
- return NEXT_CHAR;
1855
+ return emit_current_tag(parser, output);
1998
1856
  }
1857
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1858
+ return CONTINUE;
1999
1859
  }
2000
1860
 
2001
1861
  // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
@@ -2003,30 +1863,28 @@ static StateResult handle_attr_value_double_quoted_state (
2003
1863
  GumboParser* parser,
2004
1864
  GumboTokenizerState* tokenizer,
2005
1865
  int c,
2006
- GumboToken* UNUSED_ARG(output)
1866
+ GumboToken* output
2007
1867
  ) {
2008
1868
  switch (c) {
2009
1869
  case '"':
2010
1870
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
2011
- return NEXT_CHAR;
1871
+ return CONTINUE;
2012
1872
  case '&':
2013
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2014
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2015
- tokenizer->_reconsume_current_input = true;
2016
- return NEXT_CHAR;
1873
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1874
+ set_mark(parser);
1875
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED;
1876
+ return CONTINUE;
2017
1877
  case '\0':
2018
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1878
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2019
1879
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
2020
- return NEXT_CHAR;
1880
+ return CONTINUE;
2021
1881
  case -1:
2022
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
2023
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1882
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2024
1883
  abandon_current_tag(parser);
2025
- tokenizer->_reconsume_current_input = true;
2026
- return NEXT_CHAR;
1884
+ return emit_eof(parser, output);
2027
1885
  default:
2028
1886
  append_char_to_tag_buffer(parser, c, false);
2029
- return NEXT_CHAR;
1887
+ return CONTINUE;
2030
1888
  }
2031
1889
  }
2032
1890
 
@@ -2035,30 +1893,28 @@ static StateResult handle_attr_value_single_quoted_state (
2035
1893
  GumboParser* parser,
2036
1894
  GumboTokenizerState* tokenizer,
2037
1895
  int c,
2038
- GumboToken* UNUSED_ARG(output)
1896
+ GumboToken* output
2039
1897
  ) {
2040
1898
  switch (c) {
2041
1899
  case '\'':
2042
1900
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
2043
- return NEXT_CHAR;
1901
+ return CONTINUE;
2044
1902
  case '&':
2045
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2046
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2047
- tokenizer->_reconsume_current_input = true;
2048
- return NEXT_CHAR;
1903
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1904
+ set_mark(parser);
1905
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED;
1906
+ return CONTINUE;
2049
1907
  case '\0':
2050
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1908
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2051
1909
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
2052
- return NEXT_CHAR;
1910
+ return CONTINUE;
2053
1911
  case -1:
2054
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
2055
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1912
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2056
1913
  abandon_current_tag(parser);
2057
- tokenizer->_reconsume_current_input = true;
2058
- return NEXT_CHAR;
1914
+ return emit_eof(parser, output);
2059
1915
  default:
2060
1916
  append_char_to_tag_buffer(parser, c, false);
2061
- return NEXT_CHAR;
1917
+ return CONTINUE;
2062
1918
  }
2063
1919
  }
2064
1920
 
@@ -2076,89 +1932,35 @@ static StateResult handle_attr_value_unquoted_state (
2076
1932
  case ' ':
2077
1933
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2078
1934
  finish_attribute_value(parser);
2079
- return NEXT_CHAR;
1935
+ return CONTINUE;
2080
1936
  case '&':
2081
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
2082
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
2083
- tokenizer->_reconsume_current_input = true;
2084
- return NEXT_CHAR;
1937
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1938
+ set_mark(parser);
1939
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_UNQUOTED;
1940
+ return CONTINUE;
2085
1941
  case '>':
2086
1942
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2087
1943
  finish_attribute_value(parser);
2088
1944
  return emit_current_tag(parser, output);
2089
1945
  case '\0':
2090
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1946
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2091
1947
  append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
2092
- return NEXT_CHAR;
1948
+ return CONTINUE;
2093
1949
  case -1:
2094
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
2095
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2096
- tokenizer->_reconsume_current_input = true;
1950
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2097
1951
  abandon_current_tag(parser);
2098
- return NEXT_CHAR;
2099
- case '<':
2100
- case '=':
1952
+ return emit_eof(parser, output);
2101
1953
  case '"':
2102
1954
  case '\'':
1955
+ case '<':
1956
+ case '=':
2103
1957
  case '`':
2104
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1958
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE);
2105
1959
  // Fall through.
2106
1960
  default:
2107
1961
  append_char_to_tag_buffer(parser, c, true);
2108
- return NEXT_CHAR;
2109
- }
2110
- }
2111
-
2112
- // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-attribute-value-state
2113
- static StateResult handle_char_ref_in_attr_value_state (
2114
- GumboParser* parser,
2115
- GumboTokenizerState* tokenizer,
2116
- int UNUSED_ARG(c),
2117
- GumboToken* UNUSED_ARG(output)
2118
- ) {
2119
- OneOrTwoCodepoints char_ref;
2120
- int allowed_char;
2121
- bool is_unquoted = false;
2122
- switch (tokenizer->_tag_state._attr_value_state) {
2123
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
2124
- allowed_char = '"';
2125
- break;
2126
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
2127
- allowed_char = '\'';
2128
- break;
2129
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
2130
- allowed_char = '>';
2131
- is_unquoted = true;
2132
- break;
2133
- default:
2134
- // -Wmaybe-uninitialized is a little overzealous here, and doesn't
2135
- // get that the assert(0) means this codepath will never happen.
2136
- allowed_char = ' ';
2137
- assert(0);
1962
+ return CONTINUE;
2138
1963
  }
2139
-
2140
- // Ignore the status, since we don't have a convenient way of signalling that
2141
- // a parser error has occurred when the error occurs in the middle of a
2142
- // multi-state token. We'd need a flag inside the TokenizerState to do this,
2143
- // but that's a low priority fix.
2144
- gumbo_consume_char_ref (
2145
- parser,
2146
- &tokenizer->_input,
2147
- allowed_char,
2148
- true,
2149
- &char_ref
2150
- );
2151
- if (char_ref.first != kGumboNoChar) {
2152
- tokenizer->_reconsume_current_input = true;
2153
- append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
2154
- if (char_ref.second != kGumboNoChar) {
2155
- append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
2156
- }
2157
- } else {
2158
- append_char_to_tag_buffer(parser, '&', is_unquoted);
2159
- }
2160
- gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
2161
- return NEXT_CHAR;
2162
1964
  }
2163
1965
 
2164
1966
  // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
@@ -2175,24 +1977,21 @@ static StateResult handle_after_attr_value_quoted_state (
2175
1977
  case '\f':
2176
1978
  case ' ':
2177
1979
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2178
- return NEXT_CHAR;
1980
+ return CONTINUE;
2179
1981
  case '/':
2180
1982
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
2181
- return NEXT_CHAR;
1983
+ return CONTINUE;
2182
1984
  case '>':
2183
1985
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2184
1986
  return emit_current_tag(parser, output);
2185
1987
  case -1:
2186
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
2187
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1988
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2188
1989
  abandon_current_tag(parser);
2189
- tokenizer->_reconsume_current_input = true;
2190
- return NEXT_CHAR;
1990
+ return emit_eof(parser, output);
2191
1991
  default:
2192
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
2193
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2194
- tokenizer->_reconsume_current_input = true;
2195
- return NEXT_CHAR;
1992
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES);
1993
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1994
+ return CONTINUE;
2196
1995
  }
2197
1996
  }
2198
1997
 
@@ -2209,15 +2008,13 @@ static StateResult handle_self_closing_start_tag_state (
2209
2008
  tokenizer->_tag_state._is_self_closing = true;
2210
2009
  return emit_current_tag(parser, output);
2211
2010
  case -1:
2212
- tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
2213
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2011
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2214
2012
  abandon_current_tag(parser);
2215
- return NEXT_CHAR;
2013
+ return emit_eof(parser, output);
2216
2014
  default:
2217
- tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
2218
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2219
- tokenizer->_reconsume_current_input = true;
2220
- return NEXT_CHAR;
2015
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG);
2016
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2017
+ return CONTINUE;
2221
2018
  }
2222
2019
  }
2223
2020
 
@@ -2228,21 +2025,27 @@ static StateResult handle_bogus_comment_state (
2228
2025
  int c,
2229
2026
  GumboToken* output
2230
2027
  ) {
2231
- while (c != '>' && c != -1) {
2232
- if (c == '\0') {
2233
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2234
- c = 0xFFFD;
2235
- }
2028
+ switch (c) {
2029
+ case '>':
2030
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2031
+ return emit_comment(parser, output);
2032
+ case -1:
2033
+ // We need to emit the comment and then the EOF, so reconsume in data
2034
+ // state.
2035
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2036
+ return emit_comment(parser, output);
2037
+ case '\0':
2038
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2039
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2040
+ return CONTINUE;
2041
+ default:
2236
2042
  append_char_to_temporary_buffer(parser, c);
2237
- utf8iterator_next(&tokenizer->_input);
2238
- c = utf8iterator_current(&tokenizer->_input);
2043
+ return CONTINUE;
2239
2044
  }
2240
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2241
- return emit_comment(parser, output);
2242
2045
  }
2243
2046
 
2244
2047
  // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
2245
- static StateResult handle_markup_declaration_state (
2048
+ static StateResult handle_markup_declaration_open_state (
2246
2049
  GumboParser* parser,
2247
2050
  GumboTokenizerState* tokenizer,
2248
2051
  int UNUSED_ARG(c),
@@ -2253,21 +2056,21 @@ static StateResult handle_markup_declaration_state (
2253
2056
  &tokenizer->_input,
2254
2057
  "--",
2255
2058
  sizeof("--") - 1,
2256
- true
2059
+ /* case sensitive */ true
2257
2060
  )
2258
2061
  ) {
2259
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
2260
- tokenizer->_reconsume_current_input = true;
2261
- } else if (
2062
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_START);
2063
+ return CONTINUE;
2064
+ }
2065
+ if (
2262
2066
  utf8iterator_maybe_consume_match (
2263
2067
  &tokenizer->_input,
2264
2068
  "DOCTYPE",
2265
2069
  sizeof("DOCTYPE") - 1,
2266
- false
2070
+ /* case sensitive */ false
2267
2071
  )
2268
2072
  ) {
2269
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
2270
- tokenizer->_reconsume_current_input = true;
2073
+ reconsume_in_state(parser, GUMBO_LEX_DOCTYPE);
2271
2074
  // If we get here, we know we'll eventually emit a doctype token, so now is
2272
2075
  // the time to initialize the doctype strings. (Not in doctype_state_init,
2273
2076
  // since then they'll leak if ownership never gets transferred to the
@@ -2275,24 +2078,35 @@ static StateResult handle_markup_declaration_state (
2275
2078
  tokenizer->_doc_type_state.name = gumbo_strdup("");
2276
2079
  tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
2277
2080
  tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
2278
- } else if (
2279
- tokenizer->_is_current_node_foreign
2280
- && utf8iterator_maybe_consume_match (
2081
+ return CONTINUE;
2082
+ }
2083
+ if (
2084
+ utf8iterator_maybe_consume_match (
2281
2085
  &tokenizer->_input,
2282
2086
  "[CDATA[", sizeof("[CDATA[") - 1,
2283
- true
2087
+ /* case sensitive */ true
2284
2088
  )
2285
2089
  ) {
2286
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2287
- tokenizer->_is_in_cdata = true;
2288
- tokenizer->_reconsume_current_input = true;
2289
- } else {
2290
- tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2291
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2292
- tokenizer->_reconsume_current_input = true;
2293
- clear_temporary_buffer(parser);
2090
+ if (tokenizer->_is_adjusted_current_node_foreign) {
2091
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2092
+ tokenizer->_is_in_cdata = true;
2093
+ // Start the token after the <![CDATA[.
2094
+ reset_token_start_point(tokenizer);
2095
+ } else {
2096
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_CDATA_IN_HTML_CONTENT);
2097
+ clear_temporary_buffer(parser);
2098
+ append_string_to_temporary_buffer (
2099
+ parser,
2100
+ &(const GumboStringPiece) { .data = "[CDATA[", .length = 7 }
2101
+ );
2102
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2103
+ }
2104
+ return CONTINUE;
2294
2105
  }
2295
- return NEXT_CHAR;
2106
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_OPENED_COMMENT);
2107
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2108
+ clear_temporary_buffer(parser);
2109
+ return CONTINUE;
2296
2110
  }
2297
2111
 
2298
2112
  // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
@@ -2305,26 +2119,14 @@ static StateResult handle_comment_start_state (
2305
2119
  switch (c) {
2306
2120
  case '-':
2307
2121
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2308
- return NEXT_CHAR;
2309
- case '\0':
2310
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2311
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2312
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2313
- return NEXT_CHAR;
2122
+ return CONTINUE;
2314
2123
  case '>':
2315
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2124
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2316
2125
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2317
- emit_comment(parser, output);
2318
- return RETURN_ERROR;
2319
- case -1:
2320
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2321
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2322
- emit_comment(parser, output);
2323
- return RETURN_ERROR;
2126
+ return emit_comment(parser, output);
2324
2127
  default:
2325
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2326
- append_char_to_temporary_buffer(parser, c);
2327
- return NEXT_CHAR;
2128
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2129
+ return CONTINUE;
2328
2130
  }
2329
2131
  }
2330
2132
 
@@ -2338,28 +2140,20 @@ static StateResult handle_comment_start_dash_state (
2338
2140
  switch (c) {
2339
2141
  case '-':
2340
2142
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2341
- return NEXT_CHAR;
2342
- case '\0':
2343
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2344
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2345
- append_char_to_temporary_buffer(parser, '-');
2346
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2347
- return NEXT_CHAR;
2143
+ return CONTINUE;
2348
2144
  case '>':
2349
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2145
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2350
2146
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2351
- emit_comment(parser, output);
2352
- return RETURN_ERROR;
2147
+ return emit_comment(parser, output);
2353
2148
  case -1:
2354
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2355
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2356
- emit_comment(parser, output);
2357
- return RETURN_ERROR;
2149
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2150
+ // Switch to data to emit the EOF next.
2151
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2152
+ return emit_comment(parser, output);
2358
2153
  default:
2359
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2154
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2360
2155
  append_char_to_temporary_buffer(parser, '-');
2361
- append_char_to_temporary_buffer(parser, c);
2362
- return NEXT_CHAR;
2156
+ return CONTINUE;
2363
2157
  }
2364
2158
  }
2365
2159
 
@@ -2371,21 +2165,99 @@ static StateResult handle_comment_state (
2371
2165
  GumboToken* output
2372
2166
  ) {
2373
2167
  switch (c) {
2168
+ case '<':
2169
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT);
2170
+ append_char_to_temporary_buffer(parser, c);
2171
+ return CONTINUE;
2374
2172
  case '-':
2375
2173
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2376
- return NEXT_CHAR;
2174
+ return CONTINUE;
2377
2175
  case '\0':
2378
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2176
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2379
2177
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2380
- return NEXT_CHAR;
2178
+ return CONTINUE;
2381
2179
  case -1:
2382
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2383
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2384
- emit_comment(parser, output);
2385
- return RETURN_ERROR;
2180
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2181
+ // Switch to data to emit the EOF token next.
2182
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2183
+ return emit_comment(parser, output);
2386
2184
  default:
2387
2185
  append_char_to_temporary_buffer(parser, c);
2388
- return NEXT_CHAR;
2186
+ return CONTINUE;
2187
+ }
2188
+ }
2189
+
2190
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
2191
+ static StateResult handle_comment_lt_state (
2192
+ GumboParser* parser,
2193
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2194
+ int c,
2195
+ GumboToken* output
2196
+ ) {
2197
+ switch (c) {
2198
+ case '!':
2199
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG);
2200
+ append_char_to_temporary_buffer(parser, c);
2201
+ return CONTINUE;
2202
+ case '<':
2203
+ append_char_to_temporary_buffer(parser, c);
2204
+ return CONTINUE;
2205
+ default:
2206
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2207
+ return CONTINUE;
2208
+ }
2209
+ }
2210
+
2211
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
2212
+ static StateResult handle_comment_lt_bang_state (
2213
+ GumboParser* parser,
2214
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2215
+ int c,
2216
+ GumboToken* output
2217
+ ) {
2218
+ switch (c) {
2219
+ case '-':
2220
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH);
2221
+ return CONTINUE;
2222
+ default:
2223
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2224
+ return CONTINUE;
2225
+ }
2226
+ }
2227
+
2228
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
2229
+ static StateResult handle_comment_lt_bang_dash_state (
2230
+ GumboParser* parser,
2231
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2232
+ int c,
2233
+ GumboToken* output
2234
+ ) {
2235
+ switch (c) {
2236
+ case '-':
2237
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH);
2238
+ return CONTINUE;
2239
+ default:
2240
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2241
+ return CONTINUE;
2242
+ }
2243
+ }
2244
+
2245
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
2246
+ static StateResult handle_comment_lt_bang_dash_dash_state (
2247
+ GumboParser* parser,
2248
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2249
+ int c,
2250
+ GumboToken* output
2251
+ ) {
2252
+ switch (c) {
2253
+ case '>':
2254
+ case -1:
2255
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2256
+ return CONTINUE;
2257
+ default:
2258
+ tokenizer_add_parse_error(parser, GUMBO_ERR_NESTED_COMMENT);
2259
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2260
+ return CONTINUE;
2389
2261
  }
2390
2262
  }
2391
2263
 
@@ -2397,25 +2269,18 @@ static StateResult handle_comment_end_dash_state (
2397
2269
  GumboToken* output
2398
2270
  ) {
2399
2271
  switch (c) {
2400
- case '-':
2401
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2402
- return NEXT_CHAR;
2403
- case '\0':
2404
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2405
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2406
- append_char_to_temporary_buffer(parser, '-');
2407
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2408
- return NEXT_CHAR;
2409
- case -1:
2410
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2411
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2412
- emit_comment(parser, output);
2413
- return RETURN_ERROR;
2414
- default:
2415
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2416
- append_char_to_temporary_buffer(parser, '-');
2417
- append_char_to_temporary_buffer(parser, c);
2418
- return NEXT_CHAR;
2272
+ case '-':
2273
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2274
+ return CONTINUE;
2275
+ case -1:
2276
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2277
+ // Switch to data to emit EOF next.
2278
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2279
+ return emit_comment(parser, output);
2280
+ default:
2281
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2282
+ append_char_to_temporary_buffer(parser, '-');
2283
+ return CONTINUE;
2419
2284
  }
2420
2285
  }
2421
2286
 
@@ -2430,35 +2295,22 @@ static StateResult handle_comment_end_state (
2430
2295
  case '>':
2431
2296
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2432
2297
  return emit_comment(parser, output);
2433
- case '\0':
2434
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2435
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2436
- append_char_to_temporary_buffer(parser, '-');
2437
- append_char_to_temporary_buffer(parser, '-');
2438
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2439
- return NEXT_CHAR;
2440
2298
  case '!':
2441
- tokenizer_add_parse_error(
2442
- parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2443
2299
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2444
- return NEXT_CHAR;
2300
+ return CONTINUE;
2445
2301
  case '-':
2446
- tokenizer_add_parse_error(
2447
- parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2448
2302
  append_char_to_temporary_buffer(parser, '-');
2449
- return NEXT_CHAR;
2303
+ return CONTINUE;
2450
2304
  case -1:
2451
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2305
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2306
+ // Switch to data to emit EOF next.
2452
2307
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2453
- emit_comment(parser, output);
2454
- return RETURN_ERROR;
2308
+ return emit_comment(parser, output);
2455
2309
  default:
2456
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2457
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2310
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2458
2311
  append_char_to_temporary_buffer(parser, '-');
2459
2312
  append_char_to_temporary_buffer(parser, '-');
2460
- append_char_to_temporary_buffer(parser, c);
2461
- return NEXT_CHAR;
2313
+ return CONTINUE;
2462
2314
  }
2463
2315
  }
2464
2316
 
@@ -2475,30 +2327,22 @@ static StateResult handle_comment_end_bang_state (
2475
2327
  append_char_to_temporary_buffer(parser, '-');
2476
2328
  append_char_to_temporary_buffer(parser, '-');
2477
2329
  append_char_to_temporary_buffer(parser, '!');
2478
- return NEXT_CHAR;
2330
+ return CONTINUE;
2479
2331
  case '>':
2332
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT);
2480
2333
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2481
2334
  return emit_comment(parser, output);
2482
- case '\0':
2483
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2484
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2485
- append_char_to_temporary_buffer(parser, '-');
2486
- append_char_to_temporary_buffer(parser, '-');
2487
- append_char_to_temporary_buffer(parser, '!');
2488
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2489
- return NEXT_CHAR;
2490
2335
  case -1:
2491
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2336
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2337
+ // Switch to data to emit EOF next.
2492
2338
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2493
- emit_comment(parser, output);
2494
- return RETURN_ERROR;
2339
+ return emit_comment(parser, output);
2495
2340
  default:
2496
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2341
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2497
2342
  append_char_to_temporary_buffer(parser, '-');
2498
2343
  append_char_to_temporary_buffer(parser, '-');
2499
2344
  append_char_to_temporary_buffer(parser, '!');
2500
- append_char_to_temporary_buffer(parser, c);
2501
- return NEXT_CHAR;
2345
+ return CONTINUE;
2502
2346
  }
2503
2347
  }
2504
2348
 
@@ -2509,26 +2353,27 @@ static StateResult handle_doctype_state (
2509
2353
  int c,
2510
2354
  GumboToken* output
2511
2355
  ) {
2512
- assert(!tokenizer->_temporary_buffer.length);
2356
+ assert(temporary_buffer_is_empty(parser));
2513
2357
  switch (c) {
2514
2358
  case '\t':
2515
2359
  case '\n':
2516
2360
  case '\f':
2517
2361
  case ' ':
2518
2362
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2519
- return NEXT_CHAR;
2363
+ return CONTINUE;
2364
+ case '>':
2365
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2366
+ return CONTINUE;
2520
2367
  case -1:
2521
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2522
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2368
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2523
2369
  tokenizer->_doc_type_state.force_quirks = true;
2524
- emit_doctype(parser, output);
2525
- return RETURN_ERROR;
2370
+ // Switch to data to emit EOF next.
2371
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2372
+ return emit_doctype(parser, output);
2526
2373
  default:
2527
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2528
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2529
- tokenizer->_reconsume_current_input = true;
2530
- tokenizer->_doc_type_state.force_quirks = true;
2531
- return NEXT_CHAR;
2374
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME);
2375
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2376
+ return CONTINUE;
2532
2377
  }
2533
2378
  }
2534
2379
 
@@ -2544,30 +2389,27 @@ static StateResult handle_before_doctype_name_state (
2544
2389
  case '\n':
2545
2390
  case '\f':
2546
2391
  case ' ':
2547
- return NEXT_CHAR;
2392
+ return CONTINUE;
2548
2393
  case '\0':
2549
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2394
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2550
2395
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2551
- tokenizer->_doc_type_state.force_quirks = true;
2552
2396
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2553
- return NEXT_CHAR;
2397
+ return CONTINUE;
2554
2398
  case '>':
2555
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2399
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_NAME);
2556
2400
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2557
2401
  tokenizer->_doc_type_state.force_quirks = true;
2558
- emit_doctype(parser, output);
2559
- return RETURN_ERROR;
2402
+ return emit_doctype(parser, output);
2560
2403
  case -1:
2561
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2562
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2404
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2563
2405
  tokenizer->_doc_type_state.force_quirks = true;
2564
- emit_doctype(parser, output);
2565
- return RETURN_ERROR;
2406
+ // Switch to data to emit EOF next.
2407
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2408
+ return emit_doctype(parser, output);
2566
2409
  default:
2567
2410
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2568
- tokenizer->_doc_type_state.force_quirks = false;
2569
2411
  append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2570
- return NEXT_CHAR;
2412
+ return CONTINUE;
2571
2413
  }
2572
2414
  }
2573
2415
 
@@ -2586,30 +2428,26 @@ static StateResult handle_doctype_name_state (
2586
2428
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2587
2429
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2588
2430
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2589
- return NEXT_CHAR;
2431
+ return CONTINUE;
2590
2432
  case '>':
2591
2433
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2592
2434
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2593
2435
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2594
- emit_doctype(parser, output);
2595
- return RETURN_SUCCESS;
2436
+ return emit_doctype(parser, output);
2596
2437
  case '\0':
2597
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2438
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2598
2439
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2599
- return NEXT_CHAR;
2440
+ return CONTINUE;
2600
2441
  case -1:
2601
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2602
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2442
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2443
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2603
2444
  tokenizer->_doc_type_state.force_quirks = true;
2604
2445
  gumbo_free((void*) tokenizer->_doc_type_state.name);
2605
2446
  finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2606
- emit_doctype(parser, output);
2607
- return RETURN_ERROR;
2447
+ return emit_doctype(parser, output);
2608
2448
  default:
2609
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2610
- tokenizer->_doc_type_state.force_quirks = false;
2611
2449
  append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2612
- return NEXT_CHAR;
2450
+ return CONTINUE;
2613
2451
  }
2614
2452
  }
2615
2453
 
@@ -2625,35 +2463,29 @@ static StateResult handle_after_doctype_name_state (
2625
2463
  case '\n':
2626
2464
  case '\f':
2627
2465
  case ' ':
2628
- return NEXT_CHAR;
2466
+ return CONTINUE;
2629
2467
  case '>':
2630
2468
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2631
- emit_doctype(parser, output);
2632
- return RETURN_SUCCESS;
2469
+ return emit_doctype(parser, output);
2633
2470
  case -1:
2634
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2471
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2635
2472
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2636
2473
  tokenizer->_doc_type_state.force_quirks = true;
2637
- emit_doctype(parser, output);
2638
- return RETURN_ERROR;
2474
+ return emit_doctype(parser, output);
2639
2475
  default:
2640
2476
  if (utf8iterator_maybe_consume_match(
2641
2477
  &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2642
- gumbo_tokenizer_set_state(
2643
- parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2644
- tokenizer->_reconsume_current_input = true;
2478
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2645
2479
  } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2646
2480
  sizeof("SYSTEM") - 1, false)) {
2647
- gumbo_tokenizer_set_state(
2648
- parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2649
- tokenizer->_reconsume_current_input = true;
2481
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2650
2482
  } else {
2651
2483
  tokenizer_add_parse_error(
2652
- parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2653
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2484
+ parser, GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME);
2485
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2654
2486
  tokenizer->_doc_type_state.force_quirks = true;
2655
2487
  }
2656
- return NEXT_CHAR;
2488
+ return CONTINUE;
2657
2489
  }
2658
2490
  }
2659
2491
 
@@ -2670,37 +2502,34 @@ static StateResult handle_after_doctype_public_keyword_state (
2670
2502
  case '\f':
2671
2503
  case ' ':
2672
2504
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2673
- return NEXT_CHAR;
2505
+ return CONTINUE;
2674
2506
  case '"':
2675
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2507
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2676
2508
  assert(temporary_buffer_is_empty(parser));
2677
2509
  gumbo_tokenizer_set_state(
2678
2510
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2679
- return NEXT_CHAR;
2511
+ return CONTINUE;
2680
2512
  case '\'':
2681
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2513
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2682
2514
  assert(temporary_buffer_is_empty(parser));
2683
2515
  gumbo_tokenizer_set_state(
2684
2516
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2685
- return NEXT_CHAR;
2517
+ return CONTINUE;
2686
2518
  case '>':
2687
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2519
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2688
2520
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2689
2521
  tokenizer->_doc_type_state.force_quirks = true;
2690
- emit_doctype(parser, output);
2691
- return RETURN_ERROR;
2522
+ return emit_doctype(parser, output);
2692
2523
  case -1:
2693
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2694
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2524
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2525
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2695
2526
  tokenizer->_doc_type_state.force_quirks = true;
2696
- emit_doctype(parser, output);
2697
- return RETURN_ERROR;
2527
+ return emit_doctype(parser, output);
2698
2528
  default:
2699
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2700
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2529
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2530
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2701
2531
  tokenizer->_doc_type_state.force_quirks = true;
2702
- emit_doctype(parser, output);
2703
- return RETURN_ERROR;
2532
+ return CONTINUE;
2704
2533
  }
2705
2534
  }
2706
2535
 
@@ -2716,35 +2545,32 @@ static StateResult handle_before_doctype_public_id_state (
2716
2545
  case '\n':
2717
2546
  case '\f':
2718
2547
  case ' ':
2719
- return NEXT_CHAR;
2548
+ return CONTINUE;
2720
2549
  case '"':
2721
2550
  assert(temporary_buffer_is_empty(parser));
2722
2551
  gumbo_tokenizer_set_state(
2723
2552
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2724
- return NEXT_CHAR;
2553
+ return CONTINUE;
2725
2554
  case '\'':
2726
2555
  assert(temporary_buffer_is_empty(parser));
2727
2556
  gumbo_tokenizer_set_state(
2728
2557
  parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2729
- return NEXT_CHAR;
2558
+ return CONTINUE;
2730
2559
  case '>':
2731
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2560
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2732
2561
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2733
2562
  tokenizer->_doc_type_state.force_quirks = true;
2734
- emit_doctype(parser, output);
2735
- return RETURN_ERROR;
2563
+ return emit_doctype(parser, output);
2736
2564
  case -1:
2737
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2738
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2565
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2566
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2739
2567
  tokenizer->_doc_type_state.force_quirks = true;
2740
- emit_doctype(parser, output);
2741
- return RETURN_ERROR;
2568
+ return emit_doctype(parser, output);
2742
2569
  default:
2743
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2744
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2570
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2571
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2745
2572
  tokenizer->_doc_type_state.force_quirks = true;
2746
- emit_doctype(parser, output);
2747
- return RETURN_ERROR;
2573
+ return CONTINUE;
2748
2574
  }
2749
2575
  }
2750
2576
 
@@ -2759,28 +2585,26 @@ static StateResult handle_doctype_public_id_double_quoted_state (
2759
2585
  case '"':
2760
2586
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2761
2587
  finish_doctype_public_id(parser);
2762
- return NEXT_CHAR;
2588
+ return CONTINUE;
2763
2589
  case '\0':
2764
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2590
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2765
2591
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2766
- return NEXT_CHAR;
2592
+ return CONTINUE;
2767
2593
  case '>':
2768
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2594
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2769
2595
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2770
2596
  tokenizer->_doc_type_state.force_quirks = true;
2771
2597
  finish_doctype_public_id(parser);
2772
- emit_doctype(parser, output);
2773
- return RETURN_ERROR;
2598
+ return emit_doctype(parser, output);
2774
2599
  case -1:
2775
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2776
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2600
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2601
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2777
2602
  tokenizer->_doc_type_state.force_quirks = true;
2778
2603
  finish_doctype_public_id(parser);
2779
- emit_doctype(parser, output);
2780
- return RETURN_ERROR;
2604
+ return emit_doctype(parser, output);
2781
2605
  default:
2782
2606
  append_char_to_temporary_buffer(parser, c);
2783
- return NEXT_CHAR;
2607
+ return CONTINUE;
2784
2608
  }
2785
2609
  }
2786
2610
 
@@ -2795,28 +2619,26 @@ static StateResult handle_doctype_public_id_single_quoted_state (
2795
2619
  case '\'':
2796
2620
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2797
2621
  finish_doctype_public_id(parser);
2798
- return NEXT_CHAR;
2622
+ return CONTINUE;
2799
2623
  case '\0':
2800
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2624
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2801
2625
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2802
- return NEXT_CHAR;
2626
+ return CONTINUE;
2803
2627
  case '>':
2804
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2628
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2805
2629
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2806
2630
  tokenizer->_doc_type_state.force_quirks = true;
2807
2631
  finish_doctype_public_id(parser);
2808
- emit_doctype(parser, output);
2809
- return RETURN_ERROR;
2632
+ return emit_doctype(parser, output);
2810
2633
  case -1:
2811
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2812
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2634
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2635
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2813
2636
  tokenizer->_doc_type_state.force_quirks = true;
2814
2637
  finish_doctype_public_id(parser);
2815
- emit_doctype(parser, output);
2816
- return RETURN_ERROR;
2638
+ return emit_doctype(parser, output);
2817
2639
  default:
2818
2640
  append_char_to_temporary_buffer(parser, c);
2819
- return NEXT_CHAR;
2641
+ return CONTINUE;
2820
2642
  }
2821
2643
  }
2822
2644
 
@@ -2834,35 +2656,38 @@ static StateResult handle_after_doctype_public_id_state (
2834
2656
  case ' ':
2835
2657
  gumbo_tokenizer_set_state(
2836
2658
  parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2837
- return NEXT_CHAR;
2659
+ return CONTINUE;
2838
2660
  case '>':
2839
2661
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2840
- emit_doctype(parser, output);
2841
- return RETURN_SUCCESS;
2662
+ return emit_doctype(parser, output);
2842
2663
  case '"':
2843
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2664
+ tokenizer_add_parse_error (
2665
+ parser,
2666
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2667
+ );
2844
2668
  assert(temporary_buffer_is_empty(parser));
2845
2669
  gumbo_tokenizer_set_state(
2846
2670
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2847
- return NEXT_CHAR;
2671
+ return CONTINUE;
2848
2672
  case '\'':
2849
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2673
+ tokenizer_add_parse_error (
2674
+ parser,
2675
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2676
+ );
2850
2677
  assert(temporary_buffer_is_empty(parser));
2851
2678
  gumbo_tokenizer_set_state(
2852
2679
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2853
- return NEXT_CHAR;
2680
+ return CONTINUE;
2854
2681
  case -1:
2855
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2856
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2857
- tokenizer->_reconsume_current_input = true;
2682
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2683
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2858
2684
  tokenizer->_doc_type_state.force_quirks = true;
2859
- emit_doctype(parser, output);
2860
- return RETURN_ERROR;
2685
+ return emit_doctype(parser, output);
2861
2686
  default:
2862
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2863
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2687
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2688
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2864
2689
  tokenizer->_doc_type_state.force_quirks = true;
2865
- return NEXT_CHAR;
2690
+ return CONTINUE;
2866
2691
  }
2867
2692
  }
2868
2693
 
@@ -2878,33 +2703,30 @@ static StateResult handle_between_doctype_public_system_id_state (
2878
2703
  case '\n':
2879
2704
  case '\f':
2880
2705
  case ' ':
2881
- return NEXT_CHAR;
2706
+ return CONTINUE;
2882
2707
  case '>':
2883
2708
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2884
- emit_doctype(parser, output);
2885
- return RETURN_SUCCESS;
2709
+ return emit_doctype(parser, output);
2886
2710
  case '"':
2887
2711
  assert(temporary_buffer_is_empty(parser));
2888
2712
  gumbo_tokenizer_set_state(
2889
2713
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2890
- return NEXT_CHAR;
2714
+ return CONTINUE;
2891
2715
  case '\'':
2892
2716
  assert(temporary_buffer_is_empty(parser));
2893
2717
  gumbo_tokenizer_set_state(
2894
2718
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2895
- return NEXT_CHAR;
2719
+ return CONTINUE;
2896
2720
  case -1:
2897
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2898
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2721
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2722
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2899
2723
  tokenizer->_doc_type_state.force_quirks = true;
2900
- emit_doctype(parser, output);
2901
- return RETURN_ERROR;
2724
+ return emit_doctype(parser, output);
2902
2725
  default:
2903
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2904
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2726
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2727
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2905
2728
  tokenizer->_doc_type_state.force_quirks = true;
2906
- emit_doctype(parser, output);
2907
- return RETURN_ERROR;
2729
+ return CONTINUE;
2908
2730
  }
2909
2731
  }
2910
2732
 
@@ -2921,36 +2743,34 @@ static StateResult handle_after_doctype_system_keyword_state (
2921
2743
  case '\f':
2922
2744
  case ' ':
2923
2745
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2924
- return NEXT_CHAR;
2746
+ return CONTINUE;
2925
2747
  case '"':
2926
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2748
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2927
2749
  assert(temporary_buffer_is_empty(parser));
2928
2750
  gumbo_tokenizer_set_state(
2929
2751
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2930
- return NEXT_CHAR;
2752
+ return CONTINUE;
2931
2753
  case '\'':
2932
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2754
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2933
2755
  assert(temporary_buffer_is_empty(parser));
2934
2756
  gumbo_tokenizer_set_state(
2935
2757
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2936
- return NEXT_CHAR;
2758
+ return CONTINUE;
2937
2759
  case '>':
2938
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2760
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2939
2761
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2940
2762
  tokenizer->_doc_type_state.force_quirks = true;
2941
- emit_doctype(parser, output);
2942
- return RETURN_ERROR;
2763
+ return emit_doctype(parser, output);
2943
2764
  case -1:
2944
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2945
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2765
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2766
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2946
2767
  tokenizer->_doc_type_state.force_quirks = true;
2947
- emit_doctype(parser, output);
2948
- return RETURN_ERROR;
2768
+ return emit_doctype(parser, output);
2949
2769
  default:
2950
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2951
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2770
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2771
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2952
2772
  tokenizer->_doc_type_state.force_quirks = true;
2953
- return NEXT_CHAR;
2773
+ return CONTINUE;
2954
2774
  }
2955
2775
  }
2956
2776
 
@@ -2966,34 +2786,32 @@ static StateResult handle_before_doctype_system_id_state (
2966
2786
  case '\n':
2967
2787
  case '\f':
2968
2788
  case ' ':
2969
- return NEXT_CHAR;
2789
+ return CONTINUE;
2970
2790
  case '"':
2971
2791
  assert(temporary_buffer_is_empty(parser));
2972
2792
  gumbo_tokenizer_set_state(
2973
2793
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2974
- return NEXT_CHAR;
2794
+ return CONTINUE;
2975
2795
  case '\'':
2976
2796
  assert(temporary_buffer_is_empty(parser));
2977
2797
  gumbo_tokenizer_set_state(
2978
2798
  parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2979
- return NEXT_CHAR;
2799
+ return CONTINUE;
2980
2800
  case '>':
2981
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2801
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2982
2802
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2983
2803
  tokenizer->_doc_type_state.force_quirks = true;
2984
- emit_doctype(parser, output);
2985
- return RETURN_ERROR;
2804
+ return emit_doctype(parser, output);
2986
2805
  case -1:
2987
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2988
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2806
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2807
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2989
2808
  tokenizer->_doc_type_state.force_quirks = true;
2990
- emit_doctype(parser, output);
2991
- return RETURN_ERROR;
2809
+ return emit_doctype(parser, output);
2992
2810
  default:
2993
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2994
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2811
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2812
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2995
2813
  tokenizer->_doc_type_state.force_quirks = true;
2996
- return NEXT_CHAR;
2814
+ return CONTINUE;
2997
2815
  }
2998
2816
  }
2999
2817
 
@@ -3008,28 +2826,26 @@ static StateResult handle_doctype_system_id_double_quoted_state (
3008
2826
  case '"':
3009
2827
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
3010
2828
  finish_doctype_system_id(parser);
3011
- return NEXT_CHAR;
2829
+ return CONTINUE;
3012
2830
  case '\0':
3013
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2831
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
3014
2832
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
3015
- return NEXT_CHAR;
2833
+ return CONTINUE;
3016
2834
  case '>':
3017
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2835
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
3018
2836
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3019
2837
  tokenizer->_doc_type_state.force_quirks = true;
3020
2838
  finish_doctype_system_id(parser);
3021
- emit_doctype(parser, output);
3022
- return RETURN_ERROR;
2839
+ return emit_doctype(parser, output);
3023
2840
  case -1:
3024
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
3025
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2841
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2842
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3026
2843
  tokenizer->_doc_type_state.force_quirks = true;
3027
2844
  finish_doctype_system_id(parser);
3028
- emit_doctype(parser, output);
3029
- return RETURN_ERROR;
2845
+ return emit_doctype(parser, output);
3030
2846
  default:
3031
2847
  append_char_to_temporary_buffer(parser, c);
3032
- return NEXT_CHAR;
2848
+ return CONTINUE;
3033
2849
  }
3034
2850
  }
3035
2851
 
@@ -3044,28 +2860,26 @@ static StateResult handle_doctype_system_id_single_quoted_state (
3044
2860
  case '\'':
3045
2861
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
3046
2862
  finish_doctype_system_id(parser);
3047
- return NEXT_CHAR;
2863
+ return CONTINUE;
3048
2864
  case '\0':
3049
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2865
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
3050
2866
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
3051
- return NEXT_CHAR;
2867
+ return CONTINUE;
3052
2868
  case '>':
3053
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2869
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
3054
2870
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3055
2871
  tokenizer->_doc_type_state.force_quirks = true;
3056
2872
  finish_doctype_system_id(parser);
3057
- emit_doctype(parser, output);
3058
- return RETURN_ERROR;
2873
+ return emit_doctype(parser, output);
3059
2874
  case -1:
3060
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
3061
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2875
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2876
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3062
2877
  tokenizer->_doc_type_state.force_quirks = true;
3063
2878
  finish_doctype_system_id(parser);
3064
- emit_doctype(parser, output);
3065
- return RETURN_ERROR;
2879
+ return emit_doctype(parser, output);
3066
2880
  default:
3067
2881
  append_char_to_temporary_buffer(parser, c);
3068
- return NEXT_CHAR;
2882
+ return CONTINUE;
3069
2883
  }
3070
2884
  }
3071
2885
 
@@ -3081,21 +2895,19 @@ static StateResult handle_after_doctype_system_id_state (
3081
2895
  case '\n':
3082
2896
  case '\f':
3083
2897
  case ' ':
3084
- return NEXT_CHAR;
2898
+ return CONTINUE;
3085
2899
  case '>':
3086
2900
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3087
- emit_doctype(parser, output);
3088
- return RETURN_SUCCESS;
2901
+ return emit_doctype(parser, output);
3089
2902
  case -1:
3090
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
3091
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2903
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2904
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3092
2905
  tokenizer->_doc_type_state.force_quirks = true;
3093
- emit_doctype(parser, output);
3094
- return RETURN_ERROR;
2906
+ return emit_doctype(parser, output);
3095
2907
  default:
3096
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
3097
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
3098
- return NEXT_CHAR;
2908
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER);
2909
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2910
+ return CONTINUE;
3099
2911
  }
3100
2912
  }
3101
2913
 
@@ -3106,33 +2918,370 @@ static StateResult handle_bogus_doctype_state (
3106
2918
  int c,
3107
2919
  GumboToken* output
3108
2920
  ) {
3109
- if (c == '>' || c == -1) {
2921
+ switch (c) {
2922
+ case '>':
3110
2923
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3111
- emit_doctype(parser, output);
3112
- return RETURN_ERROR;
2924
+ return emit_doctype(parser, output);
2925
+ case '\0':
2926
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2927
+ return CONTINUE;
2928
+ case -1:
2929
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2930
+ return emit_doctype(parser, output);
2931
+ default:
2932
+ return CONTINUE;
3113
2933
  }
3114
- return NEXT_CHAR;
3115
2934
  }
3116
2935
 
3117
2936
  // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
3118
- static StateResult handle_cdata_state (
2937
+ static StateResult handle_cdata_section_state (
3119
2938
  GumboParser* parser,
3120
2939
  GumboTokenizerState* tokenizer,
3121
2940
  int c,
3122
2941
  GumboToken* output
3123
2942
  ) {
3124
- if (c == -1 || utf8iterator_maybe_consume_match(
3125
- &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
3126
- tokenizer->_reconsume_current_input = true;
2943
+ switch (c) {
2944
+ case ']':
2945
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_BRACKET);
2946
+ set_mark(parser);
2947
+ return CONTINUE;
2948
+ case -1:
2949
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_CDATA);
2950
+ return emit_eof(parser, output);
2951
+ default:
2952
+ return emit_char(parser, c, output);
2953
+ }
2954
+ }
2955
+
2956
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
2957
+ static StateResult handle_cdata_section_bracket_state (
2958
+ GumboParser* parser,
2959
+ GumboTokenizerState* tokenizer,
2960
+ int c,
2961
+ GumboToken* output
2962
+ ) {
2963
+ switch (c) {
2964
+ case ']':
2965
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_END);
2966
+ return CONTINUE;
2967
+ default:
2968
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2969
+ // Emit the ].
2970
+ return emit_from_mark(parser, output);
2971
+ }
2972
+ }
2973
+
2974
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
2975
+ static StateResult handle_cdata_section_end_state (
2976
+ GumboParser* parser,
2977
+ GumboTokenizerState* tokenizer,
2978
+ int c,
2979
+ GumboToken* output
2980
+ ) {
2981
+ switch (c) {
2982
+ case ']':
2983
+ {
2984
+ // XXX: This is terrible. We want to emit a ] corresponding to the first
2985
+ // of the three in a row we've seen. So let's emit one token from the
2986
+ // temporary buffer (which will rewind 3 characters, emit the ] and
2987
+ // advance one). Next, let's clear the temporary buffer which will set the
2988
+ // mark to the middle of the three brackets. Finally, let's move to the
2989
+ // appropriate state.
2990
+ StateResult result = emit_from_mark(parser, output);
2991
+ tokenizer->_resume_pos = NULL;
2992
+ set_mark(parser);
2993
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION);
2994
+ return result;
2995
+ }
2996
+ case '>':
2997
+ // We're done with CDATA so move past the >, reset the token start point
2998
+ // to point after the >, and then reconsume in the data state.
2999
+ utf8iterator_next(&tokenizer->_input);
3127
3000
  reset_token_start_point(tokenizer);
3128
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
3001
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3129
3002
  tokenizer->_is_in_cdata = false;
3130
- return NEXT_CHAR;
3131
- } else {
3132
- return emit_current_char(parser, output);
3003
+ return CONTINUE;
3004
+ default:
3005
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
3006
+ return emit_from_mark(parser, output);
3007
+ }
3008
+ }
3009
+
3010
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
3011
+ static StateResult handle_character_reference_state (
3012
+ GumboParser* parser,
3013
+ GumboTokenizerState* tokenizer,
3014
+ int c,
3015
+ GumboToken* output
3016
+ ) {
3017
+ if (gumbo_ascii_isalnum(c)) {
3018
+ reconsume_in_state(parser, GUMBO_LEX_NAMED_CHARACTER_REFERENCE);
3019
+ return CONTINUE;
3020
+ }
3021
+ if (c == '#') {
3022
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE);
3023
+ return CONTINUE;
3024
+ }
3025
+ reconsume_in_state(parser, tokenizer->_return_state);
3026
+ return flush_code_points_consumed_as_character_reference(parser, output);
3027
+ }
3028
+
3029
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
3030
+ static StateResult handle_named_character_reference_state (
3031
+ GumboParser* parser,
3032
+ GumboTokenizerState* tokenizer,
3033
+ int c,
3034
+ GumboToken* output
3035
+ ) {
3036
+ const char *cur = utf8iterator_get_char_pointer(&tokenizer->_input);
3037
+ const char *end = utf8iterator_get_end_pointer(&tokenizer->_input);
3038
+ int code_point[2];
3039
+ size_t size = match_named_char_ref(cur, end - cur, code_point);
3040
+
3041
+ if (size > 0) {
3042
+ utf8iterator_maybe_consume_match(&tokenizer->_input, cur, size, true);
3043
+ int next = utf8iterator_current(&tokenizer->_input);
3044
+ reconsume_in_state(parser, tokenizer->_return_state);
3045
+ if (character_reference_part_of_attribute(parser)
3046
+ && cur[size-1] != ';'
3047
+ && (next == '=' || gumbo_ascii_isalnum(next))) {
3048
+ GumboStringPiece str = { .data = cur, .length = size };
3049
+ append_string_to_temporary_buffer(parser, &str);
3050
+ return flush_code_points_consumed_as_character_reference(parser, output);
3051
+ }
3052
+ if (cur[size-1] != ';')
3053
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE, -1);
3054
+ reconsume_in_state(parser, tokenizer->_return_state);
3055
+ return flush_char_ref(parser, code_point[0], code_point[1], output);
3056
+ }
3057
+ reconsume_in_state(parser, GUMBO_LEX_AMBIGUOUS_AMPERSAND);
3058
+ return flush_code_points_consumed_as_character_reference(parser, output);
3059
+ }
3060
+
3061
+ // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
3062
+ static StateResult handle_ambiguous_ampersand_state (
3063
+ GumboParser* parser,
3064
+ GumboTokenizerState* tokenizer,
3065
+ int c,
3066
+ GumboToken* output
3067
+ ) {
3068
+ if (gumbo_ascii_isalnum(c)) {
3069
+ if (character_reference_part_of_attribute(parser)) {
3070
+ append_char_to_tag_buffer(parser, c, true);
3071
+ return CONTINUE;
3072
+ }
3073
+ return emit_char(parser, c, output);
3074
+ }
3075
+ if (c == ';') {
3076
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE, -1);
3077
+ // fall through
3078
+ }
3079
+ reconsume_in_state(parser, tokenizer->_return_state);
3080
+ return CONTINUE;
3081
+ }
3082
+
3083
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
3084
+ static StateResult handle_numeric_character_reference_state (
3085
+ GumboParser* parser,
3086
+ GumboTokenizerState* tokenizer,
3087
+ int c,
3088
+ GumboToken* output
3089
+ ) {
3090
+ tokenizer->_character_reference_code = 0;
3091
+ switch (c) {
3092
+ case 'x':
3093
+ case 'X':
3094
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START);
3095
+ return CONTINUE;
3096
+ default:
3097
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START);
3098
+ return CONTINUE;
3133
3099
  }
3134
3100
  }
3135
3101
 
3102
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-start-state
3103
+ static StateResult handle_hexadecimal_character_reference_start_state (
3104
+ GumboParser* parser,
3105
+ GumboTokenizerState* tokenizer,
3106
+ int c,
3107
+ GumboToken* output
3108
+ ) {
3109
+ if (gumbo_ascii_isxdigit(c)) {
3110
+ reconsume_in_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE);
3111
+ return CONTINUE;
3112
+ }
3113
+ tokenizer_add_char_ref_error (
3114
+ parser,
3115
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3116
+ -1
3117
+ );
3118
+ reconsume_in_state(parser, tokenizer->_return_state);
3119
+ return flush_code_points_consumed_as_character_reference(parser, output);
3120
+ }
3121
+
3122
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
3123
+ static StateResult handle_decimal_character_reference_start_state (
3124
+ GumboParser* parser,
3125
+ GumboTokenizerState* tokenizer,
3126
+ int c,
3127
+ GumboToken* output
3128
+ ) {
3129
+ if (gumbo_ascii_isdigit(c)) {
3130
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE);
3131
+ return CONTINUE;
3132
+ }
3133
+ tokenizer_add_char_ref_error (
3134
+ parser,
3135
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3136
+ -1
3137
+ );
3138
+ reconsume_in_state(parser, tokenizer->_return_state);
3139
+ return flush_code_points_consumed_as_character_reference(parser, output);
3140
+ }
3141
+
3142
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-state
3143
+ static StateResult handle_hexadecimal_character_reference_state (
3144
+ GumboParser* parser,
3145
+ GumboTokenizerState* tokenizer,
3146
+ int c,
3147
+ GumboToken* output
3148
+ ) {
3149
+ if (gumbo_ascii_isdigit(c)) {
3150
+ tokenizer->_character_reference_code =
3151
+ tokenizer->_character_reference_code * 16 + (c - 0x0030);
3152
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3153
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3154
+ return CONTINUE;
3155
+ }
3156
+ if (gumbo_ascii_isupper_xdigit(c)) {
3157
+ tokenizer->_character_reference_code =
3158
+ tokenizer->_character_reference_code * 16 + (c - 0x0037);
3159
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3160
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3161
+ return CONTINUE;
3162
+ }
3163
+ if (gumbo_ascii_islower_xdigit(c)) {
3164
+ tokenizer->_character_reference_code =
3165
+ tokenizer->_character_reference_code * 16 + (c - 0x0057);
3166
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3167
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3168
+ return CONTINUE;
3169
+ }
3170
+ if (c == ';') {
3171
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3172
+ return CONTINUE;
3173
+ }
3174
+ tokenizer_add_char_ref_error(
3175
+ parser,
3176
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3177
+ tokenizer->_character_reference_code
3178
+ );
3179
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3180
+ return CONTINUE;
3181
+ }
3182
+
3183
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
3184
+ static StateResult handle_decimal_character_reference_state (
3185
+ GumboParser* parser,
3186
+ GumboTokenizerState* tokenizer,
3187
+ int c,
3188
+ GumboToken* output
3189
+ ) {
3190
+ if (gumbo_ascii_isdigit(c)) {
3191
+ tokenizer->_character_reference_code =
3192
+ tokenizer->_character_reference_code * 10 + (c - 0x0030);
3193
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3194
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3195
+ return CONTINUE;
3196
+ }
3197
+ if (c == ';') {
3198
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3199
+ return CONTINUE;
3200
+ }
3201
+ tokenizer_add_char_ref_error(
3202
+ parser,
3203
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3204
+ tokenizer->_character_reference_code
3205
+ );
3206
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3207
+ return CONTINUE;
3208
+ }
3209
+
3210
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
3211
+ static StateResult handle_numeric_character_reference_end_state (
3212
+ GumboParser* parser,
3213
+ GumboTokenizerState* tokenizer,
3214
+ int c,
3215
+ GumboToken* output
3216
+ ) {
3217
+ c = tokenizer->_character_reference_code;
3218
+ if (c == 0) {
3219
+ tokenizer_add_char_ref_error(
3220
+ parser,
3221
+ GUMBO_ERR_NULL_CHARACTER_REFERENCE,
3222
+ c
3223
+ );
3224
+ c = kUtf8ReplacementChar;
3225
+ } else if (c > kUtf8MaxChar) {
3226
+ tokenizer_add_char_ref_error(
3227
+ parser,
3228
+ GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
3229
+ c
3230
+ );
3231
+ c = kUtf8ReplacementChar;
3232
+ } else if (utf8_is_surrogate(c)) {
3233
+ tokenizer_add_char_ref_error(
3234
+ parser,
3235
+ GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
3236
+ c
3237
+ );
3238
+ c = kUtf8ReplacementChar;
3239
+ } else if (utf8_is_noncharacter(c)) {
3240
+ tokenizer_add_char_ref_error(
3241
+ parser,
3242
+ GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
3243
+ c
3244
+ );
3245
+ } else if (c == 0x0D || (utf8_is_control(c) && !gumbo_ascii_isspace(c))) {
3246
+ tokenizer_add_char_ref_error(
3247
+ parser,
3248
+ GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
3249
+ c
3250
+ );
3251
+ switch (c) {
3252
+ case 0x80: c = 0x20AC; break;
3253
+ case 0x82: c = 0x201A; break;
3254
+ case 0x83: c = 0x0192; break;
3255
+ case 0x84: c = 0x201E; break;
3256
+ case 0x85: c = 0x2026; break;
3257
+ case 0x86: c = 0x2020; break;
3258
+ case 0x87: c = 0x2021; break;
3259
+ case 0x88: c = 0x02C6; break;
3260
+ case 0x89: c = 0x2030; break;
3261
+ case 0x8A: c = 0x0160; break;
3262
+ case 0x8B: c = 0x2039; break;
3263
+ case 0x8C: c = 0x0152; break;
3264
+ case 0x8E: c = 0x017D; break;
3265
+ case 0x91: c = 0x2018; break;
3266
+ case 0x92: c = 0x2019; break;
3267
+ case 0x93: c = 0x201C; break;
3268
+ case 0x94: c = 0x201D; break;
3269
+ case 0x95: c = 0x2022; break;
3270
+ case 0x96: c = 0x2013; break;
3271
+ case 0x97: c = 0x2014; break;
3272
+ case 0x98: c = 0x02DC; break;
3273
+ case 0x99: c = 0x2122; break;
3274
+ case 0x9A: c = 0x0161; break;
3275
+ case 0x9B: c = 0x203A; break;
3276
+ case 0x9C: c = 0x0153; break;
3277
+ case 0x9E: c = 0x017E; break;
3278
+ case 0x9F: c = 0x0178; break;
3279
+ }
3280
+ }
3281
+ reconsume_in_state(parser, tokenizer->_return_state);
3282
+ return flush_char_ref(parser, c, kGumboNoChar, output);
3283
+ }
3284
+
3136
3285
  typedef StateResult (*GumboLexerStateFunction) (
3137
3286
  GumboParser* parser,
3138
3287
  GumboTokenizerState* tokenizer,
@@ -3141,77 +3290,89 @@ typedef StateResult (*GumboLexerStateFunction) (
3141
3290
  );
3142
3291
 
3143
3292
  static GumboLexerStateFunction dispatch_table[] = {
3144
- handle_data_state,
3145
- handle_char_ref_in_data_state,
3146
- handle_rcdata_state,
3147
- handle_char_ref_in_rcdata_state,
3148
- handle_rawtext_state,
3149
- handle_script_state,
3150
- handle_plaintext_state,
3151
- handle_tag_open_state,
3152
- handle_end_tag_open_state,
3153
- handle_tag_name_state,
3154
- handle_rcdata_lt_state,
3155
- handle_rcdata_end_tag_open_state,
3156
- handle_rcdata_end_tag_name_state,
3157
- handle_rawtext_lt_state,
3158
- handle_rawtext_end_tag_open_state,
3159
- handle_rawtext_end_tag_name_state,
3160
- handle_script_lt_state,
3161
- handle_script_end_tag_open_state,
3162
- handle_script_end_tag_name_state,
3163
- handle_script_escaped_start_state,
3164
- handle_script_escaped_start_dash_state,
3165
- handle_script_escaped_state,
3166
- handle_script_escaped_dash_state,
3167
- handle_script_escaped_dash_dash_state,
3168
- handle_script_escaped_lt_state,
3169
- handle_script_escaped_end_tag_open_state,
3170
- handle_script_escaped_end_tag_name_state,
3171
- handle_script_double_escaped_start_state,
3172
- handle_script_double_escaped_state,
3173
- handle_script_double_escaped_dash_state,
3174
- handle_script_double_escaped_dash_dash_state,
3175
- handle_script_double_escaped_lt_state,
3176
- handle_script_double_escaped_end_state,
3177
- handle_before_attr_name_state,
3178
- handle_attr_name_state,
3179
- handle_after_attr_name_state,
3180
- handle_before_attr_value_state,
3181
- handle_attr_value_double_quoted_state,
3182
- handle_attr_value_single_quoted_state,
3183
- handle_attr_value_unquoted_state,
3184
- handle_char_ref_in_attr_value_state,
3185
- handle_after_attr_value_quoted_state,
3186
- handle_self_closing_start_tag_state,
3187
- handle_bogus_comment_state,
3188
- handle_markup_declaration_state,
3189
- handle_comment_start_state,
3190
- handle_comment_start_dash_state,
3191
- handle_comment_state,
3192
- handle_comment_end_dash_state,
3193
- handle_comment_end_state,
3194
- handle_comment_end_bang_state,
3195
- handle_doctype_state,
3196
- handle_before_doctype_name_state,
3197
- handle_doctype_name_state,
3198
- handle_after_doctype_name_state,
3199
- handle_after_doctype_public_keyword_state,
3200
- handle_before_doctype_public_id_state,
3201
- handle_doctype_public_id_double_quoted_state,
3202
- handle_doctype_public_id_single_quoted_state,
3203
- handle_after_doctype_public_id_state,
3204
- handle_between_doctype_public_system_id_state,
3205
- handle_after_doctype_system_keyword_state,
3206
- handle_before_doctype_system_id_state,
3207
- handle_doctype_system_id_double_quoted_state,
3208
- handle_doctype_system_id_single_quoted_state,
3209
- handle_after_doctype_system_id_state,
3210
- handle_bogus_doctype_state,
3211
- handle_cdata_state
3293
+ [GUMBO_LEX_DATA] = handle_data_state,
3294
+ [GUMBO_LEX_RCDATA] = handle_rcdata_state,
3295
+ [GUMBO_LEX_RAWTEXT] = handle_rawtext_state,
3296
+ [GUMBO_LEX_SCRIPT_DATA] = handle_script_data_state,
3297
+ [GUMBO_LEX_PLAINTEXT] = handle_plaintext_state,
3298
+ [GUMBO_LEX_TAG_OPEN] = handle_tag_open_state,
3299
+ [GUMBO_LEX_END_TAG_OPEN] = handle_end_tag_open_state,
3300
+ [GUMBO_LEX_TAG_NAME] = handle_tag_name_state,
3301
+ [GUMBO_LEX_RCDATA_LT] = handle_rcdata_lt_state,
3302
+ [GUMBO_LEX_RCDATA_END_TAG_OPEN] = handle_rcdata_end_tag_open_state,
3303
+ [GUMBO_LEX_RCDATA_END_TAG_NAME] = handle_rcdata_end_tag_name_state,
3304
+ [GUMBO_LEX_RAWTEXT_LT] = handle_rawtext_lt_state,
3305
+ [GUMBO_LEX_RAWTEXT_END_TAG_OPEN] = handle_rawtext_end_tag_open_state,
3306
+ [GUMBO_LEX_RAWTEXT_END_TAG_NAME] = handle_rawtext_end_tag_name_state,
3307
+ [GUMBO_LEX_SCRIPT_DATA_LT] = handle_script_data_lt_state,
3308
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN] = handle_script_data_end_tag_open_state,
3309
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME] = handle_script_data_end_tag_name_state,
3310
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START] = handle_script_data_escaped_start_state,
3311
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH] = handle_script_data_escaped_start_dash_state,
3312
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED] = handle_script_data_escaped_state,
3313
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH] = handle_script_data_escaped_dash_state,
3314
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH] = handle_script_data_escaped_dash_dash_state,
3315
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT] = handle_script_data_escaped_lt_state,
3316
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN] = handle_script_data_escaped_end_tag_open_state,
3317
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME] = handle_script_data_escaped_end_tag_name_state,
3318
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START] = handle_script_data_double_escaped_start_state,
3319
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED] = handle_script_data_double_escaped_state,
3320
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH] = handle_script_data_double_escaped_dash_state,
3321
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH] = handle_script_data_double_escaped_dash_dash_state,
3322
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT] = handle_script_data_double_escaped_lt_state,
3323
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END] = handle_script_data_double_escaped_end_state,
3324
+ [GUMBO_LEX_BEFORE_ATTR_NAME] = handle_before_attr_name_state,
3325
+ [GUMBO_LEX_ATTR_NAME] = handle_attr_name_state,
3326
+ [GUMBO_LEX_AFTER_ATTR_NAME] = handle_after_attr_name_state,
3327
+ [GUMBO_LEX_BEFORE_ATTR_VALUE] = handle_before_attr_value_state,
3328
+ [GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED] = handle_attr_value_double_quoted_state,
3329
+ [GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED] = handle_attr_value_single_quoted_state,
3330
+ [GUMBO_LEX_ATTR_VALUE_UNQUOTED] = handle_attr_value_unquoted_state,
3331
+ [GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED] = handle_after_attr_value_quoted_state,
3332
+ [GUMBO_LEX_SELF_CLOSING_START_TAG] = handle_self_closing_start_tag_state,
3333
+ [GUMBO_LEX_BOGUS_COMMENT] = handle_bogus_comment_state,
3334
+ [GUMBO_LEX_MARKUP_DECLARATION_OPEN] = handle_markup_declaration_open_state,
3335
+ [GUMBO_LEX_COMMENT_START] = handle_comment_start_state,
3336
+ [GUMBO_LEX_COMMENT_START_DASH] = handle_comment_start_dash_state,
3337
+ [GUMBO_LEX_COMMENT] = handle_comment_state,
3338
+ [GUMBO_LEX_COMMENT_LT] = handle_comment_lt_state,
3339
+ [GUMBO_LEX_COMMENT_LT_BANG] = handle_comment_lt_bang_state,
3340
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH] = handle_comment_lt_bang_dash_state,
3341
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH] = handle_comment_lt_bang_dash_dash_state,
3342
+ [GUMBO_LEX_COMMENT_END_DASH] = handle_comment_end_dash_state,
3343
+ [GUMBO_LEX_COMMENT_END] = handle_comment_end_state,
3344
+ [GUMBO_LEX_COMMENT_END_BANG] = handle_comment_end_bang_state,
3345
+ [GUMBO_LEX_DOCTYPE] = handle_doctype_state,
3346
+ [GUMBO_LEX_BEFORE_DOCTYPE_NAME] = handle_before_doctype_name_state,
3347
+ [GUMBO_LEX_DOCTYPE_NAME] = handle_doctype_name_state,
3348
+ [GUMBO_LEX_AFTER_DOCTYPE_NAME] = handle_after_doctype_name_state,
3349
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD] = handle_after_doctype_public_keyword_state,
3350
+ [GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID] = handle_before_doctype_public_id_state,
3351
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED] = handle_doctype_public_id_double_quoted_state,
3352
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED] = handle_doctype_public_id_single_quoted_state,
3353
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID] = handle_after_doctype_public_id_state,
3354
+ [GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID] = handle_between_doctype_public_system_id_state,
3355
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD] = handle_after_doctype_system_keyword_state,
3356
+ [GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID] = handle_before_doctype_system_id_state,
3357
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED] = handle_doctype_system_id_double_quoted_state,
3358
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED] = handle_doctype_system_id_single_quoted_state,
3359
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID] = handle_after_doctype_system_id_state,
3360
+ [GUMBO_LEX_BOGUS_DOCTYPE] = handle_bogus_doctype_state,
3361
+ [GUMBO_LEX_CDATA_SECTION] = handle_cdata_section_state,
3362
+ [GUMBO_LEX_CDATA_SECTION_BRACKET] = handle_cdata_section_bracket_state,
3363
+ [GUMBO_LEX_CDATA_SECTION_END] = handle_cdata_section_end_state,
3364
+ [GUMBO_LEX_CHARACTER_REFERENCE] = handle_character_reference_state,
3365
+ [GUMBO_LEX_NAMED_CHARACTER_REFERENCE] = handle_named_character_reference_state,
3366
+ [GUMBO_LEX_AMBIGUOUS_AMPERSAND] = handle_ambiguous_ampersand_state,
3367
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE] = handle_numeric_character_reference_state,
3368
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START] = handle_hexadecimal_character_reference_start_state,
3369
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START] = handle_decimal_character_reference_start_state,
3370
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE] = handle_hexadecimal_character_reference_state,
3371
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE] = handle_decimal_character_reference_state,
3372
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
3212
3373
  };
3213
3374
 
3214
- bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3375
+ void gumbo_lex(GumboParser* parser, GumboToken* output) {
3215
3376
  // Because of the spec requirements that...
3216
3377
  //
3217
3378
  // 1. Tokens be handled immediately by the parser upon emission.
@@ -3236,15 +3397,15 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3236
3397
  // isn't consumed twice.
3237
3398
  tokenizer->_reconsume_current_input = false;
3238
3399
  tokenizer->_buffered_emit_char = kGumboNoChar;
3239
- return true;
3400
+ return;
3240
3401
  }
3241
3402
 
3242
- if (maybe_emit_from_temporary_buffer(parser, output)) {
3243
- return true;
3403
+ if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
3404
+ return;
3244
3405
  }
3245
3406
 
3246
3407
  while (1) {
3247
- assert(!tokenizer->_temporary_buffer_emit);
3408
+ assert(!tokenizer->_resume_pos);
3248
3409
  assert(tokenizer->_buffered_emit_char == kGumboNoChar);
3249
3410
  int c = utf8iterator_current(&tokenizer->_input);
3250
3411
  GumboTokenizerEnum state = tokenizer->_state;
@@ -3255,11 +3416,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3255
3416
  bool should_advance = !tokenizer->_reconsume_current_input;
3256
3417
  tokenizer->_reconsume_current_input = false;
3257
3418
 
3258
- if (result == RETURN_SUCCESS) {
3259
- return true;
3260
- } else if (result == RETURN_ERROR) {
3261
- return false;
3262
- }
3419
+ if (result == EMIT_TOKEN)
3420
+ return;
3263
3421
 
3264
3422
  if (should_advance) {
3265
3423
  utf8iterator_next(&tokenizer->_input);
@@ -3285,12 +3443,16 @@ void gumbo_token_destroy(GumboToken* token) {
3285
3443
  }
3286
3444
  }
3287
3445
  gumbo_free((void*) token->v.start_tag.attributes.data);
3288
- if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN)
3446
+ if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
3289
3447
  gumbo_free(token->v.start_tag.name);
3448
+ token->v.start_tag.name = NULL;
3449
+ }
3290
3450
  return;
3291
3451
  case GUMBO_TOKEN_END_TAG:
3292
- if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN)
3452
+ if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN) {
3293
3453
  gumbo_free(token->v.end_tag.name);
3454
+ token->v.end_tag.name = NULL;
3455
+ }
3294
3456
  break;
3295
3457
  case GUMBO_TOKEN_COMMENT:
3296
3458
  gumbo_free((void*) token->v.text);