nokogumbo 0.5 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
data/work/tokenizer.c ADDED
@@ -0,0 +1,2978 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // Coding conventions specific to this file:
18
+ //
19
+ // 1. Functions that fill in a token should be named emit_*, and should be
20
+ // followed immediately by a return from the tokenizer (true if no error
21
+ // occurred, false if an error occurred). Sometimes the emit functions
22
+ // themselves return a boolean so that they can be combined with the return
23
+ // statement; in this case, they should match this convention.
24
+ // 2. Functions that shuffle data from temporaries to final API structures
25
+ // should be named finish_*, and be called just before the tokenizer exits the
26
+ // state that accumulates the temporary.
27
+ // 3. All internal data structures should be kept in an initialized state from
28
+ // tokenizer creation onwards, ready to accept input. When a buffer's flushed
29
+ // and reset, it should be deallocated and immediately reinitialized.
30
+ // 4. Make sure there are appropriate break statements following each state.
31
+ // 5. Assertions on the state of the temporary and tag buffers are usually a
32
+ // good idea, and should go at the entry point of each state when added.
33
+ // 6. Statement order within states goes:
34
+ // 1. Add parse errors, if appropriate.
35
+ // 2. Call finish_* functions to build up tag state.
36
+ // 2. Switch to new state. Set _reconsume flag if appropriate.
37
+ // 3. Perform any other temporary buffer manipulation.
38
+ // 4. Emit tokens
39
+ // 5. Return/break.
40
+ // This order ensures that we can verify that every emit is followed by a
41
+ // return, ensures that the correct state is recorded with any parse errors, and
42
+ // prevents parse error position from being messed up by possible mark/resets in
43
+ // temporary buffer manipulation.
44
+
45
+
46
+ #include "tokenizer.h"
47
+
48
+ #include <assert.h>
49
+ #include <stdbool.h>
50
+ #include <string.h>
51
+
52
+ #include "attribute.h"
53
+ #include "char_ref.h"
54
+ #include "error.h"
55
+ #include "gumbo.h"
56
+ #include "parser.h"
57
+ #include "string_buffer.h"
58
+ #include "string_piece.h"
59
+ #include "token_type.h"
60
+ #include "tokenizer_states.h"
61
+ #include "utf8.h"
62
+ #include "util.h"
63
+ #include "vector.h"
64
+
65
+ // Compared against _script_data_buffer to determine if we're in double-escaped
66
+ // script mode.
67
+ const GumboStringPiece kScriptTag = { "script", 6 };
68
+
69
+ // An enum for the return value of each individual state.
70
+ typedef enum {
71
+ RETURN_ERROR, // Return false (error) from the tokenizer.
72
+ RETURN_SUCCESS, // Return true (success) from the tokenizer.
73
+ NEXT_CHAR // Proceed to the next character and continue lexing.
74
+ } StateResult;
75
+
76
+ // This is a struct containing state necessary to build up a tag token,
77
+ // character by character.
78
+ typedef struct _GumboTagState {
79
+ // A buffer to accumulate characters for various GumboStringPiece fields.
80
+ GumboStringBuffer _buffer;
81
+
82
+ // A pointer to the start of the original text corresponding to the contents
83
+ // of the buffer.
84
+ const char* _original_text;
85
+
86
+ // The current tag enum, computed once the tag name state has finished so that
87
+ // the buffer can be re-used for building up attributes.
88
+ GumboTag _tag;
89
+
90
+ // The starting location of the text in the buffer.
91
+ GumboSourcePosition _start_pos;
92
+
93
+ // The current list of attributes. This is copied (and ownership of its data
94
+ // transferred) to the GumboStartTag token upon completion of the tag. New
95
+ // attributes are added as soon as their attribute name state is complete, and
96
+ // values are filled in by operating on _attributes.data[attributes.length-1].
97
+ GumboVector /* GumboAttribute */ _attributes;
98
+
99
+ // If true, the next attribute value to be finished should be dropped. This
100
+ // happens if a duplicate attribute name is encountered - we want to consume
101
+ // the attribute value, but shouldn't overwrite the existing value.
102
+ bool _drop_next_attr_value;
103
+
104
+ // The state that caused the tokenizer to switch into a character reference in
105
+ // attribute value state. This is used to set the additional allowed
106
+ // character, and is switched back to on completion. Initialized as the
107
+ // tokenizer enters the character reference state.
108
+ GumboTokenizerEnum _attr_value_state;
109
+
110
+ // The last start tag to have been emitted by the tokenizer. This is
111
+ // necessary to check for appropriate end tags.
112
+ GumboTag _last_start_tag;
113
+
114
+ // If true, then this is a start tag. If false, it's an end tag. This is
115
+ // necessary to generate the appropriate token type at tag-closing time.
116
+ bool _is_start_tag;
117
+
118
+ // If true, then this tag is "self-closing" and doesn't have an end tag.
119
+ bool _is_self_closing;
120
+ } GumboTagState;
121
+
122
+ // This is the main tokenizer state struct, containing all state used by in
123
+ // tokenizing the input stream.
124
+ typedef struct _GumboTokenizerState {
125
+ // The current lexer state. Starts in GUMBO_LEX_DATA.
126
+ GumboTokenizerEnum _state;
127
+
128
+ // A flag indicating whether the current input character needs to reconsumed
129
+ // in another state, or whether the next input character should be read for
130
+ // the next iteration of the state loop. This is set when the spec reads
131
+ // "Reconsume the current input character in..."
132
+ bool _reconsume_current_input;
133
+
134
+ // A flag indicating whether the current node is a foreign element. This is
135
+ // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
136
+ // markup declaration state.
137
+ bool _is_current_node_foreign;
138
+
139
+ // Certain states (notably character references) may emit two character tokens
140
+ // at once, but the contract for lex() fills in only one token at a time. The
141
+ // extra character is buffered here, and then this is checked on entry to
142
+ // lex(). If a character is stored here, it's immediately emitted and control
143
+ // returns from the lexer. kGumboNoChar is used to represent 'no character
144
+ // stored.'
145
+ //
146
+ // Note that characters emitted through this mechanism will have their source
147
+ // position marked as the character under the mark, i.e. multiple characters
148
+ // may be emitted with the same position. This is desirable for character
149
+ // references, but unsuitable for many other cases. Use the _temporary_buffer
150
+ // mechanism if the buffered characters must have their original positions in
151
+ // the document.
152
+ int _buffered_emit_char;
153
+
154
+ // A temporary buffer to accumulate characters, as described by the "temporary
155
+ // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
156
+ // way: we record the specific character to go into the buffer, which may
157
+ // sometimes be a lowercased version of the actual input character. However,
158
+ // we *also* use utf8iterator_mark() to record the position at tag start.
159
+ // When we start flushing the temporary buffer, we set _temporary_buffer_emit
160
+ // to the start of it, and then increment it for each call to the tokenizer.
161
+ // We also call utf8iterator_reset(), and utf8iterator_next() through the
162
+ // input stream, so that tokens emitted by emit_char have the correct position
163
+ // and original text.
164
+ GumboStringBuffer _temporary_buffer;
165
+
166
+ // The current cursor position we're emitting from within
167
+ // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
168
+ const char* _temporary_buffer_emit;
169
+
170
+ // The temporary buffer is also used by the spec to check whether we should
171
+ // enter the script data double escaped state, but we can't use the same
172
+ // buffer for both because we have to flush out "<s" as emits while still
173
+ // maintaining the context that will eventually become "script". This is a
174
+ // separate buffer that's used in place of the temporary buffer for states
175
+ // that may enter the script data double escape start state.
176
+ GumboStringBuffer _script_data_buffer;
177
+
178
+ // Pointer to the beginning of the current token in the original buffer; used
179
+ // to record the original text.
180
+ const char* _token_start;
181
+
182
+ // GumboSourcePosition recording the source location of the start of the
183
+ // current token.
184
+ GumboSourcePosition _token_start_pos;
185
+
186
+ // Current tag state.
187
+ GumboTagState _tag_state;
188
+
189
+ // Doctype state. We use the temporary buffer to accumulate characters (it's
190
+ // not used for anything else in the doctype states), and then freshly
191
+ // allocate the strings in the doctype token, then copy it over on emit.
192
+ GumboTokenDocType _doc_type_state;
193
+
194
+ // The UTF8Iterator over the tokenizer input.
195
+ Utf8Iterator _input;
196
+ } GumboTokenizerState;
197
+
198
+ // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
199
+ static void add_parse_error(GumboParser* parser, GumboErrorType type) {
200
+ GumboError* error = gumbo_add_error(parser);
201
+ if (!error) {
202
+ return;
203
+ }
204
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
205
+ utf8iterator_get_position(&tokenizer->_input, &error->position);
206
+ error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
207
+ error->type = type;
208
+ error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
209
+ switch (tokenizer->_state) {
210
+ case GUMBO_LEX_DATA:
211
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
212
+ break;
213
+ case GUMBO_LEX_CHAR_REF_IN_DATA:
214
+ case GUMBO_LEX_CHAR_REF_IN_RCDATA:
215
+ case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
216
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
217
+ break;
218
+ case GUMBO_LEX_RCDATA:
219
+ case GUMBO_LEX_RCDATA_LT:
220
+ case GUMBO_LEX_RCDATA_END_TAG_OPEN:
221
+ case GUMBO_LEX_RCDATA_END_TAG_NAME:
222
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
223
+ break;
224
+ case GUMBO_LEX_RAWTEXT:
225
+ case GUMBO_LEX_RAWTEXT_LT:
226
+ case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
227
+ case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
228
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
229
+ break;
230
+ case GUMBO_LEX_PLAINTEXT:
231
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
232
+ break;
233
+ case GUMBO_LEX_SCRIPT:
234
+ case GUMBO_LEX_SCRIPT_LT:
235
+ case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
236
+ case GUMBO_LEX_SCRIPT_END_TAG_NAME:
237
+ case GUMBO_LEX_SCRIPT_ESCAPED_START:
238
+ case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
239
+ case GUMBO_LEX_SCRIPT_ESCAPED:
240
+ case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
241
+ case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
242
+ case GUMBO_LEX_SCRIPT_ESCAPED_LT:
243
+ case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
244
+ case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
245
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
246
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
247
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
248
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
249
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
250
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
251
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
252
+ break;
253
+ case GUMBO_LEX_TAG_OPEN:
254
+ case GUMBO_LEX_END_TAG_OPEN:
255
+ case GUMBO_LEX_TAG_NAME:
256
+ case GUMBO_LEX_BEFORE_ATTR_NAME:
257
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
258
+ break;
259
+ case GUMBO_LEX_SELF_CLOSING_START_TAG:
260
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
261
+ break;
262
+ case GUMBO_LEX_ATTR_NAME:
263
+ case GUMBO_LEX_AFTER_ATTR_NAME:
264
+ case GUMBO_LEX_BEFORE_ATTR_VALUE:
265
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
266
+ break;
267
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
268
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
269
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
270
+ case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
271
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
272
+ break;
273
+ case GUMBO_LEX_BOGUS_COMMENT:
274
+ case GUMBO_LEX_COMMENT_START:
275
+ case GUMBO_LEX_COMMENT_START_DASH:
276
+ case GUMBO_LEX_COMMENT:
277
+ case GUMBO_LEX_COMMENT_END_DASH:
278
+ case GUMBO_LEX_COMMENT_END:
279
+ case GUMBO_LEX_COMMENT_END_BANG:
280
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
281
+ break;
282
+ case GUMBO_LEX_MARKUP_DECLARATION:
283
+ case GUMBO_LEX_DOCTYPE:
284
+ case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
285
+ case GUMBO_LEX_DOCTYPE_NAME:
286
+ case GUMBO_LEX_AFTER_DOCTYPE_NAME:
287
+ case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
288
+ case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
289
+ case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
290
+ case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
291
+ case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
292
+ case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
293
+ case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
294
+ case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
295
+ case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
296
+ case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
297
+ case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
298
+ case GUMBO_LEX_BOGUS_DOCTYPE:
299
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
300
+ break;
301
+ case GUMBO_LEX_CDATA:
302
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
303
+ break;
304
+ }
305
+ }
306
+
307
+ static bool is_alpha(int c) {
308
+ // We don't use ISO C isupper/islower functions here because they
309
+ // depend upon the program's locale, while the behavior of the HTML5 spec is
310
+ // independent of which locale the program is run in.
311
+ return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
312
+ }
313
+
314
+ static int ensure_lowercase(int c) {
315
+ return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
316
+ }
317
+
318
+ static GumboTokenType get_char_token_type(int c) {
319
+ switch (c) {
320
+ case '\t':
321
+ case '\n':
322
+ case '\r':
323
+ case '\f':
324
+ case ' ':
325
+ return GUMBO_TOKEN_WHITESPACE;
326
+ case 0:
327
+ gumbo_debug("Emitted null byte.\n");
328
+ return GUMBO_TOKEN_NULL;
329
+ case -1:
330
+ return GUMBO_TOKEN_EOF;
331
+ default:
332
+ return GUMBO_TOKEN_CHARACTER;
333
+ }
334
+ }
335
+
336
+ // Starts recording characters in the temporary buffer.
337
+ // Because this needs to reset the utf8iterator_mark to the beginning of the
338
+ // text that will eventually be emitted, it needs to be called a couple of
339
+ // states before the spec says "Set the temporary buffer to the empty string".
340
+ // In general, this should be called whenever there's a transition to a
341
+ // "less-than sign state". The initial < and possibly / then need to be
342
+ // appended to the temporary buffer, their presence needs to be accounted for in
343
+ // states that compare the temporary buffer against a literal value, and
344
+ // spec stanzas that say "emit a < and / character token along with a character
345
+ // token for each character in the temporary buffer" need to be adjusted to
346
+ // account for the presence of the < and / inside the temporary buffer.
347
+ static void clear_temporary_buffer(GumboParser* parser) {
348
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
349
+ assert(!tokenizer->_temporary_buffer_emit);
350
+ utf8iterator_mark(&tokenizer->_input);
351
+ gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
352
+ gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
353
+ // The temporary buffer and script data buffer are the same object in the
354
+ // spec, so the script data buffer should be cleared as well.
355
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
356
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
357
+ }
358
+
359
+ // Appends a codepoint to the temporary buffer.
360
+ static void append_char_to_temporary_buffer(
361
+ GumboParser* parser, int codepoint) {
362
+ gumbo_string_buffer_append_codepoint(
363
+ parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
364
+ }
365
+
366
+ // Checks to see if the temporary buffer equals a certain string.
367
+ // Make sure this remains side-effect free; it's used in assertions.
368
+ #ifndef NDEBUG
369
+ static bool temporary_buffer_equals(
370
+ GumboParser* parser, const char* text) {
371
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
372
+ // TODO(jdtang): See if the extra strlen is a performance problem, and replace
373
+ // it with an explicit sizeof(literal) if necessary. I don't think it will
374
+ // be, as this is only used in a couple of rare states.
375
+ int text_len = strlen(text);
376
+ return text_len == buffer->length &&
377
+ memcmp(buffer->data, text, text_len) == 0;
378
+ }
379
+ #endif
380
+
381
+ static void doc_type_state_init(GumboParser* parser) {
382
+ GumboTokenDocType* doc_type_state =
383
+ &parser->_tokenizer_state->_doc_type_state;
384
+ // We initialize these to NULL here so that we don't end up leaking memory if
385
+ // we never see a doctype token. When we do see a doctype token, we reset
386
+ // them to a freshly-allocated empty string so that we can present a uniform
387
+ // interface to client code and not make them check for null. Ownership is
388
+ // transferred to the doctype token when it's emitted.
389
+ doc_type_state->name = NULL;
390
+ doc_type_state->public_identifier = NULL;
391
+ doc_type_state->system_identifier = NULL;
392
+ doc_type_state->force_quirks = false;
393
+ doc_type_state->has_public_identifier = false;
394
+ doc_type_state->has_system_identifier = false;
395
+ }
396
+
397
+ // Sets the token original_text and position to the current iterator position.
398
+ // This is necessary because [CDATA[ sections may include text that is ignored
399
+ // by the tokenizer.
400
+ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
401
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
402
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
403
+ }
404
+
405
+ // Sets the tag buffer original text and start point to the current iterator
406
+ // position. This is necessary because attribute names & values may have
407
+ // whitespace preceeding them, and so we can't assume that the actual token
408
+ // starting point was the end of the last tag buffer usage.
409
+ static void reset_tag_buffer_start_point(GumboParser* parser) {
410
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
411
+ GumboTagState* tag_state = &tokenizer->_tag_state;
412
+
413
+ utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
414
+ tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
415
+ }
416
+
417
+ // Moves the temporary buffer contents over to the specified output string,
418
+ // and clears the temporary buffer.
419
+ static void finish_temporary_buffer(GumboParser* parser, const char** output) {
420
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
421
+ *output =
422
+ gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
423
+ clear_temporary_buffer(parser);
424
+ }
425
+
426
+ // Advances the iterator past the end of the token, and then fills in the
427
+ // relevant position fields. It's assumed that after every emit, the tokenizer
428
+ // will immediately return (letting the tree-construction stage read the filled
429
+ // in Token). Thus, it's safe to advance the input stream here, since it will
430
+ // bypass the advance at the bottom of the state machine loop.
431
+ //
432
+ // Since this advances the iterator and resets the current input, make sure to
433
+ // call it after you've recorded any other data you need for the token.
434
+ static void finish_token(GumboParser* parser, GumboToken* token) {
435
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
436
+ if (!tokenizer->_reconsume_current_input) {
437
+ utf8iterator_next(&tokenizer->_input);
438
+ }
439
+
440
+ token->position = tokenizer->_token_start_pos;
441
+ token->original_text.data = tokenizer->_token_start;
442
+ reset_token_start_point(tokenizer);
443
+ token->original_text.length =
444
+ tokenizer->_token_start - token->original_text.data;
445
+ if (token->original_text.data[token->original_text.length - 1] == '\r') {
446
+ // The UTF8 iterator will ignore carriage returns in the input stream, which
447
+ // means that the next token may start one past a \r character. The pointer
448
+ // arithmetic above results in that \r being appended to the original text
449
+ // of the preceding token, so we have to adjust its length here to chop the
450
+ // \r off.
451
+ --token->original_text.length;
452
+ }
453
+ }
454
+
455
+ // Records the doctype public ID, assumed to be in the temporary buffer.
456
+ // Convenience method that also sets has_public_identifier to true.
457
+ static void finish_doctype_public_id(GumboParser* parser) {
458
+ GumboTokenDocType* doc_type_state =
459
+ &parser->_tokenizer_state->_doc_type_state;
460
+ gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
461
+ finish_temporary_buffer(parser, &doc_type_state->public_identifier);
462
+ doc_type_state->has_public_identifier = true;
463
+ }
464
+
465
+ // Records the doctype system ID, assumed to be in the temporary buffer.
466
+ // Convenience method that also sets has_system_identifier to true.
467
+ static void finish_doctype_system_id(GumboParser* parser) {
468
+ GumboTokenDocType* doc_type_state =
469
+ &parser->_tokenizer_state->_doc_type_state;
470
+ gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
471
+ finish_temporary_buffer(parser, &doc_type_state->system_identifier);
472
+ doc_type_state->has_system_identifier = true;
473
+ }
474
+
475
+ // Writes a single specified character to the output token.
476
+ static void emit_char(GumboParser* parser, int c, GumboToken* output) {
477
+ output->type = get_char_token_type(c);
478
+ output->v.character = c;
479
+ finish_token(parser, output);
480
+ }
481
+
482
+ // Writes a replacement character token and records a parse error.
483
+ // Always returns RETURN_ERROR, per gumbo_lex return value.
484
+ static StateResult emit_replacement_char(
485
+ GumboParser* parser, GumboToken* output) {
486
+ // In all cases, this is because of a null byte in the input stream.
487
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
488
+ emit_char(parser, kUtf8ReplacementChar, output);
489
+ return RETURN_ERROR;
490
+ }
491
+
492
+ // Writes an EOF character token. Always returns RETURN_SUCCESS.
493
+ static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
494
+ emit_char(parser, -1, output);
495
+ return RETURN_SUCCESS;
496
+ }
497
+
498
+ // Writes the current input character out as a character token.
499
+ // Always returns RETURN_SUCCESS.
500
+ static bool emit_current_char(GumboParser* parser, GumboToken* output) {
501
+ emit_char(
502
+ parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
503
+ return RETURN_SUCCESS;
504
+ }
505
+
506
+ // Writes out a doctype token, copying it from the tokenizer state.
507
+ static void emit_doctype(GumboParser* parser, GumboToken* output) {
508
+ output->type = GUMBO_TOKEN_DOCTYPE;
509
+ output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
510
+ finish_token(parser, output);
511
+ doc_type_state_init(parser);
512
+ }
513
+
514
+ // Debug-only function that explicitly sets the attribute vector data to NULL so
515
+ // it can be asserted on tag creation, verifying that there are no memory leaks.
516
+ static void mark_tag_state_as_empty(GumboTagState* tag_state) {
517
+ #ifndef NDEBUG
518
+ tag_state->_attributes = kGumboEmptyVector;
519
+ #endif
520
+ }
521
+
522
+ // Writes out the current tag as a start or end tag token.
523
+ // Always returns RETURN_SUCCESS.
524
+ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
525
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
526
+ if (tag_state->_is_start_tag) {
527
+ output->type = GUMBO_TOKEN_START_TAG;
528
+ output->v.start_tag.tag = tag_state->_tag;
529
+ output->v.start_tag.attributes = tag_state->_attributes;
530
+ output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
531
+ tag_state->_last_start_tag = tag_state->_tag;
532
+ mark_tag_state_as_empty(tag_state);
533
+ gumbo_debug("Emitted start tag %s.\n",
534
+ gumbo_normalized_tagname(tag_state->_tag));
535
+ } else {
536
+ output->type = GUMBO_TOKEN_END_TAG;
537
+ output->v.end_tag = tag_state->_tag;
538
+ // In end tags, ownership of the attributes vector is not transferred to the
539
+ // token, but it's still initialized as normal, so it must be manually
540
+ // deallocated. There may also be attributes to destroy, in certain broken
541
+ // cases like </div</th> (the "th" is an attribute there).
542
+ for (int i = 0; i < tag_state->_attributes.length; ++i) {
543
+ gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
544
+ }
545
+ gumbo_parser_deallocate(parser, tag_state->_attributes.data);
546
+ mark_tag_state_as_empty(tag_state);
547
+ gumbo_debug("Emitted end tag %s.\n",
548
+ gumbo_normalized_tagname(tag_state->_tag));
549
+ }
550
+ gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
551
+ finish_token(parser, output);
552
+ gumbo_debug("Original text = %.*s.\n", output->original_text.length, output->original_text.data);
553
+ assert(output->original_text.length >= 2);
554
+ assert(output->original_text.data[0] == '<');
555
+ assert(output->original_text.data[output->original_text.length - 1] == '>');
556
+ return RETURN_SUCCESS;
557
+ }
558
+
559
+ // In some states, we speculatively start a tag, but don't know whether it'll be
560
+ // emitted as tag token or as a series of character tokens until we finish it.
561
+ // We need to abandon the tag we'd started & free its memory in that case to
562
+ // avoid a memory leak.
563
+ static void abandon_current_tag(GumboParser* parser) {
564
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
565
+ for (int i = 0; i < tag_state->_attributes.length; ++i) {
566
+ gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
567
+ }
568
+ gumbo_parser_deallocate(parser, tag_state->_attributes.data);
569
+ mark_tag_state_as_empty(tag_state);
570
+ gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
571
+ gumbo_debug("Abandoning current tag.\n");
572
+ }
573
+
574
+ // Wraps the consume_char_ref function to handle its output and make the
575
+ // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
576
+ // error occurred, RETURN_SUCCESS otherwise.
577
+ static StateResult emit_char_ref(
578
+ GumboParser* parser, int additional_allowed_char,
579
+ bool is_in_attribute, GumboToken* output) {
580
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
581
+ OneOrTwoCodepoints char_ref;
582
+ bool status = consume_char_ref(
583
+ parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
584
+ if (char_ref.first != kGumboNoChar) {
585
+ // consume_char_ref ends with the iterator pointing at the next character,
586
+ // so we need to be sure not advance it again before reading the next token.
587
+ tokenizer->_reconsume_current_input = true;
588
+ emit_char(parser, char_ref.first, output);
589
+ tokenizer->_buffered_emit_char = char_ref.second;
590
+ } else {
591
+ emit_char(parser, '&', output);
592
+ }
593
+ return status ? RETURN_SUCCESS : RETURN_ERROR;
594
+ }
595
+
596
+ // Emits a comment token. Comments use the temporary buffer to accumulate their
597
+ // data, and then it's copied over and released to the 'text' field of the
598
+ // GumboToken union. Always returns RETURN_SUCCESS.
599
+ static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
600
+ output->type = GUMBO_TOKEN_COMMENT;
601
+ finish_temporary_buffer(parser, &output->v.text);
602
+ finish_token(parser, output);
603
+ return RETURN_SUCCESS;
604
+ }
605
+
606
+ // Checks to see we should be flushing accumulated characters in the temporary
607
+ // buffer, and fills the output token with the next output character if so.
608
+ // Returns true if a character has been emitted and the tokenizer should
609
+ // immediately return, false if we're at the end of the temporary buffer and
610
+ // should resume normal operation.
611
+ static bool maybe_emit_from_temporary_buffer(
612
+ GumboParser* parser, GumboToken* output) {
613
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
614
+ const char* c = tokenizer->_temporary_buffer_emit;
615
+ GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
616
+
617
+ if (!c || c >= buffer->data + buffer->length) {
618
+ tokenizer->_temporary_buffer_emit = NULL;
619
+ return false;
620
+ }
621
+
622
+ assert(*c == utf8iterator_current(&tokenizer->_input));
623
+ // emit_char also advances the input stream. We need to do some juggling of
624
+ // the _reconsume_current_input flag to get the proper behavior when emitting
625
+ // previous tokens. Basically, _reconsume_current_input should *never* be set
626
+ // when emitting anything from the temporary buffer, since those characters
627
+ // have already been advanced past. However, it should be preserved so that
628
+ // when the *next* character is encountered again, the tokenizer knows not to
629
+ // advance past it.
630
+ bool saved_reconsume_state = tokenizer->_reconsume_current_input;
631
+ tokenizer->_reconsume_current_input = false;
632
+ emit_char(parser, *c, output);
633
+ ++tokenizer->_temporary_buffer_emit;
634
+ tokenizer->_reconsume_current_input = saved_reconsume_state;
635
+ return true;
636
+ }
637
+
638
+ // Sets up the tokenizer to begin flushing the temporary buffer.
639
+ // This resets the input iterator stream to the start of the last tag, sets up
640
+ // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
641
+ // the first character in it. It returns true if a character was emitted, false
642
+ // otherwise.
643
+ static bool emit_temporary_buffer(
644
+ GumboParser* parser, GumboToken* output) {
645
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
646
+ assert(tokenizer->_temporary_buffer.data);
647
+ utf8iterator_reset(&tokenizer->_input);
648
+ tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
649
+ return maybe_emit_from_temporary_buffer(parser, output);
650
+ }
651
+
652
+ // Appends a codepoint to the current tag buffer. If
653
+ // reinitilize_position_on_first is set, this also initializes the tag buffer
654
+ // start point; the only time you would *not* want to pass true for this
655
+ // parameter is if you want the original_text to include character (like an
656
+ // opening quote) that doesn't appear in the value.
657
+ static void append_char_to_tag_buffer(GumboParser* parser, int codepoint,
658
+ bool reinitilize_position_on_first) {
659
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
660
+ if (buffer->length == 0 && reinitilize_position_on_first) {
661
+ reset_tag_buffer_start_point(parser);
662
+ }
663
+ gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
664
+ }
665
+
666
+ // (Re-)initialize the tag buffer. This also resets the original_text pointer
667
+ // and _start_pos field to point to the current position.
668
+ static void initialize_tag_buffer(GumboParser* parser) {
669
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
670
+ GumboTagState* tag_state = &tokenizer->_tag_state;
671
+
672
+ gumbo_string_buffer_init(parser, &tag_state->_buffer);
673
+ reset_tag_buffer_start_point(parser);
674
+ }
675
+
676
+ // Initializes the tag_state to start a new tag, keeping track of the opening
677
+ // positions and original text. Takes a boolean indicating whether this is a
678
+ // start or end tag.
679
+ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
680
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
681
+ GumboTagState* tag_state = &tokenizer->_tag_state;
682
+ int c = utf8iterator_current(&tokenizer->_input);
683
+ assert(is_alpha(c));
684
+ c = ensure_lowercase(c);
685
+ assert(is_alpha(c));
686
+
687
+ initialize_tag_buffer(parser);
688
+ gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
689
+
690
+ assert(tag_state->_attributes.data == NULL);
691
+ gumbo_vector_init(parser, 4, &tag_state->_attributes);
692
+ tag_state->_drop_next_attr_value = false;
693
+ tag_state->_is_start_tag = is_start_tag;
694
+ tag_state->_is_self_closing = false;
695
+ gumbo_debug("Starting new tag.\n");
696
+ }
697
+
698
+ // Fills in the specified char* with the contents of the tag buffer.
699
+ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
700
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
701
+ GumboTagState* tag_state = &tokenizer->_tag_state;
702
+ *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
703
+ }
704
+
705
+ // Fills in:
706
+ // * The original_text GumboStringPiece with the portion of the original
707
+ // buffer that corresponds to the tag buffer.
708
+ // * The start_pos GumboSourcePosition with the start position of the tag
709
+ // buffer.
710
+ // * The end_pos GumboSourcePosition with the current source position.
711
+ static void copy_over_original_tag_text(
712
+ GumboParser* parser, GumboStringPiece* original_text,
713
+ GumboSourcePosition* start_pos, GumboSourcePosition* end_pos) {
714
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
715
+ GumboTagState* tag_state = &tokenizer->_tag_state;
716
+
717
+ original_text->data = tag_state->_original_text;
718
+ original_text->length =
719
+ utf8iterator_get_char_pointer(&tokenizer->_input) -
720
+ tag_state->_original_text;
721
+ if (original_text->data[original_text->length - 1] == '\r') {
722
+ // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
723
+ // appended to the end of original text even when it's really the first part
724
+ // of the next character. If we detect this situation, shrink the length of
725
+ // the original text by 1 to remove the carriage return.
726
+ --original_text->length;
727
+ }
728
+ *start_pos = tag_state->_start_pos;
729
+ utf8iterator_get_position(&tokenizer->_input, end_pos);
730
+ }
731
+
732
+ // Releases and then re-initializes the tag buffer.
733
+ static void reinitialize_tag_buffer(GumboParser* parser) {
734
+ gumbo_parser_deallocate(
735
+ parser, parser->_tokenizer_state->_tag_state._buffer.data);
736
+ initialize_tag_buffer(parser);
737
+ }
738
+
739
+ // Moves some data from the temporary buffer over the the tag-based fields in
740
+ // TagState.
741
+ static void finish_tag_name(GumboParser* parser) {
742
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
743
+ GumboTagState* tag_state = &tokenizer->_tag_state;
744
+
745
+ const char* temp;
746
+ copy_over_tag_buffer(parser, &temp);
747
+ tag_state->_tag = gumbo_tag_enum(temp);
748
+ reinitialize_tag_buffer(parser);
749
+ gumbo_parser_deallocate(parser, (void*) temp);
750
+ }
751
+
752
+ // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
753
+ static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
754
+ int original_index, int new_index) {
755
+ GumboError* error = gumbo_add_error(parser);
756
+ if (!error) {
757
+ return;
758
+ }
759
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
760
+ error->type = GUMBO_ERR_DUPLICATE_ATTR;
761
+ error->position = tag_state->_start_pos;
762
+ error->original_text = tag_state->_original_text;
763
+ error->v.duplicate_attr.original_index = original_index;
764
+ error->v.duplicate_attr.new_index = new_index;
765
+ copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
766
+ reinitialize_tag_buffer(parser);
767
+ }
768
+
769
+ // Creates a new attribute in the current tag, copying the current tag buffer to
770
+ // the attribute's name. The attribute's value starts out as the empty string
771
+ // (following the "Boolean attributes" section of the spec) and is only
772
+ // overwritten on finish_attribute_value(). If the attribute has already been
773
+ // specified, the new attribute is dropped, a parse error is added, and the
774
+ // function returns false. Otherwise, this returns true.
775
+ static bool finish_attribute_name(GumboParser* parser) {
776
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
777
+ GumboTagState* tag_state = &tokenizer->_tag_state;
778
+ // May've been set by a previous attribute without a value; reset it here.
779
+ tag_state->_drop_next_attr_value = false;
780
+ assert(tag_state->_attributes.data);
781
+ assert(tag_state->_attributes.capacity);
782
+
783
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
784
+ for (int i = 0; i < attributes->length; ++i) {
785
+ GumboAttribute* attr = attributes->data[i];
786
+ if (strlen(attr->name) == tag_state->_buffer.length &&
787
+ memcmp(attr->name, tag_state->_buffer.data,
788
+ tag_state->_buffer.length) == 0) {
789
+ // Identical attribute; bail.
790
+ add_duplicate_attr_error(
791
+ parser, attr->name, i, attributes->length);
792
+ tag_state->_drop_next_attr_value = true;
793
+ return false;
794
+ }
795
+ }
796
+
797
+ GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
798
+ attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
799
+ copy_over_tag_buffer(parser, &attr->name);
800
+ copy_over_original_tag_text(parser, &attr->original_name,
801
+ &attr->name_start, &attr->name_end);
802
+ attr->value = gumbo_copy_stringz(parser, "");
803
+ copy_over_original_tag_text(parser, &attr->original_value,
804
+ &attr->name_start, &attr->name_end);
805
+ gumbo_vector_add(parser, attr, attributes);
806
+ reinitialize_tag_buffer(parser);
807
+ return true;
808
+ }
809
+
810
+ // Finishes an attribute value. This sets the value of the most recently added
811
+ // attribute to the current contents of the tag buffer.
812
+ static void finish_attribute_value(GumboParser* parser) {
813
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
814
+ if (tag_state->_drop_next_attr_value) {
815
+ // Duplicate attribute name detected in an earlier state, so we have to
816
+ // ignore the value.
817
+ tag_state->_drop_next_attr_value = false;
818
+ return;
819
+ }
820
+
821
+ GumboAttribute* attr =
822
+ tag_state->_attributes.data[tag_state->_attributes.length - 1];
823
+ gumbo_parser_deallocate(parser, (void*) attr->value);
824
+ copy_over_tag_buffer(parser, &attr->value);
825
+ copy_over_original_tag_text(parser, &attr->original_value,
826
+ &attr->value_start, &attr->value_end);
827
+ reinitialize_tag_buffer(parser);
828
+ }
829
+
830
+ // Returns true if the current end tag matches the last start tag emitted.
831
+ static bool is_appropriate_end_tag(GumboParser* parser) {
832
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
833
+ assert(!tag_state->_is_start_tag);
834
+ // Null terminate the current string buffer, so it can be passed to
835
+ // gumbo_tag_enum, but don't increment the length in case we need to dump the
836
+ // buffer as character tokens.
837
+ gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
838
+ --tag_state->_buffer.length;
839
+ return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
840
+ tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
841
+ }
842
+
843
+ void gumbo_tokenizer_state_init(
844
+ GumboParser* parser, const char* text, size_t text_length) {
845
+ GumboTokenizerState* tokenizer =
846
+ gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
847
+ parser->_tokenizer_state = tokenizer;
848
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
849
+ tokenizer->_reconsume_current_input = false;
850
+ tokenizer->_is_current_node_foreign = false;
851
+ tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
852
+
853
+ tokenizer->_buffered_emit_char = kGumboNoChar;
854
+ gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
855
+ tokenizer->_temporary_buffer_emit = NULL;
856
+
857
+ mark_tag_state_as_empty(&tokenizer->_tag_state);
858
+
859
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
860
+ tokenizer->_token_start = text;
861
+ utf8iterator_init(parser, text, text_length, &tokenizer->_input);
862
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
863
+ doc_type_state_init(parser);
864
+ }
865
+
866
+ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
867
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
868
+ assert(tokenizer->_doc_type_state.name == NULL);
869
+ assert(tokenizer->_doc_type_state.public_identifier == NULL);
870
+ assert(tokenizer->_doc_type_state.system_identifier == NULL);
871
+ gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
872
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
873
+ gumbo_parser_deallocate(parser, tokenizer);
874
+ }
875
+
876
+ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
877
+ parser->_tokenizer_state->_state = state;
878
+ }
879
+
880
+ void gumbo_tokenizer_set_is_current_node_foreign(
881
+ GumboParser* parser, bool is_foreign) {
882
+ if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
883
+ gumbo_debug("Toggling is_current_node_foreign to %s.\n",
884
+ is_foreign ? "true" : "false");
885
+ }
886
+ parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
887
+ }
888
+
889
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
890
+ static StateResult handle_data_state(
891
+ GumboParser* parser, GumboTokenizerState* tokenizer,
892
+ int c, GumboToken* output) {
893
+ switch (c) {
894
+ case '&':
895
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
896
+ // The char_ref machinery expects to be on the & so it can mark that
897
+ // and return to it if the text isn't a char ref, so we need to
898
+ // reconsume it.
899
+ tokenizer->_reconsume_current_input = true;
900
+ return NEXT_CHAR;
901
+ case '<':
902
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
903
+ clear_temporary_buffer(parser);
904
+ append_char_to_temporary_buffer(parser, '<');
905
+ return NEXT_CHAR;
906
+ case '\0':
907
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
908
+ emit_char(parser, c, output);
909
+ return RETURN_ERROR;
910
+ default:
911
+ return emit_current_char(parser, output);
912
+ }
913
+ }
914
+
915
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
916
+ static StateResult handle_char_ref_in_data_state(
917
+ GumboParser* parser, GumboTokenizerState* tokenizer,
918
+ int c, GumboToken* output) {
919
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
920
+ return emit_char_ref(parser, ' ', false, output);
921
+ }
922
+
923
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
924
+ static StateResult handle_rcdata_state(
925
+ GumboParser* parser, GumboTokenizerState* tokenizer,
926
+ int c, GumboToken* output) {
927
+ switch (c) {
928
+ case '&':
929
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
930
+ tokenizer->_reconsume_current_input = true;
931
+ return NEXT_CHAR;
932
+ case '<':
933
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
934
+ clear_temporary_buffer(parser);
935
+ append_char_to_temporary_buffer(parser, '<');
936
+ return NEXT_CHAR;
937
+ case '\0':
938
+ return emit_replacement_char(parser, output);
939
+ case -1:
940
+ return emit_eof(parser, output);
941
+ default:
942
+ return emit_current_char(parser, output);
943
+ }
944
+ }
945
+
946
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
947
+ static StateResult handle_char_ref_in_rcdata_state(
948
+ GumboParser* parser, GumboTokenizerState* tokenizer,
949
+ int c, GumboToken* output) {
950
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
951
+ return emit_char_ref(parser, ' ', false, output);
952
+ }
953
+
954
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
955
+ static StateResult handle_rawtext_state(
956
+ GumboParser* parser, GumboTokenizerState* tokenizer,
957
+ int c, GumboToken* output) {
958
+ switch (c) {
959
+ case '<':
960
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
961
+ clear_temporary_buffer(parser);
962
+ append_char_to_temporary_buffer(parser, '<');
963
+ return NEXT_CHAR;
964
+ case '\0':
965
+ return emit_replacement_char(parser, output);
966
+ case -1:
967
+ return emit_eof(parser, output);
968
+ default:
969
+ return emit_current_char(parser, output);
970
+ }
971
+ }
972
+
973
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
974
+ static StateResult handle_script_state(
975
+ GumboParser* parser, GumboTokenizerState* tokenizer,
976
+ int c, GumboToken* output) {
977
+ switch (c) {
978
+ case '<':
979
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
980
+ clear_temporary_buffer(parser);
981
+ append_char_to_temporary_buffer(parser, '<');
982
+ return NEXT_CHAR;
983
+ case '\0':
984
+ return emit_replacement_char(parser, output);
985
+ case -1:
986
+ return emit_eof(parser, output);
987
+ default:
988
+ return emit_current_char(parser, output);
989
+ }
990
+ }
991
+
992
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
993
+ static StateResult handle_plaintext_state(
994
+ GumboParser* parser, GumboTokenizerState* tokenizer,
995
+ int c, GumboToken* output) {
996
+ switch (c) {
997
+ case '\0':
998
+ return emit_replacement_char(parser, output);
999
+ case -1:
1000
+ return emit_eof(parser, output);
1001
+ default:
1002
+ return emit_current_char(parser, output);
1003
+ }
1004
+ }
1005
+
1006
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
1007
+ static StateResult handle_tag_open_state(
1008
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1009
+ int c, GumboToken* output) {
1010
+ assert(temporary_buffer_equals(parser, "<"));
1011
+ switch (c) {
1012
+ case '!':
1013
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1014
+ clear_temporary_buffer(parser);
1015
+ return NEXT_CHAR;
1016
+ case '/':
1017
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1018
+ append_char_to_temporary_buffer(parser, '/');
1019
+ return NEXT_CHAR;
1020
+ case '?':
1021
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1022
+ clear_temporary_buffer(parser);
1023
+ append_char_to_temporary_buffer(parser, '?');
1024
+ add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1025
+ return NEXT_CHAR;
1026
+ default:
1027
+ if (is_alpha(c)) {
1028
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1029
+ start_new_tag(parser, true);
1030
+ return NEXT_CHAR;
1031
+ } else {
1032
+ add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1033
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1034
+ emit_temporary_buffer(parser, output);
1035
+ return RETURN_ERROR;
1036
+ }
1037
+ }
1038
+ }
1039
+
1040
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
1041
+ static StateResult handle_end_tag_open_state(
1042
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1043
+ int c, GumboToken* output) {
1044
+ assert(temporary_buffer_equals(parser, "</"));
1045
+ switch (c) {
1046
+ case '>':
1047
+ add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1048
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1049
+ return NEXT_CHAR;
1050
+ case -1:
1051
+ add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1052
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1053
+ return emit_temporary_buffer(parser, output);
1054
+ default:
1055
+ if (is_alpha(c)) {
1056
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1057
+ start_new_tag(parser, false);
1058
+ } else {
1059
+ add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1060
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1061
+ clear_temporary_buffer(parser);
1062
+ append_char_to_temporary_buffer(parser, c);
1063
+ }
1064
+ return NEXT_CHAR;
1065
+ }
1066
+ }
1067
+
1068
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
1069
+ static StateResult handle_tag_name_state(
1070
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1071
+ int c, GumboToken* output) {
1072
+ switch (c) {
1073
+ case '\t':
1074
+ case '\n':
1075
+ case '\f':
1076
+ case ' ':
1077
+ finish_tag_name(parser);
1078
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1079
+ return NEXT_CHAR;
1080
+ case '/':
1081
+ finish_tag_name(parser);
1082
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1083
+ return NEXT_CHAR;
1084
+ case '>':
1085
+ finish_tag_name(parser);
1086
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1087
+ return emit_current_tag(parser, output);
1088
+ case '\0':
1089
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1090
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1091
+ return NEXT_CHAR;
1092
+ case -1:
1093
+ add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1094
+ abandon_current_tag(parser);
1095
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1096
+ return NEXT_CHAR;
1097
+ default:
1098
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1099
+ return NEXT_CHAR;
1100
+ }
1101
+ }
1102
+
1103
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
1104
+ static StateResult handle_rcdata_lt_state(
1105
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1106
+ int c, GumboToken* output) {
1107
+ assert(temporary_buffer_equals(parser, "<"));
1108
+ if (c == '/') {
1109
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1110
+ append_char_to_temporary_buffer(parser, '/');
1111
+ return NEXT_CHAR;
1112
+ } else {
1113
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1114
+ tokenizer->_reconsume_current_input = true;
1115
+ return emit_temporary_buffer(parser, output);
1116
+ }
1117
+ }
1118
+
1119
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
1120
+ static StateResult handle_rcdata_end_tag_open_state(
1121
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1122
+ int c, GumboToken* output) {
1123
+ assert(temporary_buffer_equals(parser, "</"));
1124
+ if (is_alpha(c)) {
1125
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1126
+ start_new_tag(parser, false);
1127
+ append_char_to_temporary_buffer(parser, c);
1128
+ return NEXT_CHAR;
1129
+ } else {
1130
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1131
+ return emit_temporary_buffer(parser, output);
1132
+ }
1133
+ return true;
1134
+ }
1135
+
1136
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
1137
+ static StateResult handle_rcdata_end_tag_name_state(
1138
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1139
+ int c, GumboToken* output) {
1140
+ assert(tokenizer->_temporary_buffer.length >= 2);
1141
+ if (is_alpha(c)) {
1142
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1143
+ append_char_to_temporary_buffer(parser, c);
1144
+ return NEXT_CHAR;
1145
+ } else if (is_appropriate_end_tag(parser)) {
1146
+ switch (c) {
1147
+ case '\t':
1148
+ case '\n':
1149
+ case '\f':
1150
+ case ' ':
1151
+ finish_tag_name(parser);
1152
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1153
+ return NEXT_CHAR;
1154
+ case '/':
1155
+ finish_tag_name(parser);
1156
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1157
+ return NEXT_CHAR;
1158
+ case '>':
1159
+ finish_tag_name(parser);
1160
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1161
+ return emit_current_tag(parser, output);
1162
+ }
1163
+ }
1164
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1165
+ abandon_current_tag(parser);
1166
+ return emit_temporary_buffer(parser, output);
1167
+ }
1168
+
1169
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
1170
+ static StateResult handle_rawtext_lt_state(
1171
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1172
+ int c, GumboToken* output) {
1173
+ assert(temporary_buffer_equals(parser, "<"));
1174
+ if (c == '/') {
1175
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1176
+ append_char_to_temporary_buffer(parser, '/');
1177
+ return NEXT_CHAR;
1178
+ } else {
1179
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1180
+ tokenizer->_reconsume_current_input = true;
1181
+ return emit_temporary_buffer(parser, output);
1182
+ }
1183
+ }
1184
+
1185
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
1186
+ static StateResult handle_rawtext_end_tag_open_state(
1187
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1188
+ int c, GumboToken* output) {
1189
+ assert(temporary_buffer_equals(parser, "</"));
1190
+ if (is_alpha(c)) {
1191
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1192
+ start_new_tag(parser, false);
1193
+ append_char_to_temporary_buffer(parser, c);
1194
+ return NEXT_CHAR;
1195
+ } else {
1196
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1197
+ return emit_temporary_buffer(parser, output);
1198
+ }
1199
+ }
1200
+
1201
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
1202
+ static StateResult handle_rawtext_end_tag_name_state(
1203
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1204
+ int c, GumboToken* output) {
1205
+ assert(tokenizer->_temporary_buffer.length >= 2);
1206
+ gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1207
+ tokenizer->_tag_state._buffer.data);
1208
+ if (is_alpha(c)) {
1209
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1210
+ append_char_to_temporary_buffer(parser, c);
1211
+ return NEXT_CHAR;
1212
+ } else if (is_appropriate_end_tag(parser)) {
1213
+ gumbo_debug("Is an appropriate end tag.\n");
1214
+ switch (c) {
1215
+ case '\t':
1216
+ case '\n':
1217
+ case '\f':
1218
+ case ' ':
1219
+ finish_tag_name(parser);
1220
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1221
+ return NEXT_CHAR;
1222
+ case '/':
1223
+ finish_tag_name(parser);
1224
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1225
+ return NEXT_CHAR;
1226
+ case '>':
1227
+ finish_tag_name(parser);
1228
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1229
+ return emit_current_tag(parser, output);
1230
+ }
1231
+ }
1232
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1233
+ abandon_current_tag(parser);
1234
+ return emit_temporary_buffer(parser, output);
1235
+ }
1236
+
1237
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
1238
+ static StateResult handle_script_lt_state(
1239
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1240
+ int c, GumboToken* output) {
1241
+ assert(temporary_buffer_equals(parser, "<"));
1242
+ if (c == '/') {
1243
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1244
+ append_char_to_temporary_buffer(parser, '/');
1245
+ return NEXT_CHAR;
1246
+ } else if (c == '!') {
1247
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1248
+ append_char_to_temporary_buffer(parser, '!');
1249
+ return emit_temporary_buffer(parser, output);
1250
+ } else {
1251
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1252
+ tokenizer->_reconsume_current_input = true;
1253
+ return emit_temporary_buffer(parser, output);
1254
+ }
1255
+ }
1256
+
1257
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
1258
+ static StateResult handle_script_end_tag_open_state(
1259
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1260
+ int c, GumboToken* output) {
1261
+ assert(temporary_buffer_equals(parser, "</"));
1262
+ if (is_alpha(c)) {
1263
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1264
+ start_new_tag(parser, false);
1265
+ append_char_to_temporary_buffer(parser, c);
1266
+ return NEXT_CHAR;
1267
+ } else {
1268
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1269
+ return emit_temporary_buffer(parser, output);
1270
+ }
1271
+ }
1272
+
1273
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
1274
+ static StateResult handle_script_end_tag_name_state(
1275
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1276
+ int c, GumboToken* output) {
1277
+ assert(tokenizer->_temporary_buffer.length >= 2);
1278
+ if (is_alpha(c)) {
1279
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1280
+ append_char_to_temporary_buffer(parser, c);
1281
+ return NEXT_CHAR;
1282
+ } else if (is_appropriate_end_tag(parser)) {
1283
+ switch (c) {
1284
+ case '\t':
1285
+ case '\n':
1286
+ case '\f':
1287
+ case ' ':
1288
+ finish_tag_name(parser);
1289
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1290
+ return NEXT_CHAR;
1291
+ case '/':
1292
+ finish_tag_name(parser);
1293
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1294
+ return NEXT_CHAR;
1295
+ case '>':
1296
+ finish_tag_name(parser);
1297
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1298
+ return emit_current_tag(parser, output);
1299
+ }
1300
+ }
1301
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1302
+ abandon_current_tag(parser);
1303
+ return emit_temporary_buffer(parser, output);
1304
+ }
1305
+
1306
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
1307
+ static StateResult handle_script_escaped_start_state(
1308
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1309
+ int c, GumboToken* output) {
1310
+ if (c == '-') {
1311
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1312
+ return emit_current_char(parser, output);
1313
+ } else {
1314
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1315
+ tokenizer->_reconsume_current_input = true;
1316
+ return NEXT_CHAR;
1317
+ }
1318
+ }
1319
+
1320
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
1321
+ static StateResult handle_script_escaped_start_dash_state(
1322
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1323
+ int c, GumboToken* output) {
1324
+ if (c == '-') {
1325
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1326
+ return emit_current_char(parser, output);
1327
+ } else {
1328
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1329
+ tokenizer->_reconsume_current_input = true;
1330
+ return NEXT_CHAR;
1331
+ }
1332
+ }
1333
+
1334
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
1335
+ static StateResult handle_script_escaped_state(
1336
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1337
+ int c, GumboToken* output) {
1338
+ switch (c) {
1339
+ case '-':
1340
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1341
+ return emit_current_char(parser, output);
1342
+ case '<':
1343
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1344
+ clear_temporary_buffer(parser);
1345
+ append_char_to_temporary_buffer(parser, c);
1346
+ return NEXT_CHAR;
1347
+ case '\0':
1348
+ return emit_replacement_char(parser, output);
1349
+ case -1:
1350
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1351
+ return emit_eof(parser, output);
1352
+ default:
1353
+ return emit_current_char(parser, output);
1354
+ }
1355
+ }
1356
+
1357
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
1358
+ static StateResult handle_script_escaped_dash_state(
1359
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1360
+ int c, GumboToken* output) {
1361
+ switch (c) {
1362
+ case '-':
1363
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1364
+ return emit_current_char(parser, output);
1365
+ case '<':
1366
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1367
+ clear_temporary_buffer(parser);
1368
+ append_char_to_temporary_buffer(parser, c);
1369
+ return NEXT_CHAR;
1370
+ case '\0':
1371
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1372
+ return emit_replacement_char(parser, output);
1373
+ case -1:
1374
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1375
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1376
+ return NEXT_CHAR;
1377
+ default:
1378
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1379
+ return emit_current_char(parser, output);
1380
+ }
1381
+ }
1382
+
1383
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
1384
+ static StateResult handle_script_escaped_dash_dash_state(
1385
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1386
+ int c, GumboToken* output) {
1387
+ switch (c) {
1388
+ case '-':
1389
+ return emit_current_char(parser, output);
1390
+ case '<':
1391
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1392
+ clear_temporary_buffer(parser);
1393
+ append_char_to_temporary_buffer(parser, c);
1394
+ return NEXT_CHAR;
1395
+ case '>':
1396
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1397
+ return emit_current_char(parser, output);
1398
+ case '\0':
1399
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1400
+ return emit_replacement_char(parser, output);
1401
+ case -1:
1402
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1403
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1404
+ return NEXT_CHAR;
1405
+ default:
1406
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1407
+ return emit_current_char(parser, output);
1408
+ }
1409
+ }
1410
+
1411
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
1412
+ static StateResult handle_script_escaped_lt_state(
1413
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1414
+ int c, GumboToken* output) {
1415
+ assert(temporary_buffer_equals(parser, "<"));
1416
+ assert(!tokenizer->_script_data_buffer.length);
1417
+ if (c == '/') {
1418
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1419
+ append_char_to_temporary_buffer(parser, c);
1420
+ return NEXT_CHAR;
1421
+ } else if (is_alpha(c)) {
1422
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1423
+ append_char_to_temporary_buffer(parser, c);
1424
+ gumbo_string_buffer_append_codepoint(
1425
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1426
+ return emit_temporary_buffer(parser, output);
1427
+ } else {
1428
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1429
+ return emit_temporary_buffer(parser, output);
1430
+ }
1431
+ }
1432
+
1433
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
1434
+ static StateResult handle_script_escaped_end_tag_open_state(
1435
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1436
+ int c, GumboToken* output) {
1437
+ assert(temporary_buffer_equals(parser, "</"));
1438
+ if (is_alpha(c)) {
1439
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1440
+ start_new_tag(parser, false);
1441
+ append_char_to_temporary_buffer(parser, c);
1442
+ return NEXT_CHAR;
1443
+ } else {
1444
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1445
+ return emit_temporary_buffer(parser, output);
1446
+ }
1447
+ }
1448
+
1449
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
1450
+ static StateResult handle_script_escaped_end_tag_name_state(
1451
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1452
+ int c, GumboToken* output) {
1453
+ assert(tokenizer->_temporary_buffer.length >= 2);
1454
+ if (is_alpha(c)) {
1455
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1456
+ append_char_to_temporary_buffer(parser, c);
1457
+ return NEXT_CHAR;
1458
+ } else if (is_appropriate_end_tag(parser)) {
1459
+ switch (c) {
1460
+ case '\t':
1461
+ case '\n':
1462
+ case '\f':
1463
+ case ' ':
1464
+ finish_tag_name(parser);
1465
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1466
+ return NEXT_CHAR;
1467
+ case '/':
1468
+ finish_tag_name(parser);
1469
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1470
+ return NEXT_CHAR;
1471
+ case '>':
1472
+ finish_tag_name(parser);
1473
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1474
+ return emit_current_tag(parser, output);
1475
+ }
1476
+ }
1477
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1478
+ abandon_current_tag(parser);
1479
+ return emit_temporary_buffer(parser, output);
1480
+ }
1481
+
1482
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
1483
+ static StateResult handle_script_double_escaped_start_state(
1484
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1485
+ int c, GumboToken* output) {
1486
+ switch (c) {
1487
+ case '\t':
1488
+ case '\n':
1489
+ case '\f':
1490
+ case ' ':
1491
+ case '/':
1492
+ case '>':
1493
+ gumbo_tokenizer_set_state(parser, gumbo_string_equals(
1494
+ &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1495
+ ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED : GUMBO_LEX_SCRIPT_ESCAPED);
1496
+ return emit_current_char(parser, output);
1497
+ default:
1498
+ if (is_alpha(c)) {
1499
+ gumbo_string_buffer_append_codepoint(
1500
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1501
+ return emit_current_char(parser, output);
1502
+ } else {
1503
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1504
+ tokenizer->_reconsume_current_input = true;
1505
+ return NEXT_CHAR;
1506
+ }
1507
+ }
1508
+ }
1509
+
1510
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
1511
+ static StateResult handle_script_double_escaped_state(
1512
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1513
+ int c, GumboToken* output) {
1514
+ switch (c) {
1515
+ case '-':
1516
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1517
+ return emit_current_char(parser, output);
1518
+ case '<':
1519
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1520
+ return emit_current_char(parser, output);
1521
+ case '\0':
1522
+ return emit_replacement_char(parser, output);
1523
+ case -1:
1524
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1525
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1526
+ return NEXT_CHAR;
1527
+ default:
1528
+ return emit_current_char(parser, output);
1529
+ }
1530
+ }
1531
+
1532
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
1533
+ static StateResult handle_script_double_escaped_dash_state(
1534
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1535
+ int c, GumboToken* output) {
1536
+ switch (c) {
1537
+ case '-':
1538
+ gumbo_tokenizer_set_state(
1539
+ parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1540
+ return emit_current_char(parser, output);
1541
+ case '<':
1542
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1543
+ return emit_current_char(parser, output);
1544
+ case '\0':
1545
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1546
+ return emit_replacement_char(parser, output);
1547
+ case -1:
1548
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1549
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1550
+ return NEXT_CHAR;
1551
+ default:
1552
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1553
+ return emit_current_char(parser, output);
1554
+ }
1555
+ }
1556
+
1557
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
1558
+ static StateResult handle_script_double_escaped_dash_dash_state(
1559
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1560
+ int c, GumboToken* output) {
1561
+ switch (c) {
1562
+ case '-':
1563
+ return emit_current_char(parser, output);
1564
+ case '<':
1565
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1566
+ return emit_current_char(parser, output);
1567
+ case '>':
1568
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1569
+ return emit_current_char(parser, output);
1570
+ case '\0':
1571
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1572
+ return emit_replacement_char(parser, output);
1573
+ case -1:
1574
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1575
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1576
+ return NEXT_CHAR;
1577
+ default:
1578
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1579
+ return emit_current_char(parser, output);
1580
+ }
1581
+ }
1582
+
1583
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
1584
+ static StateResult handle_script_double_escaped_lt_state(
1585
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1586
+ int c, GumboToken* output) {
1587
+ if (c == '/') {
1588
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1589
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
1590
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
1591
+ return emit_current_char(parser, output);
1592
+ } else {
1593
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1594
+ tokenizer->_reconsume_current_input = true;
1595
+ return NEXT_CHAR;
1596
+ }
1597
+
1598
+ }
1599
+
1600
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
1601
+ static StateResult handle_script_double_escaped_end_state(
1602
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1603
+ int c, GumboToken* output) {
1604
+ switch (c) {
1605
+ case '\t':
1606
+ case '\n':
1607
+ case '\f':
1608
+ case ' ':
1609
+ case '/':
1610
+ case '>':
1611
+ gumbo_tokenizer_set_state(parser, gumbo_string_equals(
1612
+ &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1613
+ ? GUMBO_LEX_SCRIPT_ESCAPED : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1614
+ return emit_current_char(parser, output);
1615
+ default:
1616
+ if (is_alpha(c)) {
1617
+ gumbo_string_buffer_append_codepoint(
1618
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1619
+ return emit_current_char(parser, output);
1620
+ } else {
1621
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1622
+ tokenizer->_reconsume_current_input = true;
1623
+ return NEXT_CHAR;
1624
+ }
1625
+ }
1626
+ }
1627
+
1628
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
1629
+ static StateResult handle_before_attr_name_state(
1630
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1631
+ int c, GumboToken* output) {
1632
+ switch (c) {
1633
+ case '\t':
1634
+ case '\n':
1635
+ case '\f':
1636
+ case ' ':
1637
+ return NEXT_CHAR;
1638
+ case '/':
1639
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1640
+ return NEXT_CHAR;
1641
+ case '>':
1642
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1643
+ return emit_current_tag(parser, output);
1644
+ case '\0':
1645
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1646
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1647
+ append_char_to_temporary_buffer(parser, 0xfffd);
1648
+ return NEXT_CHAR;
1649
+ case -1:
1650
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1651
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1652
+ abandon_current_tag(parser);
1653
+ return NEXT_CHAR;
1654
+ case '"':
1655
+ case '\'':
1656
+ case '<':
1657
+ case '=':
1658
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1659
+ // Fall through.
1660
+ default:
1661
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1662
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1663
+ return NEXT_CHAR;
1664
+ }
1665
+ }
1666
+
1667
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
1668
+ static StateResult handle_attr_name_state(
1669
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1670
+ int c, GumboToken* output) {
1671
+ switch (c) {
1672
+ case '\t':
1673
+ case '\n':
1674
+ case '\f':
1675
+ case ' ':
1676
+ finish_attribute_name(parser);
1677
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1678
+ return NEXT_CHAR;
1679
+ case '/':
1680
+ finish_attribute_name(parser);
1681
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1682
+ return NEXT_CHAR;
1683
+ case '=':
1684
+ finish_attribute_name(parser);
1685
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1686
+ return NEXT_CHAR;
1687
+ case '>':
1688
+ finish_attribute_name(parser);
1689
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1690
+ return emit_current_tag(parser, output);
1691
+ case '\0':
1692
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1693
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1694
+ return NEXT_CHAR;
1695
+ case -1:
1696
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1697
+ abandon_current_tag(parser);
1698
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1699
+ return NEXT_CHAR;
1700
+ case '"':
1701
+ case '\'':
1702
+ case '<':
1703
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1704
+ // Fall through.
1705
+ default:
1706
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1707
+ return NEXT_CHAR;
1708
+ }
1709
+ }
1710
+
1711
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
1712
+ static StateResult handle_after_attr_name_state(
1713
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1714
+ int c, GumboToken* output) {
1715
+ switch (c) {
1716
+ case '\t':
1717
+ case '\n':
1718
+ case '\f':
1719
+ case ' ':
1720
+ return NEXT_CHAR;
1721
+ case '/':
1722
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1723
+ return NEXT_CHAR;
1724
+ case '=':
1725
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1726
+ return NEXT_CHAR;
1727
+ case '>':
1728
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1729
+ return emit_current_tag(parser, output);
1730
+ case '\0':
1731
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1732
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1733
+ append_char_to_temporary_buffer(parser, 0xfffd);
1734
+ return NEXT_CHAR;
1735
+ case -1:
1736
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1737
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1738
+ abandon_current_tag(parser);
1739
+ return NEXT_CHAR;
1740
+ case '"':
1741
+ case '\'':
1742
+ case '<':
1743
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1744
+ // Fall through.
1745
+ default:
1746
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1747
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1748
+ return NEXT_CHAR;
1749
+ }
1750
+ }
1751
+
1752
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
1753
+ static StateResult handle_before_attr_value_state(
1754
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1755
+ int c, GumboToken* output) {
1756
+ switch (c) {
1757
+ case '\t':
1758
+ case '\n':
1759
+ case '\f':
1760
+ case ' ':
1761
+ return NEXT_CHAR;
1762
+ case '"':
1763
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1764
+ reset_tag_buffer_start_point(parser);
1765
+ return NEXT_CHAR;
1766
+ case '&':
1767
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1768
+ tokenizer->_reconsume_current_input = true;
1769
+ return NEXT_CHAR;
1770
+ case '\'':
1771
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1772
+ reset_tag_buffer_start_point(parser);
1773
+ return NEXT_CHAR;
1774
+ case '\0':
1775
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1776
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1777
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1778
+ return NEXT_CHAR;
1779
+ case -1:
1780
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1781
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1782
+ abandon_current_tag(parser);
1783
+ tokenizer->_reconsume_current_input = true;
1784
+ return NEXT_CHAR;
1785
+ case '>':
1786
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1787
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1788
+ emit_current_tag(parser, output);
1789
+ return RETURN_ERROR;
1790
+ case '<':
1791
+ case '=':
1792
+ case '`':
1793
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1794
+ // Fall through.
1795
+ default:
1796
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1797
+ append_char_to_tag_buffer(parser, c, true);
1798
+ return NEXT_CHAR;
1799
+ }
1800
+ }
1801
+
1802
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
1803
+ static StateResult handle_attr_value_double_quoted_state(
1804
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1805
+ int c, GumboToken* output) {
1806
+ switch (c) {
1807
+ case '"':
1808
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1809
+ return NEXT_CHAR;
1810
+ case '&':
1811
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1812
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1813
+ tokenizer->_reconsume_current_input = true;
1814
+ return NEXT_CHAR;
1815
+ case '\0':
1816
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1817
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1818
+ return NEXT_CHAR;
1819
+ case -1:
1820
+ add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
1821
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1822
+ abandon_current_tag(parser);
1823
+ tokenizer->_reconsume_current_input = true;
1824
+ return NEXT_CHAR;
1825
+ default:
1826
+ append_char_to_tag_buffer(parser, c, false);
1827
+ return NEXT_CHAR;
1828
+ }
1829
+ }
1830
+
1831
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
1832
+ static StateResult handle_attr_value_single_quoted_state(
1833
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1834
+ int c, GumboToken* output) {
1835
+ switch (c) {
1836
+ case '\'':
1837
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1838
+ return NEXT_CHAR;
1839
+ case '&':
1840
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1841
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1842
+ tokenizer->_reconsume_current_input = true;
1843
+ return NEXT_CHAR;
1844
+ case '\0':
1845
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1846
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1847
+ return NEXT_CHAR;
1848
+ case -1:
1849
+ add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
1850
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1851
+ abandon_current_tag(parser);
1852
+ tokenizer->_reconsume_current_input = true;
1853
+ return NEXT_CHAR;
1854
+ default:
1855
+ append_char_to_tag_buffer(parser, c, false);
1856
+ return NEXT_CHAR;
1857
+ }
1858
+ }
1859
+
1860
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
1861
+ static StateResult handle_attr_value_unquoted_state(
1862
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1863
+ int c, GumboToken* output) {
1864
+ switch (c) {
1865
+ case '\t':
1866
+ case '\n':
1867
+ case '\f':
1868
+ case ' ':
1869
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1870
+ finish_attribute_value(parser);
1871
+ return NEXT_CHAR;
1872
+ case '&':
1873
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1874
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1875
+ tokenizer->_reconsume_current_input = true;
1876
+ return NEXT_CHAR;
1877
+ case '>':
1878
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1879
+ finish_attribute_value(parser);
1880
+ return emit_current_tag(parser, output);
1881
+ case '\0':
1882
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1883
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1884
+ return NEXT_CHAR;
1885
+ case -1:
1886
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1887
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1888
+ tokenizer->_reconsume_current_input = true;
1889
+ abandon_current_tag(parser);
1890
+ return NEXT_CHAR;
1891
+ case '<':
1892
+ case '=':
1893
+ case '"':
1894
+ case '\'':
1895
+ case '`':
1896
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1897
+ // Fall through.
1898
+ default:
1899
+ append_char_to_tag_buffer(parser, c, true);
1900
+ return NEXT_CHAR;
1901
+ }
1902
+ }
1903
+
1904
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
1905
+ static StateResult handle_char_ref_in_attr_value_state(
1906
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1907
+ int c, GumboToken* output) {
1908
+ OneOrTwoCodepoints char_ref;
1909
+ int allowed_char;
1910
+ bool is_unquoted = false;
1911
+ switch (tokenizer->_tag_state._attr_value_state) {
1912
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
1913
+ allowed_char = '"';
1914
+ break;
1915
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
1916
+ allowed_char = '\'';
1917
+ break;
1918
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
1919
+ allowed_char = '>';
1920
+ is_unquoted = true;
1921
+ break;
1922
+ default:
1923
+ // -Wmaybe-uninitialized is a little overzealous here, and doesn't
1924
+ // get that the assert(0) means this codepath will never happen.
1925
+ allowed_char = ' ';
1926
+ assert(0);
1927
+ }
1928
+
1929
+ // Ignore the status, since we don't have a convenient way of signalling that
1930
+ // a parser error has occurred when the error occurs in the middle of a
1931
+ // multi-state token. We'd need a flag inside the TokenizerState to do this,
1932
+ // but that's a low priority fix.
1933
+ consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
1934
+ if (char_ref.first != kGumboNoChar) {
1935
+ tokenizer->_reconsume_current_input = true;
1936
+ append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
1937
+ if (char_ref.second != kGumboNoChar) {
1938
+ append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
1939
+ }
1940
+ } else {
1941
+ append_char_to_tag_buffer(parser, '&', is_unquoted);
1942
+ }
1943
+ gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
1944
+ return NEXT_CHAR;
1945
+ }
1946
+
1947
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
1948
+ static StateResult handle_after_attr_value_quoted_state(
1949
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1950
+ int c, GumboToken* output) {
1951
+ finish_attribute_value(parser);
1952
+ switch (c) {
1953
+ case '\t':
1954
+ case '\n':
1955
+ case '\f':
1956
+ case ' ':
1957
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1958
+ return NEXT_CHAR;
1959
+ case '/':
1960
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1961
+ return NEXT_CHAR;
1962
+ case '>':
1963
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1964
+ return emit_current_tag(parser, output);
1965
+ case -1:
1966
+ add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
1967
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1968
+ abandon_current_tag(parser);
1969
+ tokenizer->_reconsume_current_input = true;
1970
+ return NEXT_CHAR;
1971
+ default:
1972
+ add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
1973
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1974
+ tokenizer->_reconsume_current_input = true;
1975
+ return NEXT_CHAR;
1976
+ }
1977
+ }
1978
+
1979
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
1980
+ static StateResult handle_self_closing_start_tag_state(
1981
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1982
+ int c, GumboToken* output) {
1983
+ switch (c) {
1984
+ case '>':
1985
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1986
+ tokenizer->_tag_state._is_self_closing = true;
1987
+ return emit_current_tag(parser, output);
1988
+ case -1:
1989
+ add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
1990
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1991
+ abandon_current_tag(parser);
1992
+ return NEXT_CHAR;
1993
+ default:
1994
+ add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
1995
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1996
+ tokenizer->_reconsume_current_input = true;
1997
+ return NEXT_CHAR;
1998
+ }
1999
+ }
2000
+
2001
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
2002
+ static StateResult handle_bogus_comment_state(
2003
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2004
+ int c, GumboToken* output) {
2005
+ while (c != '>' && c != -1) {
2006
+ if (c == '\0') {
2007
+ c = 0xFFFD;
2008
+ }
2009
+ append_char_to_temporary_buffer(parser, c);
2010
+ utf8iterator_next(&tokenizer->_input);
2011
+ c = utf8iterator_current(&tokenizer->_input);
2012
+ }
2013
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2014
+ return emit_comment(parser, output);
2015
+ }
2016
+
2017
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
2018
+ static StateResult handle_markup_declaration_state(
2019
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2020
+ int c, GumboToken* output) {
2021
+ if (utf8iterator_maybe_consume_match(
2022
+ &tokenizer->_input, "--", sizeof("--") - 1, true)) {
2023
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
2024
+ tokenizer->_reconsume_current_input = true;
2025
+ } else if (utf8iterator_maybe_consume_match(
2026
+ &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
2027
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
2028
+ tokenizer->_reconsume_current_input = true;
2029
+ // If we get here, we know we'll eventually emit a doctype token, so now is
2030
+ // the time to initialize the doctype strings. (Not in doctype_state_init,
2031
+ // since then they'll leak if ownership never gets transferred to the
2032
+ // doctype token.
2033
+ tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
2034
+ tokenizer->_doc_type_state.public_identifier =
2035
+ gumbo_copy_stringz(parser, "");
2036
+ tokenizer->_doc_type_state.system_identifier =
2037
+ gumbo_copy_stringz(parser, "");
2038
+ } else if (tokenizer->_is_current_node_foreign &&
2039
+ utf8iterator_maybe_consume_match(
2040
+ &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2041
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2042
+ tokenizer->_reconsume_current_input = true;
2043
+ } else {
2044
+ add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2045
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2046
+ tokenizer->_reconsume_current_input = true;
2047
+ clear_temporary_buffer(parser);
2048
+ }
2049
+ return NEXT_CHAR;
2050
+ }
2051
+
2052
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
2053
+ static StateResult handle_comment_start_state(
2054
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2055
+ int c, GumboToken* output) {
2056
+ switch (c) {
2057
+ case '-':
2058
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2059
+ return NEXT_CHAR;
2060
+ case '\0':
2061
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2062
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2063
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2064
+ return NEXT_CHAR;
2065
+ case '>':
2066
+ add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2067
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2068
+ emit_comment(parser, output);
2069
+ return RETURN_ERROR;
2070
+ case -1:
2071
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2072
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2073
+ emit_comment(parser, output);
2074
+ return RETURN_ERROR;
2075
+ default:
2076
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2077
+ append_char_to_temporary_buffer(parser, c);
2078
+ return NEXT_CHAR;
2079
+ }
2080
+ }
2081
+
2082
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
2083
+ static StateResult handle_comment_start_dash_state(
2084
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2085
+ int c, GumboToken* output) {
2086
+ switch (c) {
2087
+ case '-':
2088
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2089
+ return NEXT_CHAR;
2090
+ case '\0':
2091
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2092
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2093
+ append_char_to_temporary_buffer(parser, '-');
2094
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2095
+ return NEXT_CHAR;
2096
+ case '>':
2097
+ add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2098
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2099
+ emit_comment(parser, output);
2100
+ return RETURN_ERROR;
2101
+ case -1:
2102
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2103
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2104
+ emit_comment(parser, output);
2105
+ return RETURN_ERROR;
2106
+ default:
2107
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2108
+ append_char_to_temporary_buffer(parser, '-');
2109
+ append_char_to_temporary_buffer(parser, c);
2110
+ return NEXT_CHAR;
2111
+ }
2112
+ }
2113
+
2114
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
2115
+ static StateResult handle_comment_state(
2116
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2117
+ int c, GumboToken* output) {
2118
+ switch (c) {
2119
+ case '-':
2120
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2121
+ return NEXT_CHAR;
2122
+ case '\0':
2123
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2124
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2125
+ return NEXT_CHAR;
2126
+ case -1:
2127
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2128
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2129
+ emit_comment(parser, output);
2130
+ return RETURN_ERROR;
2131
+ default:
2132
+ append_char_to_temporary_buffer(parser, c);
2133
+ return NEXT_CHAR;
2134
+ }
2135
+ }
2136
+
2137
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
2138
+ static StateResult handle_comment_end_dash_state(
2139
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2140
+ int c, GumboToken* output) {
2141
+ switch (c) {
2142
+ case '-':
2143
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2144
+ return NEXT_CHAR;
2145
+ case '\0':
2146
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2147
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2148
+ append_char_to_temporary_buffer(parser, '-');
2149
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2150
+ return NEXT_CHAR;
2151
+ case -1:
2152
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2153
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2154
+ emit_comment(parser, output);
2155
+ return RETURN_ERROR;
2156
+ default:
2157
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2158
+ append_char_to_temporary_buffer(parser, '-');
2159
+ append_char_to_temporary_buffer(parser, c);
2160
+ return NEXT_CHAR;
2161
+ }
2162
+ }
2163
+
2164
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
2165
+ static StateResult handle_comment_end_state(
2166
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2167
+ int c, GumboToken* output) {
2168
+ switch (c) {
2169
+ case '>':
2170
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2171
+ return emit_comment(parser, output);
2172
+ case '\0':
2173
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2174
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2175
+ append_char_to_temporary_buffer(parser, '-');
2176
+ append_char_to_temporary_buffer(parser, '-');
2177
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2178
+ return NEXT_CHAR;
2179
+ case '!':
2180
+ add_parse_error(parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2181
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2182
+ return NEXT_CHAR;
2183
+ case '-':
2184
+ add_parse_error(parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2185
+ append_char_to_temporary_buffer(parser, '-');
2186
+ return NEXT_CHAR;
2187
+ case -1:
2188
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2189
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2190
+ emit_comment(parser, output);
2191
+ return RETURN_ERROR;
2192
+ default:
2193
+ add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2194
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2195
+ append_char_to_temporary_buffer(parser, '-');
2196
+ append_char_to_temporary_buffer(parser, '-');
2197
+ append_char_to_temporary_buffer(parser, c);
2198
+ return NEXT_CHAR;
2199
+ }
2200
+ }
2201
+
2202
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
2203
+ static StateResult handle_comment_end_bang_state(
2204
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2205
+ int c, GumboToken* output) {
2206
+ switch (c) {
2207
+ case '-':
2208
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2209
+ append_char_to_temporary_buffer(parser, '-');
2210
+ append_char_to_temporary_buffer(parser, '-');
2211
+ append_char_to_temporary_buffer(parser, '!');
2212
+ return NEXT_CHAR;
2213
+ case '>':
2214
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2215
+ return emit_comment(parser, output);
2216
+ case '\0':
2217
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2218
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2219
+ append_char_to_temporary_buffer(parser, '-');
2220
+ append_char_to_temporary_buffer(parser, '-');
2221
+ append_char_to_temporary_buffer(parser, '!');
2222
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2223
+ return NEXT_CHAR;
2224
+ case -1:
2225
+ add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2226
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2227
+ emit_comment(parser, output);
2228
+ return RETURN_ERROR;
2229
+ default:
2230
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2231
+ append_char_to_temporary_buffer(parser, '-');
2232
+ append_char_to_temporary_buffer(parser, '-');
2233
+ append_char_to_temporary_buffer(parser, '!');
2234
+ append_char_to_temporary_buffer(parser, c);
2235
+ return NEXT_CHAR;
2236
+ }
2237
+ }
2238
+
2239
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
2240
+ static StateResult handle_doctype_state(
2241
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2242
+ int c, GumboToken* output) {
2243
+ assert(!tokenizer->_temporary_buffer.length);
2244
+ switch (c) {
2245
+ case '\t':
2246
+ case '\n':
2247
+ case '\f':
2248
+ case ' ':
2249
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2250
+ return NEXT_CHAR;
2251
+ case -1:
2252
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2253
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2254
+ tokenizer->_doc_type_state.force_quirks = true;
2255
+ emit_doctype(parser, output);
2256
+ return RETURN_ERROR;
2257
+ default:
2258
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2259
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2260
+ tokenizer->_reconsume_current_input = true;
2261
+ tokenizer->_doc_type_state.force_quirks = true;
2262
+ return NEXT_CHAR;
2263
+ }
2264
+ }
2265
+
2266
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
2267
+ static StateResult handle_before_doctype_name_state(
2268
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2269
+ int c, GumboToken* output) {
2270
+ switch (c) {
2271
+ case '\t':
2272
+ case '\n':
2273
+ case '\f':
2274
+ case ' ':
2275
+ return NEXT_CHAR;
2276
+ case '\0':
2277
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2278
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2279
+ tokenizer->_doc_type_state.force_quirks = true;
2280
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2281
+ return NEXT_CHAR;
2282
+ case '>':
2283
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2284
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2285
+ tokenizer->_doc_type_state.force_quirks = true;
2286
+ emit_doctype(parser, output);
2287
+ return RETURN_ERROR;
2288
+ case -1:
2289
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2290
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2291
+ tokenizer->_doc_type_state.force_quirks = true;
2292
+ emit_doctype(parser, output);
2293
+ return RETURN_ERROR;
2294
+ default:
2295
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2296
+ tokenizer->_doc_type_state.force_quirks = false;
2297
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2298
+ return NEXT_CHAR;
2299
+ }
2300
+ }
2301
+
2302
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
2303
+ static StateResult handle_doctype_name_state(
2304
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2305
+ int c, GumboToken* output) {
2306
+ switch (c) {
2307
+ case '\t':
2308
+ case '\n':
2309
+ case '\f':
2310
+ case ' ':
2311
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2312
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2313
+ finish_temporary_buffer(
2314
+ parser, &tokenizer->_doc_type_state.name);
2315
+ return NEXT_CHAR;
2316
+ case '>':
2317
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2318
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2319
+ finish_temporary_buffer(
2320
+ parser, &tokenizer->_doc_type_state.name);
2321
+ emit_doctype(parser, output);
2322
+ return RETURN_SUCCESS;
2323
+ case '\0':
2324
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2325
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2326
+ return NEXT_CHAR;
2327
+ case -1:
2328
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2329
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2330
+ tokenizer->_doc_type_state.force_quirks = true;
2331
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2332
+ finish_temporary_buffer(
2333
+ parser, &tokenizer->_doc_type_state.name);
2334
+ emit_doctype(parser, output);
2335
+ return RETURN_ERROR;
2336
+ default:
2337
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2338
+ tokenizer->_doc_type_state.force_quirks = false;
2339
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2340
+ return NEXT_CHAR;
2341
+ }
2342
+ }
2343
+
2344
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
2345
+ static StateResult handle_after_doctype_name_state(
2346
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2347
+ int c, GumboToken* output) {
2348
+ switch (c) {
2349
+ case '\t':
2350
+ case '\n':
2351
+ case '\f':
2352
+ case ' ':
2353
+ return NEXT_CHAR;
2354
+ case '>':
2355
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2356
+ emit_doctype(parser, output);
2357
+ return RETURN_SUCCESS;
2358
+ case -1:
2359
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2360
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2361
+ tokenizer->_doc_type_state.force_quirks = true;
2362
+ emit_doctype(parser, output);
2363
+ return RETURN_ERROR;
2364
+ default:
2365
+ if (utf8iterator_maybe_consume_match(
2366
+ &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2367
+ gumbo_tokenizer_set_state(
2368
+ parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2369
+ tokenizer->_reconsume_current_input = true;
2370
+ } else if (utf8iterator_maybe_consume_match(
2371
+ &tokenizer->_input, "SYSTEM", sizeof("SYSTEM") - 1, false)) {
2372
+ gumbo_tokenizer_set_state(
2373
+ parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2374
+ tokenizer->_reconsume_current_input = true;
2375
+ } else {
2376
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2377
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2378
+ tokenizer->_doc_type_state.force_quirks = true;
2379
+ }
2380
+ return NEXT_CHAR;
2381
+ }
2382
+ }
2383
+
2384
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
2385
+ static StateResult handle_after_doctype_public_keyword_state(
2386
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2387
+ int c, GumboToken* output) {
2388
+ switch (c) {
2389
+ case '\t':
2390
+ case '\n':
2391
+ case '\f':
2392
+ case ' ':
2393
+ gumbo_tokenizer_set_state(
2394
+ parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2395
+ return NEXT_CHAR;
2396
+ case '"':
2397
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2398
+ assert(temporary_buffer_equals(parser, ""));
2399
+ gumbo_tokenizer_set_state(
2400
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2401
+ return NEXT_CHAR;
2402
+ case '\'':
2403
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2404
+ assert(temporary_buffer_equals(parser, ""));
2405
+ gumbo_tokenizer_set_state(
2406
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2407
+ return NEXT_CHAR;
2408
+ case '>':
2409
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2410
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2411
+ tokenizer->_doc_type_state.force_quirks = true;
2412
+ emit_doctype(parser, output);
2413
+ return RETURN_ERROR;
2414
+ case -1:
2415
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2416
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2417
+ tokenizer->_doc_type_state.force_quirks = true;
2418
+ emit_doctype(parser, output);
2419
+ return RETURN_ERROR;
2420
+ default:
2421
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2422
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2423
+ tokenizer->_doc_type_state.force_quirks = true;
2424
+ emit_doctype(parser, output);
2425
+ return RETURN_ERROR;
2426
+ }
2427
+ }
2428
+
2429
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
2430
+ static StateResult handle_before_doctype_public_id_state(
2431
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2432
+ int c, GumboToken* output) {
2433
+ switch (c) {
2434
+ case '\t':
2435
+ case '\n':
2436
+ case '\f':
2437
+ case ' ':
2438
+ return NEXT_CHAR;
2439
+ case '"':
2440
+ assert(temporary_buffer_equals(parser, ""));
2441
+ gumbo_tokenizer_set_state(
2442
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2443
+ return NEXT_CHAR;
2444
+ case '\'':
2445
+ assert(temporary_buffer_equals(parser, ""));
2446
+ gumbo_tokenizer_set_state(
2447
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2448
+ return NEXT_CHAR;
2449
+ case '>':
2450
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2451
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2452
+ tokenizer->_doc_type_state.force_quirks = true;
2453
+ emit_doctype(parser, output);
2454
+ return RETURN_ERROR;
2455
+ case -1:
2456
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2457
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2458
+ tokenizer->_doc_type_state.force_quirks = true;
2459
+ emit_doctype(parser, output);
2460
+ return RETURN_ERROR;
2461
+ default:
2462
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2463
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2464
+ tokenizer->_doc_type_state.force_quirks = true;
2465
+ emit_doctype(parser, output);
2466
+ return RETURN_ERROR;
2467
+ }
2468
+ }
2469
+
2470
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
2471
+ static StateResult handle_doctype_public_id_double_quoted_state(
2472
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2473
+ int c, GumboToken* output) {
2474
+ switch (c) {
2475
+ case '"':
2476
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2477
+ finish_doctype_public_id(parser);
2478
+ return NEXT_CHAR;
2479
+ case '\0':
2480
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2481
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2482
+ return NEXT_CHAR;
2483
+ case '>':
2484
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2485
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2486
+ tokenizer->_doc_type_state.force_quirks = true;
2487
+ finish_doctype_public_id(parser);
2488
+ emit_doctype(parser, output);
2489
+ return RETURN_ERROR;
2490
+ case -1:
2491
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2492
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2493
+ tokenizer->_doc_type_state.force_quirks = true;
2494
+ finish_doctype_public_id(parser);
2495
+ emit_doctype(parser, output);
2496
+ return RETURN_ERROR;
2497
+ default:
2498
+ append_char_to_temporary_buffer(parser, c);
2499
+ return NEXT_CHAR;
2500
+ }
2501
+ }
2502
+
2503
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
2504
+ static StateResult handle_doctype_public_id_single_quoted_state(
2505
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2506
+ int c, GumboToken* output) {
2507
+ switch (c) {
2508
+ case '\'':
2509
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2510
+ finish_doctype_public_id(parser);
2511
+ return NEXT_CHAR;
2512
+ case '\0':
2513
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2514
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2515
+ return NEXT_CHAR;
2516
+ case '>':
2517
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2518
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2519
+ tokenizer->_doc_type_state.force_quirks = true;
2520
+ finish_doctype_public_id(parser);
2521
+ emit_doctype(parser, output);
2522
+ return RETURN_ERROR;
2523
+ case -1:
2524
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2525
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2526
+ tokenizer->_doc_type_state.force_quirks = true;
2527
+ finish_doctype_public_id(parser);
2528
+ emit_doctype(parser, output);
2529
+ return RETURN_ERROR;
2530
+ default:
2531
+ append_char_to_temporary_buffer(parser, c);
2532
+ return NEXT_CHAR;
2533
+ }
2534
+ }
2535
+
2536
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
2537
+ static StateResult handle_after_doctype_public_id_state(
2538
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2539
+ int c, GumboToken* output) {
2540
+ switch (c) {
2541
+ case '\t':
2542
+ case '\n':
2543
+ case '\f':
2544
+ case ' ':
2545
+ gumbo_tokenizer_set_state(
2546
+ parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2547
+ return NEXT_CHAR;
2548
+ case '>':
2549
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2550
+ emit_doctype(parser, output);
2551
+ return RETURN_SUCCESS;
2552
+ case '"':
2553
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2554
+ assert(temporary_buffer_equals(parser, ""));
2555
+ gumbo_tokenizer_set_state(
2556
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2557
+ return NEXT_CHAR;
2558
+ case '\'':
2559
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2560
+ assert(temporary_buffer_equals(parser, ""));
2561
+ gumbo_tokenizer_set_state(
2562
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2563
+ return NEXT_CHAR;
2564
+ case -1:
2565
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2566
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2567
+ tokenizer->_reconsume_current_input = true;
2568
+ tokenizer->_doc_type_state.force_quirks = true;
2569
+ return NEXT_CHAR;
2570
+ default:
2571
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2572
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2573
+ tokenizer->_doc_type_state.force_quirks = true;
2574
+ return NEXT_CHAR;
2575
+ }
2576
+ }
2577
+
2578
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
2579
+ static StateResult handle_between_doctype_public_system_id_state(
2580
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2581
+ int c, GumboToken* output) {
2582
+ switch (c) {
2583
+ case '\t':
2584
+ case '\n':
2585
+ case '\f':
2586
+ case ' ':
2587
+ return NEXT_CHAR;
2588
+ case '>':
2589
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2590
+ emit_doctype(parser, output);
2591
+ return RETURN_SUCCESS;
2592
+ case '"':
2593
+ assert(temporary_buffer_equals(parser, ""));
2594
+ gumbo_tokenizer_set_state(
2595
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2596
+ return NEXT_CHAR;
2597
+ case '\'':
2598
+ assert(temporary_buffer_equals(parser, ""));
2599
+ gumbo_tokenizer_set_state(
2600
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2601
+ return NEXT_CHAR;
2602
+ case -1:
2603
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2604
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2605
+ tokenizer->_doc_type_state.force_quirks = true;
2606
+ emit_doctype(parser, output);
2607
+ return RETURN_ERROR;
2608
+ default:
2609
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2610
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2611
+ tokenizer->_doc_type_state.force_quirks = true;
2612
+ emit_doctype(parser, output);
2613
+ return RETURN_ERROR;
2614
+ }
2615
+ }
2616
+
2617
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
2618
+ static StateResult handle_after_doctype_system_keyword_state(
2619
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2620
+ int c, GumboToken* output) {
2621
+ switch (c) {
2622
+ case '\t':
2623
+ case '\n':
2624
+ case '\f':
2625
+ case ' ':
2626
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2627
+ return NEXT_CHAR;
2628
+ case '"':
2629
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2630
+ assert(temporary_buffer_equals(parser, ""));
2631
+ gumbo_tokenizer_set_state(
2632
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2633
+ return NEXT_CHAR;
2634
+ case '\'':
2635
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2636
+ assert(temporary_buffer_equals(parser, ""));
2637
+ gumbo_tokenizer_set_state(
2638
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2639
+ return NEXT_CHAR;
2640
+ case '>':
2641
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2642
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2643
+ tokenizer->_doc_type_state.force_quirks = true;
2644
+ emit_doctype(parser, output);
2645
+ return RETURN_ERROR;
2646
+ case -1:
2647
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2648
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2649
+ tokenizer->_doc_type_state.force_quirks = true;
2650
+ emit_doctype(parser, output);
2651
+ return RETURN_ERROR;
2652
+ default:
2653
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2654
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2655
+ tokenizer->_doc_type_state.force_quirks = true;
2656
+ return NEXT_CHAR;
2657
+ }
2658
+ }
2659
+
2660
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
2661
+ static StateResult handle_before_doctype_system_id_state(
2662
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2663
+ int c, GumboToken* output) {
2664
+ switch (c) {
2665
+ case '\t':
2666
+ case '\n':
2667
+ case '\f':
2668
+ case ' ':
2669
+ return NEXT_CHAR;
2670
+ case '"':
2671
+ assert(temporary_buffer_equals(parser, ""));
2672
+ gumbo_tokenizer_set_state(
2673
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2674
+ return NEXT_CHAR;
2675
+ case '\'':
2676
+ assert(temporary_buffer_equals(parser, ""));
2677
+ gumbo_tokenizer_set_state(
2678
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2679
+ return NEXT_CHAR;
2680
+ case '>':
2681
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2682
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2683
+ tokenizer->_doc_type_state.force_quirks = true;
2684
+ emit_doctype(parser, output);
2685
+ return RETURN_ERROR;
2686
+ case -1:
2687
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2688
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2689
+ tokenizer->_doc_type_state.force_quirks = true;
2690
+ emit_doctype(parser, output);
2691
+ return RETURN_ERROR;
2692
+ default:
2693
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2694
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2695
+ tokenizer->_doc_type_state.force_quirks = true;
2696
+ return NEXT_CHAR;
2697
+ }
2698
+ }
2699
+
2700
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
2701
+ static StateResult handle_doctype_system_id_double_quoted_state(
2702
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2703
+ int c, GumboToken* output) {
2704
+ switch (c) {
2705
+ case '"':
2706
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2707
+ finish_doctype_system_id(parser);
2708
+ return NEXT_CHAR;
2709
+ case '\0':
2710
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2711
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2712
+ return NEXT_CHAR;
2713
+ case '>':
2714
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2715
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2716
+ tokenizer->_doc_type_state.force_quirks = true;
2717
+ finish_doctype_system_id(parser);
2718
+ emit_doctype(parser, output);
2719
+ return RETURN_ERROR;
2720
+ case -1:
2721
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2722
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2723
+ tokenizer->_doc_type_state.force_quirks = true;
2724
+ finish_doctype_system_id(parser);
2725
+ emit_doctype(parser, output);
2726
+ return RETURN_ERROR;
2727
+ default:
2728
+ append_char_to_temporary_buffer(parser, c);
2729
+ return NEXT_CHAR;
2730
+ }
2731
+ }
2732
+
2733
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
2734
+ static StateResult handle_doctype_system_id_single_quoted_state(
2735
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2736
+ int c, GumboToken* output) {
2737
+ switch (c) {
2738
+ case '\'':
2739
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2740
+ finish_doctype_system_id(parser);
2741
+ return NEXT_CHAR;
2742
+ case '\0':
2743
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2744
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2745
+ return NEXT_CHAR;
2746
+ case '>':
2747
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2748
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2749
+ tokenizer->_doc_type_state.force_quirks = true;
2750
+ finish_doctype_system_id(parser);
2751
+ emit_doctype(parser, output);
2752
+ return RETURN_ERROR;
2753
+ case -1:
2754
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2755
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2756
+ tokenizer->_doc_type_state.force_quirks = true;
2757
+ finish_doctype_system_id(parser);
2758
+ emit_doctype(parser, output);
2759
+ return RETURN_ERROR;
2760
+ default:
2761
+ append_char_to_temporary_buffer(parser, c);
2762
+ return NEXT_CHAR;
2763
+ }
2764
+ }
2765
+
2766
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
2767
+ static StateResult handle_after_doctype_system_id_state(
2768
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2769
+ int c, GumboToken* output) {
2770
+ switch (c) {
2771
+ case '\t':
2772
+ case '\n':
2773
+ case '\f':
2774
+ case ' ':
2775
+ return NEXT_CHAR;
2776
+ case '>':
2777
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2778
+ emit_doctype(parser, output);
2779
+ return RETURN_SUCCESS;
2780
+ case -1:
2781
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2782
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2783
+ tokenizer->_doc_type_state.force_quirks = true;
2784
+ emit_doctype(parser, output);
2785
+ return RETURN_ERROR;
2786
+ default:
2787
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2788
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2789
+ return NEXT_CHAR;
2790
+ }
2791
+ }
2792
+
2793
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
2794
+ static StateResult handle_bogus_doctype_state(
2795
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2796
+ int c, GumboToken* output) {
2797
+ if (c == '>' || c == -1) {
2798
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2799
+ emit_doctype(parser, output);
2800
+ return RETURN_ERROR;
2801
+ }
2802
+ return NEXT_CHAR;
2803
+ }
2804
+
2805
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
2806
+ static StateResult handle_cdata_state(
2807
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2808
+ int c, GumboToken* output) {
2809
+ if (c == -1 || utf8iterator_maybe_consume_match(
2810
+ &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2811
+ tokenizer->_reconsume_current_input = true;
2812
+ reset_token_start_point(tokenizer);
2813
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2814
+ return NEXT_CHAR;
2815
+ } else {
2816
+ return emit_current_char(parser, output);
2817
+ }
2818
+ }
2819
+
2820
+ typedef StateResult (*GumboLexerStateFunction)(
2821
+ GumboParser*, GumboTokenizerState*, int, GumboToken*);
2822
+
2823
+ static GumboLexerStateFunction dispatch_table[] = {
2824
+ handle_data_state,
2825
+ handle_char_ref_in_data_state,
2826
+ handle_rcdata_state,
2827
+ handle_char_ref_in_rcdata_state,
2828
+ handle_rawtext_state,
2829
+ handle_script_state,
2830
+ handle_plaintext_state,
2831
+ handle_tag_open_state,
2832
+ handle_end_tag_open_state,
2833
+ handle_tag_name_state,
2834
+ handle_rcdata_lt_state,
2835
+ handle_rcdata_end_tag_open_state,
2836
+ handle_rcdata_end_tag_name_state,
2837
+ handle_rawtext_lt_state,
2838
+ handle_rawtext_end_tag_open_state,
2839
+ handle_rawtext_end_tag_name_state,
2840
+ handle_script_lt_state,
2841
+ handle_script_end_tag_open_state,
2842
+ handle_script_end_tag_name_state,
2843
+ handle_script_escaped_start_state,
2844
+ handle_script_escaped_start_dash_state,
2845
+ handle_script_escaped_state,
2846
+ handle_script_escaped_dash_state,
2847
+ handle_script_escaped_dash_dash_state,
2848
+ handle_script_escaped_lt_state,
2849
+ handle_script_escaped_end_tag_open_state,
2850
+ handle_script_escaped_end_tag_name_state,
2851
+ handle_script_double_escaped_start_state,
2852
+ handle_script_double_escaped_state,
2853
+ handle_script_double_escaped_dash_state,
2854
+ handle_script_double_escaped_dash_dash_state,
2855
+ handle_script_double_escaped_lt_state,
2856
+ handle_script_double_escaped_end_state,
2857
+ handle_before_attr_name_state,
2858
+ handle_attr_name_state,
2859
+ handle_after_attr_name_state,
2860
+ handle_before_attr_value_state,
2861
+ handle_attr_value_double_quoted_state,
2862
+ handle_attr_value_single_quoted_state,
2863
+ handle_attr_value_unquoted_state,
2864
+ handle_char_ref_in_attr_value_state,
2865
+ handle_after_attr_value_quoted_state,
2866
+ handle_self_closing_start_tag_state,
2867
+ handle_bogus_comment_state,
2868
+ handle_markup_declaration_state,
2869
+ handle_comment_start_state,
2870
+ handle_comment_start_dash_state,
2871
+ handle_comment_state,
2872
+ handle_comment_end_dash_state,
2873
+ handle_comment_end_state,
2874
+ handle_comment_end_bang_state,
2875
+ handle_doctype_state,
2876
+ handle_before_doctype_name_state,
2877
+ handle_doctype_name_state,
2878
+ handle_after_doctype_name_state,
2879
+ handle_after_doctype_public_keyword_state,
2880
+ handle_before_doctype_public_id_state,
2881
+ handle_doctype_public_id_double_quoted_state,
2882
+ handle_doctype_public_id_single_quoted_state,
2883
+ handle_after_doctype_public_id_state,
2884
+ handle_between_doctype_public_system_id_state,
2885
+ handle_after_doctype_system_keyword_state,
2886
+ handle_before_doctype_system_id_state,
2887
+ handle_doctype_system_id_double_quoted_state,
2888
+ handle_doctype_system_id_single_quoted_state,
2889
+ handle_after_doctype_system_id_state,
2890
+ handle_bogus_doctype_state,
2891
+ handle_cdata_state
2892
+ };
2893
+
2894
+ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2895
+ // Because of the spec requirements that...
2896
+ //
2897
+ // 1. Tokens be handled immediately by the parser upon emission.
2898
+ // 2. Some states (eg. CDATA, or various error conditions) require the
2899
+ // emission of multiple tokens in the same states.
2900
+ // 3. The tokenizer often has to reconsume the same character in a different
2901
+ // state.
2902
+ //
2903
+ // ...all state must be held in the GumboTokenizer struct instead of in local
2904
+ // variables in this function. That allows us to return from this method with
2905
+ // a token, and then immediately jump back to the same state with the same
2906
+ // input if we need to return a different token. The various emit_* functions
2907
+ // are responsible for changing state (eg. flushing the chardata buffer,
2908
+ // reading the next input character) to avoid an infinite loop.
2909
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
2910
+
2911
+ if (tokenizer->_buffered_emit_char != kGumboNoChar) {
2912
+ tokenizer->_reconsume_current_input = true;
2913
+ emit_char(parser, tokenizer->_buffered_emit_char, output);
2914
+ // And now that we've avoided advancing the input, make sure we set
2915
+ // _reconsume_current_input back to false to make sure the *next* character
2916
+ // isn't consumed twice.
2917
+ tokenizer->_reconsume_current_input = false;
2918
+ tokenizer->_buffered_emit_char = kGumboNoChar;
2919
+ return true;
2920
+ }
2921
+
2922
+ if (maybe_emit_from_temporary_buffer(parser, output)) {
2923
+ return true;
2924
+ }
2925
+
2926
+ while (1) {
2927
+ assert(!tokenizer->_temporary_buffer_emit);
2928
+ assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2929
+ int c = utf8iterator_current(&tokenizer->_input);
2930
+ gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
2931
+ StateResult result =
2932
+ dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2933
+ // We need to clear reconsume_current_input before returning to prevent
2934
+ // certain infinite loop states.
2935
+ bool should_advance = !tokenizer->_reconsume_current_input;
2936
+ tokenizer->_reconsume_current_input = false;
2937
+
2938
+ if (result == RETURN_SUCCESS) {
2939
+ return true;
2940
+ } else if(result == RETURN_ERROR) {
2941
+ return false;
2942
+ }
2943
+
2944
+ if (should_advance) {
2945
+ utf8iterator_next(&tokenizer->_input);
2946
+ }
2947
+ }
2948
+ }
2949
+
2950
+ void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
2951
+ if (!token) return;
2952
+
2953
+ switch (token->type) {
2954
+ case GUMBO_TOKEN_DOCTYPE:
2955
+ gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
2956
+ gumbo_parser_deallocate(
2957
+ parser, (void*) token->v.doc_type.public_identifier);
2958
+ gumbo_parser_deallocate(
2959
+ parser, (void*) token->v.doc_type.system_identifier);
2960
+ return;
2961
+ case GUMBO_TOKEN_START_TAG:
2962
+ for (int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2963
+ GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2964
+ if (attr) {
2965
+ // May have been nulled out if this token was merged with another.
2966
+ gumbo_destroy_attribute(parser, attr);
2967
+ }
2968
+ }
2969
+ gumbo_parser_deallocate(
2970
+ parser, (void*) token->v.start_tag.attributes.data);
2971
+ return;
2972
+ case GUMBO_TOKEN_COMMENT:
2973
+ gumbo_parser_deallocate(parser, (void*) token->v.text);
2974
+ return;
2975
+ default:
2976
+ return;
2977
+ }
2978
+ }