nokogumbo 1.4.8 → 1.4.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2897 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // Coding conventions specific to this file:
18
+ //
19
+ // 1. Functions that fill in a token should be named emit_*, and should be
20
+ // followed immediately by a return from the tokenizer (true if no error
21
+ // occurred, false if an error occurred). Sometimes the emit functions
22
+ // themselves return a boolean so that they can be combined with the return
23
+ // statement; in this case, they should match this convention.
24
+ // 2. Functions that shuffle data from temporaries to final API structures
25
+ // should be named finish_*, and be called just before the tokenizer exits the
26
+ // state that accumulates the temporary.
27
+ // 3. All internal data structures should be kept in an initialized state from
28
+ // tokenizer creation onwards, ready to accept input. When a buffer's flushed
29
+ // and reset, it should be deallocated and immediately reinitialized.
30
+ // 4. Make sure there are appropriate break statements following each state.
31
+ // 5. Assertions on the state of the temporary and tag buffers are usually a
32
+ // good idea, and should go at the entry point of each state when added.
33
+ // 6. Statement order within states goes:
34
+ // 1. Add parse errors, if appropriate.
35
+ // 2. Call finish_* functions to build up tag state.
36
+ // 2. Switch to new state. Set _reconsume flag if appropriate.
37
+ // 3. Perform any other temporary buffer manipulation.
38
+ // 4. Emit tokens
39
+ // 5. Return/break.
40
+ // This order ensures that we can verify that every emit is followed by a
41
+ // return, ensures that the correct state is recorded with any parse errors, and
42
+ // prevents parse error position from being messed up by possible mark/resets in
43
+ // temporary buffer manipulation.
44
+
45
+ #include "tokenizer.h"
46
+
47
+ #include <assert.h>
48
+ #include <stdbool.h>
49
+ #include <string.h>
50
+
51
+ #include "attribute.h"
52
+ #include "char_ref.h"
53
+ #include "error.h"
54
+ #include "gumbo.h"
55
+ #include "parser.h"
56
+ #include "string_buffer.h"
57
+ #include "string_piece.h"
58
+ #include "token_type.h"
59
+ #include "tokenizer_states.h"
60
+ #include "utf8.h"
61
+ #include "util.h"
62
+ #include "vector.h"
63
+
64
+ // Compared against _script_data_buffer to determine if we're in double-escaped
65
+ // script mode.
66
+ const GumboStringPiece kScriptTag = {"script", 6};
67
+
68
+ // An enum for the return value of each individual state.
69
+ typedef enum {
70
+ RETURN_ERROR, // Return false (error) from the tokenizer.
71
+ RETURN_SUCCESS, // Return true (success) from the tokenizer.
72
+ NEXT_CHAR // Proceed to the next character and continue lexing.
73
+ } StateResult;
74
+
75
+ // This is a struct containing state necessary to build up a tag token,
76
+ // character by character.
77
+ typedef struct GumboInternalTagState {
78
+ // A buffer to accumulate characters for various GumboStringPiece fields.
79
+ GumboStringBuffer _buffer;
80
+
81
+ // A pointer to the start of the original text corresponding to the contents
82
+ // of the buffer.
83
+ const char* _original_text;
84
+
85
+ // The current tag enum, computed once the tag name state has finished so that
86
+ // the buffer can be re-used for building up attributes.
87
+ GumboTag _tag;
88
+
89
+ // The starting location of the text in the buffer.
90
+ GumboSourcePosition _start_pos;
91
+
92
+ // The current list of attributes. This is copied (and ownership of its data
93
+ // transferred) to the GumboStartTag token upon completion of the tag. New
94
+ // attributes are added as soon as their attribute name state is complete, and
95
+ // values are filled in by operating on _attributes.data[attributes.length-1].
96
+ GumboVector /* GumboAttribute */ _attributes;
97
+
98
+ // If true, the next attribute value to be finished should be dropped. This
99
+ // happens if a duplicate attribute name is encountered - we want to consume
100
+ // the attribute value, but shouldn't overwrite the existing value.
101
+ bool _drop_next_attr_value;
102
+
103
+ // The state that caused the tokenizer to switch into a character reference in
104
+ // attribute value state. This is used to set the additional allowed
105
+ // character, and is switched back to on completion. Initialized as the
106
+ // tokenizer enters the character reference state.
107
+ GumboTokenizerEnum _attr_value_state;
108
+
109
+ // The last start tag to have been emitted by the tokenizer. This is
110
+ // necessary to check for appropriate end tags.
111
+ GumboTag _last_start_tag;
112
+
113
+ // If true, then this is a start tag. If false, it's an end tag. This is
114
+ // necessary to generate the appropriate token type at tag-closing time.
115
+ bool _is_start_tag;
116
+
117
+ // If true, then this tag is "self-closing" and doesn't have an end tag.
118
+ bool _is_self_closing;
119
+ } GumboTagState;
120
+
121
+ // This is the main tokenizer state struct, containing all state used by in
122
+ // tokenizing the input stream.
123
+ typedef struct GumboInternalTokenizerState {
124
+ // The current lexer state. Starts in GUMBO_LEX_DATA.
125
+ GumboTokenizerEnum _state;
126
+
127
+ // A flag indicating whether the current input character needs to reconsumed
128
+ // in another state, or whether the next input character should be read for
129
+ // the next iteration of the state loop. This is set when the spec reads
130
+ // "Reconsume the current input character in..."
131
+ bool _reconsume_current_input;
132
+
133
+ // A flag indicating whether the current node is a foreign element. This is
134
+ // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
135
+ // markup declaration state.
136
+ bool _is_current_node_foreign;
137
+
138
+ // A flag indicating whether the tokenizer is in a CDATA section. If so, then
139
+ // text tokens emitted will be GUMBO_TOKEN_CDATA.
140
+ bool _is_in_cdata;
141
+
142
+ // Certain states (notably character references) may emit two character tokens
143
+ // at once, but the contract for lex() fills in only one token at a time. The
144
+ // extra character is buffered here, and then this is checked on entry to
145
+ // lex(). If a character is stored here, it's immediately emitted and control
146
+ // returns from the lexer. kGumboNoChar is used to represent 'no character
147
+ // stored.'
148
+ //
149
+ // Note that characters emitted through this mechanism will have their source
150
+ // position marked as the character under the mark, i.e. multiple characters
151
+ // may be emitted with the same position. This is desirable for character
152
+ // references, but unsuitable for many other cases. Use the _temporary_buffer
153
+ // mechanism if the buffered characters must have their original positions in
154
+ // the document.
155
+ int _buffered_emit_char;
156
+
157
+ // A temporary buffer to accumulate characters, as described by the "temporary
158
+ // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
159
+ // way: we record the specific character to go into the buffer, which may
160
+ // sometimes be a lowercased version of the actual input character. However,
161
+ // we *also* use utf8iterator_mark() to record the position at tag start.
162
+ // When we start flushing the temporary buffer, we set _temporary_buffer_emit
163
+ // to the start of it, and then increment it for each call to the tokenizer.
164
+ // We also call utf8iterator_reset(), and utf8iterator_next() through the
165
+ // input stream, so that tokens emitted by emit_char have the correct position
166
+ // and original text.
167
+ GumboStringBuffer _temporary_buffer;
168
+
169
+ // The current cursor position we're emitting from within
170
+ // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
171
+ const char* _temporary_buffer_emit;
172
+
173
+ // The temporary buffer is also used by the spec to check whether we should
174
+ // enter the script data double escaped state, but we can't use the same
175
+ // buffer for both because we have to flush out "<s" as emits while still
176
+ // maintaining the context that will eventually become "script". This is a
177
+ // separate buffer that's used in place of the temporary buffer for states
178
+ // that may enter the script data double escape start state.
179
+ GumboStringBuffer _script_data_buffer;
180
+
181
+ // Pointer to the beginning of the current token in the original buffer; used
182
+ // to record the original text.
183
+ const char* _token_start;
184
+
185
+ // GumboSourcePosition recording the source location of the start of the
186
+ // current token.
187
+ GumboSourcePosition _token_start_pos;
188
+
189
+ // Current tag state.
190
+ GumboTagState _tag_state;
191
+
192
+ // Doctype state. We use the temporary buffer to accumulate characters (it's
193
+ // not used for anything else in the doctype states), and then freshly
194
+ // allocate the strings in the doctype token, then copy it over on emit.
195
+ GumboTokenDocType _doc_type_state;
196
+
197
+ // The UTF8Iterator over the tokenizer input.
198
+ Utf8Iterator _input;
199
+ } GumboTokenizerState;
200
+
201
+ // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
202
+ static void tokenizer_add_parse_error(
203
+ GumboParser* parser, GumboErrorType type) {
204
+ GumboError* error = gumbo_add_error(parser);
205
+ if (!error) {
206
+ return;
207
+ }
208
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
209
+ utf8iterator_get_position(&tokenizer->_input, &error->position);
210
+ error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
211
+ error->type = type;
212
+ error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
213
+ switch (tokenizer->_state) {
214
+ case GUMBO_LEX_DATA:
215
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
216
+ break;
217
+ case GUMBO_LEX_CHAR_REF_IN_DATA:
218
+ case GUMBO_LEX_CHAR_REF_IN_RCDATA:
219
+ case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
220
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
221
+ break;
222
+ case GUMBO_LEX_RCDATA:
223
+ case GUMBO_LEX_RCDATA_LT:
224
+ case GUMBO_LEX_RCDATA_END_TAG_OPEN:
225
+ case GUMBO_LEX_RCDATA_END_TAG_NAME:
226
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
227
+ break;
228
+ case GUMBO_LEX_RAWTEXT:
229
+ case GUMBO_LEX_RAWTEXT_LT:
230
+ case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
231
+ case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
232
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
233
+ break;
234
+ case GUMBO_LEX_PLAINTEXT:
235
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
236
+ break;
237
+ case GUMBO_LEX_SCRIPT:
238
+ case GUMBO_LEX_SCRIPT_LT:
239
+ case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
240
+ case GUMBO_LEX_SCRIPT_END_TAG_NAME:
241
+ case GUMBO_LEX_SCRIPT_ESCAPED_START:
242
+ case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
243
+ case GUMBO_LEX_SCRIPT_ESCAPED:
244
+ case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
245
+ case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
246
+ case GUMBO_LEX_SCRIPT_ESCAPED_LT:
247
+ case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
248
+ case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
249
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
250
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
251
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
252
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
253
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
254
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
255
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
256
+ break;
257
+ case GUMBO_LEX_TAG_OPEN:
258
+ case GUMBO_LEX_END_TAG_OPEN:
259
+ case GUMBO_LEX_TAG_NAME:
260
+ case GUMBO_LEX_BEFORE_ATTR_NAME:
261
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
262
+ break;
263
+ case GUMBO_LEX_SELF_CLOSING_START_TAG:
264
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
265
+ break;
266
+ case GUMBO_LEX_ATTR_NAME:
267
+ case GUMBO_LEX_AFTER_ATTR_NAME:
268
+ case GUMBO_LEX_BEFORE_ATTR_VALUE:
269
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
270
+ break;
271
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
272
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
273
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
274
+ case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
275
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
276
+ break;
277
+ case GUMBO_LEX_BOGUS_COMMENT:
278
+ case GUMBO_LEX_COMMENT_START:
279
+ case GUMBO_LEX_COMMENT_START_DASH:
280
+ case GUMBO_LEX_COMMENT:
281
+ case GUMBO_LEX_COMMENT_END_DASH:
282
+ case GUMBO_LEX_COMMENT_END:
283
+ case GUMBO_LEX_COMMENT_END_BANG:
284
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
285
+ break;
286
+ case GUMBO_LEX_MARKUP_DECLARATION:
287
+ case GUMBO_LEX_DOCTYPE:
288
+ case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
289
+ case GUMBO_LEX_DOCTYPE_NAME:
290
+ case GUMBO_LEX_AFTER_DOCTYPE_NAME:
291
+ case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
292
+ case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
293
+ case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
294
+ case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
295
+ case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
296
+ case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
297
+ case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
298
+ case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
299
+ case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
300
+ case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
301
+ case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
302
+ case GUMBO_LEX_BOGUS_DOCTYPE:
303
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
304
+ break;
305
+ case GUMBO_LEX_CDATA:
306
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
307
+ break;
308
+ }
309
+ }
310
+
311
+ static bool is_alpha(int c) {
312
+ // We don't use ISO C isupper/islower functions here because they
313
+ // depend upon the program's locale, while the behavior of the HTML5 spec is
314
+ // independent of which locale the program is run in.
315
+ return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
316
+ }
317
+
318
+ static int ensure_lowercase(int c) {
319
+ return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
320
+ }
321
+
322
+ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
323
+ if (is_in_cdata && c > 0) {
324
+ return GUMBO_TOKEN_CDATA;
325
+ }
326
+
327
+ switch (c) {
328
+ case '\t':
329
+ case '\n':
330
+ case '\r':
331
+ case '\f':
332
+ case ' ':
333
+ return GUMBO_TOKEN_WHITESPACE;
334
+ case 0:
335
+ gumbo_debug("Emitted null byte.\n");
336
+ return GUMBO_TOKEN_NULL;
337
+ case -1:
338
+ return GUMBO_TOKEN_EOF;
339
+ default:
340
+ return GUMBO_TOKEN_CHARACTER;
341
+ }
342
+ }
343
+
344
+ // Starts recording characters in the temporary buffer.
345
+ // Because this needs to reset the utf8iterator_mark to the beginning of the
346
+ // text that will eventually be emitted, it needs to be called a couple of
347
+ // states before the spec says "Set the temporary buffer to the empty string".
348
+ // In general, this should be called whenever there's a transition to a
349
+ // "less-than sign state". The initial < and possibly / then need to be
350
+ // appended to the temporary buffer, their presence needs to be accounted for in
351
+ // states that compare the temporary buffer against a literal value, and
352
+ // spec stanzas that say "emit a < and / character token along with a character
353
+ // token for each character in the temporary buffer" need to be adjusted to
354
+ // account for the presence of the < and / inside the temporary buffer.
355
+ static void clear_temporary_buffer(GumboParser* parser) {
356
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
357
+ assert(!tokenizer->_temporary_buffer_emit);
358
+ utf8iterator_mark(&tokenizer->_input);
359
+ gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
360
+ // The temporary buffer and script data buffer are the same object in the
361
+ // spec, so the script data buffer should be cleared as well.
362
+ gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
363
+ }
364
+
365
+ // Appends a codepoint to the temporary buffer.
366
+ static void append_char_to_temporary_buffer(
367
+ GumboParser* parser, int codepoint) {
368
+ gumbo_string_buffer_append_codepoint(
369
+ parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
370
+ }
371
+
372
+ // Checks to see if the temporary buffer equals a certain string.
373
+ // Make sure this remains side-effect free; it's used in assertions.
374
+ #ifndef NDEBUG
375
+ static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
376
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
377
+ // TODO(jdtang): See if the extra strlen is a performance problem, and replace
378
+ // it with an explicit sizeof(literal) if necessary. I don't think it will
379
+ // be, as this is only used in a couple of rare states.
380
+ int text_len = strlen(text);
381
+ return text_len == buffer->length &&
382
+ memcmp(buffer->data, text, text_len) == 0;
383
+ }
384
+ #endif
385
+
386
+ static void doc_type_state_init(GumboParser* parser) {
387
+ GumboTokenDocType* doc_type_state =
388
+ &parser->_tokenizer_state->_doc_type_state;
389
+ // We initialize these to NULL here so that we don't end up leaking memory if
390
+ // we never see a doctype token. When we do see a doctype token, we reset
391
+ // them to a freshly-allocated empty string so that we can present a uniform
392
+ // interface to client code and not make them check for null. Ownership is
393
+ // transferred to the doctype token when it's emitted.
394
+ doc_type_state->name = NULL;
395
+ doc_type_state->public_identifier = NULL;
396
+ doc_type_state->system_identifier = NULL;
397
+ doc_type_state->force_quirks = false;
398
+ doc_type_state->has_public_identifier = false;
399
+ doc_type_state->has_system_identifier = false;
400
+ }
401
+
402
+ // Sets the token original_text and position to the current iterator position.
403
+ // This is necessary because [CDATA[ sections may include text that is ignored
404
+ // by the tokenizer.
405
+ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
406
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
407
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
408
+ }
409
+
410
+ // Sets the tag buffer original text and start point to the current iterator
411
+ // position. This is necessary because attribute names & values may have
412
+ // whitespace preceeding them, and so we can't assume that the actual token
413
+ // starting point was the end of the last tag buffer usage.
414
+ static void reset_tag_buffer_start_point(GumboParser* parser) {
415
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
416
+ GumboTagState* tag_state = &tokenizer->_tag_state;
417
+
418
+ utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
419
+ tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
420
+ }
421
+
422
+ // Moves the temporary buffer contents over to the specified output string,
423
+ // and clears the temporary buffer.
424
+ static void finish_temporary_buffer(GumboParser* parser, const char** output) {
425
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
426
+ *output =
427
+ gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
428
+ clear_temporary_buffer(parser);
429
+ }
430
+
431
+ // Advances the iterator past the end of the token, and then fills in the
432
+ // relevant position fields. It's assumed that after every emit, the tokenizer
433
+ // will immediately return (letting the tree-construction stage read the filled
434
+ // in Token). Thus, it's safe to advance the input stream here, since it will
435
+ // bypass the advance at the bottom of the state machine loop.
436
+ //
437
+ // Since this advances the iterator and resets the current input, make sure to
438
+ // call it after you've recorded any other data you need for the token.
439
+ static void finish_token(GumboParser* parser, GumboToken* token) {
440
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
441
+ if (!tokenizer->_reconsume_current_input) {
442
+ utf8iterator_next(&tokenizer->_input);
443
+ }
444
+
445
+ token->position = tokenizer->_token_start_pos;
446
+ token->original_text.data = tokenizer->_token_start;
447
+ reset_token_start_point(tokenizer);
448
+ token->original_text.length =
449
+ tokenizer->_token_start - token->original_text.data;
450
+ if (token->original_text.length > 0 &&
451
+ token->original_text.data[token->original_text.length - 1] == '\r') {
452
+ // The UTF8 iterator will ignore carriage returns in the input stream, which
453
+ // means that the next token may start one past a \r character. The pointer
454
+ // arithmetic above results in that \r being appended to the original text
455
+ // of the preceding token, so we have to adjust its length here to chop the
456
+ // \r off.
457
+ --token->original_text.length;
458
+ }
459
+ }
460
+
461
+ // Records the doctype public ID, assumed to be in the temporary buffer.
462
+ // Convenience method that also sets has_public_identifier to true.
463
+ static void finish_doctype_public_id(GumboParser* parser) {
464
+ GumboTokenDocType* doc_type_state =
465
+ &parser->_tokenizer_state->_doc_type_state;
466
+ gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
467
+ finish_temporary_buffer(parser, &doc_type_state->public_identifier);
468
+ doc_type_state->has_public_identifier = true;
469
+ }
470
+
471
+ // Records the doctype system ID, assumed to be in the temporary buffer.
472
+ // Convenience method that also sets has_system_identifier to true.
473
+ static void finish_doctype_system_id(GumboParser* parser) {
474
+ GumboTokenDocType* doc_type_state =
475
+ &parser->_tokenizer_state->_doc_type_state;
476
+ gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
477
+ finish_temporary_buffer(parser, &doc_type_state->system_identifier);
478
+ doc_type_state->has_system_identifier = true;
479
+ }
480
+
481
+ // Writes a single specified character to the output token.
482
+ static void emit_char(GumboParser* parser, int c, GumboToken* output) {
483
+ output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
484
+ output->v.character = c;
485
+ finish_token(parser, output);
486
+ }
487
+
488
+ // Writes a replacement character token and records a parse error.
489
+ // Always returns RETURN_ERROR, per gumbo_lex return value.
490
+ static StateResult emit_replacement_char(
491
+ GumboParser* parser, GumboToken* output) {
492
+ // In all cases, this is because of a null byte in the input stream.
493
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
494
+ emit_char(parser, kUtf8ReplacementChar, output);
495
+ return RETURN_ERROR;
496
+ }
497
+
498
+ // Writes an EOF character token. Always returns RETURN_SUCCESS.
499
+ static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
500
+ emit_char(parser, -1, output);
501
+ return RETURN_SUCCESS;
502
+ }
503
+
504
+ // Writes the current input character out as a character token.
505
+ // Always returns RETURN_SUCCESS.
506
+ static bool emit_current_char(GumboParser* parser, GumboToken* output) {
507
+ emit_char(
508
+ parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
509
+ return RETURN_SUCCESS;
510
+ }
511
+
512
+ // Writes out a doctype token, copying it from the tokenizer state.
513
+ static void emit_doctype(GumboParser* parser, GumboToken* output) {
514
+ output->type = GUMBO_TOKEN_DOCTYPE;
515
+ output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
516
+ finish_token(parser, output);
517
+ doc_type_state_init(parser);
518
+ }
519
+
520
+ // Debug-only function that explicitly sets the attribute vector data to NULL so
521
+ // it can be asserted on tag creation, verifying that there are no memory leaks.
522
+ static void mark_tag_state_as_empty(GumboTagState* tag_state) {
523
+ #ifndef NDEBUG
524
+ tag_state->_attributes = kGumboEmptyVector;
525
+ #endif
526
+ }
527
+
528
+ // Writes out the current tag as a start or end tag token.
529
+ // Always returns RETURN_SUCCESS.
530
+ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
531
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
532
+ if (tag_state->_is_start_tag) {
533
+ output->type = GUMBO_TOKEN_START_TAG;
534
+ output->v.start_tag.tag = tag_state->_tag;
535
+ output->v.start_tag.attributes = tag_state->_attributes;
536
+ output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
537
+ tag_state->_last_start_tag = tag_state->_tag;
538
+ mark_tag_state_as_empty(tag_state);
539
+ gumbo_debug(
540
+ "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
541
+ } else {
542
+ output->type = GUMBO_TOKEN_END_TAG;
543
+ output->v.end_tag = tag_state->_tag;
544
+ // In end tags, ownership of the attributes vector is not transferred to the
545
+ // token, but it's still initialized as normal, so it must be manually
546
+ // deallocated. There may also be attributes to destroy, in certain broken
547
+ // cases like </div</th> (the "th" is an attribute there).
548
+ for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
549
+ gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
550
+ }
551
+ gumbo_parser_deallocate(parser, tag_state->_attributes.data);
552
+ mark_tag_state_as_empty(tag_state);
553
+ gumbo_debug(
554
+ "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
555
+ }
556
+ gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
557
+ finish_token(parser, output);
558
+ gumbo_debug("Original text = %.*s.\n", output->original_text.length,
559
+ output->original_text.data);
560
+ assert(output->original_text.length >= 2);
561
+ assert(output->original_text.data[0] == '<');
562
+ assert(output->original_text.data[output->original_text.length - 1] == '>');
563
+ return RETURN_SUCCESS;
564
+ }
565
+
566
+ // In some states, we speculatively start a tag, but don't know whether it'll be
567
+ // emitted as tag token or as a series of character tokens until we finish it.
568
+ // We need to abandon the tag we'd started & free its memory in that case to
569
+ // avoid a memory leak.
570
+ static void abandon_current_tag(GumboParser* parser) {
571
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
572
+ for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
573
+ gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
574
+ }
575
+ gumbo_parser_deallocate(parser, tag_state->_attributes.data);
576
+ mark_tag_state_as_empty(tag_state);
577
+ gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
578
+ gumbo_debug("Abandoning current tag.\n");
579
+ }
580
+
581
+ // Wraps the consume_char_ref function to handle its output and make the
582
+ // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
583
+ // error occurred, RETURN_SUCCESS otherwise.
584
+ static StateResult emit_char_ref(GumboParser* parser,
585
+ int additional_allowed_char, bool is_in_attribute, GumboToken* output) {
586
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
587
+ OneOrTwoCodepoints char_ref;
588
+ bool status = consume_char_ref(
589
+ parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
590
+ if (char_ref.first != kGumboNoChar) {
591
+ // consume_char_ref ends with the iterator pointing at the next character,
592
+ // so we need to be sure not advance it again before reading the next token.
593
+ tokenizer->_reconsume_current_input = true;
594
+ emit_char(parser, char_ref.first, output);
595
+ tokenizer->_buffered_emit_char = char_ref.second;
596
+ } else {
597
+ emit_char(parser, '&', output);
598
+ }
599
+ return status ? RETURN_SUCCESS : RETURN_ERROR;
600
+ }
601
+
602
+ // Emits a comment token. Comments use the temporary buffer to accumulate their
603
+ // data, and then it's copied over and released to the 'text' field of the
604
+ // GumboToken union. Always returns RETURN_SUCCESS.
605
+ static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
606
+ output->type = GUMBO_TOKEN_COMMENT;
607
+ finish_temporary_buffer(parser, &output->v.text);
608
+ finish_token(parser, output);
609
+ return RETURN_SUCCESS;
610
+ }
611
+
612
+ // Checks to see we should be flushing accumulated characters in the temporary
613
+ // buffer, and fills the output token with the next output character if so.
614
+ // Returns true if a character has been emitted and the tokenizer should
615
+ // immediately return, false if we're at the end of the temporary buffer and
616
+ // should resume normal operation.
617
+ static bool maybe_emit_from_temporary_buffer(
618
+ GumboParser* parser, GumboToken* output) {
619
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
620
+ const char* c = tokenizer->_temporary_buffer_emit;
621
+ GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
622
+
623
+ if (!c || c >= buffer->data + buffer->length) {
624
+ tokenizer->_temporary_buffer_emit = NULL;
625
+ return false;
626
+ }
627
+
628
+ assert(*c == utf8iterator_current(&tokenizer->_input));
629
+ // emit_char also advances the input stream. We need to do some juggling of
630
+ // the _reconsume_current_input flag to get the proper behavior when emitting
631
+ // previous tokens. Basically, _reconsume_current_input should *never* be set
632
+ // when emitting anything from the temporary buffer, since those characters
633
+ // have already been advanced past. However, it should be preserved so that
634
+ // when the *next* character is encountered again, the tokenizer knows not to
635
+ // advance past it.
636
+ bool saved_reconsume_state = tokenizer->_reconsume_current_input;
637
+ tokenizer->_reconsume_current_input = false;
638
+ emit_char(parser, *c, output);
639
+ ++tokenizer->_temporary_buffer_emit;
640
+ tokenizer->_reconsume_current_input = saved_reconsume_state;
641
+ return true;
642
+ }
643
+
644
+ // Sets up the tokenizer to begin flushing the temporary buffer.
645
+ // This resets the input iterator stream to the start of the last tag, sets up
646
+ // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
647
+ // the first character in it. It returns true if a character was emitted, false
648
+ // otherwise.
649
+ static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
650
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
651
+ assert(tokenizer->_temporary_buffer.data);
652
+ utf8iterator_reset(&tokenizer->_input);
653
+ tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
654
+ return maybe_emit_from_temporary_buffer(parser, output);
655
+ }
656
+
657
+ // Appends a codepoint to the current tag buffer. If
658
+ // reinitilize_position_on_first is set, this also initializes the tag buffer
659
+ // start point; the only time you would *not* want to pass true for this
660
+ // parameter is if you want the original_text to include character (like an
661
+ // opening quote) that doesn't appear in the value.
662
+ static void append_char_to_tag_buffer(
663
+ GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
664
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
665
+ if (buffer->length == 0 && reinitilize_position_on_first) {
666
+ reset_tag_buffer_start_point(parser);
667
+ }
668
+ gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
669
+ }
670
+
671
+ // (Re-)initialize the tag buffer. This also resets the original_text pointer
672
+ // and _start_pos field to point to the current position.
673
+ static void initialize_tag_buffer(GumboParser* parser) {
674
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
675
+ GumboTagState* tag_state = &tokenizer->_tag_state;
676
+
677
+ gumbo_string_buffer_init(parser, &tag_state->_buffer);
678
+ reset_tag_buffer_start_point(parser);
679
+ }
680
+
681
+ // Initializes the tag_state to start a new tag, keeping track of the opening
682
+ // positions and original text. Takes a boolean indicating whether this is a
683
+ // start or end tag.
684
+ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
685
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
686
+ GumboTagState* tag_state = &tokenizer->_tag_state;
687
+ int c = utf8iterator_current(&tokenizer->_input);
688
+ assert(is_alpha(c));
689
+ c = ensure_lowercase(c);
690
+ assert(is_alpha(c));
691
+
692
+ initialize_tag_buffer(parser);
693
+ gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
694
+
695
+ assert(tag_state->_attributes.data == NULL);
696
+ // Initial size chosen by statistical analysis of a corpus of 60k webpages.
697
+ // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
698
+ // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
699
+ // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
700
+ gumbo_vector_init(parser, 1, &tag_state->_attributes);
701
+ tag_state->_drop_next_attr_value = false;
702
+ tag_state->_is_start_tag = is_start_tag;
703
+ tag_state->_is_self_closing = false;
704
+ gumbo_debug("Starting new tag.\n");
705
+ }
706
+
707
+ // Fills in the specified char* with the contents of the tag buffer.
708
+ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
709
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
710
+ GumboTagState* tag_state = &tokenizer->_tag_state;
711
+ *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
712
+ }
713
+
714
+ // Fills in:
715
+ // * The original_text GumboStringPiece with the portion of the original
716
+ // buffer that corresponds to the tag buffer.
717
+ // * The start_pos GumboSourcePosition with the start position of the tag
718
+ // buffer.
719
+ // * The end_pos GumboSourcePosition with the current source position.
720
+ static void copy_over_original_tag_text(GumboParser* parser,
721
+ GumboStringPiece* original_text, GumboSourcePosition* start_pos,
722
+ GumboSourcePosition* end_pos) {
723
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
724
+ GumboTagState* tag_state = &tokenizer->_tag_state;
725
+
726
+ original_text->data = tag_state->_original_text;
727
+ original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
728
+ tag_state->_original_text;
729
+ if (original_text->data[original_text->length - 1] == '\r') {
730
+ // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
731
+ // appended to the end of original text even when it's really the first part
732
+ // of the next character. If we detect this situation, shrink the length of
733
+ // the original text by 1 to remove the carriage return.
734
+ --original_text->length;
735
+ }
736
+ *start_pos = tag_state->_start_pos;
737
+ utf8iterator_get_position(&tokenizer->_input, end_pos);
738
+ }
739
+
740
+ // Releases and then re-initializes the tag buffer.
741
+ static void reinitialize_tag_buffer(GumboParser* parser) {
742
+ gumbo_parser_deallocate(
743
+ parser, parser->_tokenizer_state->_tag_state._buffer.data);
744
+ initialize_tag_buffer(parser);
745
+ }
746
+
747
+ // Moves some data from the temporary buffer over the the tag-based fields in
748
+ // TagState.
749
+ static void finish_tag_name(GumboParser* parser) {
750
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
751
+ GumboTagState* tag_state = &tokenizer->_tag_state;
752
+
753
+ tag_state->_tag =
754
+ gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
755
+ reinitialize_tag_buffer(parser);
756
+ }
757
+
758
+ // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
759
+ static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
760
+ int original_index, int new_index) {
761
+ GumboError* error = gumbo_add_error(parser);
762
+ if (!error) {
763
+ return;
764
+ }
765
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
766
+ error->type = GUMBO_ERR_DUPLICATE_ATTR;
767
+ error->position = tag_state->_start_pos;
768
+ error->original_text = tag_state->_original_text;
769
+ error->v.duplicate_attr.original_index = original_index;
770
+ error->v.duplicate_attr.new_index = new_index;
771
+ copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
772
+ reinitialize_tag_buffer(parser);
773
+ }
774
+
775
+ // Creates a new attribute in the current tag, copying the current tag buffer to
776
+ // the attribute's name. The attribute's value starts out as the empty string
777
+ // (following the "Boolean attributes" section of the spec) and is only
778
+ // overwritten on finish_attribute_value(). If the attribute has already been
779
+ // specified, the new attribute is dropped, a parse error is added, and the
780
+ // function returns false. Otherwise, this returns true.
781
+ static bool finish_attribute_name(GumboParser* parser) {
782
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
783
+ GumboTagState* tag_state = &tokenizer->_tag_state;
784
+ // May've been set by a previous attribute without a value; reset it here.
785
+ tag_state->_drop_next_attr_value = false;
786
+ assert(tag_state->_attributes.data);
787
+ assert(tag_state->_attributes.capacity);
788
+
789
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
790
+ for (unsigned int i = 0; i < attributes->length; ++i) {
791
+ GumboAttribute* attr = attributes->data[i];
792
+ if (strlen(attr->name) == tag_state->_buffer.length &&
793
+ memcmp(attr->name, tag_state->_buffer.data,
794
+ tag_state->_buffer.length) == 0) {
795
+ // Identical attribute; bail.
796
+ add_duplicate_attr_error(parser, attr->name, i, attributes->length);
797
+ tag_state->_drop_next_attr_value = true;
798
+ return false;
799
+ }
800
+ }
801
+
802
+ GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
803
+ attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
804
+ copy_over_tag_buffer(parser, &attr->name);
805
+ copy_over_original_tag_text(
806
+ parser, &attr->original_name, &attr->name_start, &attr->name_end);
807
+ attr->value = gumbo_copy_stringz(parser, "");
808
+ copy_over_original_tag_text(
809
+ parser, &attr->original_value, &attr->name_start, &attr->name_end);
810
+ gumbo_vector_add(parser, attr, attributes);
811
+ reinitialize_tag_buffer(parser);
812
+ return true;
813
+ }
814
+
815
+ // Finishes an attribute value. This sets the value of the most recently added
816
+ // attribute to the current contents of the tag buffer.
817
+ static void finish_attribute_value(GumboParser* parser) {
818
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
819
+ if (tag_state->_drop_next_attr_value) {
820
+ // Duplicate attribute name detected in an earlier state, so we have to
821
+ // ignore the value.
822
+ tag_state->_drop_next_attr_value = false;
823
+ reinitialize_tag_buffer(parser);
824
+ return;
825
+ }
826
+
827
+ GumboAttribute* attr =
828
+ tag_state->_attributes.data[tag_state->_attributes.length - 1];
829
+ gumbo_parser_deallocate(parser, (void*) attr->value);
830
+ copy_over_tag_buffer(parser, &attr->value);
831
+ copy_over_original_tag_text(
832
+ parser, &attr->original_value, &attr->value_start, &attr->value_end);
833
+ reinitialize_tag_buffer(parser);
834
+ }
835
+
836
+ // Returns true if the current end tag matches the last start tag emitted.
837
+ static bool is_appropriate_end_tag(GumboParser* parser) {
838
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
839
+ assert(!tag_state->_is_start_tag);
840
+ return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
841
+ tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
842
+ tag_state->_buffer.length);
843
+ }
844
+
845
+ void gumbo_tokenizer_state_init(
846
+ GumboParser* parser, const char* text, size_t text_length) {
847
+ GumboTokenizerState* tokenizer =
848
+ gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
849
+ parser->_tokenizer_state = tokenizer;
850
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
851
+ tokenizer->_reconsume_current_input = false;
852
+ tokenizer->_is_current_node_foreign = false;
853
+ tokenizer->_is_in_cdata = false;
854
+ tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
855
+
856
+ tokenizer->_buffered_emit_char = kGumboNoChar;
857
+ gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
858
+ tokenizer->_temporary_buffer_emit = NULL;
859
+
860
+ mark_tag_state_as_empty(&tokenizer->_tag_state);
861
+
862
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
863
+ tokenizer->_token_start = text;
864
+ utf8iterator_init(parser, text, text_length, &tokenizer->_input);
865
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
866
+ doc_type_state_init(parser);
867
+ }
868
+
869
+ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
870
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
871
+ assert(tokenizer->_doc_type_state.name == NULL);
872
+ assert(tokenizer->_doc_type_state.public_identifier == NULL);
873
+ assert(tokenizer->_doc_type_state.system_identifier == NULL);
874
+ gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
875
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
876
+ gumbo_parser_deallocate(parser, tokenizer);
877
+ }
878
+
879
+ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
880
+ parser->_tokenizer_state->_state = state;
881
+ }
882
+
883
+ void gumbo_tokenizer_set_is_current_node_foreign(
884
+ GumboParser* parser, bool is_foreign) {
885
+ if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
886
+ gumbo_debug("Toggling is_current_node_foreign to %s.\n",
887
+ is_foreign ? "true" : "false");
888
+ }
889
+ parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
890
+ }
891
+
892
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
893
+ static StateResult handle_data_state(GumboParser* parser,
894
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
895
+ switch (c) {
896
+ case '&':
897
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
898
+ // The char_ref machinery expects to be on the & so it can mark that
899
+ // and return to it if the text isn't a char ref, so we need to
900
+ // reconsume it.
901
+ tokenizer->_reconsume_current_input = true;
902
+ return NEXT_CHAR;
903
+ case '<':
904
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
905
+ clear_temporary_buffer(parser);
906
+ append_char_to_temporary_buffer(parser, '<');
907
+ return NEXT_CHAR;
908
+ case '\0':
909
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
910
+ emit_char(parser, c, output);
911
+ return RETURN_ERROR;
912
+ default:
913
+ return emit_current_char(parser, output);
914
+ }
915
+ }
916
+
917
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
918
+ static StateResult handle_char_ref_in_data_state(GumboParser* parser,
919
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
920
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
921
+ return emit_char_ref(parser, ' ', false, output);
922
+ }
923
+
924
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
925
+ static StateResult handle_rcdata_state(GumboParser* parser,
926
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
927
+ switch (c) {
928
+ case '&':
929
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
930
+ tokenizer->_reconsume_current_input = true;
931
+ return NEXT_CHAR;
932
+ case '<':
933
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
934
+ clear_temporary_buffer(parser);
935
+ append_char_to_temporary_buffer(parser, '<');
936
+ return NEXT_CHAR;
937
+ case '\0':
938
+ return emit_replacement_char(parser, output);
939
+ case -1:
940
+ return emit_eof(parser, output);
941
+ default:
942
+ return emit_current_char(parser, output);
943
+ }
944
+ }
945
+
946
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
947
+ static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
948
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
949
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
950
+ return emit_char_ref(parser, ' ', false, output);
951
+ }
952
+
953
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
954
+ static StateResult handle_rawtext_state(GumboParser* parser,
955
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
956
+ switch (c) {
957
+ case '<':
958
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
959
+ clear_temporary_buffer(parser);
960
+ append_char_to_temporary_buffer(parser, '<');
961
+ return NEXT_CHAR;
962
+ case '\0':
963
+ return emit_replacement_char(parser, output);
964
+ case -1:
965
+ return emit_eof(parser, output);
966
+ default:
967
+ return emit_current_char(parser, output);
968
+ }
969
+ }
970
+
971
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
972
+ static StateResult handle_script_state(GumboParser* parser,
973
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
974
+ switch (c) {
975
+ case '<':
976
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
977
+ clear_temporary_buffer(parser);
978
+ append_char_to_temporary_buffer(parser, '<');
979
+ return NEXT_CHAR;
980
+ case '\0':
981
+ return emit_replacement_char(parser, output);
982
+ case -1:
983
+ return emit_eof(parser, output);
984
+ default:
985
+ return emit_current_char(parser, output);
986
+ }
987
+ }
988
+
989
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
990
+ static StateResult handle_plaintext_state(GumboParser* parser,
991
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
992
+ switch (c) {
993
+ case '\0':
994
+ return emit_replacement_char(parser, output);
995
+ case -1:
996
+ return emit_eof(parser, output);
997
+ default:
998
+ return emit_current_char(parser, output);
999
+ }
1000
+ }
1001
+
1002
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
1003
+ static StateResult handle_tag_open_state(GumboParser* parser,
1004
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1005
+ assert(temporary_buffer_equals(parser, "<"));
1006
+ switch (c) {
1007
+ case '!':
1008
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1009
+ clear_temporary_buffer(parser);
1010
+ return NEXT_CHAR;
1011
+ case '/':
1012
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1013
+ append_char_to_temporary_buffer(parser, '/');
1014
+ return NEXT_CHAR;
1015
+ case '?':
1016
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1017
+ clear_temporary_buffer(parser);
1018
+ append_char_to_temporary_buffer(parser, '?');
1019
+ tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1020
+ return NEXT_CHAR;
1021
+ default:
1022
+ if (is_alpha(c)) {
1023
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1024
+ start_new_tag(parser, true);
1025
+ return NEXT_CHAR;
1026
+ } else {
1027
+ tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1028
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1029
+ emit_temporary_buffer(parser, output);
1030
+ return RETURN_ERROR;
1031
+ }
1032
+ }
1033
+ }
1034
+
1035
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
1036
+ static StateResult handle_end_tag_open_state(GumboParser* parser,
1037
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1038
+ assert(temporary_buffer_equals(parser, "</"));
1039
+ switch (c) {
1040
+ case '>':
1041
+ tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1042
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1043
+ return NEXT_CHAR;
1044
+ case -1:
1045
+ tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1046
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1047
+ return emit_temporary_buffer(parser, output);
1048
+ default:
1049
+ if (is_alpha(c)) {
1050
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1051
+ start_new_tag(parser, false);
1052
+ } else {
1053
+ tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1054
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1055
+ clear_temporary_buffer(parser);
1056
+ append_char_to_temporary_buffer(parser, c);
1057
+ }
1058
+ return NEXT_CHAR;
1059
+ }
1060
+ }
1061
+
1062
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
1063
+ static StateResult handle_tag_name_state(GumboParser* parser,
1064
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1065
+ switch (c) {
1066
+ case '\t':
1067
+ case '\n':
1068
+ case '\f':
1069
+ case ' ':
1070
+ finish_tag_name(parser);
1071
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1072
+ return NEXT_CHAR;
1073
+ case '/':
1074
+ finish_tag_name(parser);
1075
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1076
+ return NEXT_CHAR;
1077
+ case '>':
1078
+ finish_tag_name(parser);
1079
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1080
+ return emit_current_tag(parser, output);
1081
+ case '\0':
1082
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1083
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1084
+ return NEXT_CHAR;
1085
+ case -1:
1086
+ tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1087
+ abandon_current_tag(parser);
1088
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1089
+ return NEXT_CHAR;
1090
+ default:
1091
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1092
+ return NEXT_CHAR;
1093
+ }
1094
+ }
1095
+
1096
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
1097
+ static StateResult handle_rcdata_lt_state(GumboParser* parser,
1098
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1099
+ assert(temporary_buffer_equals(parser, "<"));
1100
+ if (c == '/') {
1101
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1102
+ append_char_to_temporary_buffer(parser, '/');
1103
+ return NEXT_CHAR;
1104
+ } else {
1105
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1106
+ tokenizer->_reconsume_current_input = true;
1107
+ return emit_temporary_buffer(parser, output);
1108
+ }
1109
+ }
1110
+
1111
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
1112
+ static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
1113
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1114
+ assert(temporary_buffer_equals(parser, "</"));
1115
+ if (is_alpha(c)) {
1116
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1117
+ start_new_tag(parser, false);
1118
+ append_char_to_temporary_buffer(parser, c);
1119
+ return NEXT_CHAR;
1120
+ } else {
1121
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1122
+ return emit_temporary_buffer(parser, output);
1123
+ }
1124
+ return true;
1125
+ }
1126
+
1127
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
1128
+ static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
1129
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1130
+ assert(tokenizer->_temporary_buffer.length >= 2);
1131
+ if (is_alpha(c)) {
1132
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1133
+ append_char_to_temporary_buffer(parser, c);
1134
+ return NEXT_CHAR;
1135
+ } else if (is_appropriate_end_tag(parser)) {
1136
+ switch (c) {
1137
+ case '\t':
1138
+ case '\n':
1139
+ case '\f':
1140
+ case ' ':
1141
+ finish_tag_name(parser);
1142
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1143
+ return NEXT_CHAR;
1144
+ case '/':
1145
+ finish_tag_name(parser);
1146
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1147
+ return NEXT_CHAR;
1148
+ case '>':
1149
+ finish_tag_name(parser);
1150
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1151
+ return emit_current_tag(parser, output);
1152
+ }
1153
+ }
1154
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1155
+ abandon_current_tag(parser);
1156
+ return emit_temporary_buffer(parser, output);
1157
+ }
1158
+
1159
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
1160
+ static StateResult handle_rawtext_lt_state(GumboParser* parser,
1161
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1162
+ assert(temporary_buffer_equals(parser, "<"));
1163
+ if (c == '/') {
1164
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1165
+ append_char_to_temporary_buffer(parser, '/');
1166
+ return NEXT_CHAR;
1167
+ } else {
1168
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1169
+ tokenizer->_reconsume_current_input = true;
1170
+ return emit_temporary_buffer(parser, output);
1171
+ }
1172
+ }
1173
+
1174
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
1175
+ static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
1176
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1177
+ assert(temporary_buffer_equals(parser, "</"));
1178
+ if (is_alpha(c)) {
1179
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1180
+ start_new_tag(parser, false);
1181
+ append_char_to_temporary_buffer(parser, c);
1182
+ return NEXT_CHAR;
1183
+ } else {
1184
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1185
+ return emit_temporary_buffer(parser, output);
1186
+ }
1187
+ }
1188
+
1189
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
1190
+ static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
1191
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1192
+ assert(tokenizer->_temporary_buffer.length >= 2);
1193
+ gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1194
+ tokenizer->_tag_state._buffer.data);
1195
+ if (is_alpha(c)) {
1196
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1197
+ append_char_to_temporary_buffer(parser, c);
1198
+ return NEXT_CHAR;
1199
+ } else if (is_appropriate_end_tag(parser)) {
1200
+ gumbo_debug("Is an appropriate end tag.\n");
1201
+ switch (c) {
1202
+ case '\t':
1203
+ case '\n':
1204
+ case '\f':
1205
+ case ' ':
1206
+ finish_tag_name(parser);
1207
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1208
+ return NEXT_CHAR;
1209
+ case '/':
1210
+ finish_tag_name(parser);
1211
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1212
+ return NEXT_CHAR;
1213
+ case '>':
1214
+ finish_tag_name(parser);
1215
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1216
+ return emit_current_tag(parser, output);
1217
+ }
1218
+ }
1219
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1220
+ abandon_current_tag(parser);
1221
+ return emit_temporary_buffer(parser, output);
1222
+ }
1223
+
1224
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
1225
+ static StateResult handle_script_lt_state(GumboParser* parser,
1226
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1227
+ assert(temporary_buffer_equals(parser, "<"));
1228
+ if (c == '/') {
1229
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1230
+ append_char_to_temporary_buffer(parser, '/');
1231
+ return NEXT_CHAR;
1232
+ } else if (c == '!') {
1233
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1234
+ append_char_to_temporary_buffer(parser, '!');
1235
+ return emit_temporary_buffer(parser, output);
1236
+ } else {
1237
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1238
+ tokenizer->_reconsume_current_input = true;
1239
+ return emit_temporary_buffer(parser, output);
1240
+ }
1241
+ }
1242
+
1243
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
1244
+ static StateResult handle_script_end_tag_open_state(GumboParser* parser,
1245
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1246
+ assert(temporary_buffer_equals(parser, "</"));
1247
+ if (is_alpha(c)) {
1248
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1249
+ start_new_tag(parser, false);
1250
+ append_char_to_temporary_buffer(parser, c);
1251
+ return NEXT_CHAR;
1252
+ } else {
1253
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1254
+ return emit_temporary_buffer(parser, output);
1255
+ }
1256
+ }
1257
+
1258
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
1259
+ static StateResult handle_script_end_tag_name_state(GumboParser* parser,
1260
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1261
+ assert(tokenizer->_temporary_buffer.length >= 2);
1262
+ if (is_alpha(c)) {
1263
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1264
+ append_char_to_temporary_buffer(parser, c);
1265
+ return NEXT_CHAR;
1266
+ } else if (is_appropriate_end_tag(parser)) {
1267
+ switch (c) {
1268
+ case '\t':
1269
+ case '\n':
1270
+ case '\f':
1271
+ case ' ':
1272
+ finish_tag_name(parser);
1273
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1274
+ return NEXT_CHAR;
1275
+ case '/':
1276
+ finish_tag_name(parser);
1277
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1278
+ return NEXT_CHAR;
1279
+ case '>':
1280
+ finish_tag_name(parser);
1281
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1282
+ return emit_current_tag(parser, output);
1283
+ }
1284
+ }
1285
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1286
+ abandon_current_tag(parser);
1287
+ return emit_temporary_buffer(parser, output);
1288
+ }
1289
+
1290
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
1291
+ static StateResult handle_script_escaped_start_state(GumboParser* parser,
1292
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1293
+ if (c == '-') {
1294
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1295
+ return emit_current_char(parser, output);
1296
+ } else {
1297
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1298
+ tokenizer->_reconsume_current_input = true;
1299
+ return NEXT_CHAR;
1300
+ }
1301
+ }
1302
+
1303
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
1304
+ static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
1305
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1306
+ if (c == '-') {
1307
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1308
+ return emit_current_char(parser, output);
1309
+ } else {
1310
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1311
+ tokenizer->_reconsume_current_input = true;
1312
+ return NEXT_CHAR;
1313
+ }
1314
+ }
1315
+
1316
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
1317
+ static StateResult handle_script_escaped_state(GumboParser* parser,
1318
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1319
+ switch (c) {
1320
+ case '-':
1321
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1322
+ return emit_current_char(parser, output);
1323
+ case '<':
1324
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1325
+ clear_temporary_buffer(parser);
1326
+ append_char_to_temporary_buffer(parser, c);
1327
+ return NEXT_CHAR;
1328
+ case '\0':
1329
+ return emit_replacement_char(parser, output);
1330
+ case -1:
1331
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1332
+ return emit_eof(parser, output);
1333
+ default:
1334
+ return emit_current_char(parser, output);
1335
+ }
1336
+ }
1337
+
1338
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
1339
+ static StateResult handle_script_escaped_dash_state(GumboParser* parser,
1340
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1341
+ switch (c) {
1342
+ case '-':
1343
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1344
+ return emit_current_char(parser, output);
1345
+ case '<':
1346
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1347
+ clear_temporary_buffer(parser);
1348
+ append_char_to_temporary_buffer(parser, c);
1349
+ return NEXT_CHAR;
1350
+ case '\0':
1351
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1352
+ return emit_replacement_char(parser, output);
1353
+ case -1:
1354
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1355
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1356
+ return NEXT_CHAR;
1357
+ default:
1358
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1359
+ return emit_current_char(parser, output);
1360
+ }
1361
+ }
1362
+
1363
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
1364
+ static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
1365
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1366
+ switch (c) {
1367
+ case '-':
1368
+ return emit_current_char(parser, output);
1369
+ case '<':
1370
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1371
+ clear_temporary_buffer(parser);
1372
+ append_char_to_temporary_buffer(parser, c);
1373
+ return NEXT_CHAR;
1374
+ case '>':
1375
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1376
+ return emit_current_char(parser, output);
1377
+ case '\0':
1378
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1379
+ return emit_replacement_char(parser, output);
1380
+ case -1:
1381
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1382
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1383
+ return NEXT_CHAR;
1384
+ default:
1385
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1386
+ return emit_current_char(parser, output);
1387
+ }
1388
+ }
1389
+
1390
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
1391
+ static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1392
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1393
+ assert(temporary_buffer_equals(parser, "<"));
1394
+ assert(!tokenizer->_script_data_buffer.length);
1395
+ if (c == '/') {
1396
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1397
+ append_char_to_temporary_buffer(parser, c);
1398
+ return NEXT_CHAR;
1399
+ } else if (is_alpha(c)) {
1400
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1401
+ append_char_to_temporary_buffer(parser, c);
1402
+ gumbo_string_buffer_append_codepoint(
1403
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1404
+ return emit_temporary_buffer(parser, output);
1405
+ } else {
1406
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1407
+ return emit_temporary_buffer(parser, output);
1408
+ }
1409
+ }
1410
+
1411
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
1412
+ static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
1413
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1414
+ assert(temporary_buffer_equals(parser, "</"));
1415
+ if (is_alpha(c)) {
1416
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1417
+ start_new_tag(parser, false);
1418
+ append_char_to_temporary_buffer(parser, c);
1419
+ return NEXT_CHAR;
1420
+ } else {
1421
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1422
+ return emit_temporary_buffer(parser, output);
1423
+ }
1424
+ }
1425
+
1426
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
1427
+ static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
1428
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1429
+ assert(tokenizer->_temporary_buffer.length >= 2);
1430
+ if (is_alpha(c)) {
1431
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1432
+ append_char_to_temporary_buffer(parser, c);
1433
+ return NEXT_CHAR;
1434
+ } else if (is_appropriate_end_tag(parser)) {
1435
+ switch (c) {
1436
+ case '\t':
1437
+ case '\n':
1438
+ case '\f':
1439
+ case ' ':
1440
+ finish_tag_name(parser);
1441
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1442
+ return NEXT_CHAR;
1443
+ case '/':
1444
+ finish_tag_name(parser);
1445
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1446
+ return NEXT_CHAR;
1447
+ case '>':
1448
+ finish_tag_name(parser);
1449
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1450
+ return emit_current_tag(parser, output);
1451
+ }
1452
+ }
1453
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1454
+ abandon_current_tag(parser);
1455
+ return emit_temporary_buffer(parser, output);
1456
+ }
1457
+
1458
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
1459
+ static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1460
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1461
+ switch (c) {
1462
+ case '\t':
1463
+ case '\n':
1464
+ case '\f':
1465
+ case ' ':
1466
+ case '/':
1467
+ case '>':
1468
+ gumbo_tokenizer_set_state(
1469
+ parser, gumbo_string_equals(&kScriptTag,
1470
+ (GumboStringPiece*) &tokenizer->_script_data_buffer)
1471
+ ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1472
+ : GUMBO_LEX_SCRIPT_ESCAPED);
1473
+ return emit_current_char(parser, output);
1474
+ default:
1475
+ if (is_alpha(c)) {
1476
+ gumbo_string_buffer_append_codepoint(
1477
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1478
+ return emit_current_char(parser, output);
1479
+ } else {
1480
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1481
+ tokenizer->_reconsume_current_input = true;
1482
+ return NEXT_CHAR;
1483
+ }
1484
+ }
1485
+ }
1486
+
1487
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
1488
+ static StateResult handle_script_double_escaped_state(GumboParser* parser,
1489
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1490
+ switch (c) {
1491
+ case '-':
1492
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1493
+ return emit_current_char(parser, output);
1494
+ case '<':
1495
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1496
+ return emit_current_char(parser, output);
1497
+ case '\0':
1498
+ return emit_replacement_char(parser, output);
1499
+ case -1:
1500
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1501
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1502
+ return NEXT_CHAR;
1503
+ default:
1504
+ return emit_current_char(parser, output);
1505
+ }
1506
+ }
1507
+
1508
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
1509
+ static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
1510
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1511
+ switch (c) {
1512
+ case '-':
1513
+ gumbo_tokenizer_set_state(
1514
+ parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1515
+ return emit_current_char(parser, output);
1516
+ case '<':
1517
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1518
+ return emit_current_char(parser, output);
1519
+ case '\0':
1520
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1521
+ return emit_replacement_char(parser, output);
1522
+ case -1:
1523
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1524
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1525
+ return NEXT_CHAR;
1526
+ default:
1527
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1528
+ return emit_current_char(parser, output);
1529
+ }
1530
+ }
1531
+
1532
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
1533
+ static StateResult handle_script_double_escaped_dash_dash_state(
1534
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
1535
+ GumboToken* output) {
1536
+ switch (c) {
1537
+ case '-':
1538
+ return emit_current_char(parser, output);
1539
+ case '<':
1540
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1541
+ return emit_current_char(parser, output);
1542
+ case '>':
1543
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1544
+ return emit_current_char(parser, output);
1545
+ case '\0':
1546
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1547
+ return emit_replacement_char(parser, output);
1548
+ case -1:
1549
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1550
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1551
+ return NEXT_CHAR;
1552
+ default:
1553
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1554
+ return emit_current_char(parser, output);
1555
+ }
1556
+ }
1557
+
1558
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
1559
+ static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
1560
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1561
+ if (c == '/') {
1562
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1563
+ gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
1564
+ return emit_current_char(parser, output);
1565
+ } else {
1566
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1567
+ tokenizer->_reconsume_current_input = true;
1568
+ return NEXT_CHAR;
1569
+ }
1570
+ }
1571
+
1572
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
1573
+ static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1574
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1575
+ switch (c) {
1576
+ case '\t':
1577
+ case '\n':
1578
+ case '\f':
1579
+ case ' ':
1580
+ case '/':
1581
+ case '>':
1582
+ gumbo_tokenizer_set_state(
1583
+ parser, gumbo_string_equals(&kScriptTag,
1584
+ (GumboStringPiece*) &tokenizer->_script_data_buffer)
1585
+ ? GUMBO_LEX_SCRIPT_ESCAPED
1586
+ : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1587
+ return emit_current_char(parser, output);
1588
+ default:
1589
+ if (is_alpha(c)) {
1590
+ gumbo_string_buffer_append_codepoint(
1591
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1592
+ return emit_current_char(parser, output);
1593
+ } else {
1594
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1595
+ tokenizer->_reconsume_current_input = true;
1596
+ return NEXT_CHAR;
1597
+ }
1598
+ }
1599
+ }
1600
+
1601
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
1602
+ static StateResult handle_before_attr_name_state(GumboParser* parser,
1603
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1604
+ switch (c) {
1605
+ case '\t':
1606
+ case '\n':
1607
+ case '\f':
1608
+ case ' ':
1609
+ return NEXT_CHAR;
1610
+ case '/':
1611
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1612
+ return NEXT_CHAR;
1613
+ case '>':
1614
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1615
+ return emit_current_tag(parser, output);
1616
+ case '\0':
1617
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1618
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1619
+ append_char_to_temporary_buffer(parser, 0xfffd);
1620
+ return NEXT_CHAR;
1621
+ case -1:
1622
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1623
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1624
+ abandon_current_tag(parser);
1625
+ return NEXT_CHAR;
1626
+ case '"':
1627
+ case '\'':
1628
+ case '<':
1629
+ case '=':
1630
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1631
+ // Fall through.
1632
+ default:
1633
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1634
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1635
+ return NEXT_CHAR;
1636
+ }
1637
+ }
1638
+
1639
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
1640
+ static StateResult handle_attr_name_state(GumboParser* parser,
1641
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1642
+ switch (c) {
1643
+ case '\t':
1644
+ case '\n':
1645
+ case '\f':
1646
+ case ' ':
1647
+ finish_attribute_name(parser);
1648
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1649
+ return NEXT_CHAR;
1650
+ case '/':
1651
+ finish_attribute_name(parser);
1652
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1653
+ return NEXT_CHAR;
1654
+ case '=':
1655
+ finish_attribute_name(parser);
1656
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1657
+ return NEXT_CHAR;
1658
+ case '>':
1659
+ finish_attribute_name(parser);
1660
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1661
+ return emit_current_tag(parser, output);
1662
+ case '\0':
1663
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1664
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1665
+ return NEXT_CHAR;
1666
+ case -1:
1667
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1668
+ abandon_current_tag(parser);
1669
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1670
+ return NEXT_CHAR;
1671
+ case '"':
1672
+ case '\'':
1673
+ case '<':
1674
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1675
+ // Fall through.
1676
+ default:
1677
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1678
+ return NEXT_CHAR;
1679
+ }
1680
+ }
1681
+
1682
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
1683
+ static StateResult handle_after_attr_name_state(GumboParser* parser,
1684
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1685
+ switch (c) {
1686
+ case '\t':
1687
+ case '\n':
1688
+ case '\f':
1689
+ case ' ':
1690
+ return NEXT_CHAR;
1691
+ case '/':
1692
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1693
+ return NEXT_CHAR;
1694
+ case '=':
1695
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1696
+ return NEXT_CHAR;
1697
+ case '>':
1698
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1699
+ return emit_current_tag(parser, output);
1700
+ case '\0':
1701
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1702
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1703
+ append_char_to_temporary_buffer(parser, 0xfffd);
1704
+ return NEXT_CHAR;
1705
+ case -1:
1706
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1707
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1708
+ abandon_current_tag(parser);
1709
+ return NEXT_CHAR;
1710
+ case '"':
1711
+ case '\'':
1712
+ case '<':
1713
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1714
+ // Fall through.
1715
+ default:
1716
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1717
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1718
+ return NEXT_CHAR;
1719
+ }
1720
+ }
1721
+
1722
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
1723
+ static StateResult handle_before_attr_value_state(GumboParser* parser,
1724
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1725
+ switch (c) {
1726
+ case '\t':
1727
+ case '\n':
1728
+ case '\f':
1729
+ case ' ':
1730
+ return NEXT_CHAR;
1731
+ case '"':
1732
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1733
+ reset_tag_buffer_start_point(parser);
1734
+ return NEXT_CHAR;
1735
+ case '&':
1736
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1737
+ tokenizer->_reconsume_current_input = true;
1738
+ return NEXT_CHAR;
1739
+ case '\'':
1740
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1741
+ reset_tag_buffer_start_point(parser);
1742
+ return NEXT_CHAR;
1743
+ case '\0':
1744
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1745
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1746
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1747
+ return NEXT_CHAR;
1748
+ case -1:
1749
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1750
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1751
+ abandon_current_tag(parser);
1752
+ tokenizer->_reconsume_current_input = true;
1753
+ return NEXT_CHAR;
1754
+ case '>':
1755
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1756
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1757
+ emit_current_tag(parser, output);
1758
+ return RETURN_ERROR;
1759
+ case '<':
1760
+ case '=':
1761
+ case '`':
1762
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1763
+ // Fall through.
1764
+ default:
1765
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1766
+ append_char_to_tag_buffer(parser, c, true);
1767
+ return NEXT_CHAR;
1768
+ }
1769
+ }
1770
+
1771
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
1772
+ static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
1773
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1774
+ switch (c) {
1775
+ case '"':
1776
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1777
+ return NEXT_CHAR;
1778
+ case '&':
1779
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1780
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1781
+ tokenizer->_reconsume_current_input = true;
1782
+ return NEXT_CHAR;
1783
+ case '\0':
1784
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1785
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1786
+ return NEXT_CHAR;
1787
+ case -1:
1788
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
1789
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1790
+ abandon_current_tag(parser);
1791
+ tokenizer->_reconsume_current_input = true;
1792
+ return NEXT_CHAR;
1793
+ default:
1794
+ append_char_to_tag_buffer(parser, c, false);
1795
+ return NEXT_CHAR;
1796
+ }
1797
+ }
1798
+
1799
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
1800
+ static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
1801
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1802
+ switch (c) {
1803
+ case '\'':
1804
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1805
+ return NEXT_CHAR;
1806
+ case '&':
1807
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1808
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1809
+ tokenizer->_reconsume_current_input = true;
1810
+ return NEXT_CHAR;
1811
+ case '\0':
1812
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1813
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1814
+ return NEXT_CHAR;
1815
+ case -1:
1816
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
1817
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1818
+ abandon_current_tag(parser);
1819
+ tokenizer->_reconsume_current_input = true;
1820
+ return NEXT_CHAR;
1821
+ default:
1822
+ append_char_to_tag_buffer(parser, c, false);
1823
+ return NEXT_CHAR;
1824
+ }
1825
+ }
1826
+
1827
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
1828
+ static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
1829
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1830
+ switch (c) {
1831
+ case '\t':
1832
+ case '\n':
1833
+ case '\f':
1834
+ case ' ':
1835
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1836
+ finish_attribute_value(parser);
1837
+ return NEXT_CHAR;
1838
+ case '&':
1839
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1840
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1841
+ tokenizer->_reconsume_current_input = true;
1842
+ return NEXT_CHAR;
1843
+ case '>':
1844
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1845
+ finish_attribute_value(parser);
1846
+ return emit_current_tag(parser, output);
1847
+ case '\0':
1848
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1849
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1850
+ return NEXT_CHAR;
1851
+ case -1:
1852
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1853
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1854
+ tokenizer->_reconsume_current_input = true;
1855
+ abandon_current_tag(parser);
1856
+ return NEXT_CHAR;
1857
+ case '<':
1858
+ case '=':
1859
+ case '"':
1860
+ case '\'':
1861
+ case '`':
1862
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1863
+ // Fall through.
1864
+ default:
1865
+ append_char_to_tag_buffer(parser, c, true);
1866
+ return NEXT_CHAR;
1867
+ }
1868
+ }
1869
+
1870
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
1871
+ static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1872
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1873
+ OneOrTwoCodepoints char_ref;
1874
+ int allowed_char;
1875
+ bool is_unquoted = false;
1876
+ switch (tokenizer->_tag_state._attr_value_state) {
1877
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
1878
+ allowed_char = '"';
1879
+ break;
1880
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
1881
+ allowed_char = '\'';
1882
+ break;
1883
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
1884
+ allowed_char = '>';
1885
+ is_unquoted = true;
1886
+ break;
1887
+ default:
1888
+ // -Wmaybe-uninitialized is a little overzealous here, and doesn't
1889
+ // get that the assert(0) means this codepath will never happen.
1890
+ allowed_char = ' ';
1891
+ assert(0);
1892
+ }
1893
+
1894
+ // Ignore the status, since we don't have a convenient way of signalling that
1895
+ // a parser error has occurred when the error occurs in the middle of a
1896
+ // multi-state token. We'd need a flag inside the TokenizerState to do this,
1897
+ // but that's a low priority fix.
1898
+ consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
1899
+ if (char_ref.first != kGumboNoChar) {
1900
+ tokenizer->_reconsume_current_input = true;
1901
+ append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
1902
+ if (char_ref.second != kGumboNoChar) {
1903
+ append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
1904
+ }
1905
+ } else {
1906
+ append_char_to_tag_buffer(parser, '&', is_unquoted);
1907
+ }
1908
+ gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
1909
+ return NEXT_CHAR;
1910
+ }
1911
+
1912
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
1913
+ static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
1914
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1915
+ finish_attribute_value(parser);
1916
+ switch (c) {
1917
+ case '\t':
1918
+ case '\n':
1919
+ case '\f':
1920
+ case ' ':
1921
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1922
+ return NEXT_CHAR;
1923
+ case '/':
1924
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1925
+ return NEXT_CHAR;
1926
+ case '>':
1927
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1928
+ return emit_current_tag(parser, output);
1929
+ case -1:
1930
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
1931
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1932
+ abandon_current_tag(parser);
1933
+ tokenizer->_reconsume_current_input = true;
1934
+ return NEXT_CHAR;
1935
+ default:
1936
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
1937
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1938
+ tokenizer->_reconsume_current_input = true;
1939
+ return NEXT_CHAR;
1940
+ }
1941
+ }
1942
+
1943
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
1944
+ static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
1945
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1946
+ switch (c) {
1947
+ case '>':
1948
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1949
+ tokenizer->_tag_state._is_self_closing = true;
1950
+ return emit_current_tag(parser, output);
1951
+ case -1:
1952
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
1953
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1954
+ abandon_current_tag(parser);
1955
+ return NEXT_CHAR;
1956
+ default:
1957
+ tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
1958
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1959
+ tokenizer->_reconsume_current_input = true;
1960
+ return NEXT_CHAR;
1961
+ }
1962
+ }
1963
+
1964
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
1965
+ static StateResult handle_bogus_comment_state(GumboParser* parser,
1966
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1967
+ while (c != '>' && c != -1) {
1968
+ if (c == '\0') {
1969
+ c = 0xFFFD;
1970
+ }
1971
+ append_char_to_temporary_buffer(parser, c);
1972
+ utf8iterator_next(&tokenizer->_input);
1973
+ c = utf8iterator_current(&tokenizer->_input);
1974
+ }
1975
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1976
+ return emit_comment(parser, output);
1977
+ }
1978
+
1979
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
1980
+ static StateResult handle_markup_declaration_state(GumboParser* parser,
1981
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1982
+ if (utf8iterator_maybe_consume_match(
1983
+ &tokenizer->_input, "--", sizeof("--") - 1, true)) {
1984
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
1985
+ tokenizer->_reconsume_current_input = true;
1986
+ } else if (utf8iterator_maybe_consume_match(
1987
+ &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
1988
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
1989
+ tokenizer->_reconsume_current_input = true;
1990
+ // If we get here, we know we'll eventually emit a doctype token, so now is
1991
+ // the time to initialize the doctype strings. (Not in doctype_state_init,
1992
+ // since then they'll leak if ownership never gets transferred to the
1993
+ // doctype token.
1994
+ tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
1995
+ tokenizer->_doc_type_state.public_identifier =
1996
+ gumbo_copy_stringz(parser, "");
1997
+ tokenizer->_doc_type_state.system_identifier =
1998
+ gumbo_copy_stringz(parser, "");
1999
+ } else if (tokenizer->_is_current_node_foreign &&
2000
+ utf8iterator_maybe_consume_match(
2001
+ &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2002
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2003
+ tokenizer->_is_in_cdata = true;
2004
+ tokenizer->_reconsume_current_input = true;
2005
+ } else {
2006
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2007
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2008
+ tokenizer->_reconsume_current_input = true;
2009
+ clear_temporary_buffer(parser);
2010
+ }
2011
+ return NEXT_CHAR;
2012
+ }
2013
+
2014
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
2015
+ static StateResult handle_comment_start_state(GumboParser* parser,
2016
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2017
+ switch (c) {
2018
+ case '-':
2019
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2020
+ return NEXT_CHAR;
2021
+ case '\0':
2022
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2023
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2024
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2025
+ return NEXT_CHAR;
2026
+ case '>':
2027
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2028
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2029
+ emit_comment(parser, output);
2030
+ return RETURN_ERROR;
2031
+ case -1:
2032
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2033
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2034
+ emit_comment(parser, output);
2035
+ return RETURN_ERROR;
2036
+ default:
2037
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2038
+ append_char_to_temporary_buffer(parser, c);
2039
+ return NEXT_CHAR;
2040
+ }
2041
+ }
2042
+
2043
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
2044
+ static StateResult handle_comment_start_dash_state(GumboParser* parser,
2045
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2046
+ switch (c) {
2047
+ case '-':
2048
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2049
+ return NEXT_CHAR;
2050
+ case '\0':
2051
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2052
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2053
+ append_char_to_temporary_buffer(parser, '-');
2054
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2055
+ return NEXT_CHAR;
2056
+ case '>':
2057
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2058
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2059
+ emit_comment(parser, output);
2060
+ return RETURN_ERROR;
2061
+ case -1:
2062
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2063
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2064
+ emit_comment(parser, output);
2065
+ return RETURN_ERROR;
2066
+ default:
2067
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2068
+ append_char_to_temporary_buffer(parser, '-');
2069
+ append_char_to_temporary_buffer(parser, c);
2070
+ return NEXT_CHAR;
2071
+ }
2072
+ }
2073
+
2074
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
2075
+ static StateResult handle_comment_state(GumboParser* parser,
2076
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2077
+ switch (c) {
2078
+ case '-':
2079
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2080
+ return NEXT_CHAR;
2081
+ case '\0':
2082
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2083
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2084
+ return NEXT_CHAR;
2085
+ case -1:
2086
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2087
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2088
+ emit_comment(parser, output);
2089
+ return RETURN_ERROR;
2090
+ default:
2091
+ append_char_to_temporary_buffer(parser, c);
2092
+ return NEXT_CHAR;
2093
+ }
2094
+ }
2095
+
2096
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
2097
+ static StateResult handle_comment_end_dash_state(GumboParser* parser,
2098
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2099
+ switch (c) {
2100
+ case '-':
2101
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2102
+ return NEXT_CHAR;
2103
+ case '\0':
2104
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2105
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2106
+ append_char_to_temporary_buffer(parser, '-');
2107
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2108
+ return NEXT_CHAR;
2109
+ case -1:
2110
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2111
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2112
+ emit_comment(parser, output);
2113
+ return RETURN_ERROR;
2114
+ default:
2115
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2116
+ append_char_to_temporary_buffer(parser, '-');
2117
+ append_char_to_temporary_buffer(parser, c);
2118
+ return NEXT_CHAR;
2119
+ }
2120
+ }
2121
+
2122
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
2123
+ static StateResult handle_comment_end_state(GumboParser* parser,
2124
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2125
+ switch (c) {
2126
+ case '>':
2127
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2128
+ return emit_comment(parser, output);
2129
+ case '\0':
2130
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2131
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2132
+ append_char_to_temporary_buffer(parser, '-');
2133
+ append_char_to_temporary_buffer(parser, '-');
2134
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2135
+ return NEXT_CHAR;
2136
+ case '!':
2137
+ tokenizer_add_parse_error(
2138
+ parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2139
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2140
+ return NEXT_CHAR;
2141
+ case '-':
2142
+ tokenizer_add_parse_error(
2143
+ parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2144
+ append_char_to_temporary_buffer(parser, '-');
2145
+ return NEXT_CHAR;
2146
+ case -1:
2147
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2148
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2149
+ emit_comment(parser, output);
2150
+ return RETURN_ERROR;
2151
+ default:
2152
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2153
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2154
+ append_char_to_temporary_buffer(parser, '-');
2155
+ append_char_to_temporary_buffer(parser, '-');
2156
+ append_char_to_temporary_buffer(parser, c);
2157
+ return NEXT_CHAR;
2158
+ }
2159
+ }
2160
+
2161
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
2162
+ static StateResult handle_comment_end_bang_state(GumboParser* parser,
2163
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2164
+ switch (c) {
2165
+ case '-':
2166
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2167
+ append_char_to_temporary_buffer(parser, '-');
2168
+ append_char_to_temporary_buffer(parser, '-');
2169
+ append_char_to_temporary_buffer(parser, '!');
2170
+ return NEXT_CHAR;
2171
+ case '>':
2172
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2173
+ return emit_comment(parser, output);
2174
+ case '\0':
2175
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2176
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2177
+ append_char_to_temporary_buffer(parser, '-');
2178
+ append_char_to_temporary_buffer(parser, '-');
2179
+ append_char_to_temporary_buffer(parser, '!');
2180
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2181
+ return NEXT_CHAR;
2182
+ case -1:
2183
+ tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2184
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2185
+ emit_comment(parser, output);
2186
+ return RETURN_ERROR;
2187
+ default:
2188
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2189
+ append_char_to_temporary_buffer(parser, '-');
2190
+ append_char_to_temporary_buffer(parser, '-');
2191
+ append_char_to_temporary_buffer(parser, '!');
2192
+ append_char_to_temporary_buffer(parser, c);
2193
+ return NEXT_CHAR;
2194
+ }
2195
+ }
2196
+
2197
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
2198
+ static StateResult handle_doctype_state(GumboParser* parser,
2199
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2200
+ assert(!tokenizer->_temporary_buffer.length);
2201
+ switch (c) {
2202
+ case '\t':
2203
+ case '\n':
2204
+ case '\f':
2205
+ case ' ':
2206
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2207
+ return NEXT_CHAR;
2208
+ case -1:
2209
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2210
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2211
+ tokenizer->_doc_type_state.force_quirks = true;
2212
+ emit_doctype(parser, output);
2213
+ return RETURN_ERROR;
2214
+ default:
2215
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2216
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2217
+ tokenizer->_reconsume_current_input = true;
2218
+ tokenizer->_doc_type_state.force_quirks = true;
2219
+ return NEXT_CHAR;
2220
+ }
2221
+ }
2222
+
2223
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
2224
+ static StateResult handle_before_doctype_name_state(GumboParser* parser,
2225
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2226
+ switch (c) {
2227
+ case '\t':
2228
+ case '\n':
2229
+ case '\f':
2230
+ case ' ':
2231
+ return NEXT_CHAR;
2232
+ case '\0':
2233
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2234
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2235
+ tokenizer->_doc_type_state.force_quirks = true;
2236
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2237
+ return NEXT_CHAR;
2238
+ case '>':
2239
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2240
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2241
+ tokenizer->_doc_type_state.force_quirks = true;
2242
+ emit_doctype(parser, output);
2243
+ return RETURN_ERROR;
2244
+ case -1:
2245
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2246
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2247
+ tokenizer->_doc_type_state.force_quirks = true;
2248
+ emit_doctype(parser, output);
2249
+ return RETURN_ERROR;
2250
+ default:
2251
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2252
+ tokenizer->_doc_type_state.force_quirks = false;
2253
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2254
+ return NEXT_CHAR;
2255
+ }
2256
+ }
2257
+
2258
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
2259
+ static StateResult handle_doctype_name_state(GumboParser* parser,
2260
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2261
+ switch (c) {
2262
+ case '\t':
2263
+ case '\n':
2264
+ case '\f':
2265
+ case ' ':
2266
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2267
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2268
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2269
+ return NEXT_CHAR;
2270
+ case '>':
2271
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2272
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2273
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2274
+ emit_doctype(parser, output);
2275
+ return RETURN_SUCCESS;
2276
+ case '\0':
2277
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2278
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2279
+ return NEXT_CHAR;
2280
+ case -1:
2281
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2282
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2283
+ tokenizer->_doc_type_state.force_quirks = true;
2284
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2285
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2286
+ emit_doctype(parser, output);
2287
+ return RETURN_ERROR;
2288
+ default:
2289
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2290
+ tokenizer->_doc_type_state.force_quirks = false;
2291
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2292
+ return NEXT_CHAR;
2293
+ }
2294
+ }
2295
+
2296
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
2297
+ static StateResult handle_after_doctype_name_state(GumboParser* parser,
2298
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2299
+ switch (c) {
2300
+ case '\t':
2301
+ case '\n':
2302
+ case '\f':
2303
+ case ' ':
2304
+ return NEXT_CHAR;
2305
+ case '>':
2306
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2307
+ emit_doctype(parser, output);
2308
+ return RETURN_SUCCESS;
2309
+ case -1:
2310
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2311
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2312
+ tokenizer->_doc_type_state.force_quirks = true;
2313
+ emit_doctype(parser, output);
2314
+ return RETURN_ERROR;
2315
+ default:
2316
+ if (utf8iterator_maybe_consume_match(
2317
+ &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2318
+ gumbo_tokenizer_set_state(
2319
+ parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2320
+ tokenizer->_reconsume_current_input = true;
2321
+ } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2322
+ sizeof("SYSTEM") - 1, false)) {
2323
+ gumbo_tokenizer_set_state(
2324
+ parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2325
+ tokenizer->_reconsume_current_input = true;
2326
+ } else {
2327
+ tokenizer_add_parse_error(
2328
+ parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2329
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2330
+ tokenizer->_doc_type_state.force_quirks = true;
2331
+ }
2332
+ return NEXT_CHAR;
2333
+ }
2334
+ }
2335
+
2336
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
2337
+ static StateResult handle_after_doctype_public_keyword_state(
2338
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2339
+ GumboToken* output) {
2340
+ switch (c) {
2341
+ case '\t':
2342
+ case '\n':
2343
+ case '\f':
2344
+ case ' ':
2345
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2346
+ return NEXT_CHAR;
2347
+ case '"':
2348
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2349
+ assert(temporary_buffer_equals(parser, ""));
2350
+ gumbo_tokenizer_set_state(
2351
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2352
+ return NEXT_CHAR;
2353
+ case '\'':
2354
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2355
+ assert(temporary_buffer_equals(parser, ""));
2356
+ gumbo_tokenizer_set_state(
2357
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2358
+ return NEXT_CHAR;
2359
+ case '>':
2360
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2361
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2362
+ tokenizer->_doc_type_state.force_quirks = true;
2363
+ emit_doctype(parser, output);
2364
+ return RETURN_ERROR;
2365
+ case -1:
2366
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2367
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2368
+ tokenizer->_doc_type_state.force_quirks = true;
2369
+ emit_doctype(parser, output);
2370
+ return RETURN_ERROR;
2371
+ default:
2372
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2373
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2374
+ tokenizer->_doc_type_state.force_quirks = true;
2375
+ emit_doctype(parser, output);
2376
+ return RETURN_ERROR;
2377
+ }
2378
+ }
2379
+
2380
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
2381
+ static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2382
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2383
+ switch (c) {
2384
+ case '\t':
2385
+ case '\n':
2386
+ case '\f':
2387
+ case ' ':
2388
+ return NEXT_CHAR;
2389
+ case '"':
2390
+ assert(temporary_buffer_equals(parser, ""));
2391
+ gumbo_tokenizer_set_state(
2392
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2393
+ return NEXT_CHAR;
2394
+ case '\'':
2395
+ assert(temporary_buffer_equals(parser, ""));
2396
+ gumbo_tokenizer_set_state(
2397
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2398
+ return NEXT_CHAR;
2399
+ case '>':
2400
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2401
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2402
+ tokenizer->_doc_type_state.force_quirks = true;
2403
+ emit_doctype(parser, output);
2404
+ return RETURN_ERROR;
2405
+ case -1:
2406
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2407
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2408
+ tokenizer->_doc_type_state.force_quirks = true;
2409
+ emit_doctype(parser, output);
2410
+ return RETURN_ERROR;
2411
+ default:
2412
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2413
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2414
+ tokenizer->_doc_type_state.force_quirks = true;
2415
+ emit_doctype(parser, output);
2416
+ return RETURN_ERROR;
2417
+ }
2418
+ }
2419
+
2420
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
2421
+ static StateResult handle_doctype_public_id_double_quoted_state(
2422
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2423
+ GumboToken* output) {
2424
+ switch (c) {
2425
+ case '"':
2426
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2427
+ finish_doctype_public_id(parser);
2428
+ return NEXT_CHAR;
2429
+ case '\0':
2430
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2431
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2432
+ return NEXT_CHAR;
2433
+ case '>':
2434
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2435
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2436
+ tokenizer->_doc_type_state.force_quirks = true;
2437
+ finish_doctype_public_id(parser);
2438
+ emit_doctype(parser, output);
2439
+ return RETURN_ERROR;
2440
+ case -1:
2441
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2442
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2443
+ tokenizer->_doc_type_state.force_quirks = true;
2444
+ finish_doctype_public_id(parser);
2445
+ emit_doctype(parser, output);
2446
+ return RETURN_ERROR;
2447
+ default:
2448
+ append_char_to_temporary_buffer(parser, c);
2449
+ return NEXT_CHAR;
2450
+ }
2451
+ }
2452
+
2453
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
2454
+ static StateResult handle_doctype_public_id_single_quoted_state(
2455
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2456
+ GumboToken* output) {
2457
+ switch (c) {
2458
+ case '\'':
2459
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2460
+ finish_doctype_public_id(parser);
2461
+ return NEXT_CHAR;
2462
+ case '\0':
2463
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2464
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2465
+ return NEXT_CHAR;
2466
+ case '>':
2467
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2468
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2469
+ tokenizer->_doc_type_state.force_quirks = true;
2470
+ finish_doctype_public_id(parser);
2471
+ emit_doctype(parser, output);
2472
+ return RETURN_ERROR;
2473
+ case -1:
2474
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2475
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2476
+ tokenizer->_doc_type_state.force_quirks = true;
2477
+ finish_doctype_public_id(parser);
2478
+ emit_doctype(parser, output);
2479
+ return RETURN_ERROR;
2480
+ default:
2481
+ append_char_to_temporary_buffer(parser, c);
2482
+ return NEXT_CHAR;
2483
+ }
2484
+ }
2485
+
2486
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
2487
+ static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2488
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2489
+ switch (c) {
2490
+ case '\t':
2491
+ case '\n':
2492
+ case '\f':
2493
+ case ' ':
2494
+ gumbo_tokenizer_set_state(
2495
+ parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2496
+ return NEXT_CHAR;
2497
+ case '>':
2498
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2499
+ emit_doctype(parser, output);
2500
+ return RETURN_SUCCESS;
2501
+ case '"':
2502
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2503
+ assert(temporary_buffer_equals(parser, ""));
2504
+ gumbo_tokenizer_set_state(
2505
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2506
+ return NEXT_CHAR;
2507
+ case '\'':
2508
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2509
+ assert(temporary_buffer_equals(parser, ""));
2510
+ gumbo_tokenizer_set_state(
2511
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2512
+ return NEXT_CHAR;
2513
+ case -1:
2514
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2515
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2516
+ tokenizer->_reconsume_current_input = true;
2517
+ tokenizer->_doc_type_state.force_quirks = true;
2518
+ emit_doctype(parser, output);
2519
+ return RETURN_ERROR;
2520
+ default:
2521
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2522
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2523
+ tokenizer->_doc_type_state.force_quirks = true;
2524
+ return NEXT_CHAR;
2525
+ }
2526
+ }
2527
+
2528
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
2529
+ static StateResult handle_between_doctype_public_system_id_state(
2530
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2531
+ GumboToken* output) {
2532
+ switch (c) {
2533
+ case '\t':
2534
+ case '\n':
2535
+ case '\f':
2536
+ case ' ':
2537
+ return NEXT_CHAR;
2538
+ case '>':
2539
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2540
+ emit_doctype(parser, output);
2541
+ return RETURN_SUCCESS;
2542
+ case '"':
2543
+ assert(temporary_buffer_equals(parser, ""));
2544
+ gumbo_tokenizer_set_state(
2545
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2546
+ return NEXT_CHAR;
2547
+ case '\'':
2548
+ assert(temporary_buffer_equals(parser, ""));
2549
+ gumbo_tokenizer_set_state(
2550
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2551
+ return NEXT_CHAR;
2552
+ case -1:
2553
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2554
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2555
+ tokenizer->_doc_type_state.force_quirks = true;
2556
+ emit_doctype(parser, output);
2557
+ return RETURN_ERROR;
2558
+ default:
2559
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2560
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2561
+ tokenizer->_doc_type_state.force_quirks = true;
2562
+ emit_doctype(parser, output);
2563
+ return RETURN_ERROR;
2564
+ }
2565
+ }
2566
+
2567
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
2568
+ static StateResult handle_after_doctype_system_keyword_state(
2569
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2570
+ GumboToken* output) {
2571
+ switch (c) {
2572
+ case '\t':
2573
+ case '\n':
2574
+ case '\f':
2575
+ case ' ':
2576
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2577
+ return NEXT_CHAR;
2578
+ case '"':
2579
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2580
+ assert(temporary_buffer_equals(parser, ""));
2581
+ gumbo_tokenizer_set_state(
2582
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2583
+ return NEXT_CHAR;
2584
+ case '\'':
2585
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2586
+ assert(temporary_buffer_equals(parser, ""));
2587
+ gumbo_tokenizer_set_state(
2588
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2589
+ return NEXT_CHAR;
2590
+ case '>':
2591
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2592
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2593
+ tokenizer->_doc_type_state.force_quirks = true;
2594
+ emit_doctype(parser, output);
2595
+ return RETURN_ERROR;
2596
+ case -1:
2597
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2598
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2599
+ tokenizer->_doc_type_state.force_quirks = true;
2600
+ emit_doctype(parser, output);
2601
+ return RETURN_ERROR;
2602
+ default:
2603
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2604
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2605
+ tokenizer->_doc_type_state.force_quirks = true;
2606
+ return NEXT_CHAR;
2607
+ }
2608
+ }
2609
+
2610
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
2611
+ static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2612
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2613
+ switch (c) {
2614
+ case '\t':
2615
+ case '\n':
2616
+ case '\f':
2617
+ case ' ':
2618
+ return NEXT_CHAR;
2619
+ case '"':
2620
+ assert(temporary_buffer_equals(parser, ""));
2621
+ gumbo_tokenizer_set_state(
2622
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2623
+ return NEXT_CHAR;
2624
+ case '\'':
2625
+ assert(temporary_buffer_equals(parser, ""));
2626
+ gumbo_tokenizer_set_state(
2627
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2628
+ return NEXT_CHAR;
2629
+ case '>':
2630
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2631
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2632
+ tokenizer->_doc_type_state.force_quirks = true;
2633
+ emit_doctype(parser, output);
2634
+ return RETURN_ERROR;
2635
+ case -1:
2636
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2637
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2638
+ tokenizer->_doc_type_state.force_quirks = true;
2639
+ emit_doctype(parser, output);
2640
+ return RETURN_ERROR;
2641
+ default:
2642
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2643
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2644
+ tokenizer->_doc_type_state.force_quirks = true;
2645
+ return NEXT_CHAR;
2646
+ }
2647
+ }
2648
+
2649
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
2650
+ static StateResult handle_doctype_system_id_double_quoted_state(
2651
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2652
+ GumboToken* output) {
2653
+ switch (c) {
2654
+ case '"':
2655
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2656
+ finish_doctype_system_id(parser);
2657
+ return NEXT_CHAR;
2658
+ case '\0':
2659
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2660
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2661
+ return NEXT_CHAR;
2662
+ case '>':
2663
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2664
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2665
+ tokenizer->_doc_type_state.force_quirks = true;
2666
+ finish_doctype_system_id(parser);
2667
+ emit_doctype(parser, output);
2668
+ return RETURN_ERROR;
2669
+ case -1:
2670
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2671
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2672
+ tokenizer->_doc_type_state.force_quirks = true;
2673
+ finish_doctype_system_id(parser);
2674
+ emit_doctype(parser, output);
2675
+ return RETURN_ERROR;
2676
+ default:
2677
+ append_char_to_temporary_buffer(parser, c);
2678
+ return NEXT_CHAR;
2679
+ }
2680
+ }
2681
+
2682
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
2683
+ static StateResult handle_doctype_system_id_single_quoted_state(
2684
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2685
+ GumboToken* output) {
2686
+ switch (c) {
2687
+ case '\'':
2688
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2689
+ finish_doctype_system_id(parser);
2690
+ return NEXT_CHAR;
2691
+ case '\0':
2692
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2693
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2694
+ return NEXT_CHAR;
2695
+ case '>':
2696
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2697
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2698
+ tokenizer->_doc_type_state.force_quirks = true;
2699
+ finish_doctype_system_id(parser);
2700
+ emit_doctype(parser, output);
2701
+ return RETURN_ERROR;
2702
+ case -1:
2703
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2704
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2705
+ tokenizer->_doc_type_state.force_quirks = true;
2706
+ finish_doctype_system_id(parser);
2707
+ emit_doctype(parser, output);
2708
+ return RETURN_ERROR;
2709
+ default:
2710
+ append_char_to_temporary_buffer(parser, c);
2711
+ return NEXT_CHAR;
2712
+ }
2713
+ }
2714
+
2715
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
2716
+ static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
2717
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2718
+ switch (c) {
2719
+ case '\t':
2720
+ case '\n':
2721
+ case '\f':
2722
+ case ' ':
2723
+ return NEXT_CHAR;
2724
+ case '>':
2725
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2726
+ emit_doctype(parser, output);
2727
+ return RETURN_SUCCESS;
2728
+ case -1:
2729
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2730
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2731
+ tokenizer->_doc_type_state.force_quirks = true;
2732
+ emit_doctype(parser, output);
2733
+ return RETURN_ERROR;
2734
+ default:
2735
+ tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2736
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2737
+ return NEXT_CHAR;
2738
+ }
2739
+ }
2740
+
2741
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
2742
+ static StateResult handle_bogus_doctype_state(GumboParser* parser,
2743
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2744
+ if (c == '>' || c == -1) {
2745
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2746
+ emit_doctype(parser, output);
2747
+ return RETURN_ERROR;
2748
+ }
2749
+ return NEXT_CHAR;
2750
+ }
2751
+
2752
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
2753
+ static StateResult handle_cdata_state(GumboParser* parser,
2754
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2755
+ if (c == -1 || utf8iterator_maybe_consume_match(
2756
+ &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2757
+ tokenizer->_reconsume_current_input = true;
2758
+ reset_token_start_point(tokenizer);
2759
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2760
+ tokenizer->_is_in_cdata = false;
2761
+ return NEXT_CHAR;
2762
+ } else {
2763
+ return emit_current_char(parser, output);
2764
+ }
2765
+ }
2766
+
2767
+ typedef StateResult (*GumboLexerStateFunction)(
2768
+ GumboParser*, GumboTokenizerState*, int, GumboToken*);
2769
+
2770
+ static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
2771
+ handle_char_ref_in_data_state, handle_rcdata_state,
2772
+ handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
2773
+ handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
2774
+ handle_tag_name_state, handle_rcdata_lt_state,
2775
+ handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
2776
+ handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
2777
+ handle_rawtext_end_tag_name_state, handle_script_lt_state,
2778
+ handle_script_end_tag_open_state, handle_script_end_tag_name_state,
2779
+ handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
2780
+ handle_script_escaped_state, handle_script_escaped_dash_state,
2781
+ handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
2782
+ handle_script_escaped_end_tag_open_state,
2783
+ handle_script_escaped_end_tag_name_state,
2784
+ handle_script_double_escaped_start_state,
2785
+ handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
2786
+ handle_script_double_escaped_dash_dash_state,
2787
+ handle_script_double_escaped_lt_state,
2788
+ handle_script_double_escaped_end_state, handle_before_attr_name_state,
2789
+ handle_attr_name_state, handle_after_attr_name_state,
2790
+ handle_before_attr_value_state, handle_attr_value_double_quoted_state,
2791
+ handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
2792
+ handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
2793
+ handle_self_closing_start_tag_state, handle_bogus_comment_state,
2794
+ handle_markup_declaration_state, handle_comment_start_state,
2795
+ handle_comment_start_dash_state, handle_comment_state,
2796
+ handle_comment_end_dash_state, handle_comment_end_state,
2797
+ handle_comment_end_bang_state, handle_doctype_state,
2798
+ handle_before_doctype_name_state, handle_doctype_name_state,
2799
+ handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
2800
+ handle_before_doctype_public_id_state,
2801
+ handle_doctype_public_id_double_quoted_state,
2802
+ handle_doctype_public_id_single_quoted_state,
2803
+ handle_after_doctype_public_id_state,
2804
+ handle_between_doctype_public_system_id_state,
2805
+ handle_after_doctype_system_keyword_state,
2806
+ handle_before_doctype_system_id_state,
2807
+ handle_doctype_system_id_double_quoted_state,
2808
+ handle_doctype_system_id_single_quoted_state,
2809
+ handle_after_doctype_system_id_state, handle_bogus_doctype_state,
2810
+ handle_cdata_state};
2811
+
2812
+ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2813
+ // Because of the spec requirements that...
2814
+ //
2815
+ // 1. Tokens be handled immediately by the parser upon emission.
2816
+ // 2. Some states (eg. CDATA, or various error conditions) require the
2817
+ // emission of multiple tokens in the same states.
2818
+ // 3. The tokenizer often has to reconsume the same character in a different
2819
+ // state.
2820
+ //
2821
+ // ...all state must be held in the GumboTokenizer struct instead of in local
2822
+ // variables in this function. That allows us to return from this method with
2823
+ // a token, and then immediately jump back to the same state with the same
2824
+ // input if we need to return a different token. The various emit_* functions
2825
+ // are responsible for changing state (eg. flushing the chardata buffer,
2826
+ // reading the next input character) to avoid an infinite loop.
2827
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
2828
+
2829
+ if (tokenizer->_buffered_emit_char != kGumboNoChar) {
2830
+ tokenizer->_reconsume_current_input = true;
2831
+ emit_char(parser, tokenizer->_buffered_emit_char, output);
2832
+ // And now that we've avoided advancing the input, make sure we set
2833
+ // _reconsume_current_input back to false to make sure the *next* character
2834
+ // isn't consumed twice.
2835
+ tokenizer->_reconsume_current_input = false;
2836
+ tokenizer->_buffered_emit_char = kGumboNoChar;
2837
+ return true;
2838
+ }
2839
+
2840
+ if (maybe_emit_from_temporary_buffer(parser, output)) {
2841
+ return true;
2842
+ }
2843
+
2844
+ while (1) {
2845
+ assert(!tokenizer->_temporary_buffer_emit);
2846
+ assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2847
+ int c = utf8iterator_current(&tokenizer->_input);
2848
+ gumbo_debug(
2849
+ "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
2850
+ StateResult result =
2851
+ dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2852
+ // We need to clear reconsume_current_input before returning to prevent
2853
+ // certain infinite loop states.
2854
+ bool should_advance = !tokenizer->_reconsume_current_input;
2855
+ tokenizer->_reconsume_current_input = false;
2856
+
2857
+ if (result == RETURN_SUCCESS) {
2858
+ return true;
2859
+ } else if (result == RETURN_ERROR) {
2860
+ return false;
2861
+ }
2862
+
2863
+ if (should_advance) {
2864
+ utf8iterator_next(&tokenizer->_input);
2865
+ }
2866
+ }
2867
+ }
2868
+
2869
+ void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
2870
+ if (!token) return;
2871
+
2872
+ switch (token->type) {
2873
+ case GUMBO_TOKEN_DOCTYPE:
2874
+ gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
2875
+ gumbo_parser_deallocate(
2876
+ parser, (void*) token->v.doc_type.public_identifier);
2877
+ gumbo_parser_deallocate(
2878
+ parser, (void*) token->v.doc_type.system_identifier);
2879
+ return;
2880
+ case GUMBO_TOKEN_START_TAG:
2881
+ for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2882
+ GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2883
+ if (attr) {
2884
+ // May have been nulled out if this token was merged with another.
2885
+ gumbo_destroy_attribute(parser, attr);
2886
+ }
2887
+ }
2888
+ gumbo_parser_deallocate(
2889
+ parser, (void*) token->v.start_tag.attributes.data);
2890
+ return;
2891
+ case GUMBO_TOKEN_COMMENT:
2892
+ gumbo_parser_deallocate(parser, (void*) token->v.text);
2893
+ return;
2894
+ default:
2895
+ return;
2896
+ }
2897
+ }