nokogumbo 1.4.7 → 1.4.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,2897 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Coding conventions specific to this file:
18
- //
19
- // 1. Functions that fill in a token should be named emit_*, and should be
20
- // followed immediately by a return from the tokenizer (true if no error
21
- // occurred, false if an error occurred). Sometimes the emit functions
22
- // themselves return a boolean so that they can be combined with the return
23
- // statement; in this case, they should match this convention.
24
- // 2. Functions that shuffle data from temporaries to final API structures
25
- // should be named finish_*, and be called just before the tokenizer exits the
26
- // state that accumulates the temporary.
27
- // 3. All internal data structures should be kept in an initialized state from
28
- // tokenizer creation onwards, ready to accept input. When a buffer's flushed
29
- // and reset, it should be deallocated and immediately reinitialized.
30
- // 4. Make sure there are appropriate break statements following each state.
31
- // 5. Assertions on the state of the temporary and tag buffers are usually a
32
- // good idea, and should go at the entry point of each state when added.
33
- // 6. Statement order within states goes:
34
- // 1. Add parse errors, if appropriate.
35
- // 2. Call finish_* functions to build up tag state.
36
- // 2. Switch to new state. Set _reconsume flag if appropriate.
37
- // 3. Perform any other temporary buffer manipulation.
38
- // 4. Emit tokens
39
- // 5. Return/break.
40
- // This order ensures that we can verify that every emit is followed by a
41
- // return, ensures that the correct state is recorded with any parse errors, and
42
- // prevents parse error position from being messed up by possible mark/resets in
43
- // temporary buffer manipulation.
44
-
45
- #include "tokenizer.h"
46
-
47
- #include <assert.h>
48
- #include <stdbool.h>
49
- #include <string.h>
50
-
51
- #include "attribute.h"
52
- #include "char_ref.h"
53
- #include "error.h"
54
- #include "gumbo.h"
55
- #include "parser.h"
56
- #include "string_buffer.h"
57
- #include "string_piece.h"
58
- #include "token_type.h"
59
- #include "tokenizer_states.h"
60
- #include "utf8.h"
61
- #include "util.h"
62
- #include "vector.h"
63
-
64
- // Compared against _script_data_buffer to determine if we're in double-escaped
65
- // script mode.
66
- const GumboStringPiece kScriptTag = {"script", 6};
67
-
68
- // An enum for the return value of each individual state.
69
- typedef enum {
70
- RETURN_ERROR, // Return false (error) from the tokenizer.
71
- RETURN_SUCCESS, // Return true (success) from the tokenizer.
72
- NEXT_CHAR // Proceed to the next character and continue lexing.
73
- } StateResult;
74
-
75
- // This is a struct containing state necessary to build up a tag token,
76
- // character by character.
77
- typedef struct GumboInternalTagState {
78
- // A buffer to accumulate characters for various GumboStringPiece fields.
79
- GumboStringBuffer _buffer;
80
-
81
- // A pointer to the start of the original text corresponding to the contents
82
- // of the buffer.
83
- const char* _original_text;
84
-
85
- // The current tag enum, computed once the tag name state has finished so that
86
- // the buffer can be re-used for building up attributes.
87
- GumboTag _tag;
88
-
89
- // The starting location of the text in the buffer.
90
- GumboSourcePosition _start_pos;
91
-
92
- // The current list of attributes. This is copied (and ownership of its data
93
- // transferred) to the GumboStartTag token upon completion of the tag. New
94
- // attributes are added as soon as their attribute name state is complete, and
95
- // values are filled in by operating on _attributes.data[attributes.length-1].
96
- GumboVector /* GumboAttribute */ _attributes;
97
-
98
- // If true, the next attribute value to be finished should be dropped. This
99
- // happens if a duplicate attribute name is encountered - we want to consume
100
- // the attribute value, but shouldn't overwrite the existing value.
101
- bool _drop_next_attr_value;
102
-
103
- // The state that caused the tokenizer to switch into a character reference in
104
- // attribute value state. This is used to set the additional allowed
105
- // character, and is switched back to on completion. Initialized as the
106
- // tokenizer enters the character reference state.
107
- GumboTokenizerEnum _attr_value_state;
108
-
109
- // The last start tag to have been emitted by the tokenizer. This is
110
- // necessary to check for appropriate end tags.
111
- GumboTag _last_start_tag;
112
-
113
- // If true, then this is a start tag. If false, it's an end tag. This is
114
- // necessary to generate the appropriate token type at tag-closing time.
115
- bool _is_start_tag;
116
-
117
- // If true, then this tag is "self-closing" and doesn't have an end tag.
118
- bool _is_self_closing;
119
- } GumboTagState;
120
-
121
- // This is the main tokenizer state struct, containing all state used by in
122
- // tokenizing the input stream.
123
- typedef struct GumboInternalTokenizerState {
124
- // The current lexer state. Starts in GUMBO_LEX_DATA.
125
- GumboTokenizerEnum _state;
126
-
127
- // A flag indicating whether the current input character needs to reconsumed
128
- // in another state, or whether the next input character should be read for
129
- // the next iteration of the state loop. This is set when the spec reads
130
- // "Reconsume the current input character in..."
131
- bool _reconsume_current_input;
132
-
133
- // A flag indicating whether the current node is a foreign element. This is
134
- // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
135
- // markup declaration state.
136
- bool _is_current_node_foreign;
137
-
138
- // A flag indicating whether the tokenizer is in a CDATA section. If so, then
139
- // text tokens emitted will be GUMBO_TOKEN_CDATA.
140
- bool _is_in_cdata;
141
-
142
- // Certain states (notably character references) may emit two character tokens
143
- // at once, but the contract for lex() fills in only one token at a time. The
144
- // extra character is buffered here, and then this is checked on entry to
145
- // lex(). If a character is stored here, it's immediately emitted and control
146
- // returns from the lexer. kGumboNoChar is used to represent 'no character
147
- // stored.'
148
- //
149
- // Note that characters emitted through this mechanism will have their source
150
- // position marked as the character under the mark, i.e. multiple characters
151
- // may be emitted with the same position. This is desirable for character
152
- // references, but unsuitable for many other cases. Use the _temporary_buffer
153
- // mechanism if the buffered characters must have their original positions in
154
- // the document.
155
- int _buffered_emit_char;
156
-
157
- // A temporary buffer to accumulate characters, as described by the "temporary
158
- // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
159
- // way: we record the specific character to go into the buffer, which may
160
- // sometimes be a lowercased version of the actual input character. However,
161
- // we *also* use utf8iterator_mark() to record the position at tag start.
162
- // When we start flushing the temporary buffer, we set _temporary_buffer_emit
163
- // to the start of it, and then increment it for each call to the tokenizer.
164
- // We also call utf8iterator_reset(), and utf8iterator_next() through the
165
- // input stream, so that tokens emitted by emit_char have the correct position
166
- // and original text.
167
- GumboStringBuffer _temporary_buffer;
168
-
169
- // The current cursor position we're emitting from within
170
- // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
171
- const char* _temporary_buffer_emit;
172
-
173
- // The temporary buffer is also used by the spec to check whether we should
174
- // enter the script data double escaped state, but we can't use the same
175
- // buffer for both because we have to flush out "<s" as emits while still
176
- // maintaining the context that will eventually become "script". This is a
177
- // separate buffer that's used in place of the temporary buffer for states
178
- // that may enter the script data double escape start state.
179
- GumboStringBuffer _script_data_buffer;
180
-
181
- // Pointer to the beginning of the current token in the original buffer; used
182
- // to record the original text.
183
- const char* _token_start;
184
-
185
- // GumboSourcePosition recording the source location of the start of the
186
- // current token.
187
- GumboSourcePosition _token_start_pos;
188
-
189
- // Current tag state.
190
- GumboTagState _tag_state;
191
-
192
- // Doctype state. We use the temporary buffer to accumulate characters (it's
193
- // not used for anything else in the doctype states), and then freshly
194
- // allocate the strings in the doctype token, then copy it over on emit.
195
- GumboTokenDocType _doc_type_state;
196
-
197
- // The UTF8Iterator over the tokenizer input.
198
- Utf8Iterator _input;
199
- } GumboTokenizerState;
200
-
201
- // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
202
- static void tokenizer_add_parse_error(
203
- GumboParser* parser, GumboErrorType type) {
204
- GumboError* error = gumbo_add_error(parser);
205
- if (!error) {
206
- return;
207
- }
208
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
209
- utf8iterator_get_position(&tokenizer->_input, &error->position);
210
- error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
211
- error->type = type;
212
- error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
213
- switch (tokenizer->_state) {
214
- case GUMBO_LEX_DATA:
215
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
216
- break;
217
- case GUMBO_LEX_CHAR_REF_IN_DATA:
218
- case GUMBO_LEX_CHAR_REF_IN_RCDATA:
219
- case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
220
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
221
- break;
222
- case GUMBO_LEX_RCDATA:
223
- case GUMBO_LEX_RCDATA_LT:
224
- case GUMBO_LEX_RCDATA_END_TAG_OPEN:
225
- case GUMBO_LEX_RCDATA_END_TAG_NAME:
226
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
227
- break;
228
- case GUMBO_LEX_RAWTEXT:
229
- case GUMBO_LEX_RAWTEXT_LT:
230
- case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
231
- case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
232
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
233
- break;
234
- case GUMBO_LEX_PLAINTEXT:
235
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
236
- break;
237
- case GUMBO_LEX_SCRIPT:
238
- case GUMBO_LEX_SCRIPT_LT:
239
- case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
240
- case GUMBO_LEX_SCRIPT_END_TAG_NAME:
241
- case GUMBO_LEX_SCRIPT_ESCAPED_START:
242
- case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
243
- case GUMBO_LEX_SCRIPT_ESCAPED:
244
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
245
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
246
- case GUMBO_LEX_SCRIPT_ESCAPED_LT:
247
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
248
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
249
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
250
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
251
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
252
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
253
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
254
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
255
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
256
- break;
257
- case GUMBO_LEX_TAG_OPEN:
258
- case GUMBO_LEX_END_TAG_OPEN:
259
- case GUMBO_LEX_TAG_NAME:
260
- case GUMBO_LEX_BEFORE_ATTR_NAME:
261
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
262
- break;
263
- case GUMBO_LEX_SELF_CLOSING_START_TAG:
264
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
265
- break;
266
- case GUMBO_LEX_ATTR_NAME:
267
- case GUMBO_LEX_AFTER_ATTR_NAME:
268
- case GUMBO_LEX_BEFORE_ATTR_VALUE:
269
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
270
- break;
271
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
272
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
273
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
274
- case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
275
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
276
- break;
277
- case GUMBO_LEX_BOGUS_COMMENT:
278
- case GUMBO_LEX_COMMENT_START:
279
- case GUMBO_LEX_COMMENT_START_DASH:
280
- case GUMBO_LEX_COMMENT:
281
- case GUMBO_LEX_COMMENT_END_DASH:
282
- case GUMBO_LEX_COMMENT_END:
283
- case GUMBO_LEX_COMMENT_END_BANG:
284
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
285
- break;
286
- case GUMBO_LEX_MARKUP_DECLARATION:
287
- case GUMBO_LEX_DOCTYPE:
288
- case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
289
- case GUMBO_LEX_DOCTYPE_NAME:
290
- case GUMBO_LEX_AFTER_DOCTYPE_NAME:
291
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
292
- case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
293
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
294
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
295
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
296
- case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
297
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
298
- case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
299
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
300
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
301
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
302
- case GUMBO_LEX_BOGUS_DOCTYPE:
303
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
304
- break;
305
- case GUMBO_LEX_CDATA:
306
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
307
- break;
308
- }
309
- }
310
-
311
- static bool is_alpha(int c) {
312
- // We don't use ISO C isupper/islower functions here because they
313
- // depend upon the program's locale, while the behavior of the HTML5 spec is
314
- // independent of which locale the program is run in.
315
- return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
316
- }
317
-
318
- static int ensure_lowercase(int c) {
319
- return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
320
- }
321
-
322
- static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
323
- if (is_in_cdata && c > 0) {
324
- return GUMBO_TOKEN_CDATA;
325
- }
326
-
327
- switch (c) {
328
- case '\t':
329
- case '\n':
330
- case '\r':
331
- case '\f':
332
- case ' ':
333
- return GUMBO_TOKEN_WHITESPACE;
334
- case 0:
335
- gumbo_debug("Emitted null byte.\n");
336
- return GUMBO_TOKEN_NULL;
337
- case -1:
338
- return GUMBO_TOKEN_EOF;
339
- default:
340
- return GUMBO_TOKEN_CHARACTER;
341
- }
342
- }
343
-
344
- // Starts recording characters in the temporary buffer.
345
- // Because this needs to reset the utf8iterator_mark to the beginning of the
346
- // text that will eventually be emitted, it needs to be called a couple of
347
- // states before the spec says "Set the temporary buffer to the empty string".
348
- // In general, this should be called whenever there's a transition to a
349
- // "less-than sign state". The initial < and possibly / then need to be
350
- // appended to the temporary buffer, their presence needs to be accounted for in
351
- // states that compare the temporary buffer against a literal value, and
352
- // spec stanzas that say "emit a < and / character token along with a character
353
- // token for each character in the temporary buffer" need to be adjusted to
354
- // account for the presence of the < and / inside the temporary buffer.
355
- static void clear_temporary_buffer(GumboParser* parser) {
356
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
357
- assert(!tokenizer->_temporary_buffer_emit);
358
- utf8iterator_mark(&tokenizer->_input);
359
- gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
360
- // The temporary buffer and script data buffer are the same object in the
361
- // spec, so the script data buffer should be cleared as well.
362
- gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
363
- }
364
-
365
- // Appends a codepoint to the temporary buffer.
366
- static void append_char_to_temporary_buffer(
367
- GumboParser* parser, int codepoint) {
368
- gumbo_string_buffer_append_codepoint(
369
- parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
370
- }
371
-
372
- // Checks to see if the temporary buffer equals a certain string.
373
- // Make sure this remains side-effect free; it's used in assertions.
374
- #ifndef NDEBUG
375
- static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
376
- GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
377
- // TODO(jdtang): See if the extra strlen is a performance problem, and replace
378
- // it with an explicit sizeof(literal) if necessary. I don't think it will
379
- // be, as this is only used in a couple of rare states.
380
- int text_len = strlen(text);
381
- return text_len == buffer->length &&
382
- memcmp(buffer->data, text, text_len) == 0;
383
- }
384
- #endif
385
-
386
- static void doc_type_state_init(GumboParser* parser) {
387
- GumboTokenDocType* doc_type_state =
388
- &parser->_tokenizer_state->_doc_type_state;
389
- // We initialize these to NULL here so that we don't end up leaking memory if
390
- // we never see a doctype token. When we do see a doctype token, we reset
391
- // them to a freshly-allocated empty string so that we can present a uniform
392
- // interface to client code and not make them check for null. Ownership is
393
- // transferred to the doctype token when it's emitted.
394
- doc_type_state->name = NULL;
395
- doc_type_state->public_identifier = NULL;
396
- doc_type_state->system_identifier = NULL;
397
- doc_type_state->force_quirks = false;
398
- doc_type_state->has_public_identifier = false;
399
- doc_type_state->has_system_identifier = false;
400
- }
401
-
402
- // Sets the token original_text and position to the current iterator position.
403
- // This is necessary because [CDATA[ sections may include text that is ignored
404
- // by the tokenizer.
405
- static void reset_token_start_point(GumboTokenizerState* tokenizer) {
406
- tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
407
- utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
408
- }
409
-
410
- // Sets the tag buffer original text and start point to the current iterator
411
- // position. This is necessary because attribute names & values may have
412
- // whitespace preceeding them, and so we can't assume that the actual token
413
- // starting point was the end of the last tag buffer usage.
414
- static void reset_tag_buffer_start_point(GumboParser* parser) {
415
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
416
- GumboTagState* tag_state = &tokenizer->_tag_state;
417
-
418
- utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
419
- tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
420
- }
421
-
422
- // Moves the temporary buffer contents over to the specified output string,
423
- // and clears the temporary buffer.
424
- static void finish_temporary_buffer(GumboParser* parser, const char** output) {
425
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
426
- *output =
427
- gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
428
- clear_temporary_buffer(parser);
429
- }
430
-
431
- // Advances the iterator past the end of the token, and then fills in the
432
- // relevant position fields. It's assumed that after every emit, the tokenizer
433
- // will immediately return (letting the tree-construction stage read the filled
434
- // in Token). Thus, it's safe to advance the input stream here, since it will
435
- // bypass the advance at the bottom of the state machine loop.
436
- //
437
- // Since this advances the iterator and resets the current input, make sure to
438
- // call it after you've recorded any other data you need for the token.
439
- static void finish_token(GumboParser* parser, GumboToken* token) {
440
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
441
- if (!tokenizer->_reconsume_current_input) {
442
- utf8iterator_next(&tokenizer->_input);
443
- }
444
-
445
- token->position = tokenizer->_token_start_pos;
446
- token->original_text.data = tokenizer->_token_start;
447
- reset_token_start_point(tokenizer);
448
- token->original_text.length =
449
- tokenizer->_token_start - token->original_text.data;
450
- if (token->original_text.length > 0 &&
451
- token->original_text.data[token->original_text.length - 1] == '\r') {
452
- // The UTF8 iterator will ignore carriage returns in the input stream, which
453
- // means that the next token may start one past a \r character. The pointer
454
- // arithmetic above results in that \r being appended to the original text
455
- // of the preceding token, so we have to adjust its length here to chop the
456
- // \r off.
457
- --token->original_text.length;
458
- }
459
- }
460
-
461
- // Records the doctype public ID, assumed to be in the temporary buffer.
462
- // Convenience method that also sets has_public_identifier to true.
463
- static void finish_doctype_public_id(GumboParser* parser) {
464
- GumboTokenDocType* doc_type_state =
465
- &parser->_tokenizer_state->_doc_type_state;
466
- gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
467
- finish_temporary_buffer(parser, &doc_type_state->public_identifier);
468
- doc_type_state->has_public_identifier = true;
469
- }
470
-
471
- // Records the doctype system ID, assumed to be in the temporary buffer.
472
- // Convenience method that also sets has_system_identifier to true.
473
- static void finish_doctype_system_id(GumboParser* parser) {
474
- GumboTokenDocType* doc_type_state =
475
- &parser->_tokenizer_state->_doc_type_state;
476
- gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
477
- finish_temporary_buffer(parser, &doc_type_state->system_identifier);
478
- doc_type_state->has_system_identifier = true;
479
- }
480
-
481
- // Writes a single specified character to the output token.
482
- static void emit_char(GumboParser* parser, int c, GumboToken* output) {
483
- output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
484
- output->v.character = c;
485
- finish_token(parser, output);
486
- }
487
-
488
- // Writes a replacement character token and records a parse error.
489
- // Always returns RETURN_ERROR, per gumbo_lex return value.
490
- static StateResult emit_replacement_char(
491
- GumboParser* parser, GumboToken* output) {
492
- // In all cases, this is because of a null byte in the input stream.
493
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
494
- emit_char(parser, kUtf8ReplacementChar, output);
495
- return RETURN_ERROR;
496
- }
497
-
498
- // Writes an EOF character token. Always returns RETURN_SUCCESS.
499
- static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
500
- emit_char(parser, -1, output);
501
- return RETURN_SUCCESS;
502
- }
503
-
504
- // Writes the current input character out as a character token.
505
- // Always returns RETURN_SUCCESS.
506
- static bool emit_current_char(GumboParser* parser, GumboToken* output) {
507
- emit_char(
508
- parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
509
- return RETURN_SUCCESS;
510
- }
511
-
512
- // Writes out a doctype token, copying it from the tokenizer state.
513
- static void emit_doctype(GumboParser* parser, GumboToken* output) {
514
- output->type = GUMBO_TOKEN_DOCTYPE;
515
- output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
516
- finish_token(parser, output);
517
- doc_type_state_init(parser);
518
- }
519
-
520
- // Debug-only function that explicitly sets the attribute vector data to NULL so
521
- // it can be asserted on tag creation, verifying that there are no memory leaks.
522
- static void mark_tag_state_as_empty(GumboTagState* tag_state) {
523
- #ifndef NDEBUG
524
- tag_state->_attributes = kGumboEmptyVector;
525
- #endif
526
- }
527
-
528
- // Writes out the current tag as a start or end tag token.
529
- // Always returns RETURN_SUCCESS.
530
- static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
531
- GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
532
- if (tag_state->_is_start_tag) {
533
- output->type = GUMBO_TOKEN_START_TAG;
534
- output->v.start_tag.tag = tag_state->_tag;
535
- output->v.start_tag.attributes = tag_state->_attributes;
536
- output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
537
- tag_state->_last_start_tag = tag_state->_tag;
538
- mark_tag_state_as_empty(tag_state);
539
- gumbo_debug(
540
- "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
541
- } else {
542
- output->type = GUMBO_TOKEN_END_TAG;
543
- output->v.end_tag = tag_state->_tag;
544
- // In end tags, ownership of the attributes vector is not transferred to the
545
- // token, but it's still initialized as normal, so it must be manually
546
- // deallocated. There may also be attributes to destroy, in certain broken
547
- // cases like </div</th> (the "th" is an attribute there).
548
- for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
549
- gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
550
- }
551
- gumbo_parser_deallocate(parser, tag_state->_attributes.data);
552
- mark_tag_state_as_empty(tag_state);
553
- gumbo_debug(
554
- "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
555
- }
556
- gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
557
- finish_token(parser, output);
558
- gumbo_debug("Original text = %.*s.\n", output->original_text.length,
559
- output->original_text.data);
560
- assert(output->original_text.length >= 2);
561
- assert(output->original_text.data[0] == '<');
562
- assert(output->original_text.data[output->original_text.length - 1] == '>');
563
- return RETURN_SUCCESS;
564
- }
565
-
566
- // In some states, we speculatively start a tag, but don't know whether it'll be
567
- // emitted as tag token or as a series of character tokens until we finish it.
568
- // We need to abandon the tag we'd started & free its memory in that case to
569
- // avoid a memory leak.
570
- static void abandon_current_tag(GumboParser* parser) {
571
- GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
572
- for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
573
- gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
574
- }
575
- gumbo_parser_deallocate(parser, tag_state->_attributes.data);
576
- mark_tag_state_as_empty(tag_state);
577
- gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
578
- gumbo_debug("Abandoning current tag.\n");
579
- }
580
-
581
- // Wraps the consume_char_ref function to handle its output and make the
582
- // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
583
- // error occurred, RETURN_SUCCESS otherwise.
584
- static StateResult emit_char_ref(GumboParser* parser,
585
- int additional_allowed_char, bool is_in_attribute, GumboToken* output) {
586
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
587
- OneOrTwoCodepoints char_ref;
588
- bool status = consume_char_ref(
589
- parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
590
- if (char_ref.first != kGumboNoChar) {
591
- // consume_char_ref ends with the iterator pointing at the next character,
592
- // so we need to be sure not advance it again before reading the next token.
593
- tokenizer->_reconsume_current_input = true;
594
- emit_char(parser, char_ref.first, output);
595
- tokenizer->_buffered_emit_char = char_ref.second;
596
- } else {
597
- emit_char(parser, '&', output);
598
- }
599
- return status ? RETURN_SUCCESS : RETURN_ERROR;
600
- }
601
-
602
- // Emits a comment token. Comments use the temporary buffer to accumulate their
603
- // data, and then it's copied over and released to the 'text' field of the
604
- // GumboToken union. Always returns RETURN_SUCCESS.
605
- static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
606
- output->type = GUMBO_TOKEN_COMMENT;
607
- finish_temporary_buffer(parser, &output->v.text);
608
- finish_token(parser, output);
609
- return RETURN_SUCCESS;
610
- }
611
-
612
- // Checks to see we should be flushing accumulated characters in the temporary
613
- // buffer, and fills the output token with the next output character if so.
614
- // Returns true if a character has been emitted and the tokenizer should
615
- // immediately return, false if we're at the end of the temporary buffer and
616
- // should resume normal operation.
617
- static bool maybe_emit_from_temporary_buffer(
618
- GumboParser* parser, GumboToken* output) {
619
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
620
- const char* c = tokenizer->_temporary_buffer_emit;
621
- GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
622
-
623
- if (!c || c >= buffer->data + buffer->length) {
624
- tokenizer->_temporary_buffer_emit = NULL;
625
- return false;
626
- }
627
-
628
- assert(*c == utf8iterator_current(&tokenizer->_input));
629
- // emit_char also advances the input stream. We need to do some juggling of
630
- // the _reconsume_current_input flag to get the proper behavior when emitting
631
- // previous tokens. Basically, _reconsume_current_input should *never* be set
632
- // when emitting anything from the temporary buffer, since those characters
633
- // have already been advanced past. However, it should be preserved so that
634
- // when the *next* character is encountered again, the tokenizer knows not to
635
- // advance past it.
636
- bool saved_reconsume_state = tokenizer->_reconsume_current_input;
637
- tokenizer->_reconsume_current_input = false;
638
- emit_char(parser, *c, output);
639
- ++tokenizer->_temporary_buffer_emit;
640
- tokenizer->_reconsume_current_input = saved_reconsume_state;
641
- return true;
642
- }
643
-
644
- // Sets up the tokenizer to begin flushing the temporary buffer.
645
- // This resets the input iterator stream to the start of the last tag, sets up
646
- // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
647
- // the first character in it. It returns true if a character was emitted, false
648
- // otherwise.
649
- static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
650
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
651
- assert(tokenizer->_temporary_buffer.data);
652
- utf8iterator_reset(&tokenizer->_input);
653
- tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
654
- return maybe_emit_from_temporary_buffer(parser, output);
655
- }
656
-
657
- // Appends a codepoint to the current tag buffer. If
658
- // reinitilize_position_on_first is set, this also initializes the tag buffer
659
- // start point; the only time you would *not* want to pass true for this
660
- // parameter is if you want the original_text to include character (like an
661
- // opening quote) that doesn't appear in the value.
662
- static void append_char_to_tag_buffer(
663
- GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
664
- GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
665
- if (buffer->length == 0 && reinitilize_position_on_first) {
666
- reset_tag_buffer_start_point(parser);
667
- }
668
- gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
669
- }
670
-
671
- // (Re-)initialize the tag buffer. This also resets the original_text pointer
672
- // and _start_pos field to point to the current position.
673
- static void initialize_tag_buffer(GumboParser* parser) {
674
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
675
- GumboTagState* tag_state = &tokenizer->_tag_state;
676
-
677
- gumbo_string_buffer_init(parser, &tag_state->_buffer);
678
- reset_tag_buffer_start_point(parser);
679
- }
680
-
681
- // Initializes the tag_state to start a new tag, keeping track of the opening
682
- // positions and original text. Takes a boolean indicating whether this is a
683
- // start or end tag.
684
- static void start_new_tag(GumboParser* parser, bool is_start_tag) {
685
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
686
- GumboTagState* tag_state = &tokenizer->_tag_state;
687
- int c = utf8iterator_current(&tokenizer->_input);
688
- assert(is_alpha(c));
689
- c = ensure_lowercase(c);
690
- assert(is_alpha(c));
691
-
692
- initialize_tag_buffer(parser);
693
- gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
694
-
695
- assert(tag_state->_attributes.data == NULL);
696
- // Initial size chosen by statistical analysis of a corpus of 60k webpages.
697
- // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
698
- // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
699
- // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
700
- gumbo_vector_init(parser, 1, &tag_state->_attributes);
701
- tag_state->_drop_next_attr_value = false;
702
- tag_state->_is_start_tag = is_start_tag;
703
- tag_state->_is_self_closing = false;
704
- gumbo_debug("Starting new tag.\n");
705
- }
706
-
707
- // Fills in the specified char* with the contents of the tag buffer.
708
- static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
709
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
710
- GumboTagState* tag_state = &tokenizer->_tag_state;
711
- *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
712
- }
713
-
714
- // Fills in:
715
- // * The original_text GumboStringPiece with the portion of the original
716
- // buffer that corresponds to the tag buffer.
717
- // * The start_pos GumboSourcePosition with the start position of the tag
718
- // buffer.
719
- // * The end_pos GumboSourcePosition with the current source position.
720
- static void copy_over_original_tag_text(GumboParser* parser,
721
- GumboStringPiece* original_text, GumboSourcePosition* start_pos,
722
- GumboSourcePosition* end_pos) {
723
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
724
- GumboTagState* tag_state = &tokenizer->_tag_state;
725
-
726
- original_text->data = tag_state->_original_text;
727
- original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
728
- tag_state->_original_text;
729
- if (original_text->data[original_text->length - 1] == '\r') {
730
- // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
731
- // appended to the end of original text even when it's really the first part
732
- // of the next character. If we detect this situation, shrink the length of
733
- // the original text by 1 to remove the carriage return.
734
- --original_text->length;
735
- }
736
- *start_pos = tag_state->_start_pos;
737
- utf8iterator_get_position(&tokenizer->_input, end_pos);
738
- }
739
-
740
- // Releases and then re-initializes the tag buffer.
741
- static void reinitialize_tag_buffer(GumboParser* parser) {
742
- gumbo_parser_deallocate(
743
- parser, parser->_tokenizer_state->_tag_state._buffer.data);
744
- initialize_tag_buffer(parser);
745
- }
746
-
747
- // Moves some data from the temporary buffer over the the tag-based fields in
748
- // TagState.
749
- static void finish_tag_name(GumboParser* parser) {
750
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
751
- GumboTagState* tag_state = &tokenizer->_tag_state;
752
-
753
- tag_state->_tag =
754
- gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
755
- reinitialize_tag_buffer(parser);
756
- }
757
-
758
- // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
759
- static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
760
- int original_index, int new_index) {
761
- GumboError* error = gumbo_add_error(parser);
762
- if (!error) {
763
- return;
764
- }
765
- GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
766
- error->type = GUMBO_ERR_DUPLICATE_ATTR;
767
- error->position = tag_state->_start_pos;
768
- error->original_text = tag_state->_original_text;
769
- error->v.duplicate_attr.original_index = original_index;
770
- error->v.duplicate_attr.new_index = new_index;
771
- copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
772
- reinitialize_tag_buffer(parser);
773
- }
774
-
775
- // Creates a new attribute in the current tag, copying the current tag buffer to
776
- // the attribute's name. The attribute's value starts out as the empty string
777
- // (following the "Boolean attributes" section of the spec) and is only
778
- // overwritten on finish_attribute_value(). If the attribute has already been
779
- // specified, the new attribute is dropped, a parse error is added, and the
780
- // function returns false. Otherwise, this returns true.
781
- static bool finish_attribute_name(GumboParser* parser) {
782
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
783
- GumboTagState* tag_state = &tokenizer->_tag_state;
784
- // May've been set by a previous attribute without a value; reset it here.
785
- tag_state->_drop_next_attr_value = false;
786
- assert(tag_state->_attributes.data);
787
- assert(tag_state->_attributes.capacity);
788
-
789
- GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
790
- for (unsigned int i = 0; i < attributes->length; ++i) {
791
- GumboAttribute* attr = attributes->data[i];
792
- if (strlen(attr->name) == tag_state->_buffer.length &&
793
- memcmp(attr->name, tag_state->_buffer.data,
794
- tag_state->_buffer.length) == 0) {
795
- // Identical attribute; bail.
796
- add_duplicate_attr_error(parser, attr->name, i, attributes->length);
797
- tag_state->_drop_next_attr_value = true;
798
- return false;
799
- }
800
- }
801
-
802
- GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
803
- attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
804
- copy_over_tag_buffer(parser, &attr->name);
805
- copy_over_original_tag_text(
806
- parser, &attr->original_name, &attr->name_start, &attr->name_end);
807
- attr->value = gumbo_copy_stringz(parser, "");
808
- copy_over_original_tag_text(
809
- parser, &attr->original_value, &attr->name_start, &attr->name_end);
810
- gumbo_vector_add(parser, attr, attributes);
811
- reinitialize_tag_buffer(parser);
812
- return true;
813
- }
814
-
815
- // Finishes an attribute value. This sets the value of the most recently added
816
- // attribute to the current contents of the tag buffer.
817
- static void finish_attribute_value(GumboParser* parser) {
818
- GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
819
- if (tag_state->_drop_next_attr_value) {
820
- // Duplicate attribute name detected in an earlier state, so we have to
821
- // ignore the value.
822
- tag_state->_drop_next_attr_value = false;
823
- reinitialize_tag_buffer(parser);
824
- return;
825
- }
826
-
827
- GumboAttribute* attr =
828
- tag_state->_attributes.data[tag_state->_attributes.length - 1];
829
- gumbo_parser_deallocate(parser, (void*) attr->value);
830
- copy_over_tag_buffer(parser, &attr->value);
831
- copy_over_original_tag_text(
832
- parser, &attr->original_value, &attr->value_start, &attr->value_end);
833
- reinitialize_tag_buffer(parser);
834
- }
835
-
836
- // Returns true if the current end tag matches the last start tag emitted.
837
- static bool is_appropriate_end_tag(GumboParser* parser) {
838
- GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
839
- assert(!tag_state->_is_start_tag);
840
- return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
841
- tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
842
- tag_state->_buffer.length);
843
- }
844
-
845
- void gumbo_tokenizer_state_init(
846
- GumboParser* parser, const char* text, size_t text_length) {
847
- GumboTokenizerState* tokenizer =
848
- gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
849
- parser->_tokenizer_state = tokenizer;
850
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
851
- tokenizer->_reconsume_current_input = false;
852
- tokenizer->_is_current_node_foreign = false;
853
- tokenizer->_is_in_cdata = false;
854
- tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
855
-
856
- tokenizer->_buffered_emit_char = kGumboNoChar;
857
- gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
858
- tokenizer->_temporary_buffer_emit = NULL;
859
-
860
- mark_tag_state_as_empty(&tokenizer->_tag_state);
861
-
862
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
863
- tokenizer->_token_start = text;
864
- utf8iterator_init(parser, text, text_length, &tokenizer->_input);
865
- utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
866
- doc_type_state_init(parser);
867
- }
868
-
869
- void gumbo_tokenizer_state_destroy(GumboParser* parser) {
870
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
871
- assert(tokenizer->_doc_type_state.name == NULL);
872
- assert(tokenizer->_doc_type_state.public_identifier == NULL);
873
- assert(tokenizer->_doc_type_state.system_identifier == NULL);
874
- gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
875
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
876
- gumbo_parser_deallocate(parser, tokenizer);
877
- }
878
-
879
- void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
880
- parser->_tokenizer_state->_state = state;
881
- }
882
-
883
- void gumbo_tokenizer_set_is_current_node_foreign(
884
- GumboParser* parser, bool is_foreign) {
885
- if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
886
- gumbo_debug("Toggling is_current_node_foreign to %s.\n",
887
- is_foreign ? "true" : "false");
888
- }
889
- parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
890
- }
891
-
892
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
893
- static StateResult handle_data_state(GumboParser* parser,
894
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
895
- switch (c) {
896
- case '&':
897
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
898
- // The char_ref machinery expects to be on the & so it can mark that
899
- // and return to it if the text isn't a char ref, so we need to
900
- // reconsume it.
901
- tokenizer->_reconsume_current_input = true;
902
- return NEXT_CHAR;
903
- case '<':
904
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
905
- clear_temporary_buffer(parser);
906
- append_char_to_temporary_buffer(parser, '<');
907
- return NEXT_CHAR;
908
- case '\0':
909
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
910
- emit_char(parser, c, output);
911
- return RETURN_ERROR;
912
- default:
913
- return emit_current_char(parser, output);
914
- }
915
- }
916
-
917
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
918
- static StateResult handle_char_ref_in_data_state(GumboParser* parser,
919
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
920
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
921
- return emit_char_ref(parser, ' ', false, output);
922
- }
923
-
924
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
925
- static StateResult handle_rcdata_state(GumboParser* parser,
926
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
927
- switch (c) {
928
- case '&':
929
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
930
- tokenizer->_reconsume_current_input = true;
931
- return NEXT_CHAR;
932
- case '<':
933
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
934
- clear_temporary_buffer(parser);
935
- append_char_to_temporary_buffer(parser, '<');
936
- return NEXT_CHAR;
937
- case '\0':
938
- return emit_replacement_char(parser, output);
939
- case -1:
940
- return emit_eof(parser, output);
941
- default:
942
- return emit_current_char(parser, output);
943
- }
944
- }
945
-
946
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
947
- static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
948
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
949
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
950
- return emit_char_ref(parser, ' ', false, output);
951
- }
952
-
953
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
954
- static StateResult handle_rawtext_state(GumboParser* parser,
955
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
956
- switch (c) {
957
- case '<':
958
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
959
- clear_temporary_buffer(parser);
960
- append_char_to_temporary_buffer(parser, '<');
961
- return NEXT_CHAR;
962
- case '\0':
963
- return emit_replacement_char(parser, output);
964
- case -1:
965
- return emit_eof(parser, output);
966
- default:
967
- return emit_current_char(parser, output);
968
- }
969
- }
970
-
971
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
972
- static StateResult handle_script_state(GumboParser* parser,
973
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
974
- switch (c) {
975
- case '<':
976
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
977
- clear_temporary_buffer(parser);
978
- append_char_to_temporary_buffer(parser, '<');
979
- return NEXT_CHAR;
980
- case '\0':
981
- return emit_replacement_char(parser, output);
982
- case -1:
983
- return emit_eof(parser, output);
984
- default:
985
- return emit_current_char(parser, output);
986
- }
987
- }
988
-
989
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
990
- static StateResult handle_plaintext_state(GumboParser* parser,
991
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
992
- switch (c) {
993
- case '\0':
994
- return emit_replacement_char(parser, output);
995
- case -1:
996
- return emit_eof(parser, output);
997
- default:
998
- return emit_current_char(parser, output);
999
- }
1000
- }
1001
-
1002
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
1003
- static StateResult handle_tag_open_state(GumboParser* parser,
1004
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1005
- assert(temporary_buffer_equals(parser, "<"));
1006
- switch (c) {
1007
- case '!':
1008
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1009
- clear_temporary_buffer(parser);
1010
- return NEXT_CHAR;
1011
- case '/':
1012
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1013
- append_char_to_temporary_buffer(parser, '/');
1014
- return NEXT_CHAR;
1015
- case '?':
1016
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1017
- clear_temporary_buffer(parser);
1018
- append_char_to_temporary_buffer(parser, '?');
1019
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1020
- return NEXT_CHAR;
1021
- default:
1022
- if (is_alpha(c)) {
1023
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1024
- start_new_tag(parser, true);
1025
- return NEXT_CHAR;
1026
- } else {
1027
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1028
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1029
- emit_temporary_buffer(parser, output);
1030
- return RETURN_ERROR;
1031
- }
1032
- }
1033
- }
1034
-
1035
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
1036
- static StateResult handle_end_tag_open_state(GumboParser* parser,
1037
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1038
- assert(temporary_buffer_equals(parser, "</"));
1039
- switch (c) {
1040
- case '>':
1041
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1042
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1043
- return NEXT_CHAR;
1044
- case -1:
1045
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1046
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1047
- return emit_temporary_buffer(parser, output);
1048
- default:
1049
- if (is_alpha(c)) {
1050
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1051
- start_new_tag(parser, false);
1052
- } else {
1053
- tokenizer_add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1054
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1055
- clear_temporary_buffer(parser);
1056
- append_char_to_temporary_buffer(parser, c);
1057
- }
1058
- return NEXT_CHAR;
1059
- }
1060
- }
1061
-
1062
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
1063
- static StateResult handle_tag_name_state(GumboParser* parser,
1064
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1065
- switch (c) {
1066
- case '\t':
1067
- case '\n':
1068
- case '\f':
1069
- case ' ':
1070
- finish_tag_name(parser);
1071
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1072
- return NEXT_CHAR;
1073
- case '/':
1074
- finish_tag_name(parser);
1075
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1076
- return NEXT_CHAR;
1077
- case '>':
1078
- finish_tag_name(parser);
1079
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1080
- return emit_current_tag(parser, output);
1081
- case '\0':
1082
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1083
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1084
- return NEXT_CHAR;
1085
- case -1:
1086
- tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1087
- abandon_current_tag(parser);
1088
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1089
- return NEXT_CHAR;
1090
- default:
1091
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1092
- return NEXT_CHAR;
1093
- }
1094
- }
1095
-
1096
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
1097
- static StateResult handle_rcdata_lt_state(GumboParser* parser,
1098
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1099
- assert(temporary_buffer_equals(parser, "<"));
1100
- if (c == '/') {
1101
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1102
- append_char_to_temporary_buffer(parser, '/');
1103
- return NEXT_CHAR;
1104
- } else {
1105
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1106
- tokenizer->_reconsume_current_input = true;
1107
- return emit_temporary_buffer(parser, output);
1108
- }
1109
- }
1110
-
1111
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
1112
- static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
1113
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1114
- assert(temporary_buffer_equals(parser, "</"));
1115
- if (is_alpha(c)) {
1116
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1117
- start_new_tag(parser, false);
1118
- append_char_to_temporary_buffer(parser, c);
1119
- return NEXT_CHAR;
1120
- } else {
1121
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1122
- return emit_temporary_buffer(parser, output);
1123
- }
1124
- return true;
1125
- }
1126
-
1127
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
1128
- static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
1129
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1130
- assert(tokenizer->_temporary_buffer.length >= 2);
1131
- if (is_alpha(c)) {
1132
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1133
- append_char_to_temporary_buffer(parser, c);
1134
- return NEXT_CHAR;
1135
- } else if (is_appropriate_end_tag(parser)) {
1136
- switch (c) {
1137
- case '\t':
1138
- case '\n':
1139
- case '\f':
1140
- case ' ':
1141
- finish_tag_name(parser);
1142
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1143
- return NEXT_CHAR;
1144
- case '/':
1145
- finish_tag_name(parser);
1146
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1147
- return NEXT_CHAR;
1148
- case '>':
1149
- finish_tag_name(parser);
1150
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1151
- return emit_current_tag(parser, output);
1152
- }
1153
- }
1154
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1155
- abandon_current_tag(parser);
1156
- return emit_temporary_buffer(parser, output);
1157
- }
1158
-
1159
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
1160
- static StateResult handle_rawtext_lt_state(GumboParser* parser,
1161
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1162
- assert(temporary_buffer_equals(parser, "<"));
1163
- if (c == '/') {
1164
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1165
- append_char_to_temporary_buffer(parser, '/');
1166
- return NEXT_CHAR;
1167
- } else {
1168
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1169
- tokenizer->_reconsume_current_input = true;
1170
- return emit_temporary_buffer(parser, output);
1171
- }
1172
- }
1173
-
1174
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
1175
- static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
1176
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1177
- assert(temporary_buffer_equals(parser, "</"));
1178
- if (is_alpha(c)) {
1179
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1180
- start_new_tag(parser, false);
1181
- append_char_to_temporary_buffer(parser, c);
1182
- return NEXT_CHAR;
1183
- } else {
1184
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1185
- return emit_temporary_buffer(parser, output);
1186
- }
1187
- }
1188
-
1189
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
1190
- static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
1191
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1192
- assert(tokenizer->_temporary_buffer.length >= 2);
1193
- gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1194
- tokenizer->_tag_state._buffer.data);
1195
- if (is_alpha(c)) {
1196
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1197
- append_char_to_temporary_buffer(parser, c);
1198
- return NEXT_CHAR;
1199
- } else if (is_appropriate_end_tag(parser)) {
1200
- gumbo_debug("Is an appropriate end tag.\n");
1201
- switch (c) {
1202
- case '\t':
1203
- case '\n':
1204
- case '\f':
1205
- case ' ':
1206
- finish_tag_name(parser);
1207
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1208
- return NEXT_CHAR;
1209
- case '/':
1210
- finish_tag_name(parser);
1211
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1212
- return NEXT_CHAR;
1213
- case '>':
1214
- finish_tag_name(parser);
1215
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1216
- return emit_current_tag(parser, output);
1217
- }
1218
- }
1219
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1220
- abandon_current_tag(parser);
1221
- return emit_temporary_buffer(parser, output);
1222
- }
1223
-
1224
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
1225
- static StateResult handle_script_lt_state(GumboParser* parser,
1226
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1227
- assert(temporary_buffer_equals(parser, "<"));
1228
- if (c == '/') {
1229
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1230
- append_char_to_temporary_buffer(parser, '/');
1231
- return NEXT_CHAR;
1232
- } else if (c == '!') {
1233
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1234
- append_char_to_temporary_buffer(parser, '!');
1235
- return emit_temporary_buffer(parser, output);
1236
- } else {
1237
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1238
- tokenizer->_reconsume_current_input = true;
1239
- return emit_temporary_buffer(parser, output);
1240
- }
1241
- }
1242
-
1243
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
1244
- static StateResult handle_script_end_tag_open_state(GumboParser* parser,
1245
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1246
- assert(temporary_buffer_equals(parser, "</"));
1247
- if (is_alpha(c)) {
1248
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1249
- start_new_tag(parser, false);
1250
- append_char_to_temporary_buffer(parser, c);
1251
- return NEXT_CHAR;
1252
- } else {
1253
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1254
- return emit_temporary_buffer(parser, output);
1255
- }
1256
- }
1257
-
1258
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
1259
- static StateResult handle_script_end_tag_name_state(GumboParser* parser,
1260
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1261
- assert(tokenizer->_temporary_buffer.length >= 2);
1262
- if (is_alpha(c)) {
1263
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1264
- append_char_to_temporary_buffer(parser, c);
1265
- return NEXT_CHAR;
1266
- } else if (is_appropriate_end_tag(parser)) {
1267
- switch (c) {
1268
- case '\t':
1269
- case '\n':
1270
- case '\f':
1271
- case ' ':
1272
- finish_tag_name(parser);
1273
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1274
- return NEXT_CHAR;
1275
- case '/':
1276
- finish_tag_name(parser);
1277
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1278
- return NEXT_CHAR;
1279
- case '>':
1280
- finish_tag_name(parser);
1281
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1282
- return emit_current_tag(parser, output);
1283
- }
1284
- }
1285
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1286
- abandon_current_tag(parser);
1287
- return emit_temporary_buffer(parser, output);
1288
- }
1289
-
1290
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
1291
- static StateResult handle_script_escaped_start_state(GumboParser* parser,
1292
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1293
- if (c == '-') {
1294
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1295
- return emit_current_char(parser, output);
1296
- } else {
1297
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1298
- tokenizer->_reconsume_current_input = true;
1299
- return NEXT_CHAR;
1300
- }
1301
- }
1302
-
1303
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
1304
- static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
1305
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1306
- if (c == '-') {
1307
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1308
- return emit_current_char(parser, output);
1309
- } else {
1310
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1311
- tokenizer->_reconsume_current_input = true;
1312
- return NEXT_CHAR;
1313
- }
1314
- }
1315
-
1316
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
1317
- static StateResult handle_script_escaped_state(GumboParser* parser,
1318
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1319
- switch (c) {
1320
- case '-':
1321
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1322
- return emit_current_char(parser, output);
1323
- case '<':
1324
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1325
- clear_temporary_buffer(parser);
1326
- append_char_to_temporary_buffer(parser, c);
1327
- return NEXT_CHAR;
1328
- case '\0':
1329
- return emit_replacement_char(parser, output);
1330
- case -1:
1331
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1332
- return emit_eof(parser, output);
1333
- default:
1334
- return emit_current_char(parser, output);
1335
- }
1336
- }
1337
-
1338
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
1339
- static StateResult handle_script_escaped_dash_state(GumboParser* parser,
1340
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1341
- switch (c) {
1342
- case '-':
1343
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1344
- return emit_current_char(parser, output);
1345
- case '<':
1346
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1347
- clear_temporary_buffer(parser);
1348
- append_char_to_temporary_buffer(parser, c);
1349
- return NEXT_CHAR;
1350
- case '\0':
1351
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1352
- return emit_replacement_char(parser, output);
1353
- case -1:
1354
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1355
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1356
- return NEXT_CHAR;
1357
- default:
1358
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1359
- return emit_current_char(parser, output);
1360
- }
1361
- }
1362
-
1363
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
1364
- static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
1365
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1366
- switch (c) {
1367
- case '-':
1368
- return emit_current_char(parser, output);
1369
- case '<':
1370
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1371
- clear_temporary_buffer(parser);
1372
- append_char_to_temporary_buffer(parser, c);
1373
- return NEXT_CHAR;
1374
- case '>':
1375
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1376
- return emit_current_char(parser, output);
1377
- case '\0':
1378
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1379
- return emit_replacement_char(parser, output);
1380
- case -1:
1381
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1382
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1383
- return NEXT_CHAR;
1384
- default:
1385
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1386
- return emit_current_char(parser, output);
1387
- }
1388
- }
1389
-
1390
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
1391
- static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1392
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1393
- assert(temporary_buffer_equals(parser, "<"));
1394
- assert(!tokenizer->_script_data_buffer.length);
1395
- if (c == '/') {
1396
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1397
- append_char_to_temporary_buffer(parser, c);
1398
- return NEXT_CHAR;
1399
- } else if (is_alpha(c)) {
1400
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1401
- append_char_to_temporary_buffer(parser, c);
1402
- gumbo_string_buffer_append_codepoint(
1403
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1404
- return emit_temporary_buffer(parser, output);
1405
- } else {
1406
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1407
- return emit_temporary_buffer(parser, output);
1408
- }
1409
- }
1410
-
1411
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
1412
- static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
1413
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1414
- assert(temporary_buffer_equals(parser, "</"));
1415
- if (is_alpha(c)) {
1416
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1417
- start_new_tag(parser, false);
1418
- append_char_to_temporary_buffer(parser, c);
1419
- return NEXT_CHAR;
1420
- } else {
1421
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1422
- return emit_temporary_buffer(parser, output);
1423
- }
1424
- }
1425
-
1426
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
1427
- static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
1428
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1429
- assert(tokenizer->_temporary_buffer.length >= 2);
1430
- if (is_alpha(c)) {
1431
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1432
- append_char_to_temporary_buffer(parser, c);
1433
- return NEXT_CHAR;
1434
- } else if (is_appropriate_end_tag(parser)) {
1435
- switch (c) {
1436
- case '\t':
1437
- case '\n':
1438
- case '\f':
1439
- case ' ':
1440
- finish_tag_name(parser);
1441
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1442
- return NEXT_CHAR;
1443
- case '/':
1444
- finish_tag_name(parser);
1445
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1446
- return NEXT_CHAR;
1447
- case '>':
1448
- finish_tag_name(parser);
1449
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1450
- return emit_current_tag(parser, output);
1451
- }
1452
- }
1453
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1454
- abandon_current_tag(parser);
1455
- return emit_temporary_buffer(parser, output);
1456
- }
1457
-
1458
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
1459
- static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1460
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1461
- switch (c) {
1462
- case '\t':
1463
- case '\n':
1464
- case '\f':
1465
- case ' ':
1466
- case '/':
1467
- case '>':
1468
- gumbo_tokenizer_set_state(
1469
- parser, gumbo_string_equals(&kScriptTag,
1470
- (GumboStringPiece*) &tokenizer->_script_data_buffer)
1471
- ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1472
- : GUMBO_LEX_SCRIPT_ESCAPED);
1473
- return emit_current_char(parser, output);
1474
- default:
1475
- if (is_alpha(c)) {
1476
- gumbo_string_buffer_append_codepoint(
1477
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1478
- return emit_current_char(parser, output);
1479
- } else {
1480
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1481
- tokenizer->_reconsume_current_input = true;
1482
- return NEXT_CHAR;
1483
- }
1484
- }
1485
- }
1486
-
1487
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
1488
- static StateResult handle_script_double_escaped_state(GumboParser* parser,
1489
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1490
- switch (c) {
1491
- case '-':
1492
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1493
- return emit_current_char(parser, output);
1494
- case '<':
1495
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1496
- return emit_current_char(parser, output);
1497
- case '\0':
1498
- return emit_replacement_char(parser, output);
1499
- case -1:
1500
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1501
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1502
- return NEXT_CHAR;
1503
- default:
1504
- return emit_current_char(parser, output);
1505
- }
1506
- }
1507
-
1508
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
1509
- static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
1510
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1511
- switch (c) {
1512
- case '-':
1513
- gumbo_tokenizer_set_state(
1514
- parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1515
- return emit_current_char(parser, output);
1516
- case '<':
1517
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1518
- return emit_current_char(parser, output);
1519
- case '\0':
1520
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1521
- return emit_replacement_char(parser, output);
1522
- case -1:
1523
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1524
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1525
- return NEXT_CHAR;
1526
- default:
1527
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1528
- return emit_current_char(parser, output);
1529
- }
1530
- }
1531
-
1532
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
1533
- static StateResult handle_script_double_escaped_dash_dash_state(
1534
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
1535
- GumboToken* output) {
1536
- switch (c) {
1537
- case '-':
1538
- return emit_current_char(parser, output);
1539
- case '<':
1540
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1541
- return emit_current_char(parser, output);
1542
- case '>':
1543
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1544
- return emit_current_char(parser, output);
1545
- case '\0':
1546
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1547
- return emit_replacement_char(parser, output);
1548
- case -1:
1549
- tokenizer_add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1550
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1551
- return NEXT_CHAR;
1552
- default:
1553
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1554
- return emit_current_char(parser, output);
1555
- }
1556
- }
1557
-
1558
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
1559
- static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
1560
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1561
- if (c == '/') {
1562
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1563
- gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
1564
- return emit_current_char(parser, output);
1565
- } else {
1566
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1567
- tokenizer->_reconsume_current_input = true;
1568
- return NEXT_CHAR;
1569
- }
1570
- }
1571
-
1572
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
1573
- static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1574
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1575
- switch (c) {
1576
- case '\t':
1577
- case '\n':
1578
- case '\f':
1579
- case ' ':
1580
- case '/':
1581
- case '>':
1582
- gumbo_tokenizer_set_state(
1583
- parser, gumbo_string_equals(&kScriptTag,
1584
- (GumboStringPiece*) &tokenizer->_script_data_buffer)
1585
- ? GUMBO_LEX_SCRIPT_ESCAPED
1586
- : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1587
- return emit_current_char(parser, output);
1588
- default:
1589
- if (is_alpha(c)) {
1590
- gumbo_string_buffer_append_codepoint(
1591
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1592
- return emit_current_char(parser, output);
1593
- } else {
1594
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1595
- tokenizer->_reconsume_current_input = true;
1596
- return NEXT_CHAR;
1597
- }
1598
- }
1599
- }
1600
-
1601
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
1602
- static StateResult handle_before_attr_name_state(GumboParser* parser,
1603
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1604
- switch (c) {
1605
- case '\t':
1606
- case '\n':
1607
- case '\f':
1608
- case ' ':
1609
- return NEXT_CHAR;
1610
- case '/':
1611
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1612
- return NEXT_CHAR;
1613
- case '>':
1614
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1615
- return emit_current_tag(parser, output);
1616
- case '\0':
1617
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1618
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1619
- append_char_to_temporary_buffer(parser, 0xfffd);
1620
- return NEXT_CHAR;
1621
- case -1:
1622
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1623
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1624
- abandon_current_tag(parser);
1625
- return NEXT_CHAR;
1626
- case '"':
1627
- case '\'':
1628
- case '<':
1629
- case '=':
1630
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1631
- // Fall through.
1632
- default:
1633
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1634
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1635
- return NEXT_CHAR;
1636
- }
1637
- }
1638
-
1639
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
1640
- static StateResult handle_attr_name_state(GumboParser* parser,
1641
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1642
- switch (c) {
1643
- case '\t':
1644
- case '\n':
1645
- case '\f':
1646
- case ' ':
1647
- finish_attribute_name(parser);
1648
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1649
- return NEXT_CHAR;
1650
- case '/':
1651
- finish_attribute_name(parser);
1652
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1653
- return NEXT_CHAR;
1654
- case '=':
1655
- finish_attribute_name(parser);
1656
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1657
- return NEXT_CHAR;
1658
- case '>':
1659
- finish_attribute_name(parser);
1660
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1661
- return emit_current_tag(parser, output);
1662
- case '\0':
1663
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1664
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1665
- return NEXT_CHAR;
1666
- case -1:
1667
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1668
- abandon_current_tag(parser);
1669
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1670
- return NEXT_CHAR;
1671
- case '"':
1672
- case '\'':
1673
- case '<':
1674
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1675
- // Fall through.
1676
- default:
1677
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1678
- return NEXT_CHAR;
1679
- }
1680
- }
1681
-
1682
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
1683
- static StateResult handle_after_attr_name_state(GumboParser* parser,
1684
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1685
- switch (c) {
1686
- case '\t':
1687
- case '\n':
1688
- case '\f':
1689
- case ' ':
1690
- return NEXT_CHAR;
1691
- case '/':
1692
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1693
- return NEXT_CHAR;
1694
- case '=':
1695
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1696
- return NEXT_CHAR;
1697
- case '>':
1698
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1699
- return emit_current_tag(parser, output);
1700
- case '\0':
1701
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1702
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1703
- append_char_to_temporary_buffer(parser, 0xfffd);
1704
- return NEXT_CHAR;
1705
- case -1:
1706
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1707
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1708
- abandon_current_tag(parser);
1709
- return NEXT_CHAR;
1710
- case '"':
1711
- case '\'':
1712
- case '<':
1713
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1714
- // Fall through.
1715
- default:
1716
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1717
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1718
- return NEXT_CHAR;
1719
- }
1720
- }
1721
-
1722
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
1723
- static StateResult handle_before_attr_value_state(GumboParser* parser,
1724
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1725
- switch (c) {
1726
- case '\t':
1727
- case '\n':
1728
- case '\f':
1729
- case ' ':
1730
- return NEXT_CHAR;
1731
- case '"':
1732
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1733
- reset_tag_buffer_start_point(parser);
1734
- return NEXT_CHAR;
1735
- case '&':
1736
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1737
- tokenizer->_reconsume_current_input = true;
1738
- return NEXT_CHAR;
1739
- case '\'':
1740
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1741
- reset_tag_buffer_start_point(parser);
1742
- return NEXT_CHAR;
1743
- case '\0':
1744
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1745
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1746
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1747
- return NEXT_CHAR;
1748
- case -1:
1749
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1750
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1751
- abandon_current_tag(parser);
1752
- tokenizer->_reconsume_current_input = true;
1753
- return NEXT_CHAR;
1754
- case '>':
1755
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1756
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1757
- emit_current_tag(parser, output);
1758
- return RETURN_ERROR;
1759
- case '<':
1760
- case '=':
1761
- case '`':
1762
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1763
- // Fall through.
1764
- default:
1765
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1766
- append_char_to_tag_buffer(parser, c, true);
1767
- return NEXT_CHAR;
1768
- }
1769
- }
1770
-
1771
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
1772
- static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
1773
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1774
- switch (c) {
1775
- case '"':
1776
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1777
- return NEXT_CHAR;
1778
- case '&':
1779
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1780
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1781
- tokenizer->_reconsume_current_input = true;
1782
- return NEXT_CHAR;
1783
- case '\0':
1784
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1785
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1786
- return NEXT_CHAR;
1787
- case -1:
1788
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
1789
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1790
- abandon_current_tag(parser);
1791
- tokenizer->_reconsume_current_input = true;
1792
- return NEXT_CHAR;
1793
- default:
1794
- append_char_to_tag_buffer(parser, c, false);
1795
- return NEXT_CHAR;
1796
- }
1797
- }
1798
-
1799
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
1800
- static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
1801
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1802
- switch (c) {
1803
- case '\'':
1804
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1805
- return NEXT_CHAR;
1806
- case '&':
1807
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1808
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1809
- tokenizer->_reconsume_current_input = true;
1810
- return NEXT_CHAR;
1811
- case '\0':
1812
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1813
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1814
- return NEXT_CHAR;
1815
- case -1:
1816
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
1817
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1818
- abandon_current_tag(parser);
1819
- tokenizer->_reconsume_current_input = true;
1820
- return NEXT_CHAR;
1821
- default:
1822
- append_char_to_tag_buffer(parser, c, false);
1823
- return NEXT_CHAR;
1824
- }
1825
- }
1826
-
1827
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
1828
- static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
1829
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1830
- switch (c) {
1831
- case '\t':
1832
- case '\n':
1833
- case '\f':
1834
- case ' ':
1835
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1836
- finish_attribute_value(parser);
1837
- return NEXT_CHAR;
1838
- case '&':
1839
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1840
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1841
- tokenizer->_reconsume_current_input = true;
1842
- return NEXT_CHAR;
1843
- case '>':
1844
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1845
- finish_attribute_value(parser);
1846
- return emit_current_tag(parser, output);
1847
- case '\0':
1848
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1849
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1850
- return NEXT_CHAR;
1851
- case -1:
1852
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1853
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1854
- tokenizer->_reconsume_current_input = true;
1855
- abandon_current_tag(parser);
1856
- return NEXT_CHAR;
1857
- case '<':
1858
- case '=':
1859
- case '"':
1860
- case '\'':
1861
- case '`':
1862
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1863
- // Fall through.
1864
- default:
1865
- append_char_to_tag_buffer(parser, c, true);
1866
- return NEXT_CHAR;
1867
- }
1868
- }
1869
-
1870
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
1871
- static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1872
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1873
- OneOrTwoCodepoints char_ref;
1874
- int allowed_char;
1875
- bool is_unquoted = false;
1876
- switch (tokenizer->_tag_state._attr_value_state) {
1877
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
1878
- allowed_char = '"';
1879
- break;
1880
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
1881
- allowed_char = '\'';
1882
- break;
1883
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
1884
- allowed_char = '>';
1885
- is_unquoted = true;
1886
- break;
1887
- default:
1888
- // -Wmaybe-uninitialized is a little overzealous here, and doesn't
1889
- // get that the assert(0) means this codepath will never happen.
1890
- allowed_char = ' ';
1891
- assert(0);
1892
- }
1893
-
1894
- // Ignore the status, since we don't have a convenient way of signalling that
1895
- // a parser error has occurred when the error occurs in the middle of a
1896
- // multi-state token. We'd need a flag inside the TokenizerState to do this,
1897
- // but that's a low priority fix.
1898
- consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
1899
- if (char_ref.first != kGumboNoChar) {
1900
- tokenizer->_reconsume_current_input = true;
1901
- append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
1902
- if (char_ref.second != kGumboNoChar) {
1903
- append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
1904
- }
1905
- } else {
1906
- append_char_to_tag_buffer(parser, '&', is_unquoted);
1907
- }
1908
- gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
1909
- return NEXT_CHAR;
1910
- }
1911
-
1912
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
1913
- static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
1914
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1915
- finish_attribute_value(parser);
1916
- switch (c) {
1917
- case '\t':
1918
- case '\n':
1919
- case '\f':
1920
- case ' ':
1921
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1922
- return NEXT_CHAR;
1923
- case '/':
1924
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1925
- return NEXT_CHAR;
1926
- case '>':
1927
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1928
- return emit_current_tag(parser, output);
1929
- case -1:
1930
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
1931
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1932
- abandon_current_tag(parser);
1933
- tokenizer->_reconsume_current_input = true;
1934
- return NEXT_CHAR;
1935
- default:
1936
- tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
1937
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1938
- tokenizer->_reconsume_current_input = true;
1939
- return NEXT_CHAR;
1940
- }
1941
- }
1942
-
1943
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
1944
- static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
1945
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1946
- switch (c) {
1947
- case '>':
1948
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1949
- tokenizer->_tag_state._is_self_closing = true;
1950
- return emit_current_tag(parser, output);
1951
- case -1:
1952
- tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
1953
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1954
- abandon_current_tag(parser);
1955
- return NEXT_CHAR;
1956
- default:
1957
- tokenizer_add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
1958
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1959
- tokenizer->_reconsume_current_input = true;
1960
- return NEXT_CHAR;
1961
- }
1962
- }
1963
-
1964
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
1965
- static StateResult handle_bogus_comment_state(GumboParser* parser,
1966
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1967
- while (c != '>' && c != -1) {
1968
- if (c == '\0') {
1969
- c = 0xFFFD;
1970
- }
1971
- append_char_to_temporary_buffer(parser, c);
1972
- utf8iterator_next(&tokenizer->_input);
1973
- c = utf8iterator_current(&tokenizer->_input);
1974
- }
1975
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1976
- return emit_comment(parser, output);
1977
- }
1978
-
1979
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
1980
- static StateResult handle_markup_declaration_state(GumboParser* parser,
1981
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1982
- if (utf8iterator_maybe_consume_match(
1983
- &tokenizer->_input, "--", sizeof("--") - 1, true)) {
1984
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
1985
- tokenizer->_reconsume_current_input = true;
1986
- } else if (utf8iterator_maybe_consume_match(
1987
- &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
1988
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
1989
- tokenizer->_reconsume_current_input = true;
1990
- // If we get here, we know we'll eventually emit a doctype token, so now is
1991
- // the time to initialize the doctype strings. (Not in doctype_state_init,
1992
- // since then they'll leak if ownership never gets transferred to the
1993
- // doctype token.
1994
- tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
1995
- tokenizer->_doc_type_state.public_identifier =
1996
- gumbo_copy_stringz(parser, "");
1997
- tokenizer->_doc_type_state.system_identifier =
1998
- gumbo_copy_stringz(parser, "");
1999
- } else if (tokenizer->_is_current_node_foreign &&
2000
- utf8iterator_maybe_consume_match(
2001
- &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2002
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2003
- tokenizer->_is_in_cdata = true;
2004
- tokenizer->_reconsume_current_input = true;
2005
- } else {
2006
- tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2007
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2008
- tokenizer->_reconsume_current_input = true;
2009
- clear_temporary_buffer(parser);
2010
- }
2011
- return NEXT_CHAR;
2012
- }
2013
-
2014
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
2015
- static StateResult handle_comment_start_state(GumboParser* parser,
2016
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2017
- switch (c) {
2018
- case '-':
2019
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2020
- return NEXT_CHAR;
2021
- case '\0':
2022
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2023
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2024
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2025
- return NEXT_CHAR;
2026
- case '>':
2027
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2028
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2029
- emit_comment(parser, output);
2030
- return RETURN_ERROR;
2031
- case -1:
2032
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2033
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2034
- emit_comment(parser, output);
2035
- return RETURN_ERROR;
2036
- default:
2037
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2038
- append_char_to_temporary_buffer(parser, c);
2039
- return NEXT_CHAR;
2040
- }
2041
- }
2042
-
2043
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
2044
- static StateResult handle_comment_start_dash_state(GumboParser* parser,
2045
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2046
- switch (c) {
2047
- case '-':
2048
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2049
- return NEXT_CHAR;
2050
- case '\0':
2051
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2052
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2053
- append_char_to_temporary_buffer(parser, '-');
2054
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2055
- return NEXT_CHAR;
2056
- case '>':
2057
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2058
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2059
- emit_comment(parser, output);
2060
- return RETURN_ERROR;
2061
- case -1:
2062
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2063
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2064
- emit_comment(parser, output);
2065
- return RETURN_ERROR;
2066
- default:
2067
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2068
- append_char_to_temporary_buffer(parser, '-');
2069
- append_char_to_temporary_buffer(parser, c);
2070
- return NEXT_CHAR;
2071
- }
2072
- }
2073
-
2074
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
2075
- static StateResult handle_comment_state(GumboParser* parser,
2076
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2077
- switch (c) {
2078
- case '-':
2079
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2080
- return NEXT_CHAR;
2081
- case '\0':
2082
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2083
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2084
- return NEXT_CHAR;
2085
- case -1:
2086
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2087
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2088
- emit_comment(parser, output);
2089
- return RETURN_ERROR;
2090
- default:
2091
- append_char_to_temporary_buffer(parser, c);
2092
- return NEXT_CHAR;
2093
- }
2094
- }
2095
-
2096
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
2097
- static StateResult handle_comment_end_dash_state(GumboParser* parser,
2098
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2099
- switch (c) {
2100
- case '-':
2101
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2102
- return NEXT_CHAR;
2103
- case '\0':
2104
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2105
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2106
- append_char_to_temporary_buffer(parser, '-');
2107
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2108
- return NEXT_CHAR;
2109
- case -1:
2110
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2111
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2112
- emit_comment(parser, output);
2113
- return RETURN_ERROR;
2114
- default:
2115
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2116
- append_char_to_temporary_buffer(parser, '-');
2117
- append_char_to_temporary_buffer(parser, c);
2118
- return NEXT_CHAR;
2119
- }
2120
- }
2121
-
2122
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
2123
- static StateResult handle_comment_end_state(GumboParser* parser,
2124
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2125
- switch (c) {
2126
- case '>':
2127
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2128
- return emit_comment(parser, output);
2129
- case '\0':
2130
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2131
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2132
- append_char_to_temporary_buffer(parser, '-');
2133
- append_char_to_temporary_buffer(parser, '-');
2134
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2135
- return NEXT_CHAR;
2136
- case '!':
2137
- tokenizer_add_parse_error(
2138
- parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2139
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2140
- return NEXT_CHAR;
2141
- case '-':
2142
- tokenizer_add_parse_error(
2143
- parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2144
- append_char_to_temporary_buffer(parser, '-');
2145
- return NEXT_CHAR;
2146
- case -1:
2147
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2148
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2149
- emit_comment(parser, output);
2150
- return RETURN_ERROR;
2151
- default:
2152
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2153
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2154
- append_char_to_temporary_buffer(parser, '-');
2155
- append_char_to_temporary_buffer(parser, '-');
2156
- append_char_to_temporary_buffer(parser, c);
2157
- return NEXT_CHAR;
2158
- }
2159
- }
2160
-
2161
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
2162
- static StateResult handle_comment_end_bang_state(GumboParser* parser,
2163
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2164
- switch (c) {
2165
- case '-':
2166
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2167
- append_char_to_temporary_buffer(parser, '-');
2168
- append_char_to_temporary_buffer(parser, '-');
2169
- append_char_to_temporary_buffer(parser, '!');
2170
- return NEXT_CHAR;
2171
- case '>':
2172
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2173
- return emit_comment(parser, output);
2174
- case '\0':
2175
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2176
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2177
- append_char_to_temporary_buffer(parser, '-');
2178
- append_char_to_temporary_buffer(parser, '-');
2179
- append_char_to_temporary_buffer(parser, '!');
2180
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2181
- return NEXT_CHAR;
2182
- case -1:
2183
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2184
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2185
- emit_comment(parser, output);
2186
- return RETURN_ERROR;
2187
- default:
2188
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2189
- append_char_to_temporary_buffer(parser, '-');
2190
- append_char_to_temporary_buffer(parser, '-');
2191
- append_char_to_temporary_buffer(parser, '!');
2192
- append_char_to_temporary_buffer(parser, c);
2193
- return NEXT_CHAR;
2194
- }
2195
- }
2196
-
2197
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
2198
- static StateResult handle_doctype_state(GumboParser* parser,
2199
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2200
- assert(!tokenizer->_temporary_buffer.length);
2201
- switch (c) {
2202
- case '\t':
2203
- case '\n':
2204
- case '\f':
2205
- case ' ':
2206
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2207
- return NEXT_CHAR;
2208
- case -1:
2209
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2210
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2211
- tokenizer->_doc_type_state.force_quirks = true;
2212
- emit_doctype(parser, output);
2213
- return RETURN_ERROR;
2214
- default:
2215
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2216
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2217
- tokenizer->_reconsume_current_input = true;
2218
- tokenizer->_doc_type_state.force_quirks = true;
2219
- return NEXT_CHAR;
2220
- }
2221
- }
2222
-
2223
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
2224
- static StateResult handle_before_doctype_name_state(GumboParser* parser,
2225
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2226
- switch (c) {
2227
- case '\t':
2228
- case '\n':
2229
- case '\f':
2230
- case ' ':
2231
- return NEXT_CHAR;
2232
- case '\0':
2233
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2234
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2235
- tokenizer->_doc_type_state.force_quirks = true;
2236
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2237
- return NEXT_CHAR;
2238
- case '>':
2239
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2240
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2241
- tokenizer->_doc_type_state.force_quirks = true;
2242
- emit_doctype(parser, output);
2243
- return RETURN_ERROR;
2244
- case -1:
2245
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2246
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2247
- tokenizer->_doc_type_state.force_quirks = true;
2248
- emit_doctype(parser, output);
2249
- return RETURN_ERROR;
2250
- default:
2251
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2252
- tokenizer->_doc_type_state.force_quirks = false;
2253
- append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2254
- return NEXT_CHAR;
2255
- }
2256
- }
2257
-
2258
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
2259
- static StateResult handle_doctype_name_state(GumboParser* parser,
2260
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2261
- switch (c) {
2262
- case '\t':
2263
- case '\n':
2264
- case '\f':
2265
- case ' ':
2266
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2267
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2268
- finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2269
- return NEXT_CHAR;
2270
- case '>':
2271
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2272
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2273
- finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2274
- emit_doctype(parser, output);
2275
- return RETURN_SUCCESS;
2276
- case '\0':
2277
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2278
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2279
- return NEXT_CHAR;
2280
- case -1:
2281
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2282
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2283
- tokenizer->_doc_type_state.force_quirks = true;
2284
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2285
- finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2286
- emit_doctype(parser, output);
2287
- return RETURN_ERROR;
2288
- default:
2289
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2290
- tokenizer->_doc_type_state.force_quirks = false;
2291
- append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2292
- return NEXT_CHAR;
2293
- }
2294
- }
2295
-
2296
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
2297
- static StateResult handle_after_doctype_name_state(GumboParser* parser,
2298
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2299
- switch (c) {
2300
- case '\t':
2301
- case '\n':
2302
- case '\f':
2303
- case ' ':
2304
- return NEXT_CHAR;
2305
- case '>':
2306
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2307
- emit_doctype(parser, output);
2308
- return RETURN_SUCCESS;
2309
- case -1:
2310
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2311
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2312
- tokenizer->_doc_type_state.force_quirks = true;
2313
- emit_doctype(parser, output);
2314
- return RETURN_ERROR;
2315
- default:
2316
- if (utf8iterator_maybe_consume_match(
2317
- &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2318
- gumbo_tokenizer_set_state(
2319
- parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2320
- tokenizer->_reconsume_current_input = true;
2321
- } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2322
- sizeof("SYSTEM") - 1, false)) {
2323
- gumbo_tokenizer_set_state(
2324
- parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2325
- tokenizer->_reconsume_current_input = true;
2326
- } else {
2327
- tokenizer_add_parse_error(
2328
- parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2329
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2330
- tokenizer->_doc_type_state.force_quirks = true;
2331
- }
2332
- return NEXT_CHAR;
2333
- }
2334
- }
2335
-
2336
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
2337
- static StateResult handle_after_doctype_public_keyword_state(
2338
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2339
- GumboToken* output) {
2340
- switch (c) {
2341
- case '\t':
2342
- case '\n':
2343
- case '\f':
2344
- case ' ':
2345
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2346
- return NEXT_CHAR;
2347
- case '"':
2348
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2349
- assert(temporary_buffer_equals(parser, ""));
2350
- gumbo_tokenizer_set_state(
2351
- parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2352
- return NEXT_CHAR;
2353
- case '\'':
2354
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2355
- assert(temporary_buffer_equals(parser, ""));
2356
- gumbo_tokenizer_set_state(
2357
- parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2358
- return NEXT_CHAR;
2359
- case '>':
2360
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2361
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2362
- tokenizer->_doc_type_state.force_quirks = true;
2363
- emit_doctype(parser, output);
2364
- return RETURN_ERROR;
2365
- case -1:
2366
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2367
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2368
- tokenizer->_doc_type_state.force_quirks = true;
2369
- emit_doctype(parser, output);
2370
- return RETURN_ERROR;
2371
- default:
2372
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2373
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2374
- tokenizer->_doc_type_state.force_quirks = true;
2375
- emit_doctype(parser, output);
2376
- return RETURN_ERROR;
2377
- }
2378
- }
2379
-
2380
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
2381
- static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2382
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2383
- switch (c) {
2384
- case '\t':
2385
- case '\n':
2386
- case '\f':
2387
- case ' ':
2388
- return NEXT_CHAR;
2389
- case '"':
2390
- assert(temporary_buffer_equals(parser, ""));
2391
- gumbo_tokenizer_set_state(
2392
- parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2393
- return NEXT_CHAR;
2394
- case '\'':
2395
- assert(temporary_buffer_equals(parser, ""));
2396
- gumbo_tokenizer_set_state(
2397
- parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2398
- return NEXT_CHAR;
2399
- case '>':
2400
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2401
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2402
- tokenizer->_doc_type_state.force_quirks = true;
2403
- emit_doctype(parser, output);
2404
- return RETURN_ERROR;
2405
- case -1:
2406
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2407
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2408
- tokenizer->_doc_type_state.force_quirks = true;
2409
- emit_doctype(parser, output);
2410
- return RETURN_ERROR;
2411
- default:
2412
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2413
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2414
- tokenizer->_doc_type_state.force_quirks = true;
2415
- emit_doctype(parser, output);
2416
- return RETURN_ERROR;
2417
- }
2418
- }
2419
-
2420
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
2421
- static StateResult handle_doctype_public_id_double_quoted_state(
2422
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2423
- GumboToken* output) {
2424
- switch (c) {
2425
- case '"':
2426
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2427
- finish_doctype_public_id(parser);
2428
- return NEXT_CHAR;
2429
- case '\0':
2430
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2431
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2432
- return NEXT_CHAR;
2433
- case '>':
2434
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2435
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2436
- tokenizer->_doc_type_state.force_quirks = true;
2437
- finish_doctype_public_id(parser);
2438
- emit_doctype(parser, output);
2439
- return RETURN_ERROR;
2440
- case -1:
2441
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2442
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2443
- tokenizer->_doc_type_state.force_quirks = true;
2444
- finish_doctype_public_id(parser);
2445
- emit_doctype(parser, output);
2446
- return RETURN_ERROR;
2447
- default:
2448
- append_char_to_temporary_buffer(parser, c);
2449
- return NEXT_CHAR;
2450
- }
2451
- }
2452
-
2453
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
2454
- static StateResult handle_doctype_public_id_single_quoted_state(
2455
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2456
- GumboToken* output) {
2457
- switch (c) {
2458
- case '\'':
2459
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2460
- finish_doctype_public_id(parser);
2461
- return NEXT_CHAR;
2462
- case '\0':
2463
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2464
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2465
- return NEXT_CHAR;
2466
- case '>':
2467
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2468
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2469
- tokenizer->_doc_type_state.force_quirks = true;
2470
- finish_doctype_public_id(parser);
2471
- emit_doctype(parser, output);
2472
- return RETURN_ERROR;
2473
- case -1:
2474
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2475
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2476
- tokenizer->_doc_type_state.force_quirks = true;
2477
- finish_doctype_public_id(parser);
2478
- emit_doctype(parser, output);
2479
- return RETURN_ERROR;
2480
- default:
2481
- append_char_to_temporary_buffer(parser, c);
2482
- return NEXT_CHAR;
2483
- }
2484
- }
2485
-
2486
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
2487
- static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2488
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2489
- switch (c) {
2490
- case '\t':
2491
- case '\n':
2492
- case '\f':
2493
- case ' ':
2494
- gumbo_tokenizer_set_state(
2495
- parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2496
- return NEXT_CHAR;
2497
- case '>':
2498
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2499
- emit_doctype(parser, output);
2500
- return RETURN_SUCCESS;
2501
- case '"':
2502
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2503
- assert(temporary_buffer_equals(parser, ""));
2504
- gumbo_tokenizer_set_state(
2505
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2506
- return NEXT_CHAR;
2507
- case '\'':
2508
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2509
- assert(temporary_buffer_equals(parser, ""));
2510
- gumbo_tokenizer_set_state(
2511
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2512
- return NEXT_CHAR;
2513
- case -1:
2514
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2515
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2516
- tokenizer->_reconsume_current_input = true;
2517
- tokenizer->_doc_type_state.force_quirks = true;
2518
- emit_doctype(parser, output);
2519
- return RETURN_ERROR;
2520
- default:
2521
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2522
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2523
- tokenizer->_doc_type_state.force_quirks = true;
2524
- return NEXT_CHAR;
2525
- }
2526
- }
2527
-
2528
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
2529
- static StateResult handle_between_doctype_public_system_id_state(
2530
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2531
- GumboToken* output) {
2532
- switch (c) {
2533
- case '\t':
2534
- case '\n':
2535
- case '\f':
2536
- case ' ':
2537
- return NEXT_CHAR;
2538
- case '>':
2539
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2540
- emit_doctype(parser, output);
2541
- return RETURN_SUCCESS;
2542
- case '"':
2543
- assert(temporary_buffer_equals(parser, ""));
2544
- gumbo_tokenizer_set_state(
2545
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2546
- return NEXT_CHAR;
2547
- case '\'':
2548
- assert(temporary_buffer_equals(parser, ""));
2549
- gumbo_tokenizer_set_state(
2550
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2551
- return NEXT_CHAR;
2552
- case -1:
2553
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2554
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2555
- tokenizer->_doc_type_state.force_quirks = true;
2556
- emit_doctype(parser, output);
2557
- return RETURN_ERROR;
2558
- default:
2559
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2560
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2561
- tokenizer->_doc_type_state.force_quirks = true;
2562
- emit_doctype(parser, output);
2563
- return RETURN_ERROR;
2564
- }
2565
- }
2566
-
2567
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
2568
- static StateResult handle_after_doctype_system_keyword_state(
2569
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2570
- GumboToken* output) {
2571
- switch (c) {
2572
- case '\t':
2573
- case '\n':
2574
- case '\f':
2575
- case ' ':
2576
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2577
- return NEXT_CHAR;
2578
- case '"':
2579
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2580
- assert(temporary_buffer_equals(parser, ""));
2581
- gumbo_tokenizer_set_state(
2582
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2583
- return NEXT_CHAR;
2584
- case '\'':
2585
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2586
- assert(temporary_buffer_equals(parser, ""));
2587
- gumbo_tokenizer_set_state(
2588
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2589
- return NEXT_CHAR;
2590
- case '>':
2591
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2592
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2593
- tokenizer->_doc_type_state.force_quirks = true;
2594
- emit_doctype(parser, output);
2595
- return RETURN_ERROR;
2596
- case -1:
2597
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2598
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2599
- tokenizer->_doc_type_state.force_quirks = true;
2600
- emit_doctype(parser, output);
2601
- return RETURN_ERROR;
2602
- default:
2603
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2604
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2605
- tokenizer->_doc_type_state.force_quirks = true;
2606
- return NEXT_CHAR;
2607
- }
2608
- }
2609
-
2610
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
2611
- static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2612
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2613
- switch (c) {
2614
- case '\t':
2615
- case '\n':
2616
- case '\f':
2617
- case ' ':
2618
- return NEXT_CHAR;
2619
- case '"':
2620
- assert(temporary_buffer_equals(parser, ""));
2621
- gumbo_tokenizer_set_state(
2622
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2623
- return NEXT_CHAR;
2624
- case '\'':
2625
- assert(temporary_buffer_equals(parser, ""));
2626
- gumbo_tokenizer_set_state(
2627
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2628
- return NEXT_CHAR;
2629
- case '>':
2630
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2631
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2632
- tokenizer->_doc_type_state.force_quirks = true;
2633
- emit_doctype(parser, output);
2634
- return RETURN_ERROR;
2635
- case -1:
2636
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2637
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2638
- tokenizer->_doc_type_state.force_quirks = true;
2639
- emit_doctype(parser, output);
2640
- return RETURN_ERROR;
2641
- default:
2642
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2643
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2644
- tokenizer->_doc_type_state.force_quirks = true;
2645
- return NEXT_CHAR;
2646
- }
2647
- }
2648
-
2649
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
2650
- static StateResult handle_doctype_system_id_double_quoted_state(
2651
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2652
- GumboToken* output) {
2653
- switch (c) {
2654
- case '"':
2655
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2656
- finish_doctype_system_id(parser);
2657
- return NEXT_CHAR;
2658
- case '\0':
2659
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2660
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2661
- return NEXT_CHAR;
2662
- case '>':
2663
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2664
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2665
- tokenizer->_doc_type_state.force_quirks = true;
2666
- finish_doctype_system_id(parser);
2667
- emit_doctype(parser, output);
2668
- return RETURN_ERROR;
2669
- case -1:
2670
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2671
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2672
- tokenizer->_doc_type_state.force_quirks = true;
2673
- finish_doctype_system_id(parser);
2674
- emit_doctype(parser, output);
2675
- return RETURN_ERROR;
2676
- default:
2677
- append_char_to_temporary_buffer(parser, c);
2678
- return NEXT_CHAR;
2679
- }
2680
- }
2681
-
2682
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
2683
- static StateResult handle_doctype_system_id_single_quoted_state(
2684
- GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2685
- GumboToken* output) {
2686
- switch (c) {
2687
- case '\'':
2688
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2689
- finish_doctype_system_id(parser);
2690
- return NEXT_CHAR;
2691
- case '\0':
2692
- tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2693
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2694
- return NEXT_CHAR;
2695
- case '>':
2696
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2697
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2698
- tokenizer->_doc_type_state.force_quirks = true;
2699
- finish_doctype_system_id(parser);
2700
- emit_doctype(parser, output);
2701
- return RETURN_ERROR;
2702
- case -1:
2703
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2704
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2705
- tokenizer->_doc_type_state.force_quirks = true;
2706
- finish_doctype_system_id(parser);
2707
- emit_doctype(parser, output);
2708
- return RETURN_ERROR;
2709
- default:
2710
- append_char_to_temporary_buffer(parser, c);
2711
- return NEXT_CHAR;
2712
- }
2713
- }
2714
-
2715
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
2716
- static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
2717
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2718
- switch (c) {
2719
- case '\t':
2720
- case '\n':
2721
- case '\f':
2722
- case ' ':
2723
- return NEXT_CHAR;
2724
- case '>':
2725
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2726
- emit_doctype(parser, output);
2727
- return RETURN_SUCCESS;
2728
- case -1:
2729
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2730
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2731
- tokenizer->_doc_type_state.force_quirks = true;
2732
- emit_doctype(parser, output);
2733
- return RETURN_ERROR;
2734
- default:
2735
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2736
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2737
- return NEXT_CHAR;
2738
- }
2739
- }
2740
-
2741
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
2742
- static StateResult handle_bogus_doctype_state(GumboParser* parser,
2743
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2744
- if (c == '>' || c == -1) {
2745
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2746
- emit_doctype(parser, output);
2747
- return RETURN_ERROR;
2748
- }
2749
- return NEXT_CHAR;
2750
- }
2751
-
2752
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
2753
- static StateResult handle_cdata_state(GumboParser* parser,
2754
- GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2755
- if (c == -1 || utf8iterator_maybe_consume_match(
2756
- &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2757
- tokenizer->_reconsume_current_input = true;
2758
- reset_token_start_point(tokenizer);
2759
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2760
- tokenizer->_is_in_cdata = false;
2761
- return NEXT_CHAR;
2762
- } else {
2763
- return emit_current_char(parser, output);
2764
- }
2765
- }
2766
-
2767
- typedef StateResult (*GumboLexerStateFunction)(
2768
- GumboParser*, GumboTokenizerState*, int, GumboToken*);
2769
-
2770
- static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
2771
- handle_char_ref_in_data_state, handle_rcdata_state,
2772
- handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
2773
- handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
2774
- handle_tag_name_state, handle_rcdata_lt_state,
2775
- handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
2776
- handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
2777
- handle_rawtext_end_tag_name_state, handle_script_lt_state,
2778
- handle_script_end_tag_open_state, handle_script_end_tag_name_state,
2779
- handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
2780
- handle_script_escaped_state, handle_script_escaped_dash_state,
2781
- handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
2782
- handle_script_escaped_end_tag_open_state,
2783
- handle_script_escaped_end_tag_name_state,
2784
- handle_script_double_escaped_start_state,
2785
- handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
2786
- handle_script_double_escaped_dash_dash_state,
2787
- handle_script_double_escaped_lt_state,
2788
- handle_script_double_escaped_end_state, handle_before_attr_name_state,
2789
- handle_attr_name_state, handle_after_attr_name_state,
2790
- handle_before_attr_value_state, handle_attr_value_double_quoted_state,
2791
- handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
2792
- handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
2793
- handle_self_closing_start_tag_state, handle_bogus_comment_state,
2794
- handle_markup_declaration_state, handle_comment_start_state,
2795
- handle_comment_start_dash_state, handle_comment_state,
2796
- handle_comment_end_dash_state, handle_comment_end_state,
2797
- handle_comment_end_bang_state, handle_doctype_state,
2798
- handle_before_doctype_name_state, handle_doctype_name_state,
2799
- handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
2800
- handle_before_doctype_public_id_state,
2801
- handle_doctype_public_id_double_quoted_state,
2802
- handle_doctype_public_id_single_quoted_state,
2803
- handle_after_doctype_public_id_state,
2804
- handle_between_doctype_public_system_id_state,
2805
- handle_after_doctype_system_keyword_state,
2806
- handle_before_doctype_system_id_state,
2807
- handle_doctype_system_id_double_quoted_state,
2808
- handle_doctype_system_id_single_quoted_state,
2809
- handle_after_doctype_system_id_state, handle_bogus_doctype_state,
2810
- handle_cdata_state};
2811
-
2812
- bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2813
- // Because of the spec requirements that...
2814
- //
2815
- // 1. Tokens be handled immediately by the parser upon emission.
2816
- // 2. Some states (eg. CDATA, or various error conditions) require the
2817
- // emission of multiple tokens in the same states.
2818
- // 3. The tokenizer often has to reconsume the same character in a different
2819
- // state.
2820
- //
2821
- // ...all state must be held in the GumboTokenizer struct instead of in local
2822
- // variables in this function. That allows us to return from this method with
2823
- // a token, and then immediately jump back to the same state with the same
2824
- // input if we need to return a different token. The various emit_* functions
2825
- // are responsible for changing state (eg. flushing the chardata buffer,
2826
- // reading the next input character) to avoid an infinite loop.
2827
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
2828
-
2829
- if (tokenizer->_buffered_emit_char != kGumboNoChar) {
2830
- tokenizer->_reconsume_current_input = true;
2831
- emit_char(parser, tokenizer->_buffered_emit_char, output);
2832
- // And now that we've avoided advancing the input, make sure we set
2833
- // _reconsume_current_input back to false to make sure the *next* character
2834
- // isn't consumed twice.
2835
- tokenizer->_reconsume_current_input = false;
2836
- tokenizer->_buffered_emit_char = kGumboNoChar;
2837
- return true;
2838
- }
2839
-
2840
- if (maybe_emit_from_temporary_buffer(parser, output)) {
2841
- return true;
2842
- }
2843
-
2844
- while (1) {
2845
- assert(!tokenizer->_temporary_buffer_emit);
2846
- assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2847
- int c = utf8iterator_current(&tokenizer->_input);
2848
- gumbo_debug(
2849
- "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
2850
- StateResult result =
2851
- dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2852
- // We need to clear reconsume_current_input before returning to prevent
2853
- // certain infinite loop states.
2854
- bool should_advance = !tokenizer->_reconsume_current_input;
2855
- tokenizer->_reconsume_current_input = false;
2856
-
2857
- if (result == RETURN_SUCCESS) {
2858
- return true;
2859
- } else if (result == RETURN_ERROR) {
2860
- return false;
2861
- }
2862
-
2863
- if (should_advance) {
2864
- utf8iterator_next(&tokenizer->_input);
2865
- }
2866
- }
2867
- }
2868
-
2869
- void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
2870
- if (!token) return;
2871
-
2872
- switch (token->type) {
2873
- case GUMBO_TOKEN_DOCTYPE:
2874
- gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
2875
- gumbo_parser_deallocate(
2876
- parser, (void*) token->v.doc_type.public_identifier);
2877
- gumbo_parser_deallocate(
2878
- parser, (void*) token->v.doc_type.system_identifier);
2879
- return;
2880
- case GUMBO_TOKEN_START_TAG:
2881
- for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2882
- GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2883
- if (attr) {
2884
- // May have been nulled out if this token was merged with another.
2885
- gumbo_destroy_attribute(parser, attr);
2886
- }
2887
- }
2888
- gumbo_parser_deallocate(
2889
- parser, (void*) token->v.start_tag.attributes.data);
2890
- return;
2891
- case GUMBO_TOKEN_COMMENT:
2892
- gumbo_parser_deallocate(parser, (void*) token->v.text);
2893
- return;
2894
- default:
2895
- return;
2896
- }
2897
- }