nokogumbo 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
data/work/token_type.h DELETED
@@ -1,40 +0,0 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #ifndef GUMBO_TOKEN_TYPE_H_
18
- #define GUMBO_TOKEN_TYPE_H_
19
-
20
- #ifdef __cplusplus
21
- extern "C" {
22
- #endif
23
-
24
- // An enum representing the type of token.
25
- typedef enum _GumboTokenType {
26
- GUMBO_TOKEN_DOCTYPE,
27
- GUMBO_TOKEN_START_TAG,
28
- GUMBO_TOKEN_END_TAG,
29
- GUMBO_TOKEN_COMMENT,
30
- GUMBO_TOKEN_WHITESPACE,
31
- GUMBO_TOKEN_CHARACTER,
32
- GUMBO_TOKEN_NULL,
33
- GUMBO_TOKEN_EOF
34
- } GumboTokenType;
35
-
36
- #ifdef __cplusplus
37
- } // extern C
38
- #endif
39
-
40
- #endif // GUMBO_TOKEN_TYPE_H_
data/work/tokenizer.c DELETED
@@ -1,2978 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Coding conventions specific to this file:
18
- //
19
- // 1. Functions that fill in a token should be named emit_*, and should be
20
- // followed immediately by a return from the tokenizer (true if no error
21
- // occurred, false if an error occurred). Sometimes the emit functions
22
- // themselves return a boolean so that they can be combined with the return
23
- // statement; in this case, they should match this convention.
24
- // 2. Functions that shuffle data from temporaries to final API structures
25
- // should be named finish_*, and be called just before the tokenizer exits the
26
- // state that accumulates the temporary.
27
- // 3. All internal data structures should be kept in an initialized state from
28
- // tokenizer creation onwards, ready to accept input. When a buffer's flushed
29
- // and reset, it should be deallocated and immediately reinitialized.
30
- // 4. Make sure there are appropriate break statements following each state.
31
- // 5. Assertions on the state of the temporary and tag buffers are usually a
32
- // good idea, and should go at the entry point of each state when added.
33
- // 6. Statement order within states goes:
34
- // 1. Add parse errors, if appropriate.
35
- // 2. Call finish_* functions to build up tag state.
36
- // 2. Switch to new state. Set _reconsume flag if appropriate.
37
- // 3. Perform any other temporary buffer manipulation.
38
- // 4. Emit tokens
39
- // 5. Return/break.
40
- // This order ensures that we can verify that every emit is followed by a
41
- // return, ensures that the correct state is recorded with any parse errors, and
42
- // prevents parse error position from being messed up by possible mark/resets in
43
- // temporary buffer manipulation.
44
-
45
-
46
- #include "tokenizer.h"
47
-
48
- #include <assert.h>
49
- #include <stdbool.h>
50
- #include <string.h>
51
-
52
- #include "attribute.h"
53
- #include "char_ref.h"
54
- #include "error.h"
55
- #include "gumbo.h"
56
- #include "parser.h"
57
- #include "string_buffer.h"
58
- #include "string_piece.h"
59
- #include "token_type.h"
60
- #include "tokenizer_states.h"
61
- #include "utf8.h"
62
- #include "util.h"
63
- #include "vector.h"
64
-
65
- // Compared against _script_data_buffer to determine if we're in double-escaped
66
- // script mode.
67
- const GumboStringPiece kScriptTag = { "script", 6 };
68
-
69
- // An enum for the return value of each individual state.
70
- typedef enum {
71
- RETURN_ERROR, // Return false (error) from the tokenizer.
72
- RETURN_SUCCESS, // Return true (success) from the tokenizer.
73
- NEXT_CHAR // Proceed to the next character and continue lexing.
74
- } StateResult;
75
-
76
- // This is a struct containing state necessary to build up a tag token,
77
- // character by character.
78
- typedef struct _GumboTagState {
79
- // A buffer to accumulate characters for various GumboStringPiece fields.
80
- GumboStringBuffer _buffer;
81
-
82
- // A pointer to the start of the original text corresponding to the contents
83
- // of the buffer.
84
- const char* _original_text;
85
-
86
- // The current tag enum, computed once the tag name state has finished so that
87
- // the buffer can be re-used for building up attributes.
88
- GumboTag _tag;
89
-
90
- // The starting location of the text in the buffer.
91
- GumboSourcePosition _start_pos;
92
-
93
- // The current list of attributes. This is copied (and ownership of its data
94
- // transferred) to the GumboStartTag token upon completion of the tag. New
95
- // attributes are added as soon as their attribute name state is complete, and
96
- // values are filled in by operating on _attributes.data[attributes.length-1].
97
- GumboVector /* GumboAttribute */ _attributes;
98
-
99
- // If true, the next attribute value to be finished should be dropped. This
100
- // happens if a duplicate attribute name is encountered - we want to consume
101
- // the attribute value, but shouldn't overwrite the existing value.
102
- bool _drop_next_attr_value;
103
-
104
- // The state that caused the tokenizer to switch into a character reference in
105
- // attribute value state. This is used to set the additional allowed
106
- // character, and is switched back to on completion. Initialized as the
107
- // tokenizer enters the character reference state.
108
- GumboTokenizerEnum _attr_value_state;
109
-
110
- // The last start tag to have been emitted by the tokenizer. This is
111
- // necessary to check for appropriate end tags.
112
- GumboTag _last_start_tag;
113
-
114
- // If true, then this is a start tag. If false, it's an end tag. This is
115
- // necessary to generate the appropriate token type at tag-closing time.
116
- bool _is_start_tag;
117
-
118
- // If true, then this tag is "self-closing" and doesn't have an end tag.
119
- bool _is_self_closing;
120
- } GumboTagState;
121
-
122
- // This is the main tokenizer state struct, containing all state used by in
123
- // tokenizing the input stream.
124
- typedef struct _GumboTokenizerState {
125
- // The current lexer state. Starts in GUMBO_LEX_DATA.
126
- GumboTokenizerEnum _state;
127
-
128
- // A flag indicating whether the current input character needs to reconsumed
129
- // in another state, or whether the next input character should be read for
130
- // the next iteration of the state loop. This is set when the spec reads
131
- // "Reconsume the current input character in..."
132
- bool _reconsume_current_input;
133
-
134
- // A flag indicating whether the current node is a foreign element. This is
135
- // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
136
- // markup declaration state.
137
- bool _is_current_node_foreign;
138
-
139
- // Certain states (notably character references) may emit two character tokens
140
- // at once, but the contract for lex() fills in only one token at a time. The
141
- // extra character is buffered here, and then this is checked on entry to
142
- // lex(). If a character is stored here, it's immediately emitted and control
143
- // returns from the lexer. kGumboNoChar is used to represent 'no character
144
- // stored.'
145
- //
146
- // Note that characters emitted through this mechanism will have their source
147
- // position marked as the character under the mark, i.e. multiple characters
148
- // may be emitted with the same position. This is desirable for character
149
- // references, but unsuitable for many other cases. Use the _temporary_buffer
150
- // mechanism if the buffered characters must have their original positions in
151
- // the document.
152
- int _buffered_emit_char;
153
-
154
- // A temporary buffer to accumulate characters, as described by the "temporary
155
- // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
156
- // way: we record the specific character to go into the buffer, which may
157
- // sometimes be a lowercased version of the actual input character. However,
158
- // we *also* use utf8iterator_mark() to record the position at tag start.
159
- // When we start flushing the temporary buffer, we set _temporary_buffer_emit
160
- // to the start of it, and then increment it for each call to the tokenizer.
161
- // We also call utf8iterator_reset(), and utf8iterator_next() through the
162
- // input stream, so that tokens emitted by emit_char have the correct position
163
- // and original text.
164
- GumboStringBuffer _temporary_buffer;
165
-
166
- // The current cursor position we're emitting from within
167
- // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
168
- const char* _temporary_buffer_emit;
169
-
170
- // The temporary buffer is also used by the spec to check whether we should
171
- // enter the script data double escaped state, but we can't use the same
172
- // buffer for both because we have to flush out "<s" as emits while still
173
- // maintaining the context that will eventually become "script". This is a
174
- // separate buffer that's used in place of the temporary buffer for states
175
- // that may enter the script data double escape start state.
176
- GumboStringBuffer _script_data_buffer;
177
-
178
- // Pointer to the beginning of the current token in the original buffer; used
179
- // to record the original text.
180
- const char* _token_start;
181
-
182
- // GumboSourcePosition recording the source location of the start of the
183
- // current token.
184
- GumboSourcePosition _token_start_pos;
185
-
186
- // Current tag state.
187
- GumboTagState _tag_state;
188
-
189
- // Doctype state. We use the temporary buffer to accumulate characters (it's
190
- // not used for anything else in the doctype states), and then freshly
191
- // allocate the strings in the doctype token, then copy it over on emit.
192
- GumboTokenDocType _doc_type_state;
193
-
194
- // The UTF8Iterator over the tokenizer input.
195
- Utf8Iterator _input;
196
- } GumboTokenizerState;
197
-
198
- // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
199
- static void add_parse_error(GumboParser* parser, GumboErrorType type) {
200
- GumboError* error = gumbo_add_error(parser);
201
- if (!error) {
202
- return;
203
- }
204
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
205
- utf8iterator_get_position(&tokenizer->_input, &error->position);
206
- error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
207
- error->type = type;
208
- error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
209
- switch (tokenizer->_state) {
210
- case GUMBO_LEX_DATA:
211
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
212
- break;
213
- case GUMBO_LEX_CHAR_REF_IN_DATA:
214
- case GUMBO_LEX_CHAR_REF_IN_RCDATA:
215
- case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
216
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
217
- break;
218
- case GUMBO_LEX_RCDATA:
219
- case GUMBO_LEX_RCDATA_LT:
220
- case GUMBO_LEX_RCDATA_END_TAG_OPEN:
221
- case GUMBO_LEX_RCDATA_END_TAG_NAME:
222
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
223
- break;
224
- case GUMBO_LEX_RAWTEXT:
225
- case GUMBO_LEX_RAWTEXT_LT:
226
- case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
227
- case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
228
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
229
- break;
230
- case GUMBO_LEX_PLAINTEXT:
231
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
232
- break;
233
- case GUMBO_LEX_SCRIPT:
234
- case GUMBO_LEX_SCRIPT_LT:
235
- case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
236
- case GUMBO_LEX_SCRIPT_END_TAG_NAME:
237
- case GUMBO_LEX_SCRIPT_ESCAPED_START:
238
- case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
239
- case GUMBO_LEX_SCRIPT_ESCAPED:
240
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
241
- case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
242
- case GUMBO_LEX_SCRIPT_ESCAPED_LT:
243
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
244
- case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
245
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
246
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
247
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
248
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
249
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
250
- case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
251
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
252
- break;
253
- case GUMBO_LEX_TAG_OPEN:
254
- case GUMBO_LEX_END_TAG_OPEN:
255
- case GUMBO_LEX_TAG_NAME:
256
- case GUMBO_LEX_BEFORE_ATTR_NAME:
257
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
258
- break;
259
- case GUMBO_LEX_SELF_CLOSING_START_TAG:
260
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
261
- break;
262
- case GUMBO_LEX_ATTR_NAME:
263
- case GUMBO_LEX_AFTER_ATTR_NAME:
264
- case GUMBO_LEX_BEFORE_ATTR_VALUE:
265
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
266
- break;
267
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
268
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
269
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
270
- case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
271
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
272
- break;
273
- case GUMBO_LEX_BOGUS_COMMENT:
274
- case GUMBO_LEX_COMMENT_START:
275
- case GUMBO_LEX_COMMENT_START_DASH:
276
- case GUMBO_LEX_COMMENT:
277
- case GUMBO_LEX_COMMENT_END_DASH:
278
- case GUMBO_LEX_COMMENT_END:
279
- case GUMBO_LEX_COMMENT_END_BANG:
280
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
281
- break;
282
- case GUMBO_LEX_MARKUP_DECLARATION:
283
- case GUMBO_LEX_DOCTYPE:
284
- case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
285
- case GUMBO_LEX_DOCTYPE_NAME:
286
- case GUMBO_LEX_AFTER_DOCTYPE_NAME:
287
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
288
- case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
289
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
290
- case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
291
- case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
292
- case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
293
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
294
- case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
295
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
296
- case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
297
- case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
298
- case GUMBO_LEX_BOGUS_DOCTYPE:
299
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
300
- break;
301
- case GUMBO_LEX_CDATA:
302
- error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
303
- break;
304
- }
305
- }
306
-
307
- static bool is_alpha(int c) {
308
- // We don't use ISO C isupper/islower functions here because they
309
- // depend upon the program's locale, while the behavior of the HTML5 spec is
310
- // independent of which locale the program is run in.
311
- return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
312
- }
313
-
314
- static int ensure_lowercase(int c) {
315
- return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
316
- }
317
-
318
- static GumboTokenType get_char_token_type(int c) {
319
- switch (c) {
320
- case '\t':
321
- case '\n':
322
- case '\r':
323
- case '\f':
324
- case ' ':
325
- return GUMBO_TOKEN_WHITESPACE;
326
- case 0:
327
- gumbo_debug("Emitted null byte.\n");
328
- return GUMBO_TOKEN_NULL;
329
- case -1:
330
- return GUMBO_TOKEN_EOF;
331
- default:
332
- return GUMBO_TOKEN_CHARACTER;
333
- }
334
- }
335
-
336
- // Starts recording characters in the temporary buffer.
337
- // Because this needs to reset the utf8iterator_mark to the beginning of the
338
- // text that will eventually be emitted, it needs to be called a couple of
339
- // states before the spec says "Set the temporary buffer to the empty string".
340
- // In general, this should be called whenever there's a transition to a
341
- // "less-than sign state". The initial < and possibly / then need to be
342
- // appended to the temporary buffer, their presence needs to be accounted for in
343
- // states that compare the temporary buffer against a literal value, and
344
- // spec stanzas that say "emit a < and / character token along with a character
345
- // token for each character in the temporary buffer" need to be adjusted to
346
- // account for the presence of the < and / inside the temporary buffer.
347
- static void clear_temporary_buffer(GumboParser* parser) {
348
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
349
- assert(!tokenizer->_temporary_buffer_emit);
350
- utf8iterator_mark(&tokenizer->_input);
351
- gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
352
- gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
353
- // The temporary buffer and script data buffer are the same object in the
354
- // spec, so the script data buffer should be cleared as well.
355
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
356
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
357
- }
358
-
359
- // Appends a codepoint to the temporary buffer.
360
- static void append_char_to_temporary_buffer(
361
- GumboParser* parser, int codepoint) {
362
- gumbo_string_buffer_append_codepoint(
363
- parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
364
- }
365
-
366
- // Checks to see if the temporary buffer equals a certain string.
367
- // Make sure this remains side-effect free; it's used in assertions.
368
- #ifndef NDEBUG
369
- static bool temporary_buffer_equals(
370
- GumboParser* parser, const char* text) {
371
- GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
372
- // TODO(jdtang): See if the extra strlen is a performance problem, and replace
373
- // it with an explicit sizeof(literal) if necessary. I don't think it will
374
- // be, as this is only used in a couple of rare states.
375
- int text_len = strlen(text);
376
- return text_len == buffer->length &&
377
- memcmp(buffer->data, text, text_len) == 0;
378
- }
379
- #endif
380
-
381
- static void doc_type_state_init(GumboParser* parser) {
382
- GumboTokenDocType* doc_type_state =
383
- &parser->_tokenizer_state->_doc_type_state;
384
- // We initialize these to NULL here so that we don't end up leaking memory if
385
- // we never see a doctype token. When we do see a doctype token, we reset
386
- // them to a freshly-allocated empty string so that we can present a uniform
387
- // interface to client code and not make them check for null. Ownership is
388
- // transferred to the doctype token when it's emitted.
389
- doc_type_state->name = NULL;
390
- doc_type_state->public_identifier = NULL;
391
- doc_type_state->system_identifier = NULL;
392
- doc_type_state->force_quirks = false;
393
- doc_type_state->has_public_identifier = false;
394
- doc_type_state->has_system_identifier = false;
395
- }
396
-
397
- // Sets the token original_text and position to the current iterator position.
398
- // This is necessary because [CDATA[ sections may include text that is ignored
399
- // by the tokenizer.
400
- static void reset_token_start_point(GumboTokenizerState* tokenizer) {
401
- tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
402
- utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
403
- }
404
-
405
- // Sets the tag buffer original text and start point to the current iterator
406
- // position. This is necessary because attribute names & values may have
407
- // whitespace preceeding them, and so we can't assume that the actual token
408
- // starting point was the end of the last tag buffer usage.
409
- static void reset_tag_buffer_start_point(GumboParser* parser) {
410
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
411
- GumboTagState* tag_state = &tokenizer->_tag_state;
412
-
413
- utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
414
- tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
415
- }
416
-
417
- // Moves the temporary buffer contents over to the specified output string,
418
- // and clears the temporary buffer.
419
- static void finish_temporary_buffer(GumboParser* parser, const char** output) {
420
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
421
- *output =
422
- gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
423
- clear_temporary_buffer(parser);
424
- }
425
-
426
- // Advances the iterator past the end of the token, and then fills in the
427
- // relevant position fields. It's assumed that after every emit, the tokenizer
428
- // will immediately return (letting the tree-construction stage read the filled
429
- // in Token). Thus, it's safe to advance the input stream here, since it will
430
- // bypass the advance at the bottom of the state machine loop.
431
- //
432
- // Since this advances the iterator and resets the current input, make sure to
433
- // call it after you've recorded any other data you need for the token.
434
- static void finish_token(GumboParser* parser, GumboToken* token) {
435
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
436
- if (!tokenizer->_reconsume_current_input) {
437
- utf8iterator_next(&tokenizer->_input);
438
- }
439
-
440
- token->position = tokenizer->_token_start_pos;
441
- token->original_text.data = tokenizer->_token_start;
442
- reset_token_start_point(tokenizer);
443
- token->original_text.length =
444
- tokenizer->_token_start - token->original_text.data;
445
- if (token->original_text.data[token->original_text.length - 1] == '\r') {
446
- // The UTF8 iterator will ignore carriage returns in the input stream, which
447
- // means that the next token may start one past a \r character. The pointer
448
- // arithmetic above results in that \r being appended to the original text
449
- // of the preceding token, so we have to adjust its length here to chop the
450
- // \r off.
451
- --token->original_text.length;
452
- }
453
- }
454
-
455
- // Records the doctype public ID, assumed to be in the temporary buffer.
456
- // Convenience method that also sets has_public_identifier to true.
457
- static void finish_doctype_public_id(GumboParser* parser) {
458
- GumboTokenDocType* doc_type_state =
459
- &parser->_tokenizer_state->_doc_type_state;
460
- gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
461
- finish_temporary_buffer(parser, &doc_type_state->public_identifier);
462
- doc_type_state->has_public_identifier = true;
463
- }
464
-
465
- // Records the doctype system ID, assumed to be in the temporary buffer.
466
- // Convenience method that also sets has_system_identifier to true.
467
- static void finish_doctype_system_id(GumboParser* parser) {
468
- GumboTokenDocType* doc_type_state =
469
- &parser->_tokenizer_state->_doc_type_state;
470
- gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
471
- finish_temporary_buffer(parser, &doc_type_state->system_identifier);
472
- doc_type_state->has_system_identifier = true;
473
- }
474
-
475
- // Writes a single specified character to the output token.
476
- static void emit_char(GumboParser* parser, int c, GumboToken* output) {
477
- output->type = get_char_token_type(c);
478
- output->v.character = c;
479
- finish_token(parser, output);
480
- }
481
-
482
- // Writes a replacement character token and records a parse error.
483
- // Always returns RETURN_ERROR, per gumbo_lex return value.
484
- static StateResult emit_replacement_char(
485
- GumboParser* parser, GumboToken* output) {
486
- // In all cases, this is because of a null byte in the input stream.
487
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
488
- emit_char(parser, kUtf8ReplacementChar, output);
489
- return RETURN_ERROR;
490
- }
491
-
492
- // Writes an EOF character token. Always returns RETURN_SUCCESS.
493
- static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
494
- emit_char(parser, -1, output);
495
- return RETURN_SUCCESS;
496
- }
497
-
498
- // Writes the current input character out as a character token.
499
- // Always returns RETURN_SUCCESS.
500
- static bool emit_current_char(GumboParser* parser, GumboToken* output) {
501
- emit_char(
502
- parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
503
- return RETURN_SUCCESS;
504
- }
505
-
506
- // Writes out a doctype token, copying it from the tokenizer state.
507
- static void emit_doctype(GumboParser* parser, GumboToken* output) {
508
- output->type = GUMBO_TOKEN_DOCTYPE;
509
- output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
510
- finish_token(parser, output);
511
- doc_type_state_init(parser);
512
- }
513
-
514
- // Debug-only function that explicitly sets the attribute vector data to NULL so
515
- // it can be asserted on tag creation, verifying that there are no memory leaks.
516
- static void mark_tag_state_as_empty(GumboTagState* tag_state) {
517
- #ifndef NDEBUG
518
- tag_state->_attributes = kGumboEmptyVector;
519
- #endif
520
- }
521
-
522
- // Writes out the current tag as a start or end tag token.
523
- // Always returns RETURN_SUCCESS.
524
- static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
525
- GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
526
- if (tag_state->_is_start_tag) {
527
- output->type = GUMBO_TOKEN_START_TAG;
528
- output->v.start_tag.tag = tag_state->_tag;
529
- output->v.start_tag.attributes = tag_state->_attributes;
530
- output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
531
- tag_state->_last_start_tag = tag_state->_tag;
532
- mark_tag_state_as_empty(tag_state);
533
- gumbo_debug("Emitted start tag %s.\n",
534
- gumbo_normalized_tagname(tag_state->_tag));
535
- } else {
536
- output->type = GUMBO_TOKEN_END_TAG;
537
- output->v.end_tag = tag_state->_tag;
538
- // In end tags, ownership of the attributes vector is not transferred to the
539
- // token, but it's still initialized as normal, so it must be manually
540
- // deallocated. There may also be attributes to destroy, in certain broken
541
- // cases like </div</th> (the "th" is an attribute there).
542
- for (int i = 0; i < tag_state->_attributes.length; ++i) {
543
- gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
544
- }
545
- gumbo_parser_deallocate(parser, tag_state->_attributes.data);
546
- mark_tag_state_as_empty(tag_state);
547
- gumbo_debug("Emitted end tag %s.\n",
548
- gumbo_normalized_tagname(tag_state->_tag));
549
- }
550
- gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
551
- finish_token(parser, output);
552
- gumbo_debug("Original text = %.*s.\n", output->original_text.length, output->original_text.data);
553
- assert(output->original_text.length >= 2);
554
- assert(output->original_text.data[0] == '<');
555
- assert(output->original_text.data[output->original_text.length - 1] == '>');
556
- return RETURN_SUCCESS;
557
- }
558
-
559
- // In some states, we speculatively start a tag, but don't know whether it'll be
560
- // emitted as tag token or as a series of character tokens until we finish it.
561
- // We need to abandon the tag we'd started & free its memory in that case to
562
- // avoid a memory leak.
563
- static void abandon_current_tag(GumboParser* parser) {
564
- GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
565
- for (int i = 0; i < tag_state->_attributes.length; ++i) {
566
- gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
567
- }
568
- gumbo_parser_deallocate(parser, tag_state->_attributes.data);
569
- mark_tag_state_as_empty(tag_state);
570
- gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
571
- gumbo_debug("Abandoning current tag.\n");
572
- }
573
-
574
- // Wraps the consume_char_ref function to handle its output and make the
575
- // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
576
- // error occurred, RETURN_SUCCESS otherwise.
577
- static StateResult emit_char_ref(
578
- GumboParser* parser, int additional_allowed_char,
579
- bool is_in_attribute, GumboToken* output) {
580
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
581
- OneOrTwoCodepoints char_ref;
582
- bool status = consume_char_ref(
583
- parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
584
- if (char_ref.first != kGumboNoChar) {
585
- // consume_char_ref ends with the iterator pointing at the next character,
586
- // so we need to be sure not advance it again before reading the next token.
587
- tokenizer->_reconsume_current_input = true;
588
- emit_char(parser, char_ref.first, output);
589
- tokenizer->_buffered_emit_char = char_ref.second;
590
- } else {
591
- emit_char(parser, '&', output);
592
- }
593
- return status ? RETURN_SUCCESS : RETURN_ERROR;
594
- }
595
-
596
- // Emits a comment token. Comments use the temporary buffer to accumulate their
597
- // data, and then it's copied over and released to the 'text' field of the
598
- // GumboToken union. Always returns RETURN_SUCCESS.
599
- static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
600
- output->type = GUMBO_TOKEN_COMMENT;
601
- finish_temporary_buffer(parser, &output->v.text);
602
- finish_token(parser, output);
603
- return RETURN_SUCCESS;
604
- }
605
-
606
- // Checks to see we should be flushing accumulated characters in the temporary
607
- // buffer, and fills the output token with the next output character if so.
608
- // Returns true if a character has been emitted and the tokenizer should
609
- // immediately return, false if we're at the end of the temporary buffer and
610
- // should resume normal operation.
611
- static bool maybe_emit_from_temporary_buffer(
612
- GumboParser* parser, GumboToken* output) {
613
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
614
- const char* c = tokenizer->_temporary_buffer_emit;
615
- GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
616
-
617
- if (!c || c >= buffer->data + buffer->length) {
618
- tokenizer->_temporary_buffer_emit = NULL;
619
- return false;
620
- }
621
-
622
- assert(*c == utf8iterator_current(&tokenizer->_input));
623
- // emit_char also advances the input stream. We need to do some juggling of
624
- // the _reconsume_current_input flag to get the proper behavior when emitting
625
- // previous tokens. Basically, _reconsume_current_input should *never* be set
626
- // when emitting anything from the temporary buffer, since those characters
627
- // have already been advanced past. However, it should be preserved so that
628
- // when the *next* character is encountered again, the tokenizer knows not to
629
- // advance past it.
630
- bool saved_reconsume_state = tokenizer->_reconsume_current_input;
631
- tokenizer->_reconsume_current_input = false;
632
- emit_char(parser, *c, output);
633
- ++tokenizer->_temporary_buffer_emit;
634
- tokenizer->_reconsume_current_input = saved_reconsume_state;
635
- return true;
636
- }
637
-
638
- // Sets up the tokenizer to begin flushing the temporary buffer.
639
- // This resets the input iterator stream to the start of the last tag, sets up
640
- // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
641
- // the first character in it. It returns true if a character was emitted, false
642
- // otherwise.
643
- static bool emit_temporary_buffer(
644
- GumboParser* parser, GumboToken* output) {
645
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
646
- assert(tokenizer->_temporary_buffer.data);
647
- utf8iterator_reset(&tokenizer->_input);
648
- tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
649
- return maybe_emit_from_temporary_buffer(parser, output);
650
- }
651
-
652
- // Appends a codepoint to the current tag buffer. If
653
- // reinitilize_position_on_first is set, this also initializes the tag buffer
654
- // start point; the only time you would *not* want to pass true for this
655
- // parameter is if you want the original_text to include character (like an
656
- // opening quote) that doesn't appear in the value.
657
- static void append_char_to_tag_buffer(GumboParser* parser, int codepoint,
658
- bool reinitilize_position_on_first) {
659
- GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
660
- if (buffer->length == 0 && reinitilize_position_on_first) {
661
- reset_tag_buffer_start_point(parser);
662
- }
663
- gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
664
- }
665
-
666
- // (Re-)initialize the tag buffer. This also resets the original_text pointer
667
- // and _start_pos field to point to the current position.
668
- static void initialize_tag_buffer(GumboParser* parser) {
669
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
670
- GumboTagState* tag_state = &tokenizer->_tag_state;
671
-
672
- gumbo_string_buffer_init(parser, &tag_state->_buffer);
673
- reset_tag_buffer_start_point(parser);
674
- }
675
-
676
- // Initializes the tag_state to start a new tag, keeping track of the opening
677
- // positions and original text. Takes a boolean indicating whether this is a
678
- // start or end tag.
679
- static void start_new_tag(GumboParser* parser, bool is_start_tag) {
680
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
681
- GumboTagState* tag_state = &tokenizer->_tag_state;
682
- int c = utf8iterator_current(&tokenizer->_input);
683
- assert(is_alpha(c));
684
- c = ensure_lowercase(c);
685
- assert(is_alpha(c));
686
-
687
- initialize_tag_buffer(parser);
688
- gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
689
-
690
- assert(tag_state->_attributes.data == NULL);
691
- gumbo_vector_init(parser, 4, &tag_state->_attributes);
692
- tag_state->_drop_next_attr_value = false;
693
- tag_state->_is_start_tag = is_start_tag;
694
- tag_state->_is_self_closing = false;
695
- gumbo_debug("Starting new tag.\n");
696
- }
697
-
698
- // Fills in the specified char* with the contents of the tag buffer.
699
- static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
700
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
701
- GumboTagState* tag_state = &tokenizer->_tag_state;
702
- *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
703
- }
704
-
705
- // Fills in:
706
- // * The original_text GumboStringPiece with the portion of the original
707
- // buffer that corresponds to the tag buffer.
708
- // * The start_pos GumboSourcePosition with the start position of the tag
709
- // buffer.
710
- // * The end_pos GumboSourcePosition with the current source position.
711
- static void copy_over_original_tag_text(
712
- GumboParser* parser, GumboStringPiece* original_text,
713
- GumboSourcePosition* start_pos, GumboSourcePosition* end_pos) {
714
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
715
- GumboTagState* tag_state = &tokenizer->_tag_state;
716
-
717
- original_text->data = tag_state->_original_text;
718
- original_text->length =
719
- utf8iterator_get_char_pointer(&tokenizer->_input) -
720
- tag_state->_original_text;
721
- if (original_text->data[original_text->length - 1] == '\r') {
722
- // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
723
- // appended to the end of original text even when it's really the first part
724
- // of the next character. If we detect this situation, shrink the length of
725
- // the original text by 1 to remove the carriage return.
726
- --original_text->length;
727
- }
728
- *start_pos = tag_state->_start_pos;
729
- utf8iterator_get_position(&tokenizer->_input, end_pos);
730
- }
731
-
732
- // Releases and then re-initializes the tag buffer.
733
- static void reinitialize_tag_buffer(GumboParser* parser) {
734
- gumbo_parser_deallocate(
735
- parser, parser->_tokenizer_state->_tag_state._buffer.data);
736
- initialize_tag_buffer(parser);
737
- }
738
-
739
- // Moves some data from the temporary buffer over the the tag-based fields in
740
- // TagState.
741
- static void finish_tag_name(GumboParser* parser) {
742
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
743
- GumboTagState* tag_state = &tokenizer->_tag_state;
744
-
745
- const char* temp;
746
- copy_over_tag_buffer(parser, &temp);
747
- tag_state->_tag = gumbo_tag_enum(temp);
748
- reinitialize_tag_buffer(parser);
749
- gumbo_parser_deallocate(parser, (void*) temp);
750
- }
751
-
752
- // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
753
- static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
754
- int original_index, int new_index) {
755
- GumboError* error = gumbo_add_error(parser);
756
- if (!error) {
757
- return;
758
- }
759
- GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
760
- error->type = GUMBO_ERR_DUPLICATE_ATTR;
761
- error->position = tag_state->_start_pos;
762
- error->original_text = tag_state->_original_text;
763
- error->v.duplicate_attr.original_index = original_index;
764
- error->v.duplicate_attr.new_index = new_index;
765
- copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
766
- reinitialize_tag_buffer(parser);
767
- }
768
-
769
- // Creates a new attribute in the current tag, copying the current tag buffer to
770
- // the attribute's name. The attribute's value starts out as the empty string
771
- // (following the "Boolean attributes" section of the spec) and is only
772
- // overwritten on finish_attribute_value(). If the attribute has already been
773
- // specified, the new attribute is dropped, a parse error is added, and the
774
- // function returns false. Otherwise, this returns true.
775
- static bool finish_attribute_name(GumboParser* parser) {
776
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
777
- GumboTagState* tag_state = &tokenizer->_tag_state;
778
- // May've been set by a previous attribute without a value; reset it here.
779
- tag_state->_drop_next_attr_value = false;
780
- assert(tag_state->_attributes.data);
781
- assert(tag_state->_attributes.capacity);
782
-
783
- GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
784
- for (int i = 0; i < attributes->length; ++i) {
785
- GumboAttribute* attr = attributes->data[i];
786
- if (strlen(attr->name) == tag_state->_buffer.length &&
787
- memcmp(attr->name, tag_state->_buffer.data,
788
- tag_state->_buffer.length) == 0) {
789
- // Identical attribute; bail.
790
- add_duplicate_attr_error(
791
- parser, attr->name, i, attributes->length);
792
- tag_state->_drop_next_attr_value = true;
793
- return false;
794
- }
795
- }
796
-
797
- GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
798
- attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
799
- copy_over_tag_buffer(parser, &attr->name);
800
- copy_over_original_tag_text(parser, &attr->original_name,
801
- &attr->name_start, &attr->name_end);
802
- attr->value = gumbo_copy_stringz(parser, "");
803
- copy_over_original_tag_text(parser, &attr->original_value,
804
- &attr->name_start, &attr->name_end);
805
- gumbo_vector_add(parser, attr, attributes);
806
- reinitialize_tag_buffer(parser);
807
- return true;
808
- }
809
-
810
- // Finishes an attribute value. This sets the value of the most recently added
811
- // attribute to the current contents of the tag buffer.
812
- static void finish_attribute_value(GumboParser* parser) {
813
- GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
814
- if (tag_state->_drop_next_attr_value) {
815
- // Duplicate attribute name detected in an earlier state, so we have to
816
- // ignore the value.
817
- tag_state->_drop_next_attr_value = false;
818
- return;
819
- }
820
-
821
- GumboAttribute* attr =
822
- tag_state->_attributes.data[tag_state->_attributes.length - 1];
823
- gumbo_parser_deallocate(parser, (void*) attr->value);
824
- copy_over_tag_buffer(parser, &attr->value);
825
- copy_over_original_tag_text(parser, &attr->original_value,
826
- &attr->value_start, &attr->value_end);
827
- reinitialize_tag_buffer(parser);
828
- }
829
-
830
- // Returns true if the current end tag matches the last start tag emitted.
831
- static bool is_appropriate_end_tag(GumboParser* parser) {
832
- GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
833
- assert(!tag_state->_is_start_tag);
834
- // Null terminate the current string buffer, so it can be passed to
835
- // gumbo_tag_enum, but don't increment the length in case we need to dump the
836
- // buffer as character tokens.
837
- gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
838
- --tag_state->_buffer.length;
839
- return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
840
- tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
841
- }
842
-
843
- void gumbo_tokenizer_state_init(
844
- GumboParser* parser, const char* text, size_t text_length) {
845
- GumboTokenizerState* tokenizer =
846
- gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
847
- parser->_tokenizer_state = tokenizer;
848
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
849
- tokenizer->_reconsume_current_input = false;
850
- tokenizer->_is_current_node_foreign = false;
851
- tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
852
-
853
- tokenizer->_buffered_emit_char = kGumboNoChar;
854
- gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
855
- tokenizer->_temporary_buffer_emit = NULL;
856
-
857
- mark_tag_state_as_empty(&tokenizer->_tag_state);
858
-
859
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
860
- tokenizer->_token_start = text;
861
- utf8iterator_init(parser, text, text_length, &tokenizer->_input);
862
- utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
863
- doc_type_state_init(parser);
864
- }
865
-
866
- void gumbo_tokenizer_state_destroy(GumboParser* parser) {
867
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
868
- assert(tokenizer->_doc_type_state.name == NULL);
869
- assert(tokenizer->_doc_type_state.public_identifier == NULL);
870
- assert(tokenizer->_doc_type_state.system_identifier == NULL);
871
- gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
872
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
873
- gumbo_parser_deallocate(parser, tokenizer);
874
- }
875
-
876
- void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
877
- parser->_tokenizer_state->_state = state;
878
- }
879
-
880
- void gumbo_tokenizer_set_is_current_node_foreign(
881
- GumboParser* parser, bool is_foreign) {
882
- if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
883
- gumbo_debug("Toggling is_current_node_foreign to %s.\n",
884
- is_foreign ? "true" : "false");
885
- }
886
- parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
887
- }
888
-
889
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
890
- static StateResult handle_data_state(
891
- GumboParser* parser, GumboTokenizerState* tokenizer,
892
- int c, GumboToken* output) {
893
- switch (c) {
894
- case '&':
895
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
896
- // The char_ref machinery expects to be on the & so it can mark that
897
- // and return to it if the text isn't a char ref, so we need to
898
- // reconsume it.
899
- tokenizer->_reconsume_current_input = true;
900
- return NEXT_CHAR;
901
- case '<':
902
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
903
- clear_temporary_buffer(parser);
904
- append_char_to_temporary_buffer(parser, '<');
905
- return NEXT_CHAR;
906
- case '\0':
907
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
908
- emit_char(parser, c, output);
909
- return RETURN_ERROR;
910
- default:
911
- return emit_current_char(parser, output);
912
- }
913
- }
914
-
915
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
916
- static StateResult handle_char_ref_in_data_state(
917
- GumboParser* parser, GumboTokenizerState* tokenizer,
918
- int c, GumboToken* output) {
919
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
920
- return emit_char_ref(parser, ' ', false, output);
921
- }
922
-
923
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
924
- static StateResult handle_rcdata_state(
925
- GumboParser* parser, GumboTokenizerState* tokenizer,
926
- int c, GumboToken* output) {
927
- switch (c) {
928
- case '&':
929
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
930
- tokenizer->_reconsume_current_input = true;
931
- return NEXT_CHAR;
932
- case '<':
933
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
934
- clear_temporary_buffer(parser);
935
- append_char_to_temporary_buffer(parser, '<');
936
- return NEXT_CHAR;
937
- case '\0':
938
- return emit_replacement_char(parser, output);
939
- case -1:
940
- return emit_eof(parser, output);
941
- default:
942
- return emit_current_char(parser, output);
943
- }
944
- }
945
-
946
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
947
- static StateResult handle_char_ref_in_rcdata_state(
948
- GumboParser* parser, GumboTokenizerState* tokenizer,
949
- int c, GumboToken* output) {
950
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
951
- return emit_char_ref(parser, ' ', false, output);
952
- }
953
-
954
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
955
- static StateResult handle_rawtext_state(
956
- GumboParser* parser, GumboTokenizerState* tokenizer,
957
- int c, GumboToken* output) {
958
- switch (c) {
959
- case '<':
960
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
961
- clear_temporary_buffer(parser);
962
- append_char_to_temporary_buffer(parser, '<');
963
- return NEXT_CHAR;
964
- case '\0':
965
- return emit_replacement_char(parser, output);
966
- case -1:
967
- return emit_eof(parser, output);
968
- default:
969
- return emit_current_char(parser, output);
970
- }
971
- }
972
-
973
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
974
- static StateResult handle_script_state(
975
- GumboParser* parser, GumboTokenizerState* tokenizer,
976
- int c, GumboToken* output) {
977
- switch (c) {
978
- case '<':
979
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
980
- clear_temporary_buffer(parser);
981
- append_char_to_temporary_buffer(parser, '<');
982
- return NEXT_CHAR;
983
- case '\0':
984
- return emit_replacement_char(parser, output);
985
- case -1:
986
- return emit_eof(parser, output);
987
- default:
988
- return emit_current_char(parser, output);
989
- }
990
- }
991
-
992
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
993
- static StateResult handle_plaintext_state(
994
- GumboParser* parser, GumboTokenizerState* tokenizer,
995
- int c, GumboToken* output) {
996
- switch (c) {
997
- case '\0':
998
- return emit_replacement_char(parser, output);
999
- case -1:
1000
- return emit_eof(parser, output);
1001
- default:
1002
- return emit_current_char(parser, output);
1003
- }
1004
- }
1005
-
1006
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
1007
- static StateResult handle_tag_open_state(
1008
- GumboParser* parser, GumboTokenizerState* tokenizer,
1009
- int c, GumboToken* output) {
1010
- assert(temporary_buffer_equals(parser, "<"));
1011
- switch (c) {
1012
- case '!':
1013
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1014
- clear_temporary_buffer(parser);
1015
- return NEXT_CHAR;
1016
- case '/':
1017
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1018
- append_char_to_temporary_buffer(parser, '/');
1019
- return NEXT_CHAR;
1020
- case '?':
1021
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1022
- clear_temporary_buffer(parser);
1023
- append_char_to_temporary_buffer(parser, '?');
1024
- add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1025
- return NEXT_CHAR;
1026
- default:
1027
- if (is_alpha(c)) {
1028
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1029
- start_new_tag(parser, true);
1030
- return NEXT_CHAR;
1031
- } else {
1032
- add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1033
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1034
- emit_temporary_buffer(parser, output);
1035
- return RETURN_ERROR;
1036
- }
1037
- }
1038
- }
1039
-
1040
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
1041
- static StateResult handle_end_tag_open_state(
1042
- GumboParser* parser, GumboTokenizerState* tokenizer,
1043
- int c, GumboToken* output) {
1044
- assert(temporary_buffer_equals(parser, "</"));
1045
- switch (c) {
1046
- case '>':
1047
- add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1048
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1049
- return NEXT_CHAR;
1050
- case -1:
1051
- add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1052
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1053
- return emit_temporary_buffer(parser, output);
1054
- default:
1055
- if (is_alpha(c)) {
1056
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1057
- start_new_tag(parser, false);
1058
- } else {
1059
- add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1060
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1061
- clear_temporary_buffer(parser);
1062
- append_char_to_temporary_buffer(parser, c);
1063
- }
1064
- return NEXT_CHAR;
1065
- }
1066
- }
1067
-
1068
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
1069
- static StateResult handle_tag_name_state(
1070
- GumboParser* parser, GumboTokenizerState* tokenizer,
1071
- int c, GumboToken* output) {
1072
- switch (c) {
1073
- case '\t':
1074
- case '\n':
1075
- case '\f':
1076
- case ' ':
1077
- finish_tag_name(parser);
1078
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1079
- return NEXT_CHAR;
1080
- case '/':
1081
- finish_tag_name(parser);
1082
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1083
- return NEXT_CHAR;
1084
- case '>':
1085
- finish_tag_name(parser);
1086
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1087
- return emit_current_tag(parser, output);
1088
- case '\0':
1089
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1090
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1091
- return NEXT_CHAR;
1092
- case -1:
1093
- add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1094
- abandon_current_tag(parser);
1095
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1096
- return NEXT_CHAR;
1097
- default:
1098
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1099
- return NEXT_CHAR;
1100
- }
1101
- }
1102
-
1103
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
1104
- static StateResult handle_rcdata_lt_state(
1105
- GumboParser* parser, GumboTokenizerState* tokenizer,
1106
- int c, GumboToken* output) {
1107
- assert(temporary_buffer_equals(parser, "<"));
1108
- if (c == '/') {
1109
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1110
- append_char_to_temporary_buffer(parser, '/');
1111
- return NEXT_CHAR;
1112
- } else {
1113
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1114
- tokenizer->_reconsume_current_input = true;
1115
- return emit_temporary_buffer(parser, output);
1116
- }
1117
- }
1118
-
1119
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
1120
- static StateResult handle_rcdata_end_tag_open_state(
1121
- GumboParser* parser, GumboTokenizerState* tokenizer,
1122
- int c, GumboToken* output) {
1123
- assert(temporary_buffer_equals(parser, "</"));
1124
- if (is_alpha(c)) {
1125
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1126
- start_new_tag(parser, false);
1127
- append_char_to_temporary_buffer(parser, c);
1128
- return NEXT_CHAR;
1129
- } else {
1130
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1131
- return emit_temporary_buffer(parser, output);
1132
- }
1133
- return true;
1134
- }
1135
-
1136
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
1137
- static StateResult handle_rcdata_end_tag_name_state(
1138
- GumboParser* parser, GumboTokenizerState* tokenizer,
1139
- int c, GumboToken* output) {
1140
- assert(tokenizer->_temporary_buffer.length >= 2);
1141
- if (is_alpha(c)) {
1142
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1143
- append_char_to_temporary_buffer(parser, c);
1144
- return NEXT_CHAR;
1145
- } else if (is_appropriate_end_tag(parser)) {
1146
- switch (c) {
1147
- case '\t':
1148
- case '\n':
1149
- case '\f':
1150
- case ' ':
1151
- finish_tag_name(parser);
1152
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1153
- return NEXT_CHAR;
1154
- case '/':
1155
- finish_tag_name(parser);
1156
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1157
- return NEXT_CHAR;
1158
- case '>':
1159
- finish_tag_name(parser);
1160
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1161
- return emit_current_tag(parser, output);
1162
- }
1163
- }
1164
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1165
- abandon_current_tag(parser);
1166
- return emit_temporary_buffer(parser, output);
1167
- }
1168
-
1169
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
1170
- static StateResult handle_rawtext_lt_state(
1171
- GumboParser* parser, GumboTokenizerState* tokenizer,
1172
- int c, GumboToken* output) {
1173
- assert(temporary_buffer_equals(parser, "<"));
1174
- if (c == '/') {
1175
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1176
- append_char_to_temporary_buffer(parser, '/');
1177
- return NEXT_CHAR;
1178
- } else {
1179
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1180
- tokenizer->_reconsume_current_input = true;
1181
- return emit_temporary_buffer(parser, output);
1182
- }
1183
- }
1184
-
1185
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
1186
- static StateResult handle_rawtext_end_tag_open_state(
1187
- GumboParser* parser, GumboTokenizerState* tokenizer,
1188
- int c, GumboToken* output) {
1189
- assert(temporary_buffer_equals(parser, "</"));
1190
- if (is_alpha(c)) {
1191
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1192
- start_new_tag(parser, false);
1193
- append_char_to_temporary_buffer(parser, c);
1194
- return NEXT_CHAR;
1195
- } else {
1196
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1197
- return emit_temporary_buffer(parser, output);
1198
- }
1199
- }
1200
-
1201
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
1202
- static StateResult handle_rawtext_end_tag_name_state(
1203
- GumboParser* parser, GumboTokenizerState* tokenizer,
1204
- int c, GumboToken* output) {
1205
- assert(tokenizer->_temporary_buffer.length >= 2);
1206
- gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1207
- tokenizer->_tag_state._buffer.data);
1208
- if (is_alpha(c)) {
1209
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1210
- append_char_to_temporary_buffer(parser, c);
1211
- return NEXT_CHAR;
1212
- } else if (is_appropriate_end_tag(parser)) {
1213
- gumbo_debug("Is an appropriate end tag.\n");
1214
- switch (c) {
1215
- case '\t':
1216
- case '\n':
1217
- case '\f':
1218
- case ' ':
1219
- finish_tag_name(parser);
1220
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1221
- return NEXT_CHAR;
1222
- case '/':
1223
- finish_tag_name(parser);
1224
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1225
- return NEXT_CHAR;
1226
- case '>':
1227
- finish_tag_name(parser);
1228
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1229
- return emit_current_tag(parser, output);
1230
- }
1231
- }
1232
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1233
- abandon_current_tag(parser);
1234
- return emit_temporary_buffer(parser, output);
1235
- }
1236
-
1237
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
1238
- static StateResult handle_script_lt_state(
1239
- GumboParser* parser, GumboTokenizerState* tokenizer,
1240
- int c, GumboToken* output) {
1241
- assert(temporary_buffer_equals(parser, "<"));
1242
- if (c == '/') {
1243
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1244
- append_char_to_temporary_buffer(parser, '/');
1245
- return NEXT_CHAR;
1246
- } else if (c == '!') {
1247
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1248
- append_char_to_temporary_buffer(parser, '!');
1249
- return emit_temporary_buffer(parser, output);
1250
- } else {
1251
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1252
- tokenizer->_reconsume_current_input = true;
1253
- return emit_temporary_buffer(parser, output);
1254
- }
1255
- }
1256
-
1257
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
1258
- static StateResult handle_script_end_tag_open_state(
1259
- GumboParser* parser, GumboTokenizerState* tokenizer,
1260
- int c, GumboToken* output) {
1261
- assert(temporary_buffer_equals(parser, "</"));
1262
- if (is_alpha(c)) {
1263
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1264
- start_new_tag(parser, false);
1265
- append_char_to_temporary_buffer(parser, c);
1266
- return NEXT_CHAR;
1267
- } else {
1268
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1269
- return emit_temporary_buffer(parser, output);
1270
- }
1271
- }
1272
-
1273
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
1274
- static StateResult handle_script_end_tag_name_state(
1275
- GumboParser* parser, GumboTokenizerState* tokenizer,
1276
- int c, GumboToken* output) {
1277
- assert(tokenizer->_temporary_buffer.length >= 2);
1278
- if (is_alpha(c)) {
1279
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1280
- append_char_to_temporary_buffer(parser, c);
1281
- return NEXT_CHAR;
1282
- } else if (is_appropriate_end_tag(parser)) {
1283
- switch (c) {
1284
- case '\t':
1285
- case '\n':
1286
- case '\f':
1287
- case ' ':
1288
- finish_tag_name(parser);
1289
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1290
- return NEXT_CHAR;
1291
- case '/':
1292
- finish_tag_name(parser);
1293
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1294
- return NEXT_CHAR;
1295
- case '>':
1296
- finish_tag_name(parser);
1297
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1298
- return emit_current_tag(parser, output);
1299
- }
1300
- }
1301
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1302
- abandon_current_tag(parser);
1303
- return emit_temporary_buffer(parser, output);
1304
- }
1305
-
1306
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
1307
- static StateResult handle_script_escaped_start_state(
1308
- GumboParser* parser, GumboTokenizerState* tokenizer,
1309
- int c, GumboToken* output) {
1310
- if (c == '-') {
1311
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1312
- return emit_current_char(parser, output);
1313
- } else {
1314
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1315
- tokenizer->_reconsume_current_input = true;
1316
- return NEXT_CHAR;
1317
- }
1318
- }
1319
-
1320
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
1321
- static StateResult handle_script_escaped_start_dash_state(
1322
- GumboParser* parser, GumboTokenizerState* tokenizer,
1323
- int c, GumboToken* output) {
1324
- if (c == '-') {
1325
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1326
- return emit_current_char(parser, output);
1327
- } else {
1328
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1329
- tokenizer->_reconsume_current_input = true;
1330
- return NEXT_CHAR;
1331
- }
1332
- }
1333
-
1334
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
1335
- static StateResult handle_script_escaped_state(
1336
- GumboParser* parser, GumboTokenizerState* tokenizer,
1337
- int c, GumboToken* output) {
1338
- switch (c) {
1339
- case '-':
1340
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1341
- return emit_current_char(parser, output);
1342
- case '<':
1343
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1344
- clear_temporary_buffer(parser);
1345
- append_char_to_temporary_buffer(parser, c);
1346
- return NEXT_CHAR;
1347
- case '\0':
1348
- return emit_replacement_char(parser, output);
1349
- case -1:
1350
- add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1351
- return emit_eof(parser, output);
1352
- default:
1353
- return emit_current_char(parser, output);
1354
- }
1355
- }
1356
-
1357
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
1358
- static StateResult handle_script_escaped_dash_state(
1359
- GumboParser* parser, GumboTokenizerState* tokenizer,
1360
- int c, GumboToken* output) {
1361
- switch (c) {
1362
- case '-':
1363
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1364
- return emit_current_char(parser, output);
1365
- case '<':
1366
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1367
- clear_temporary_buffer(parser);
1368
- append_char_to_temporary_buffer(parser, c);
1369
- return NEXT_CHAR;
1370
- case '\0':
1371
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1372
- return emit_replacement_char(parser, output);
1373
- case -1:
1374
- add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1375
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1376
- return NEXT_CHAR;
1377
- default:
1378
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1379
- return emit_current_char(parser, output);
1380
- }
1381
- }
1382
-
1383
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
1384
- static StateResult handle_script_escaped_dash_dash_state(
1385
- GumboParser* parser, GumboTokenizerState* tokenizer,
1386
- int c, GumboToken* output) {
1387
- switch (c) {
1388
- case '-':
1389
- return emit_current_char(parser, output);
1390
- case '<':
1391
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1392
- clear_temporary_buffer(parser);
1393
- append_char_to_temporary_buffer(parser, c);
1394
- return NEXT_CHAR;
1395
- case '>':
1396
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1397
- return emit_current_char(parser, output);
1398
- case '\0':
1399
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1400
- return emit_replacement_char(parser, output);
1401
- case -1:
1402
- add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1403
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1404
- return NEXT_CHAR;
1405
- default:
1406
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1407
- return emit_current_char(parser, output);
1408
- }
1409
- }
1410
-
1411
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
1412
- static StateResult handle_script_escaped_lt_state(
1413
- GumboParser* parser, GumboTokenizerState* tokenizer,
1414
- int c, GumboToken* output) {
1415
- assert(temporary_buffer_equals(parser, "<"));
1416
- assert(!tokenizer->_script_data_buffer.length);
1417
- if (c == '/') {
1418
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1419
- append_char_to_temporary_buffer(parser, c);
1420
- return NEXT_CHAR;
1421
- } else if (is_alpha(c)) {
1422
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1423
- append_char_to_temporary_buffer(parser, c);
1424
- gumbo_string_buffer_append_codepoint(
1425
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1426
- return emit_temporary_buffer(parser, output);
1427
- } else {
1428
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1429
- return emit_temporary_buffer(parser, output);
1430
- }
1431
- }
1432
-
1433
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
1434
- static StateResult handle_script_escaped_end_tag_open_state(
1435
- GumboParser* parser, GumboTokenizerState* tokenizer,
1436
- int c, GumboToken* output) {
1437
- assert(temporary_buffer_equals(parser, "</"));
1438
- if (is_alpha(c)) {
1439
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1440
- start_new_tag(parser, false);
1441
- append_char_to_temporary_buffer(parser, c);
1442
- return NEXT_CHAR;
1443
- } else {
1444
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1445
- return emit_temporary_buffer(parser, output);
1446
- }
1447
- }
1448
-
1449
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
1450
- static StateResult handle_script_escaped_end_tag_name_state(
1451
- GumboParser* parser, GumboTokenizerState* tokenizer,
1452
- int c, GumboToken* output) {
1453
- assert(tokenizer->_temporary_buffer.length >= 2);
1454
- if (is_alpha(c)) {
1455
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1456
- append_char_to_temporary_buffer(parser, c);
1457
- return NEXT_CHAR;
1458
- } else if (is_appropriate_end_tag(parser)) {
1459
- switch (c) {
1460
- case '\t':
1461
- case '\n':
1462
- case '\f':
1463
- case ' ':
1464
- finish_tag_name(parser);
1465
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1466
- return NEXT_CHAR;
1467
- case '/':
1468
- finish_tag_name(parser);
1469
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1470
- return NEXT_CHAR;
1471
- case '>':
1472
- finish_tag_name(parser);
1473
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1474
- return emit_current_tag(parser, output);
1475
- }
1476
- }
1477
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1478
- abandon_current_tag(parser);
1479
- return emit_temporary_buffer(parser, output);
1480
- }
1481
-
1482
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
1483
- static StateResult handle_script_double_escaped_start_state(
1484
- GumboParser* parser, GumboTokenizerState* tokenizer,
1485
- int c, GumboToken* output) {
1486
- switch (c) {
1487
- case '\t':
1488
- case '\n':
1489
- case '\f':
1490
- case ' ':
1491
- case '/':
1492
- case '>':
1493
- gumbo_tokenizer_set_state(parser, gumbo_string_equals(
1494
- &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1495
- ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED : GUMBO_LEX_SCRIPT_ESCAPED);
1496
- return emit_current_char(parser, output);
1497
- default:
1498
- if (is_alpha(c)) {
1499
- gumbo_string_buffer_append_codepoint(
1500
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1501
- return emit_current_char(parser, output);
1502
- } else {
1503
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1504
- tokenizer->_reconsume_current_input = true;
1505
- return NEXT_CHAR;
1506
- }
1507
- }
1508
- }
1509
-
1510
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
1511
- static StateResult handle_script_double_escaped_state(
1512
- GumboParser* parser, GumboTokenizerState* tokenizer,
1513
- int c, GumboToken* output) {
1514
- switch (c) {
1515
- case '-':
1516
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1517
- return emit_current_char(parser, output);
1518
- case '<':
1519
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1520
- return emit_current_char(parser, output);
1521
- case '\0':
1522
- return emit_replacement_char(parser, output);
1523
- case -1:
1524
- add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1525
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1526
- return NEXT_CHAR;
1527
- default:
1528
- return emit_current_char(parser, output);
1529
- }
1530
- }
1531
-
1532
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
1533
- static StateResult handle_script_double_escaped_dash_state(
1534
- GumboParser* parser, GumboTokenizerState* tokenizer,
1535
- int c, GumboToken* output) {
1536
- switch (c) {
1537
- case '-':
1538
- gumbo_tokenizer_set_state(
1539
- parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1540
- return emit_current_char(parser, output);
1541
- case '<':
1542
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1543
- return emit_current_char(parser, output);
1544
- case '\0':
1545
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1546
- return emit_replacement_char(parser, output);
1547
- case -1:
1548
- add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1549
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1550
- return NEXT_CHAR;
1551
- default:
1552
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1553
- return emit_current_char(parser, output);
1554
- }
1555
- }
1556
-
1557
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
1558
- static StateResult handle_script_double_escaped_dash_dash_state(
1559
- GumboParser* parser, GumboTokenizerState* tokenizer,
1560
- int c, GumboToken* output) {
1561
- switch (c) {
1562
- case '-':
1563
- return emit_current_char(parser, output);
1564
- case '<':
1565
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1566
- return emit_current_char(parser, output);
1567
- case '>':
1568
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1569
- return emit_current_char(parser, output);
1570
- case '\0':
1571
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1572
- return emit_replacement_char(parser, output);
1573
- case -1:
1574
- add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1575
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1576
- return NEXT_CHAR;
1577
- default:
1578
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1579
- return emit_current_char(parser, output);
1580
- }
1581
- }
1582
-
1583
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
1584
- static StateResult handle_script_double_escaped_lt_state(
1585
- GumboParser* parser, GumboTokenizerState* tokenizer,
1586
- int c, GumboToken* output) {
1587
- if (c == '/') {
1588
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1589
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
1590
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
1591
- return emit_current_char(parser, output);
1592
- } else {
1593
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1594
- tokenizer->_reconsume_current_input = true;
1595
- return NEXT_CHAR;
1596
- }
1597
-
1598
- }
1599
-
1600
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
1601
- static StateResult handle_script_double_escaped_end_state(
1602
- GumboParser* parser, GumboTokenizerState* tokenizer,
1603
- int c, GumboToken* output) {
1604
- switch (c) {
1605
- case '\t':
1606
- case '\n':
1607
- case '\f':
1608
- case ' ':
1609
- case '/':
1610
- case '>':
1611
- gumbo_tokenizer_set_state(parser, gumbo_string_equals(
1612
- &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1613
- ? GUMBO_LEX_SCRIPT_ESCAPED : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1614
- return emit_current_char(parser, output);
1615
- default:
1616
- if (is_alpha(c)) {
1617
- gumbo_string_buffer_append_codepoint(
1618
- parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1619
- return emit_current_char(parser, output);
1620
- } else {
1621
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1622
- tokenizer->_reconsume_current_input = true;
1623
- return NEXT_CHAR;
1624
- }
1625
- }
1626
- }
1627
-
1628
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
1629
- static StateResult handle_before_attr_name_state(
1630
- GumboParser* parser, GumboTokenizerState* tokenizer,
1631
- int c, GumboToken* output) {
1632
- switch (c) {
1633
- case '\t':
1634
- case '\n':
1635
- case '\f':
1636
- case ' ':
1637
- return NEXT_CHAR;
1638
- case '/':
1639
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1640
- return NEXT_CHAR;
1641
- case '>':
1642
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1643
- return emit_current_tag(parser, output);
1644
- case '\0':
1645
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1646
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1647
- append_char_to_temporary_buffer(parser, 0xfffd);
1648
- return NEXT_CHAR;
1649
- case -1:
1650
- add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1651
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1652
- abandon_current_tag(parser);
1653
- return NEXT_CHAR;
1654
- case '"':
1655
- case '\'':
1656
- case '<':
1657
- case '=':
1658
- add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1659
- // Fall through.
1660
- default:
1661
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1662
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1663
- return NEXT_CHAR;
1664
- }
1665
- }
1666
-
1667
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
1668
- static StateResult handle_attr_name_state(
1669
- GumboParser* parser, GumboTokenizerState* tokenizer,
1670
- int c, GumboToken* output) {
1671
- switch (c) {
1672
- case '\t':
1673
- case '\n':
1674
- case '\f':
1675
- case ' ':
1676
- finish_attribute_name(parser);
1677
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1678
- return NEXT_CHAR;
1679
- case '/':
1680
- finish_attribute_name(parser);
1681
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1682
- return NEXT_CHAR;
1683
- case '=':
1684
- finish_attribute_name(parser);
1685
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1686
- return NEXT_CHAR;
1687
- case '>':
1688
- finish_attribute_name(parser);
1689
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1690
- return emit_current_tag(parser, output);
1691
- case '\0':
1692
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1693
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1694
- return NEXT_CHAR;
1695
- case -1:
1696
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1697
- abandon_current_tag(parser);
1698
- add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1699
- return NEXT_CHAR;
1700
- case '"':
1701
- case '\'':
1702
- case '<':
1703
- add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1704
- // Fall through.
1705
- default:
1706
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1707
- return NEXT_CHAR;
1708
- }
1709
- }
1710
-
1711
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
1712
- static StateResult handle_after_attr_name_state(
1713
- GumboParser* parser, GumboTokenizerState* tokenizer,
1714
- int c, GumboToken* output) {
1715
- switch (c) {
1716
- case '\t':
1717
- case '\n':
1718
- case '\f':
1719
- case ' ':
1720
- return NEXT_CHAR;
1721
- case '/':
1722
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1723
- return NEXT_CHAR;
1724
- case '=':
1725
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1726
- return NEXT_CHAR;
1727
- case '>':
1728
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1729
- return emit_current_tag(parser, output);
1730
- case '\0':
1731
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1732
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1733
- append_char_to_temporary_buffer(parser, 0xfffd);
1734
- return NEXT_CHAR;
1735
- case -1:
1736
- add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1737
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1738
- abandon_current_tag(parser);
1739
- return NEXT_CHAR;
1740
- case '"':
1741
- case '\'':
1742
- case '<':
1743
- add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1744
- // Fall through.
1745
- default:
1746
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1747
- append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1748
- return NEXT_CHAR;
1749
- }
1750
- }
1751
-
1752
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
1753
- static StateResult handle_before_attr_value_state(
1754
- GumboParser* parser, GumboTokenizerState* tokenizer,
1755
- int c, GumboToken* output) {
1756
- switch (c) {
1757
- case '\t':
1758
- case '\n':
1759
- case '\f':
1760
- case ' ':
1761
- return NEXT_CHAR;
1762
- case '"':
1763
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1764
- reset_tag_buffer_start_point(parser);
1765
- return NEXT_CHAR;
1766
- case '&':
1767
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1768
- tokenizer->_reconsume_current_input = true;
1769
- return NEXT_CHAR;
1770
- case '\'':
1771
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1772
- reset_tag_buffer_start_point(parser);
1773
- return NEXT_CHAR;
1774
- case '\0':
1775
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1776
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1777
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1778
- return NEXT_CHAR;
1779
- case -1:
1780
- add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1781
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1782
- abandon_current_tag(parser);
1783
- tokenizer->_reconsume_current_input = true;
1784
- return NEXT_CHAR;
1785
- case '>':
1786
- add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1787
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1788
- emit_current_tag(parser, output);
1789
- return RETURN_ERROR;
1790
- case '<':
1791
- case '=':
1792
- case '`':
1793
- add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1794
- // Fall through.
1795
- default:
1796
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1797
- append_char_to_tag_buffer(parser, c, true);
1798
- return NEXT_CHAR;
1799
- }
1800
- }
1801
-
1802
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
1803
- static StateResult handle_attr_value_double_quoted_state(
1804
- GumboParser* parser, GumboTokenizerState* tokenizer,
1805
- int c, GumboToken* output) {
1806
- switch (c) {
1807
- case '"':
1808
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1809
- return NEXT_CHAR;
1810
- case '&':
1811
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1812
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1813
- tokenizer->_reconsume_current_input = true;
1814
- return NEXT_CHAR;
1815
- case '\0':
1816
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1817
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1818
- return NEXT_CHAR;
1819
- case -1:
1820
- add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
1821
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1822
- abandon_current_tag(parser);
1823
- tokenizer->_reconsume_current_input = true;
1824
- return NEXT_CHAR;
1825
- default:
1826
- append_char_to_tag_buffer(parser, c, false);
1827
- return NEXT_CHAR;
1828
- }
1829
- }
1830
-
1831
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
1832
- static StateResult handle_attr_value_single_quoted_state(
1833
- GumboParser* parser, GumboTokenizerState* tokenizer,
1834
- int c, GumboToken* output) {
1835
- switch (c) {
1836
- case '\'':
1837
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1838
- return NEXT_CHAR;
1839
- case '&':
1840
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1841
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1842
- tokenizer->_reconsume_current_input = true;
1843
- return NEXT_CHAR;
1844
- case '\0':
1845
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1846
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1847
- return NEXT_CHAR;
1848
- case -1:
1849
- add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
1850
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1851
- abandon_current_tag(parser);
1852
- tokenizer->_reconsume_current_input = true;
1853
- return NEXT_CHAR;
1854
- default:
1855
- append_char_to_tag_buffer(parser, c, false);
1856
- return NEXT_CHAR;
1857
- }
1858
- }
1859
-
1860
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
1861
- static StateResult handle_attr_value_unquoted_state(
1862
- GumboParser* parser, GumboTokenizerState* tokenizer,
1863
- int c, GumboToken* output) {
1864
- switch (c) {
1865
- case '\t':
1866
- case '\n':
1867
- case '\f':
1868
- case ' ':
1869
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1870
- finish_attribute_value(parser);
1871
- return NEXT_CHAR;
1872
- case '&':
1873
- tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1874
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1875
- tokenizer->_reconsume_current_input = true;
1876
- return NEXT_CHAR;
1877
- case '>':
1878
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1879
- finish_attribute_value(parser);
1880
- return emit_current_tag(parser, output);
1881
- case '\0':
1882
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1883
- append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1884
- return NEXT_CHAR;
1885
- case -1:
1886
- add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1887
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1888
- tokenizer->_reconsume_current_input = true;
1889
- abandon_current_tag(parser);
1890
- return NEXT_CHAR;
1891
- case '<':
1892
- case '=':
1893
- case '"':
1894
- case '\'':
1895
- case '`':
1896
- add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1897
- // Fall through.
1898
- default:
1899
- append_char_to_tag_buffer(parser, c, true);
1900
- return NEXT_CHAR;
1901
- }
1902
- }
1903
-
1904
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
1905
- static StateResult handle_char_ref_in_attr_value_state(
1906
- GumboParser* parser, GumboTokenizerState* tokenizer,
1907
- int c, GumboToken* output) {
1908
- OneOrTwoCodepoints char_ref;
1909
- int allowed_char;
1910
- bool is_unquoted = false;
1911
- switch (tokenizer->_tag_state._attr_value_state) {
1912
- case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
1913
- allowed_char = '"';
1914
- break;
1915
- case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
1916
- allowed_char = '\'';
1917
- break;
1918
- case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
1919
- allowed_char = '>';
1920
- is_unquoted = true;
1921
- break;
1922
- default:
1923
- // -Wmaybe-uninitialized is a little overzealous here, and doesn't
1924
- // get that the assert(0) means this codepath will never happen.
1925
- allowed_char = ' ';
1926
- assert(0);
1927
- }
1928
-
1929
- // Ignore the status, since we don't have a convenient way of signalling that
1930
- // a parser error has occurred when the error occurs in the middle of a
1931
- // multi-state token. We'd need a flag inside the TokenizerState to do this,
1932
- // but that's a low priority fix.
1933
- consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
1934
- if (char_ref.first != kGumboNoChar) {
1935
- tokenizer->_reconsume_current_input = true;
1936
- append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
1937
- if (char_ref.second != kGumboNoChar) {
1938
- append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
1939
- }
1940
- } else {
1941
- append_char_to_tag_buffer(parser, '&', is_unquoted);
1942
- }
1943
- gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
1944
- return NEXT_CHAR;
1945
- }
1946
-
1947
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
1948
- static StateResult handle_after_attr_value_quoted_state(
1949
- GumboParser* parser, GumboTokenizerState* tokenizer,
1950
- int c, GumboToken* output) {
1951
- finish_attribute_value(parser);
1952
- switch (c) {
1953
- case '\t':
1954
- case '\n':
1955
- case '\f':
1956
- case ' ':
1957
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1958
- return NEXT_CHAR;
1959
- case '/':
1960
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1961
- return NEXT_CHAR;
1962
- case '>':
1963
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1964
- return emit_current_tag(parser, output);
1965
- case -1:
1966
- add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
1967
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1968
- abandon_current_tag(parser);
1969
- tokenizer->_reconsume_current_input = true;
1970
- return NEXT_CHAR;
1971
- default:
1972
- add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
1973
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1974
- tokenizer->_reconsume_current_input = true;
1975
- return NEXT_CHAR;
1976
- }
1977
- }
1978
-
1979
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
1980
- static StateResult handle_self_closing_start_tag_state(
1981
- GumboParser* parser, GumboTokenizerState* tokenizer,
1982
- int c, GumboToken* output) {
1983
- switch (c) {
1984
- case '>':
1985
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1986
- tokenizer->_tag_state._is_self_closing = true;
1987
- return emit_current_tag(parser, output);
1988
- case -1:
1989
- add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
1990
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1991
- abandon_current_tag(parser);
1992
- return NEXT_CHAR;
1993
- default:
1994
- add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
1995
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1996
- tokenizer->_reconsume_current_input = true;
1997
- return NEXT_CHAR;
1998
- }
1999
- }
2000
-
2001
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
2002
- static StateResult handle_bogus_comment_state(
2003
- GumboParser* parser, GumboTokenizerState* tokenizer,
2004
- int c, GumboToken* output) {
2005
- while (c != '>' && c != -1) {
2006
- if (c == '\0') {
2007
- c = 0xFFFD;
2008
- }
2009
- append_char_to_temporary_buffer(parser, c);
2010
- utf8iterator_next(&tokenizer->_input);
2011
- c = utf8iterator_current(&tokenizer->_input);
2012
- }
2013
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2014
- return emit_comment(parser, output);
2015
- }
2016
-
2017
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
2018
- static StateResult handle_markup_declaration_state(
2019
- GumboParser* parser, GumboTokenizerState* tokenizer,
2020
- int c, GumboToken* output) {
2021
- if (utf8iterator_maybe_consume_match(
2022
- &tokenizer->_input, "--", sizeof("--") - 1, true)) {
2023
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
2024
- tokenizer->_reconsume_current_input = true;
2025
- } else if (utf8iterator_maybe_consume_match(
2026
- &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
2027
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
2028
- tokenizer->_reconsume_current_input = true;
2029
- // If we get here, we know we'll eventually emit a doctype token, so now is
2030
- // the time to initialize the doctype strings. (Not in doctype_state_init,
2031
- // since then they'll leak if ownership never gets transferred to the
2032
- // doctype token.
2033
- tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
2034
- tokenizer->_doc_type_state.public_identifier =
2035
- gumbo_copy_stringz(parser, "");
2036
- tokenizer->_doc_type_state.system_identifier =
2037
- gumbo_copy_stringz(parser, "");
2038
- } else if (tokenizer->_is_current_node_foreign &&
2039
- utf8iterator_maybe_consume_match(
2040
- &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2041
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2042
- tokenizer->_reconsume_current_input = true;
2043
- } else {
2044
- add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2045
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2046
- tokenizer->_reconsume_current_input = true;
2047
- clear_temporary_buffer(parser);
2048
- }
2049
- return NEXT_CHAR;
2050
- }
2051
-
2052
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
2053
- static StateResult handle_comment_start_state(
2054
- GumboParser* parser, GumboTokenizerState* tokenizer,
2055
- int c, GumboToken* output) {
2056
- switch (c) {
2057
- case '-':
2058
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2059
- return NEXT_CHAR;
2060
- case '\0':
2061
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2062
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2063
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2064
- return NEXT_CHAR;
2065
- case '>':
2066
- add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2067
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2068
- emit_comment(parser, output);
2069
- return RETURN_ERROR;
2070
- case -1:
2071
- add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2072
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2073
- emit_comment(parser, output);
2074
- return RETURN_ERROR;
2075
- default:
2076
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2077
- append_char_to_temporary_buffer(parser, c);
2078
- return NEXT_CHAR;
2079
- }
2080
- }
2081
-
2082
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
2083
- static StateResult handle_comment_start_dash_state(
2084
- GumboParser* parser, GumboTokenizerState* tokenizer,
2085
- int c, GumboToken* output) {
2086
- switch (c) {
2087
- case '-':
2088
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2089
- return NEXT_CHAR;
2090
- case '\0':
2091
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2092
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2093
- append_char_to_temporary_buffer(parser, '-');
2094
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2095
- return NEXT_CHAR;
2096
- case '>':
2097
- add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2098
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2099
- emit_comment(parser, output);
2100
- return RETURN_ERROR;
2101
- case -1:
2102
- add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2103
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2104
- emit_comment(parser, output);
2105
- return RETURN_ERROR;
2106
- default:
2107
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2108
- append_char_to_temporary_buffer(parser, '-');
2109
- append_char_to_temporary_buffer(parser, c);
2110
- return NEXT_CHAR;
2111
- }
2112
- }
2113
-
2114
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
2115
- static StateResult handle_comment_state(
2116
- GumboParser* parser, GumboTokenizerState* tokenizer,
2117
- int c, GumboToken* output) {
2118
- switch (c) {
2119
- case '-':
2120
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2121
- return NEXT_CHAR;
2122
- case '\0':
2123
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2124
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2125
- return NEXT_CHAR;
2126
- case -1:
2127
- add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2128
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2129
- emit_comment(parser, output);
2130
- return RETURN_ERROR;
2131
- default:
2132
- append_char_to_temporary_buffer(parser, c);
2133
- return NEXT_CHAR;
2134
- }
2135
- }
2136
-
2137
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
2138
- static StateResult handle_comment_end_dash_state(
2139
- GumboParser* parser, GumboTokenizerState* tokenizer,
2140
- int c, GumboToken* output) {
2141
- switch (c) {
2142
- case '-':
2143
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2144
- return NEXT_CHAR;
2145
- case '\0':
2146
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2147
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2148
- append_char_to_temporary_buffer(parser, '-');
2149
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2150
- return NEXT_CHAR;
2151
- case -1:
2152
- add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2153
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2154
- emit_comment(parser, output);
2155
- return RETURN_ERROR;
2156
- default:
2157
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2158
- append_char_to_temporary_buffer(parser, '-');
2159
- append_char_to_temporary_buffer(parser, c);
2160
- return NEXT_CHAR;
2161
- }
2162
- }
2163
-
2164
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
2165
- static StateResult handle_comment_end_state(
2166
- GumboParser* parser, GumboTokenizerState* tokenizer,
2167
- int c, GumboToken* output) {
2168
- switch (c) {
2169
- case '>':
2170
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2171
- return emit_comment(parser, output);
2172
- case '\0':
2173
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2174
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2175
- append_char_to_temporary_buffer(parser, '-');
2176
- append_char_to_temporary_buffer(parser, '-');
2177
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2178
- return NEXT_CHAR;
2179
- case '!':
2180
- add_parse_error(parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2181
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2182
- return NEXT_CHAR;
2183
- case '-':
2184
- add_parse_error(parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2185
- append_char_to_temporary_buffer(parser, '-');
2186
- return NEXT_CHAR;
2187
- case -1:
2188
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2189
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2190
- emit_comment(parser, output);
2191
- return RETURN_ERROR;
2192
- default:
2193
- add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2194
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2195
- append_char_to_temporary_buffer(parser, '-');
2196
- append_char_to_temporary_buffer(parser, '-');
2197
- append_char_to_temporary_buffer(parser, c);
2198
- return NEXT_CHAR;
2199
- }
2200
- }
2201
-
2202
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
2203
- static StateResult handle_comment_end_bang_state(
2204
- GumboParser* parser, GumboTokenizerState* tokenizer,
2205
- int c, GumboToken* output) {
2206
- switch (c) {
2207
- case '-':
2208
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2209
- append_char_to_temporary_buffer(parser, '-');
2210
- append_char_to_temporary_buffer(parser, '-');
2211
- append_char_to_temporary_buffer(parser, '!');
2212
- return NEXT_CHAR;
2213
- case '>':
2214
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2215
- return emit_comment(parser, output);
2216
- case '\0':
2217
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2218
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2219
- append_char_to_temporary_buffer(parser, '-');
2220
- append_char_to_temporary_buffer(parser, '-');
2221
- append_char_to_temporary_buffer(parser, '!');
2222
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2223
- return NEXT_CHAR;
2224
- case -1:
2225
- add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2226
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2227
- emit_comment(parser, output);
2228
- return RETURN_ERROR;
2229
- default:
2230
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2231
- append_char_to_temporary_buffer(parser, '-');
2232
- append_char_to_temporary_buffer(parser, '-');
2233
- append_char_to_temporary_buffer(parser, '!');
2234
- append_char_to_temporary_buffer(parser, c);
2235
- return NEXT_CHAR;
2236
- }
2237
- }
2238
-
2239
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
2240
- static StateResult handle_doctype_state(
2241
- GumboParser* parser, GumboTokenizerState* tokenizer,
2242
- int c, GumboToken* output) {
2243
- assert(!tokenizer->_temporary_buffer.length);
2244
- switch (c) {
2245
- case '\t':
2246
- case '\n':
2247
- case '\f':
2248
- case ' ':
2249
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2250
- return NEXT_CHAR;
2251
- case -1:
2252
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2253
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2254
- tokenizer->_doc_type_state.force_quirks = true;
2255
- emit_doctype(parser, output);
2256
- return RETURN_ERROR;
2257
- default:
2258
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2259
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2260
- tokenizer->_reconsume_current_input = true;
2261
- tokenizer->_doc_type_state.force_quirks = true;
2262
- return NEXT_CHAR;
2263
- }
2264
- }
2265
-
2266
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
2267
- static StateResult handle_before_doctype_name_state(
2268
- GumboParser* parser, GumboTokenizerState* tokenizer,
2269
- int c, GumboToken* output) {
2270
- switch (c) {
2271
- case '\t':
2272
- case '\n':
2273
- case '\f':
2274
- case ' ':
2275
- return NEXT_CHAR;
2276
- case '\0':
2277
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2278
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2279
- tokenizer->_doc_type_state.force_quirks = true;
2280
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2281
- return NEXT_CHAR;
2282
- case '>':
2283
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2284
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2285
- tokenizer->_doc_type_state.force_quirks = true;
2286
- emit_doctype(parser, output);
2287
- return RETURN_ERROR;
2288
- case -1:
2289
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2290
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2291
- tokenizer->_doc_type_state.force_quirks = true;
2292
- emit_doctype(parser, output);
2293
- return RETURN_ERROR;
2294
- default:
2295
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2296
- tokenizer->_doc_type_state.force_quirks = false;
2297
- append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2298
- return NEXT_CHAR;
2299
- }
2300
- }
2301
-
2302
- // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
2303
- static StateResult handle_doctype_name_state(
2304
- GumboParser* parser, GumboTokenizerState* tokenizer,
2305
- int c, GumboToken* output) {
2306
- switch (c) {
2307
- case '\t':
2308
- case '\n':
2309
- case '\f':
2310
- case ' ':
2311
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2312
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2313
- finish_temporary_buffer(
2314
- parser, &tokenizer->_doc_type_state.name);
2315
- return NEXT_CHAR;
2316
- case '>':
2317
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2318
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2319
- finish_temporary_buffer(
2320
- parser, &tokenizer->_doc_type_state.name);
2321
- emit_doctype(parser, output);
2322
- return RETURN_SUCCESS;
2323
- case '\0':
2324
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2325
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2326
- return NEXT_CHAR;
2327
- case -1:
2328
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2329
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2330
- tokenizer->_doc_type_state.force_quirks = true;
2331
- gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2332
- finish_temporary_buffer(
2333
- parser, &tokenizer->_doc_type_state.name);
2334
- emit_doctype(parser, output);
2335
- return RETURN_ERROR;
2336
- default:
2337
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2338
- tokenizer->_doc_type_state.force_quirks = false;
2339
- append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2340
- return NEXT_CHAR;
2341
- }
2342
- }
2343
-
2344
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
2345
- static StateResult handle_after_doctype_name_state(
2346
- GumboParser* parser, GumboTokenizerState* tokenizer,
2347
- int c, GumboToken* output) {
2348
- switch (c) {
2349
- case '\t':
2350
- case '\n':
2351
- case '\f':
2352
- case ' ':
2353
- return NEXT_CHAR;
2354
- case '>':
2355
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2356
- emit_doctype(parser, output);
2357
- return RETURN_SUCCESS;
2358
- case -1:
2359
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2360
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2361
- tokenizer->_doc_type_state.force_quirks = true;
2362
- emit_doctype(parser, output);
2363
- return RETURN_ERROR;
2364
- default:
2365
- if (utf8iterator_maybe_consume_match(
2366
- &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2367
- gumbo_tokenizer_set_state(
2368
- parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2369
- tokenizer->_reconsume_current_input = true;
2370
- } else if (utf8iterator_maybe_consume_match(
2371
- &tokenizer->_input, "SYSTEM", sizeof("SYSTEM") - 1, false)) {
2372
- gumbo_tokenizer_set_state(
2373
- parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2374
- tokenizer->_reconsume_current_input = true;
2375
- } else {
2376
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2377
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2378
- tokenizer->_doc_type_state.force_quirks = true;
2379
- }
2380
- return NEXT_CHAR;
2381
- }
2382
- }
2383
-
2384
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
2385
- static StateResult handle_after_doctype_public_keyword_state(
2386
- GumboParser* parser, GumboTokenizerState* tokenizer,
2387
- int c, GumboToken* output) {
2388
- switch (c) {
2389
- case '\t':
2390
- case '\n':
2391
- case '\f':
2392
- case ' ':
2393
- gumbo_tokenizer_set_state(
2394
- parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2395
- return NEXT_CHAR;
2396
- case '"':
2397
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2398
- assert(temporary_buffer_equals(parser, ""));
2399
- gumbo_tokenizer_set_state(
2400
- parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2401
- return NEXT_CHAR;
2402
- case '\'':
2403
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2404
- assert(temporary_buffer_equals(parser, ""));
2405
- gumbo_tokenizer_set_state(
2406
- parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2407
- return NEXT_CHAR;
2408
- case '>':
2409
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2410
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2411
- tokenizer->_doc_type_state.force_quirks = true;
2412
- emit_doctype(parser, output);
2413
- return RETURN_ERROR;
2414
- case -1:
2415
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2416
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2417
- tokenizer->_doc_type_state.force_quirks = true;
2418
- emit_doctype(parser, output);
2419
- return RETURN_ERROR;
2420
- default:
2421
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2422
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2423
- tokenizer->_doc_type_state.force_quirks = true;
2424
- emit_doctype(parser, output);
2425
- return RETURN_ERROR;
2426
- }
2427
- }
2428
-
2429
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
2430
- static StateResult handle_before_doctype_public_id_state(
2431
- GumboParser* parser, GumboTokenizerState* tokenizer,
2432
- int c, GumboToken* output) {
2433
- switch (c) {
2434
- case '\t':
2435
- case '\n':
2436
- case '\f':
2437
- case ' ':
2438
- return NEXT_CHAR;
2439
- case '"':
2440
- assert(temporary_buffer_equals(parser, ""));
2441
- gumbo_tokenizer_set_state(
2442
- parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2443
- return NEXT_CHAR;
2444
- case '\'':
2445
- assert(temporary_buffer_equals(parser, ""));
2446
- gumbo_tokenizer_set_state(
2447
- parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2448
- return NEXT_CHAR;
2449
- case '>':
2450
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2451
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2452
- tokenizer->_doc_type_state.force_quirks = true;
2453
- emit_doctype(parser, output);
2454
- return RETURN_ERROR;
2455
- case -1:
2456
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2457
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2458
- tokenizer->_doc_type_state.force_quirks = true;
2459
- emit_doctype(parser, output);
2460
- return RETURN_ERROR;
2461
- default:
2462
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2463
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2464
- tokenizer->_doc_type_state.force_quirks = true;
2465
- emit_doctype(parser, output);
2466
- return RETURN_ERROR;
2467
- }
2468
- }
2469
-
2470
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
2471
- static StateResult handle_doctype_public_id_double_quoted_state(
2472
- GumboParser* parser, GumboTokenizerState* tokenizer,
2473
- int c, GumboToken* output) {
2474
- switch (c) {
2475
- case '"':
2476
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2477
- finish_doctype_public_id(parser);
2478
- return NEXT_CHAR;
2479
- case '\0':
2480
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2481
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2482
- return NEXT_CHAR;
2483
- case '>':
2484
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2485
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2486
- tokenizer->_doc_type_state.force_quirks = true;
2487
- finish_doctype_public_id(parser);
2488
- emit_doctype(parser, output);
2489
- return RETURN_ERROR;
2490
- case -1:
2491
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2492
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2493
- tokenizer->_doc_type_state.force_quirks = true;
2494
- finish_doctype_public_id(parser);
2495
- emit_doctype(parser, output);
2496
- return RETURN_ERROR;
2497
- default:
2498
- append_char_to_temporary_buffer(parser, c);
2499
- return NEXT_CHAR;
2500
- }
2501
- }
2502
-
2503
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
2504
- static StateResult handle_doctype_public_id_single_quoted_state(
2505
- GumboParser* parser, GumboTokenizerState* tokenizer,
2506
- int c, GumboToken* output) {
2507
- switch (c) {
2508
- case '\'':
2509
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2510
- finish_doctype_public_id(parser);
2511
- return NEXT_CHAR;
2512
- case '\0':
2513
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2514
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2515
- return NEXT_CHAR;
2516
- case '>':
2517
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2518
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2519
- tokenizer->_doc_type_state.force_quirks = true;
2520
- finish_doctype_public_id(parser);
2521
- emit_doctype(parser, output);
2522
- return RETURN_ERROR;
2523
- case -1:
2524
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2525
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2526
- tokenizer->_doc_type_state.force_quirks = true;
2527
- finish_doctype_public_id(parser);
2528
- emit_doctype(parser, output);
2529
- return RETURN_ERROR;
2530
- default:
2531
- append_char_to_temporary_buffer(parser, c);
2532
- return NEXT_CHAR;
2533
- }
2534
- }
2535
-
2536
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
2537
- static StateResult handle_after_doctype_public_id_state(
2538
- GumboParser* parser, GumboTokenizerState* tokenizer,
2539
- int c, GumboToken* output) {
2540
- switch (c) {
2541
- case '\t':
2542
- case '\n':
2543
- case '\f':
2544
- case ' ':
2545
- gumbo_tokenizer_set_state(
2546
- parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2547
- return NEXT_CHAR;
2548
- case '>':
2549
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2550
- emit_doctype(parser, output);
2551
- return RETURN_SUCCESS;
2552
- case '"':
2553
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2554
- assert(temporary_buffer_equals(parser, ""));
2555
- gumbo_tokenizer_set_state(
2556
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2557
- return NEXT_CHAR;
2558
- case '\'':
2559
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2560
- assert(temporary_buffer_equals(parser, ""));
2561
- gumbo_tokenizer_set_state(
2562
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2563
- return NEXT_CHAR;
2564
- case -1:
2565
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2566
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2567
- tokenizer->_reconsume_current_input = true;
2568
- tokenizer->_doc_type_state.force_quirks = true;
2569
- return NEXT_CHAR;
2570
- default:
2571
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2572
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2573
- tokenizer->_doc_type_state.force_quirks = true;
2574
- return NEXT_CHAR;
2575
- }
2576
- }
2577
-
2578
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
2579
- static StateResult handle_between_doctype_public_system_id_state(
2580
- GumboParser* parser, GumboTokenizerState* tokenizer,
2581
- int c, GumboToken* output) {
2582
- switch (c) {
2583
- case '\t':
2584
- case '\n':
2585
- case '\f':
2586
- case ' ':
2587
- return NEXT_CHAR;
2588
- case '>':
2589
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2590
- emit_doctype(parser, output);
2591
- return RETURN_SUCCESS;
2592
- case '"':
2593
- assert(temporary_buffer_equals(parser, ""));
2594
- gumbo_tokenizer_set_state(
2595
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2596
- return NEXT_CHAR;
2597
- case '\'':
2598
- assert(temporary_buffer_equals(parser, ""));
2599
- gumbo_tokenizer_set_state(
2600
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2601
- return NEXT_CHAR;
2602
- case -1:
2603
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2604
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2605
- tokenizer->_doc_type_state.force_quirks = true;
2606
- emit_doctype(parser, output);
2607
- return RETURN_ERROR;
2608
- default:
2609
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2610
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2611
- tokenizer->_doc_type_state.force_quirks = true;
2612
- emit_doctype(parser, output);
2613
- return RETURN_ERROR;
2614
- }
2615
- }
2616
-
2617
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
2618
- static StateResult handle_after_doctype_system_keyword_state(
2619
- GumboParser* parser, GumboTokenizerState* tokenizer,
2620
- int c, GumboToken* output) {
2621
- switch (c) {
2622
- case '\t':
2623
- case '\n':
2624
- case '\f':
2625
- case ' ':
2626
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2627
- return NEXT_CHAR;
2628
- case '"':
2629
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2630
- assert(temporary_buffer_equals(parser, ""));
2631
- gumbo_tokenizer_set_state(
2632
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2633
- return NEXT_CHAR;
2634
- case '\'':
2635
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2636
- assert(temporary_buffer_equals(parser, ""));
2637
- gumbo_tokenizer_set_state(
2638
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2639
- return NEXT_CHAR;
2640
- case '>':
2641
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2642
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2643
- tokenizer->_doc_type_state.force_quirks = true;
2644
- emit_doctype(parser, output);
2645
- return RETURN_ERROR;
2646
- case -1:
2647
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2648
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2649
- tokenizer->_doc_type_state.force_quirks = true;
2650
- emit_doctype(parser, output);
2651
- return RETURN_ERROR;
2652
- default:
2653
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2654
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2655
- tokenizer->_doc_type_state.force_quirks = true;
2656
- return NEXT_CHAR;
2657
- }
2658
- }
2659
-
2660
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
2661
- static StateResult handle_before_doctype_system_id_state(
2662
- GumboParser* parser, GumboTokenizerState* tokenizer,
2663
- int c, GumboToken* output) {
2664
- switch (c) {
2665
- case '\t':
2666
- case '\n':
2667
- case '\f':
2668
- case ' ':
2669
- return NEXT_CHAR;
2670
- case '"':
2671
- assert(temporary_buffer_equals(parser, ""));
2672
- gumbo_tokenizer_set_state(
2673
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2674
- return NEXT_CHAR;
2675
- case '\'':
2676
- assert(temporary_buffer_equals(parser, ""));
2677
- gumbo_tokenizer_set_state(
2678
- parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2679
- return NEXT_CHAR;
2680
- case '>':
2681
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2682
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2683
- tokenizer->_doc_type_state.force_quirks = true;
2684
- emit_doctype(parser, output);
2685
- return RETURN_ERROR;
2686
- case -1:
2687
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2688
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2689
- tokenizer->_doc_type_state.force_quirks = true;
2690
- emit_doctype(parser, output);
2691
- return RETURN_ERROR;
2692
- default:
2693
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2694
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2695
- tokenizer->_doc_type_state.force_quirks = true;
2696
- return NEXT_CHAR;
2697
- }
2698
- }
2699
-
2700
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
2701
- static StateResult handle_doctype_system_id_double_quoted_state(
2702
- GumboParser* parser, GumboTokenizerState* tokenizer,
2703
- int c, GumboToken* output) {
2704
- switch (c) {
2705
- case '"':
2706
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2707
- finish_doctype_system_id(parser);
2708
- return NEXT_CHAR;
2709
- case '\0':
2710
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2711
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2712
- return NEXT_CHAR;
2713
- case '>':
2714
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2715
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2716
- tokenizer->_doc_type_state.force_quirks = true;
2717
- finish_doctype_system_id(parser);
2718
- emit_doctype(parser, output);
2719
- return RETURN_ERROR;
2720
- case -1:
2721
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2722
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2723
- tokenizer->_doc_type_state.force_quirks = true;
2724
- finish_doctype_system_id(parser);
2725
- emit_doctype(parser, output);
2726
- return RETURN_ERROR;
2727
- default:
2728
- append_char_to_temporary_buffer(parser, c);
2729
- return NEXT_CHAR;
2730
- }
2731
- }
2732
-
2733
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
2734
- static StateResult handle_doctype_system_id_single_quoted_state(
2735
- GumboParser* parser, GumboTokenizerState* tokenizer,
2736
- int c, GumboToken* output) {
2737
- switch (c) {
2738
- case '\'':
2739
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2740
- finish_doctype_system_id(parser);
2741
- return NEXT_CHAR;
2742
- case '\0':
2743
- add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2744
- append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2745
- return NEXT_CHAR;
2746
- case '>':
2747
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2748
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2749
- tokenizer->_doc_type_state.force_quirks = true;
2750
- finish_doctype_system_id(parser);
2751
- emit_doctype(parser, output);
2752
- return RETURN_ERROR;
2753
- case -1:
2754
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2755
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2756
- tokenizer->_doc_type_state.force_quirks = true;
2757
- finish_doctype_system_id(parser);
2758
- emit_doctype(parser, output);
2759
- return RETURN_ERROR;
2760
- default:
2761
- append_char_to_temporary_buffer(parser, c);
2762
- return NEXT_CHAR;
2763
- }
2764
- }
2765
-
2766
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
2767
- static StateResult handle_after_doctype_system_id_state(
2768
- GumboParser* parser, GumboTokenizerState* tokenizer,
2769
- int c, GumboToken* output) {
2770
- switch (c) {
2771
- case '\t':
2772
- case '\n':
2773
- case '\f':
2774
- case ' ':
2775
- return NEXT_CHAR;
2776
- case '>':
2777
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2778
- emit_doctype(parser, output);
2779
- return RETURN_SUCCESS;
2780
- case -1:
2781
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2782
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2783
- tokenizer->_doc_type_state.force_quirks = true;
2784
- emit_doctype(parser, output);
2785
- return RETURN_ERROR;
2786
- default:
2787
- add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2788
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2789
- return NEXT_CHAR;
2790
- }
2791
- }
2792
-
2793
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
2794
- static StateResult handle_bogus_doctype_state(
2795
- GumboParser* parser, GumboTokenizerState* tokenizer,
2796
- int c, GumboToken* output) {
2797
- if (c == '>' || c == -1) {
2798
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2799
- emit_doctype(parser, output);
2800
- return RETURN_ERROR;
2801
- }
2802
- return NEXT_CHAR;
2803
- }
2804
-
2805
- // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
2806
- static StateResult handle_cdata_state(
2807
- GumboParser* parser, GumboTokenizerState* tokenizer,
2808
- int c, GumboToken* output) {
2809
- if (c == -1 || utf8iterator_maybe_consume_match(
2810
- &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2811
- tokenizer->_reconsume_current_input = true;
2812
- reset_token_start_point(tokenizer);
2813
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2814
- return NEXT_CHAR;
2815
- } else {
2816
- return emit_current_char(parser, output);
2817
- }
2818
- }
2819
-
2820
- typedef StateResult (*GumboLexerStateFunction)(
2821
- GumboParser*, GumboTokenizerState*, int, GumboToken*);
2822
-
2823
- static GumboLexerStateFunction dispatch_table[] = {
2824
- handle_data_state,
2825
- handle_char_ref_in_data_state,
2826
- handle_rcdata_state,
2827
- handle_char_ref_in_rcdata_state,
2828
- handle_rawtext_state,
2829
- handle_script_state,
2830
- handle_plaintext_state,
2831
- handle_tag_open_state,
2832
- handle_end_tag_open_state,
2833
- handle_tag_name_state,
2834
- handle_rcdata_lt_state,
2835
- handle_rcdata_end_tag_open_state,
2836
- handle_rcdata_end_tag_name_state,
2837
- handle_rawtext_lt_state,
2838
- handle_rawtext_end_tag_open_state,
2839
- handle_rawtext_end_tag_name_state,
2840
- handle_script_lt_state,
2841
- handle_script_end_tag_open_state,
2842
- handle_script_end_tag_name_state,
2843
- handle_script_escaped_start_state,
2844
- handle_script_escaped_start_dash_state,
2845
- handle_script_escaped_state,
2846
- handle_script_escaped_dash_state,
2847
- handle_script_escaped_dash_dash_state,
2848
- handle_script_escaped_lt_state,
2849
- handle_script_escaped_end_tag_open_state,
2850
- handle_script_escaped_end_tag_name_state,
2851
- handle_script_double_escaped_start_state,
2852
- handle_script_double_escaped_state,
2853
- handle_script_double_escaped_dash_state,
2854
- handle_script_double_escaped_dash_dash_state,
2855
- handle_script_double_escaped_lt_state,
2856
- handle_script_double_escaped_end_state,
2857
- handle_before_attr_name_state,
2858
- handle_attr_name_state,
2859
- handle_after_attr_name_state,
2860
- handle_before_attr_value_state,
2861
- handle_attr_value_double_quoted_state,
2862
- handle_attr_value_single_quoted_state,
2863
- handle_attr_value_unquoted_state,
2864
- handle_char_ref_in_attr_value_state,
2865
- handle_after_attr_value_quoted_state,
2866
- handle_self_closing_start_tag_state,
2867
- handle_bogus_comment_state,
2868
- handle_markup_declaration_state,
2869
- handle_comment_start_state,
2870
- handle_comment_start_dash_state,
2871
- handle_comment_state,
2872
- handle_comment_end_dash_state,
2873
- handle_comment_end_state,
2874
- handle_comment_end_bang_state,
2875
- handle_doctype_state,
2876
- handle_before_doctype_name_state,
2877
- handle_doctype_name_state,
2878
- handle_after_doctype_name_state,
2879
- handle_after_doctype_public_keyword_state,
2880
- handle_before_doctype_public_id_state,
2881
- handle_doctype_public_id_double_quoted_state,
2882
- handle_doctype_public_id_single_quoted_state,
2883
- handle_after_doctype_public_id_state,
2884
- handle_between_doctype_public_system_id_state,
2885
- handle_after_doctype_system_keyword_state,
2886
- handle_before_doctype_system_id_state,
2887
- handle_doctype_system_id_double_quoted_state,
2888
- handle_doctype_system_id_single_quoted_state,
2889
- handle_after_doctype_system_id_state,
2890
- handle_bogus_doctype_state,
2891
- handle_cdata_state
2892
- };
2893
-
2894
- bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2895
- // Because of the spec requirements that...
2896
- //
2897
- // 1. Tokens be handled immediately by the parser upon emission.
2898
- // 2. Some states (eg. CDATA, or various error conditions) require the
2899
- // emission of multiple tokens in the same states.
2900
- // 3. The tokenizer often has to reconsume the same character in a different
2901
- // state.
2902
- //
2903
- // ...all state must be held in the GumboTokenizer struct instead of in local
2904
- // variables in this function. That allows us to return from this method with
2905
- // a token, and then immediately jump back to the same state with the same
2906
- // input if we need to return a different token. The various emit_* functions
2907
- // are responsible for changing state (eg. flushing the chardata buffer,
2908
- // reading the next input character) to avoid an infinite loop.
2909
- GumboTokenizerState* tokenizer = parser->_tokenizer_state;
2910
-
2911
- if (tokenizer->_buffered_emit_char != kGumboNoChar) {
2912
- tokenizer->_reconsume_current_input = true;
2913
- emit_char(parser, tokenizer->_buffered_emit_char, output);
2914
- // And now that we've avoided advancing the input, make sure we set
2915
- // _reconsume_current_input back to false to make sure the *next* character
2916
- // isn't consumed twice.
2917
- tokenizer->_reconsume_current_input = false;
2918
- tokenizer->_buffered_emit_char = kGumboNoChar;
2919
- return true;
2920
- }
2921
-
2922
- if (maybe_emit_from_temporary_buffer(parser, output)) {
2923
- return true;
2924
- }
2925
-
2926
- while (1) {
2927
- assert(!tokenizer->_temporary_buffer_emit);
2928
- assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2929
- int c = utf8iterator_current(&tokenizer->_input);
2930
- gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
2931
- StateResult result =
2932
- dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2933
- // We need to clear reconsume_current_input before returning to prevent
2934
- // certain infinite loop states.
2935
- bool should_advance = !tokenizer->_reconsume_current_input;
2936
- tokenizer->_reconsume_current_input = false;
2937
-
2938
- if (result == RETURN_SUCCESS) {
2939
- return true;
2940
- } else if(result == RETURN_ERROR) {
2941
- return false;
2942
- }
2943
-
2944
- if (should_advance) {
2945
- utf8iterator_next(&tokenizer->_input);
2946
- }
2947
- }
2948
- }
2949
-
2950
- void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
2951
- if (!token) return;
2952
-
2953
- switch (token->type) {
2954
- case GUMBO_TOKEN_DOCTYPE:
2955
- gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
2956
- gumbo_parser_deallocate(
2957
- parser, (void*) token->v.doc_type.public_identifier);
2958
- gumbo_parser_deallocate(
2959
- parser, (void*) token->v.doc_type.system_identifier);
2960
- return;
2961
- case GUMBO_TOKEN_START_TAG:
2962
- for (int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2963
- GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2964
- if (attr) {
2965
- // May have been nulled out if this token was merged with another.
2966
- gumbo_destroy_attribute(parser, attr);
2967
- }
2968
- }
2969
- gumbo_parser_deallocate(
2970
- parser, (void*) token->v.start_tag.attributes.data);
2971
- return;
2972
- case GUMBO_TOKEN_COMMENT:
2973
- gumbo_parser_deallocate(parser, (void*) token->v.text);
2974
- return;
2975
- default:
2976
- return;
2977
- }
2978
- }