ruby-gumbo 1.0.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.mkd +28 -31
  4. data/Rakefile +60 -59
  5. data/ext/extconf.rb +17 -9
  6. data/ext/{gumbo.c → ruby_gumbo_ext.c} +29 -28
  7. data/lib/gumbo.rb +19 -0
  8. data/lib/gumbo/element.rb +52 -0
  9. data/lib/gumbo/{extra.rb → node.rb} +19 -22
  10. data/lib/gumbo/text.rb +29 -0
  11. data/vendor/gumbo-parser/src/attribute.c +44 -0
  12. data/vendor/gumbo-parser/src/attribute.h +37 -0
  13. data/vendor/gumbo-parser/src/char_ref.c +2561 -0
  14. data/vendor/gumbo-parser/src/char_ref.h +61 -0
  15. data/vendor/gumbo-parser/src/error.c +258 -0
  16. data/vendor/gumbo-parser/src/error.h +227 -0
  17. data/vendor/gumbo-parser/src/gumbo.h +807 -0
  18. data/vendor/gumbo-parser/src/insertion_mode.h +57 -0
  19. data/vendor/gumbo-parser/src/parser.c +3917 -0
  20. data/vendor/gumbo-parser/src/parser.h +57 -0
  21. data/vendor/gumbo-parser/src/string_buffer.c +106 -0
  22. data/vendor/gumbo-parser/src/string_buffer.h +81 -0
  23. data/vendor/gumbo-parser/src/string_piece.c +49 -0
  24. data/vendor/gumbo-parser/src/string_piece.h +39 -0
  25. data/vendor/gumbo-parser/src/tag.c +225 -0
  26. data/vendor/gumbo-parser/src/token_type.h +40 -0
  27. data/vendor/gumbo-parser/src/tokenizer.c +2980 -0
  28. data/vendor/gumbo-parser/src/tokenizer.h +123 -0
  29. data/vendor/gumbo-parser/src/tokenizer_states.h +103 -0
  30. data/vendor/gumbo-parser/src/utf8.c +275 -0
  31. data/vendor/gumbo-parser/src/utf8.h +127 -0
  32. data/vendor/gumbo-parser/src/util.c +58 -0
  33. data/vendor/gumbo-parser/src/util.h +62 -0
  34. data/vendor/gumbo-parser/src/vector.c +123 -0
  35. data/vendor/gumbo-parser/src/vector.h +69 -0
  36. metadata +40 -10
  37. data/ext/extconf.h +0 -3
@@ -0,0 +1,40 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #ifndef GUMBO_TOKEN_TYPE_H_
18
+ #define GUMBO_TOKEN_TYPE_H_
19
+
20
+ #ifdef __cplusplus
21
+ extern "C" {
22
+ #endif
23
+
24
+ // An enum representing the type of token.
25
+ typedef enum {
26
+ GUMBO_TOKEN_DOCTYPE,
27
+ GUMBO_TOKEN_START_TAG,
28
+ GUMBO_TOKEN_END_TAG,
29
+ GUMBO_TOKEN_COMMENT,
30
+ GUMBO_TOKEN_WHITESPACE,
31
+ GUMBO_TOKEN_CHARACTER,
32
+ GUMBO_TOKEN_NULL,
33
+ GUMBO_TOKEN_EOF
34
+ } GumboTokenType;
35
+
36
+ #ifdef __cplusplus
37
+ } // extern C
38
+ #endif
39
+
40
+ #endif // GUMBO_TOKEN_TYPE_H_
@@ -0,0 +1,2980 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // Coding conventions specific to this file:
18
+ //
19
+ // 1. Functions that fill in a token should be named emit_*, and should be
20
+ // followed immediately by a return from the tokenizer (true if no error
21
+ // occurred, false if an error occurred). Sometimes the emit functions
22
+ // themselves return a boolean so that they can be combined with the return
23
+ // statement; in this case, they should match this convention.
24
+ // 2. Functions that shuffle data from temporaries to final API structures
25
+ // should be named finish_*, and be called just before the tokenizer exits the
26
+ // state that accumulates the temporary.
27
+ // 3. All internal data structures should be kept in an initialized state from
28
+ // tokenizer creation onwards, ready to accept input. When a buffer's flushed
29
+ // and reset, it should be deallocated and immediately reinitialized.
30
+ // 4. Make sure there are appropriate break statements following each state.
31
+ // 5. Assertions on the state of the temporary and tag buffers are usually a
32
+ // good idea, and should go at the entry point of each state when added.
33
+ // 6. Statement order within states goes:
34
+ // 1. Add parse errors, if appropriate.
35
+ // 2. Call finish_* functions to build up tag state.
36
+ // 2. Switch to new state. Set _reconsume flag if appropriate.
37
+ // 3. Perform any other temporary buffer manipulation.
38
+ // 4. Emit tokens
39
+ // 5. Return/break.
40
+ // This order ensures that we can verify that every emit is followed by a
41
+ // return, ensures that the correct state is recorded with any parse errors, and
42
+ // prevents parse error position from being messed up by possible mark/resets in
43
+ // temporary buffer manipulation.
44
+
45
+
46
+ #include "tokenizer.h"
47
+
48
+ #include <assert.h>
49
+ #include <stdbool.h>
50
+ #include <string.h>
51
+
52
+ #include "attribute.h"
53
+ #include "char_ref.h"
54
+ #include "error.h"
55
+ #include "gumbo.h"
56
+ #include "parser.h"
57
+ #include "string_buffer.h"
58
+ #include "string_piece.h"
59
+ #include "token_type.h"
60
+ #include "tokenizer_states.h"
61
+ #include "utf8.h"
62
+ #include "util.h"
63
+ #include "vector.h"
64
+
65
+ // Compared against _script_data_buffer to determine if we're in double-escaped
66
+ // script mode.
67
+ const GumboStringPiece kScriptTag = { "script", 6 };
68
+
69
+ // An enum for the return value of each individual state.
70
+ typedef enum {
71
+ RETURN_ERROR, // Return false (error) from the tokenizer.
72
+ RETURN_SUCCESS, // Return true (success) from the tokenizer.
73
+ NEXT_CHAR // Proceed to the next character and continue lexing.
74
+ } StateResult;
75
+
76
+ // This is a struct containing state necessary to build up a tag token,
77
+ // character by character.
78
+ typedef struct GumboInternalTagState {
79
+ // A buffer to accumulate characters for various GumboStringPiece fields.
80
+ GumboStringBuffer _buffer;
81
+
82
+ // A pointer to the start of the original text corresponding to the contents
83
+ // of the buffer.
84
+ const char* _original_text;
85
+
86
+ // The current tag enum, computed once the tag name state has finished so that
87
+ // the buffer can be re-used for building up attributes.
88
+ GumboTag _tag;
89
+
90
+ // The starting location of the text in the buffer.
91
+ GumboSourcePosition _start_pos;
92
+
93
+ // The current list of attributes. This is copied (and ownership of its data
94
+ // transferred) to the GumboStartTag token upon completion of the tag. New
95
+ // attributes are added as soon as their attribute name state is complete, and
96
+ // values are filled in by operating on _attributes.data[attributes.length-1].
97
+ GumboVector /* GumboAttribute */ _attributes;
98
+
99
+ // If true, the next attribute value to be finished should be dropped. This
100
+ // happens if a duplicate attribute name is encountered - we want to consume
101
+ // the attribute value, but shouldn't overwrite the existing value.
102
+ bool _drop_next_attr_value;
103
+
104
+ // The state that caused the tokenizer to switch into a character reference in
105
+ // attribute value state. This is used to set the additional allowed
106
+ // character, and is switched back to on completion. Initialized as the
107
+ // tokenizer enters the character reference state.
108
+ GumboTokenizerEnum _attr_value_state;
109
+
110
+ // The last start tag to have been emitted by the tokenizer. This is
111
+ // necessary to check for appropriate end tags.
112
+ GumboTag _last_start_tag;
113
+
114
+ // If true, then this is a start tag. If false, it's an end tag. This is
115
+ // necessary to generate the appropriate token type at tag-closing time.
116
+ bool _is_start_tag;
117
+
118
+ // If true, then this tag is "self-closing" and doesn't have an end tag.
119
+ bool _is_self_closing;
120
+ } GumboTagState;
121
+
122
+ // This is the main tokenizer state struct, containing all state used by in
123
+ // tokenizing the input stream.
124
+ typedef struct GumboInternalTokenizerState {
125
+ // The current lexer state. Starts in GUMBO_LEX_DATA.
126
+ GumboTokenizerEnum _state;
127
+
128
+ // A flag indicating whether the current input character needs to reconsumed
129
+ // in another state, or whether the next input character should be read for
130
+ // the next iteration of the state loop. This is set when the spec reads
131
+ // "Reconsume the current input character in..."
132
+ bool _reconsume_current_input;
133
+
134
+ // A flag indicating whether the current node is a foreign element. This is
135
+ // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
136
+ // markup declaration state.
137
+ bool _is_current_node_foreign;
138
+
139
+ // Certain states (notably character references) may emit two character tokens
140
+ // at once, but the contract for lex() fills in only one token at a time. The
141
+ // extra character is buffered here, and then this is checked on entry to
142
+ // lex(). If a character is stored here, it's immediately emitted and control
143
+ // returns from the lexer. kGumboNoChar is used to represent 'no character
144
+ // stored.'
145
+ //
146
+ // Note that characters emitted through this mechanism will have their source
147
+ // position marked as the character under the mark, i.e. multiple characters
148
+ // may be emitted with the same position. This is desirable for character
149
+ // references, but unsuitable for many other cases. Use the _temporary_buffer
150
+ // mechanism if the buffered characters must have their original positions in
151
+ // the document.
152
+ int _buffered_emit_char;
153
+
154
+ // A temporary buffer to accumulate characters, as described by the "temporary
155
+ // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
156
+ // way: we record the specific character to go into the buffer, which may
157
+ // sometimes be a lowercased version of the actual input character. However,
158
+ // we *also* use utf8iterator_mark() to record the position at tag start.
159
+ // When we start flushing the temporary buffer, we set _temporary_buffer_emit
160
+ // to the start of it, and then increment it for each call to the tokenizer.
161
+ // We also call utf8iterator_reset(), and utf8iterator_next() through the
162
+ // input stream, so that tokens emitted by emit_char have the correct position
163
+ // and original text.
164
+ GumboStringBuffer _temporary_buffer;
165
+
166
+ // The current cursor position we're emitting from within
167
+ // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
168
+ const char* _temporary_buffer_emit;
169
+
170
+ // The temporary buffer is also used by the spec to check whether we should
171
+ // enter the script data double escaped state, but we can't use the same
172
+ // buffer for both because we have to flush out "<s" as emits while still
173
+ // maintaining the context that will eventually become "script". This is a
174
+ // separate buffer that's used in place of the temporary buffer for states
175
+ // that may enter the script data double escape start state.
176
+ GumboStringBuffer _script_data_buffer;
177
+
178
+ // Pointer to the beginning of the current token in the original buffer; used
179
+ // to record the original text.
180
+ const char* _token_start;
181
+
182
+ // GumboSourcePosition recording the source location of the start of the
183
+ // current token.
184
+ GumboSourcePosition _token_start_pos;
185
+
186
+ // Current tag state.
187
+ GumboTagState _tag_state;
188
+
189
+ // Doctype state. We use the temporary buffer to accumulate characters (it's
190
+ // not used for anything else in the doctype states), and then freshly
191
+ // allocate the strings in the doctype token, then copy it over on emit.
192
+ GumboTokenDocType _doc_type_state;
193
+
194
+ // The UTF8Iterator over the tokenizer input.
195
+ Utf8Iterator _input;
196
+ } GumboTokenizerState;
197
+
198
+ // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
199
+ static void add_parse_error(GumboParser* parser, GumboErrorType type) {
200
+ GumboError* error = gumbo_add_error(parser);
201
+ if (!error) {
202
+ return;
203
+ }
204
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
205
+ utf8iterator_get_position(&tokenizer->_input, &error->position);
206
+ error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
207
+ error->type = type;
208
+ error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
209
+ switch (tokenizer->_state) {
210
+ case GUMBO_LEX_DATA:
211
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
212
+ break;
213
+ case GUMBO_LEX_CHAR_REF_IN_DATA:
214
+ case GUMBO_LEX_CHAR_REF_IN_RCDATA:
215
+ case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
216
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
217
+ break;
218
+ case GUMBO_LEX_RCDATA:
219
+ case GUMBO_LEX_RCDATA_LT:
220
+ case GUMBO_LEX_RCDATA_END_TAG_OPEN:
221
+ case GUMBO_LEX_RCDATA_END_TAG_NAME:
222
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
223
+ break;
224
+ case GUMBO_LEX_RAWTEXT:
225
+ case GUMBO_LEX_RAWTEXT_LT:
226
+ case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
227
+ case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
228
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
229
+ break;
230
+ case GUMBO_LEX_PLAINTEXT:
231
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
232
+ break;
233
+ case GUMBO_LEX_SCRIPT:
234
+ case GUMBO_LEX_SCRIPT_LT:
235
+ case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
236
+ case GUMBO_LEX_SCRIPT_END_TAG_NAME:
237
+ case GUMBO_LEX_SCRIPT_ESCAPED_START:
238
+ case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
239
+ case GUMBO_LEX_SCRIPT_ESCAPED:
240
+ case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
241
+ case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
242
+ case GUMBO_LEX_SCRIPT_ESCAPED_LT:
243
+ case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
244
+ case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
245
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
246
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
247
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
248
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
249
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
250
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
251
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
252
+ break;
253
+ case GUMBO_LEX_TAG_OPEN:
254
+ case GUMBO_LEX_END_TAG_OPEN:
255
+ case GUMBO_LEX_TAG_NAME:
256
+ case GUMBO_LEX_BEFORE_ATTR_NAME:
257
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
258
+ break;
259
+ case GUMBO_LEX_SELF_CLOSING_START_TAG:
260
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
261
+ break;
262
+ case GUMBO_LEX_ATTR_NAME:
263
+ case GUMBO_LEX_AFTER_ATTR_NAME:
264
+ case GUMBO_LEX_BEFORE_ATTR_VALUE:
265
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
266
+ break;
267
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
268
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
269
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
270
+ case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
271
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
272
+ break;
273
+ case GUMBO_LEX_BOGUS_COMMENT:
274
+ case GUMBO_LEX_COMMENT_START:
275
+ case GUMBO_LEX_COMMENT_START_DASH:
276
+ case GUMBO_LEX_COMMENT:
277
+ case GUMBO_LEX_COMMENT_END_DASH:
278
+ case GUMBO_LEX_COMMENT_END:
279
+ case GUMBO_LEX_COMMENT_END_BANG:
280
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
281
+ break;
282
+ case GUMBO_LEX_MARKUP_DECLARATION:
283
+ case GUMBO_LEX_DOCTYPE:
284
+ case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
285
+ case GUMBO_LEX_DOCTYPE_NAME:
286
+ case GUMBO_LEX_AFTER_DOCTYPE_NAME:
287
+ case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
288
+ case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
289
+ case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
290
+ case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
291
+ case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
292
+ case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
293
+ case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
294
+ case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
295
+ case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
296
+ case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
297
+ case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
298
+ case GUMBO_LEX_BOGUS_DOCTYPE:
299
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
300
+ break;
301
+ case GUMBO_LEX_CDATA:
302
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
303
+ break;
304
+ }
305
+ }
306
+
307
+ static bool is_alpha(int c) {
308
+ // We don't use ISO C isupper/islower functions here because they
309
+ // depend upon the program's locale, while the behavior of the HTML5 spec is
310
+ // independent of which locale the program is run in.
311
+ return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
312
+ }
313
+
314
+ static int ensure_lowercase(int c) {
315
+ return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
316
+ }
317
+
318
+ static GumboTokenType get_char_token_type(int c) {
319
+ switch (c) {
320
+ case '\t':
321
+ case '\n':
322
+ case '\r':
323
+ case '\f':
324
+ case ' ':
325
+ return GUMBO_TOKEN_WHITESPACE;
326
+ case 0:
327
+ gumbo_debug("Emitted null byte.\n");
328
+ return GUMBO_TOKEN_NULL;
329
+ case -1:
330
+ return GUMBO_TOKEN_EOF;
331
+ default:
332
+ return GUMBO_TOKEN_CHARACTER;
333
+ }
334
+ }
335
+
336
+ // Starts recording characters in the temporary buffer.
337
+ // Because this needs to reset the utf8iterator_mark to the beginning of the
338
+ // text that will eventually be emitted, it needs to be called a couple of
339
+ // states before the spec says "Set the temporary buffer to the empty string".
340
+ // In general, this should be called whenever there's a transition to a
341
+ // "less-than sign state". The initial < and possibly / then need to be
342
+ // appended to the temporary buffer, their presence needs to be accounted for in
343
+ // states that compare the temporary buffer against a literal value, and
344
+ // spec stanzas that say "emit a < and / character token along with a character
345
+ // token for each character in the temporary buffer" need to be adjusted to
346
+ // account for the presence of the < and / inside the temporary buffer.
347
+ static void clear_temporary_buffer(GumboParser* parser) {
348
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
349
+ assert(!tokenizer->_temporary_buffer_emit);
350
+ utf8iterator_mark(&tokenizer->_input);
351
+ gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
352
+ gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
353
+ // The temporary buffer and script data buffer are the same object in the
354
+ // spec, so the script data buffer should be cleared as well.
355
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
356
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
357
+ }
358
+
359
+ // Appends a codepoint to the temporary buffer.
360
+ static void append_char_to_temporary_buffer(
361
+ GumboParser* parser, int codepoint) {
362
+ gumbo_string_buffer_append_codepoint(
363
+ parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
364
+ }
365
+
366
+ // Checks to see if the temporary buffer equals a certain string.
367
+ // Make sure this remains side-effect free; it's used in assertions.
368
+ #ifndef NDEBUG
369
+ static bool temporary_buffer_equals(
370
+ GumboParser* parser, const char* text) {
371
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
372
+ // TODO(jdtang): See if the extra strlen is a performance problem, and replace
373
+ // it with an explicit sizeof(literal) if necessary. I don't think it will
374
+ // be, as this is only used in a couple of rare states.
375
+ int text_len = strlen(text);
376
+ return text_len == buffer->length &&
377
+ memcmp(buffer->data, text, text_len) == 0;
378
+ }
379
+ #endif
380
+
381
+ static void doc_type_state_init(GumboParser* parser) {
382
+ GumboTokenDocType* doc_type_state =
383
+ &parser->_tokenizer_state->_doc_type_state;
384
+ // We initialize these to NULL here so that we don't end up leaking memory if
385
+ // we never see a doctype token. When we do see a doctype token, we reset
386
+ // them to a freshly-allocated empty string so that we can present a uniform
387
+ // interface to client code and not make them check for null. Ownership is
388
+ // transferred to the doctype token when it's emitted.
389
+ doc_type_state->name = NULL;
390
+ doc_type_state->public_identifier = NULL;
391
+ doc_type_state->system_identifier = NULL;
392
+ doc_type_state->force_quirks = false;
393
+ doc_type_state->has_public_identifier = false;
394
+ doc_type_state->has_system_identifier = false;
395
+ }
396
+
397
+ // Sets the token original_text and position to the current iterator position.
398
+ // This is necessary because [CDATA[ sections may include text that is ignored
399
+ // by the tokenizer.
400
+ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
401
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
402
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
403
+ }
404
+
405
+ // Sets the tag buffer original text and start point to the current iterator
406
+ // position. This is necessary because attribute names & values may have
407
+ // whitespace preceeding them, and so we can't assume that the actual token
408
+ // starting point was the end of the last tag buffer usage.
409
+ static void reset_tag_buffer_start_point(GumboParser* parser) {
410
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
411
+ GumboTagState* tag_state = &tokenizer->_tag_state;
412
+
413
+ utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
414
+ tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
415
+ }
416
+
417
+ // Moves the temporary buffer contents over to the specified output string,
418
+ // and clears the temporary buffer.
419
+ static void finish_temporary_buffer(GumboParser* parser, const char** output) {
420
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
421
+ *output =
422
+ gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
423
+ clear_temporary_buffer(parser);
424
+ }
425
+
426
+ // Advances the iterator past the end of the token, and then fills in the
427
+ // relevant position fields. It's assumed that after every emit, the tokenizer
428
+ // will immediately return (letting the tree-construction stage read the filled
429
+ // in Token). Thus, it's safe to advance the input stream here, since it will
430
+ // bypass the advance at the bottom of the state machine loop.
431
+ //
432
+ // Since this advances the iterator and resets the current input, make sure to
433
+ // call it after you've recorded any other data you need for the token.
434
+ static void finish_token(GumboParser* parser, GumboToken* token) {
435
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
436
+ if (!tokenizer->_reconsume_current_input) {
437
+ utf8iterator_next(&tokenizer->_input);
438
+ }
439
+
440
+ token->position = tokenizer->_token_start_pos;
441
+ token->original_text.data = tokenizer->_token_start;
442
+ reset_token_start_point(tokenizer);
443
+ token->original_text.length =
444
+ tokenizer->_token_start - token->original_text.data;
445
+ if (token->original_text.length > 0 &&
446
+ token->original_text.data[token->original_text.length - 1] == '\r') {
447
+ // The UTF8 iterator will ignore carriage returns in the input stream, which
448
+ // means that the next token may start one past a \r character. The pointer
449
+ // arithmetic above results in that \r being appended to the original text
450
+ // of the preceding token, so we have to adjust its length here to chop the
451
+ // \r off.
452
+ --token->original_text.length;
453
+ }
454
+ }
455
+
456
+ // Records the doctype public ID, assumed to be in the temporary buffer.
457
+ // Convenience method that also sets has_public_identifier to true.
458
+ static void finish_doctype_public_id(GumboParser* parser) {
459
+ GumboTokenDocType* doc_type_state =
460
+ &parser->_tokenizer_state->_doc_type_state;
461
+ gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
462
+ finish_temporary_buffer(parser, &doc_type_state->public_identifier);
463
+ doc_type_state->has_public_identifier = true;
464
+ }
465
+
466
+ // Records the doctype system ID, assumed to be in the temporary buffer.
467
+ // Convenience method that also sets has_system_identifier to true.
468
+ static void finish_doctype_system_id(GumboParser* parser) {
469
+ GumboTokenDocType* doc_type_state =
470
+ &parser->_tokenizer_state->_doc_type_state;
471
+ gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
472
+ finish_temporary_buffer(parser, &doc_type_state->system_identifier);
473
+ doc_type_state->has_system_identifier = true;
474
+ }
475
+
476
+ // Writes a single specified character to the output token.
477
+ static void emit_char(GumboParser* parser, int c, GumboToken* output) {
478
+ output->type = get_char_token_type(c);
479
+ output->v.character = c;
480
+ finish_token(parser, output);
481
+ }
482
+
483
+ // Writes a replacement character token and records a parse error.
484
+ // Always returns RETURN_ERROR, per gumbo_lex return value.
485
+ static StateResult emit_replacement_char(
486
+ GumboParser* parser, GumboToken* output) {
487
+ // In all cases, this is because of a null byte in the input stream.
488
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
489
+ emit_char(parser, kUtf8ReplacementChar, output);
490
+ return RETURN_ERROR;
491
+ }
492
+
493
+ // Writes an EOF character token. Always returns RETURN_SUCCESS.
494
+ static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
495
+ emit_char(parser, -1, output);
496
+ return RETURN_SUCCESS;
497
+ }
498
+
499
+ // Writes the current input character out as a character token.
500
+ // Always returns RETURN_SUCCESS.
501
+ static bool emit_current_char(GumboParser* parser, GumboToken* output) {
502
+ emit_char(
503
+ parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
504
+ return RETURN_SUCCESS;
505
+ }
506
+
507
+ // Writes out a doctype token, copying it from the tokenizer state.
508
+ static void emit_doctype(GumboParser* parser, GumboToken* output) {
509
+ output->type = GUMBO_TOKEN_DOCTYPE;
510
+ output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
511
+ finish_token(parser, output);
512
+ doc_type_state_init(parser);
513
+ }
514
+
515
+ // Debug-only function that explicitly sets the attribute vector data to NULL so
516
+ // it can be asserted on tag creation, verifying that there are no memory leaks.
517
+ static void mark_tag_state_as_empty(GumboTagState* tag_state) {
518
+ #ifndef NDEBUG
519
+ tag_state->_attributes = kGumboEmptyVector;
520
+ #endif
521
+ }
522
+
523
+ // Writes out the current tag as a start or end tag token.
524
+ // Always returns RETURN_SUCCESS.
525
+ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
526
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
527
+ if (tag_state->_is_start_tag) {
528
+ output->type = GUMBO_TOKEN_START_TAG;
529
+ output->v.start_tag.tag = tag_state->_tag;
530
+ output->v.start_tag.attributes = tag_state->_attributes;
531
+ output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
532
+ tag_state->_last_start_tag = tag_state->_tag;
533
+ mark_tag_state_as_empty(tag_state);
534
+ gumbo_debug("Emitted start tag %s.\n",
535
+ gumbo_normalized_tagname(tag_state->_tag));
536
+ } else {
537
+ output->type = GUMBO_TOKEN_END_TAG;
538
+ output->v.end_tag = tag_state->_tag;
539
+ // In end tags, ownership of the attributes vector is not transferred to the
540
+ // token, but it's still initialized as normal, so it must be manually
541
+ // deallocated. There may also be attributes to destroy, in certain broken
542
+ // cases like </div</th> (the "th" is an attribute there).
543
+ for (int i = 0; i < tag_state->_attributes.length; ++i) {
544
+ gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
545
+ }
546
+ gumbo_parser_deallocate(parser, tag_state->_attributes.data);
547
+ mark_tag_state_as_empty(tag_state);
548
+ gumbo_debug("Emitted end tag %s.\n",
549
+ gumbo_normalized_tagname(tag_state->_tag));
550
+ }
551
+ gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
552
+ finish_token(parser, output);
553
+ gumbo_debug("Original text = %.*s.\n", output->original_text.length, output->original_text.data);
554
+ assert(output->original_text.length >= 2);
555
+ assert(output->original_text.data[0] == '<');
556
+ assert(output->original_text.data[output->original_text.length - 1] == '>');
557
+ return RETURN_SUCCESS;
558
+ }
559
+
560
+ // In some states, we speculatively start a tag, but don't know whether it'll be
561
+ // emitted as tag token or as a series of character tokens until we finish it.
562
+ // We need to abandon the tag we'd started & free its memory in that case to
563
+ // avoid a memory leak.
564
+ static void abandon_current_tag(GumboParser* parser) {
565
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
566
+ for (int i = 0; i < tag_state->_attributes.length; ++i) {
567
+ gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
568
+ }
569
+ gumbo_parser_deallocate(parser, tag_state->_attributes.data);
570
+ mark_tag_state_as_empty(tag_state);
571
+ gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
572
+ gumbo_debug("Abandoning current tag.\n");
573
+ }
574
+
575
+ // Wraps the consume_char_ref function to handle its output and make the
576
+ // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
577
+ // error occurred, RETURN_SUCCESS otherwise.
578
+ static StateResult emit_char_ref(
579
+ GumboParser* parser, int additional_allowed_char,
580
+ bool is_in_attribute, GumboToken* output) {
581
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
582
+ OneOrTwoCodepoints char_ref;
583
+ bool status = consume_char_ref(
584
+ parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
585
+ if (char_ref.first != kGumboNoChar) {
586
+ // consume_char_ref ends with the iterator pointing at the next character,
587
+ // so we need to be sure not advance it again before reading the next token.
588
+ tokenizer->_reconsume_current_input = true;
589
+ emit_char(parser, char_ref.first, output);
590
+ tokenizer->_buffered_emit_char = char_ref.second;
591
+ } else {
592
+ emit_char(parser, '&', output);
593
+ }
594
+ return status ? RETURN_SUCCESS : RETURN_ERROR;
595
+ }
596
+
597
+ // Emits a comment token. Comments use the temporary buffer to accumulate their
598
+ // data, and then it's copied over and released to the 'text' field of the
599
+ // GumboToken union. Always returns RETURN_SUCCESS.
600
+ static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
601
+ output->type = GUMBO_TOKEN_COMMENT;
602
+ finish_temporary_buffer(parser, &output->v.text);
603
+ finish_token(parser, output);
604
+ return RETURN_SUCCESS;
605
+ }
606
+
607
+ // Checks to see we should be flushing accumulated characters in the temporary
608
+ // buffer, and fills the output token with the next output character if so.
609
+ // Returns true if a character has been emitted and the tokenizer should
610
+ // immediately return, false if we're at the end of the temporary buffer and
611
+ // should resume normal operation.
612
+ static bool maybe_emit_from_temporary_buffer(
613
+ GumboParser* parser, GumboToken* output) {
614
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
615
+ const char* c = tokenizer->_temporary_buffer_emit;
616
+ GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
617
+
618
+ if (!c || c >= buffer->data + buffer->length) {
619
+ tokenizer->_temporary_buffer_emit = NULL;
620
+ return false;
621
+ }
622
+
623
+ assert(*c == utf8iterator_current(&tokenizer->_input));
624
+ // emit_char also advances the input stream. We need to do some juggling of
625
+ // the _reconsume_current_input flag to get the proper behavior when emitting
626
+ // previous tokens. Basically, _reconsume_current_input should *never* be set
627
+ // when emitting anything from the temporary buffer, since those characters
628
+ // have already been advanced past. However, it should be preserved so that
629
+ // when the *next* character is encountered again, the tokenizer knows not to
630
+ // advance past it.
631
+ bool saved_reconsume_state = tokenizer->_reconsume_current_input;
632
+ tokenizer->_reconsume_current_input = false;
633
+ emit_char(parser, *c, output);
634
+ ++tokenizer->_temporary_buffer_emit;
635
+ tokenizer->_reconsume_current_input = saved_reconsume_state;
636
+ return true;
637
+ }
638
+
639
+ // Sets up the tokenizer to begin flushing the temporary buffer.
640
+ // This resets the input iterator stream to the start of the last tag, sets up
641
+ // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
642
+ // the first character in it. It returns true if a character was emitted, false
643
+ // otherwise.
644
+ static bool emit_temporary_buffer(
645
+ GumboParser* parser, GumboToken* output) {
646
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
647
+ assert(tokenizer->_temporary_buffer.data);
648
+ utf8iterator_reset(&tokenizer->_input);
649
+ tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
650
+ return maybe_emit_from_temporary_buffer(parser, output);
651
+ }
652
+
653
+ // Appends a codepoint to the current tag buffer. If
654
+ // reinitilize_position_on_first is set, this also initializes the tag buffer
655
+ // start point; the only time you would *not* want to pass true for this
656
+ // parameter is if you want the original_text to include character (like an
657
+ // opening quote) that doesn't appear in the value.
658
+ static void append_char_to_tag_buffer(GumboParser* parser, int codepoint,
659
+ bool reinitilize_position_on_first) {
660
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
661
+ if (buffer->length == 0 && reinitilize_position_on_first) {
662
+ reset_tag_buffer_start_point(parser);
663
+ }
664
+ gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
665
+ }
666
+
667
+ // (Re-)initialize the tag buffer. This also resets the original_text pointer
668
+ // and _start_pos field to point to the current position.
669
+ static void initialize_tag_buffer(GumboParser* parser) {
670
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
671
+ GumboTagState* tag_state = &tokenizer->_tag_state;
672
+
673
+ gumbo_string_buffer_init(parser, &tag_state->_buffer);
674
+ reset_tag_buffer_start_point(parser);
675
+ }
676
+
677
+ // Initializes the tag_state to start a new tag, keeping track of the opening
678
+ // positions and original text. Takes a boolean indicating whether this is a
679
+ // start or end tag.
680
+ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
681
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
682
+ GumboTagState* tag_state = &tokenizer->_tag_state;
683
+ int c = utf8iterator_current(&tokenizer->_input);
684
+ assert(is_alpha(c));
685
+ c = ensure_lowercase(c);
686
+ assert(is_alpha(c));
687
+
688
+ initialize_tag_buffer(parser);
689
+ gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
690
+
691
+ assert(tag_state->_attributes.data == NULL);
692
+ gumbo_vector_init(parser, 4, &tag_state->_attributes);
693
+ tag_state->_drop_next_attr_value = false;
694
+ tag_state->_is_start_tag = is_start_tag;
695
+ tag_state->_is_self_closing = false;
696
+ gumbo_debug("Starting new tag.\n");
697
+ }
698
+
699
+ // Fills in the specified char* with the contents of the tag buffer.
700
+ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
701
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
702
+ GumboTagState* tag_state = &tokenizer->_tag_state;
703
+ *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
704
+ }
705
+
706
+ // Fills in:
707
+ // * The original_text GumboStringPiece with the portion of the original
708
+ // buffer that corresponds to the tag buffer.
709
+ // * The start_pos GumboSourcePosition with the start position of the tag
710
+ // buffer.
711
+ // * The end_pos GumboSourcePosition with the current source position.
712
+ static void copy_over_original_tag_text(
713
+ GumboParser* parser, GumboStringPiece* original_text,
714
+ GumboSourcePosition* start_pos, GumboSourcePosition* end_pos) {
715
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
716
+ GumboTagState* tag_state = &tokenizer->_tag_state;
717
+
718
+ original_text->data = tag_state->_original_text;
719
+ original_text->length =
720
+ utf8iterator_get_char_pointer(&tokenizer->_input) -
721
+ tag_state->_original_text;
722
+ if (original_text->data[original_text->length - 1] == '\r') {
723
+ // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
724
+ // appended to the end of original text even when it's really the first part
725
+ // of the next character. If we detect this situation, shrink the length of
726
+ // the original text by 1 to remove the carriage return.
727
+ --original_text->length;
728
+ }
729
+ *start_pos = tag_state->_start_pos;
730
+ utf8iterator_get_position(&tokenizer->_input, end_pos);
731
+ }
732
+
733
+ // Releases and then re-initializes the tag buffer.
734
+ static void reinitialize_tag_buffer(GumboParser* parser) {
735
+ gumbo_parser_deallocate(
736
+ parser, parser->_tokenizer_state->_tag_state._buffer.data);
737
+ initialize_tag_buffer(parser);
738
+ }
739
+
740
+ // Moves some data from the temporary buffer over the the tag-based fields in
741
+ // TagState.
742
+ static void finish_tag_name(GumboParser* parser) {
743
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
744
+ GumboTagState* tag_state = &tokenizer->_tag_state;
745
+
746
+ const char* temp;
747
+ copy_over_tag_buffer(parser, &temp);
748
+ tag_state->_tag = gumbo_tag_enum(temp);
749
+ reinitialize_tag_buffer(parser);
750
+ gumbo_parser_deallocate(parser, (void*) temp);
751
+ }
752
+
753
+ // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
754
+ static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
755
+ int original_index, int new_index) {
756
+ GumboError* error = gumbo_add_error(parser);
757
+ if (!error) {
758
+ return;
759
+ }
760
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
761
+ error->type = GUMBO_ERR_DUPLICATE_ATTR;
762
+ error->position = tag_state->_start_pos;
763
+ error->original_text = tag_state->_original_text;
764
+ error->v.duplicate_attr.original_index = original_index;
765
+ error->v.duplicate_attr.new_index = new_index;
766
+ copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
767
+ reinitialize_tag_buffer(parser);
768
+ }
769
+
770
+ // Creates a new attribute in the current tag, copying the current tag buffer to
771
+ // the attribute's name. The attribute's value starts out as the empty string
772
+ // (following the "Boolean attributes" section of the spec) and is only
773
+ // overwritten on finish_attribute_value(). If the attribute has already been
774
+ // specified, the new attribute is dropped, a parse error is added, and the
775
+ // function returns false. Otherwise, this returns true.
776
+ static bool finish_attribute_name(GumboParser* parser) {
777
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
778
+ GumboTagState* tag_state = &tokenizer->_tag_state;
779
+ // May've been set by a previous attribute without a value; reset it here.
780
+ tag_state->_drop_next_attr_value = false;
781
+ assert(tag_state->_attributes.data);
782
+ assert(tag_state->_attributes.capacity);
783
+
784
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
785
+ for (int i = 0; i < attributes->length; ++i) {
786
+ GumboAttribute* attr = attributes->data[i];
787
+ if (strlen(attr->name) == tag_state->_buffer.length &&
788
+ memcmp(attr->name, tag_state->_buffer.data,
789
+ tag_state->_buffer.length) == 0) {
790
+ // Identical attribute; bail.
791
+ add_duplicate_attr_error(
792
+ parser, attr->name, i, attributes->length);
793
+ tag_state->_drop_next_attr_value = true;
794
+ return false;
795
+ }
796
+ }
797
+
798
+ GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
799
+ attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
800
+ copy_over_tag_buffer(parser, &attr->name);
801
+ copy_over_original_tag_text(parser, &attr->original_name,
802
+ &attr->name_start, &attr->name_end);
803
+ attr->value = gumbo_copy_stringz(parser, "");
804
+ copy_over_original_tag_text(parser, &attr->original_value,
805
+ &attr->name_start, &attr->name_end);
806
+ gumbo_vector_add(parser, attr, attributes);
807
+ reinitialize_tag_buffer(parser);
808
+ return true;
809
+ }
810
+
811
+ // Finishes an attribute value. This sets the value of the most recently added
812
+ // attribute to the current contents of the tag buffer.
813
+ static void finish_attribute_value(GumboParser* parser) {
814
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
815
+ if (tag_state->_drop_next_attr_value) {
816
+ // Duplicate attribute name detected in an earlier state, so we have to
817
+ // ignore the value.
818
+ tag_state->_drop_next_attr_value = false;
819
+ reinitialize_tag_buffer(parser);
820
+ return;
821
+ }
822
+
823
+ GumboAttribute* attr =
824
+ tag_state->_attributes.data[tag_state->_attributes.length - 1];
825
+ gumbo_parser_deallocate(parser, (void*) attr->value);
826
+ copy_over_tag_buffer(parser, &attr->value);
827
+ copy_over_original_tag_text(parser, &attr->original_value,
828
+ &attr->value_start, &attr->value_end);
829
+ reinitialize_tag_buffer(parser);
830
+ }
831
+
832
+ // Returns true if the current end tag matches the last start tag emitted.
833
+ static bool is_appropriate_end_tag(GumboParser* parser) {
834
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
835
+ assert(!tag_state->_is_start_tag);
836
+ // Null terminate the current string buffer, so it can be passed to
837
+ // gumbo_tag_enum, but don't increment the length in case we need to dump the
838
+ // buffer as character tokens.
839
+ gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
840
+ --tag_state->_buffer.length;
841
+ return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
842
+ tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
843
+ }
844
+
845
+ void gumbo_tokenizer_state_init(
846
+ GumboParser* parser, const char* text, size_t text_length) {
847
+ GumboTokenizerState* tokenizer =
848
+ gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
849
+ parser->_tokenizer_state = tokenizer;
850
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
851
+ tokenizer->_reconsume_current_input = false;
852
+ tokenizer->_is_current_node_foreign = false;
853
+ tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
854
+
855
+ tokenizer->_buffered_emit_char = kGumboNoChar;
856
+ gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
857
+ tokenizer->_temporary_buffer_emit = NULL;
858
+
859
+ mark_tag_state_as_empty(&tokenizer->_tag_state);
860
+
861
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
862
+ tokenizer->_token_start = text;
863
+ utf8iterator_init(parser, text, text_length, &tokenizer->_input);
864
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
865
+ doc_type_state_init(parser);
866
+ }
867
+
868
+ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
869
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
870
+ assert(tokenizer->_doc_type_state.name == NULL);
871
+ assert(tokenizer->_doc_type_state.public_identifier == NULL);
872
+ assert(tokenizer->_doc_type_state.system_identifier == NULL);
873
+ gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
874
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
875
+ gumbo_parser_deallocate(parser, tokenizer);
876
+ }
877
+
878
+ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
879
+ parser->_tokenizer_state->_state = state;
880
+ }
881
+
882
+ void gumbo_tokenizer_set_is_current_node_foreign(
883
+ GumboParser* parser, bool is_foreign) {
884
+ if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
885
+ gumbo_debug("Toggling is_current_node_foreign to %s.\n",
886
+ is_foreign ? "true" : "false");
887
+ }
888
+ parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
889
+ }
890
+
891
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
892
+ static StateResult handle_data_state(
893
+ GumboParser* parser, GumboTokenizerState* tokenizer,
894
+ int c, GumboToken* output) {
895
+ switch (c) {
896
+ case '&':
897
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
898
+ // The char_ref machinery expects to be on the & so it can mark that
899
+ // and return to it if the text isn't a char ref, so we need to
900
+ // reconsume it.
901
+ tokenizer->_reconsume_current_input = true;
902
+ return NEXT_CHAR;
903
+ case '<':
904
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
905
+ clear_temporary_buffer(parser);
906
+ append_char_to_temporary_buffer(parser, '<');
907
+ return NEXT_CHAR;
908
+ case '\0':
909
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
910
+ emit_char(parser, c, output);
911
+ return RETURN_ERROR;
912
+ default:
913
+ return emit_current_char(parser, output);
914
+ }
915
+ }
916
+
917
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
918
+ static StateResult handle_char_ref_in_data_state(
919
+ GumboParser* parser, GumboTokenizerState* tokenizer,
920
+ int c, GumboToken* output) {
921
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
922
+ return emit_char_ref(parser, ' ', false, output);
923
+ }
924
+
925
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
926
+ static StateResult handle_rcdata_state(
927
+ GumboParser* parser, GumboTokenizerState* tokenizer,
928
+ int c, GumboToken* output) {
929
+ switch (c) {
930
+ case '&':
931
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
932
+ tokenizer->_reconsume_current_input = true;
933
+ return NEXT_CHAR;
934
+ case '<':
935
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
936
+ clear_temporary_buffer(parser);
937
+ append_char_to_temporary_buffer(parser, '<');
938
+ return NEXT_CHAR;
939
+ case '\0':
940
+ return emit_replacement_char(parser, output);
941
+ case -1:
942
+ return emit_eof(parser, output);
943
+ default:
944
+ return emit_current_char(parser, output);
945
+ }
946
+ }
947
+
948
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
949
+ static StateResult handle_char_ref_in_rcdata_state(
950
+ GumboParser* parser, GumboTokenizerState* tokenizer,
951
+ int c, GumboToken* output) {
952
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
953
+ return emit_char_ref(parser, ' ', false, output);
954
+ }
955
+
956
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
957
+ static StateResult handle_rawtext_state(
958
+ GumboParser* parser, GumboTokenizerState* tokenizer,
959
+ int c, GumboToken* output) {
960
+ switch (c) {
961
+ case '<':
962
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
963
+ clear_temporary_buffer(parser);
964
+ append_char_to_temporary_buffer(parser, '<');
965
+ return NEXT_CHAR;
966
+ case '\0':
967
+ return emit_replacement_char(parser, output);
968
+ case -1:
969
+ return emit_eof(parser, output);
970
+ default:
971
+ return emit_current_char(parser, output);
972
+ }
973
+ }
974
+
975
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
976
+ static StateResult handle_script_state(
977
+ GumboParser* parser, GumboTokenizerState* tokenizer,
978
+ int c, GumboToken* output) {
979
+ switch (c) {
980
+ case '<':
981
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
982
+ clear_temporary_buffer(parser);
983
+ append_char_to_temporary_buffer(parser, '<');
984
+ return NEXT_CHAR;
985
+ case '\0':
986
+ return emit_replacement_char(parser, output);
987
+ case -1:
988
+ return emit_eof(parser, output);
989
+ default:
990
+ return emit_current_char(parser, output);
991
+ }
992
+ }
993
+
994
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
995
+ static StateResult handle_plaintext_state(
996
+ GumboParser* parser, GumboTokenizerState* tokenizer,
997
+ int c, GumboToken* output) {
998
+ switch (c) {
999
+ case '\0':
1000
+ return emit_replacement_char(parser, output);
1001
+ case -1:
1002
+ return emit_eof(parser, output);
1003
+ default:
1004
+ return emit_current_char(parser, output);
1005
+ }
1006
+ }
1007
+
1008
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
1009
+ static StateResult handle_tag_open_state(
1010
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1011
+ int c, GumboToken* output) {
1012
+ assert(temporary_buffer_equals(parser, "<"));
1013
+ switch (c) {
1014
+ case '!':
1015
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1016
+ clear_temporary_buffer(parser);
1017
+ return NEXT_CHAR;
1018
+ case '/':
1019
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1020
+ append_char_to_temporary_buffer(parser, '/');
1021
+ return NEXT_CHAR;
1022
+ case '?':
1023
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1024
+ clear_temporary_buffer(parser);
1025
+ append_char_to_temporary_buffer(parser, '?');
1026
+ add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1027
+ return NEXT_CHAR;
1028
+ default:
1029
+ if (is_alpha(c)) {
1030
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1031
+ start_new_tag(parser, true);
1032
+ return NEXT_CHAR;
1033
+ } else {
1034
+ add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1035
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1036
+ emit_temporary_buffer(parser, output);
1037
+ return RETURN_ERROR;
1038
+ }
1039
+ }
1040
+ }
1041
+
1042
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
1043
+ static StateResult handle_end_tag_open_state(
1044
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1045
+ int c, GumboToken* output) {
1046
+ assert(temporary_buffer_equals(parser, "</"));
1047
+ switch (c) {
1048
+ case '>':
1049
+ add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1050
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1051
+ return NEXT_CHAR;
1052
+ case -1:
1053
+ add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1054
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1055
+ return emit_temporary_buffer(parser, output);
1056
+ default:
1057
+ if (is_alpha(c)) {
1058
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1059
+ start_new_tag(parser, false);
1060
+ } else {
1061
+ add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1062
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1063
+ clear_temporary_buffer(parser);
1064
+ append_char_to_temporary_buffer(parser, c);
1065
+ }
1066
+ return NEXT_CHAR;
1067
+ }
1068
+ }
1069
+
1070
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
1071
+ static StateResult handle_tag_name_state(
1072
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1073
+ int c, GumboToken* output) {
1074
+ switch (c) {
1075
+ case '\t':
1076
+ case '\n':
1077
+ case '\f':
1078
+ case ' ':
1079
+ finish_tag_name(parser);
1080
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1081
+ return NEXT_CHAR;
1082
+ case '/':
1083
+ finish_tag_name(parser);
1084
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1085
+ return NEXT_CHAR;
1086
+ case '>':
1087
+ finish_tag_name(parser);
1088
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1089
+ return emit_current_tag(parser, output);
1090
+ case '\0':
1091
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1092
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1093
+ return NEXT_CHAR;
1094
+ case -1:
1095
+ add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1096
+ abandon_current_tag(parser);
1097
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1098
+ return NEXT_CHAR;
1099
+ default:
1100
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1101
+ return NEXT_CHAR;
1102
+ }
1103
+ }
1104
+
1105
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
1106
+ static StateResult handle_rcdata_lt_state(
1107
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1108
+ int c, GumboToken* output) {
1109
+ assert(temporary_buffer_equals(parser, "<"));
1110
+ if (c == '/') {
1111
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1112
+ append_char_to_temporary_buffer(parser, '/');
1113
+ return NEXT_CHAR;
1114
+ } else {
1115
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1116
+ tokenizer->_reconsume_current_input = true;
1117
+ return emit_temporary_buffer(parser, output);
1118
+ }
1119
+ }
1120
+
1121
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
1122
+ static StateResult handle_rcdata_end_tag_open_state(
1123
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1124
+ int c, GumboToken* output) {
1125
+ assert(temporary_buffer_equals(parser, "</"));
1126
+ if (is_alpha(c)) {
1127
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1128
+ start_new_tag(parser, false);
1129
+ append_char_to_temporary_buffer(parser, c);
1130
+ return NEXT_CHAR;
1131
+ } else {
1132
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1133
+ return emit_temporary_buffer(parser, output);
1134
+ }
1135
+ return true;
1136
+ }
1137
+
1138
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
1139
+ static StateResult handle_rcdata_end_tag_name_state(
1140
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1141
+ int c, GumboToken* output) {
1142
+ assert(tokenizer->_temporary_buffer.length >= 2);
1143
+ if (is_alpha(c)) {
1144
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1145
+ append_char_to_temporary_buffer(parser, c);
1146
+ return NEXT_CHAR;
1147
+ } else if (is_appropriate_end_tag(parser)) {
1148
+ switch (c) {
1149
+ case '\t':
1150
+ case '\n':
1151
+ case '\f':
1152
+ case ' ':
1153
+ finish_tag_name(parser);
1154
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1155
+ return NEXT_CHAR;
1156
+ case '/':
1157
+ finish_tag_name(parser);
1158
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1159
+ return NEXT_CHAR;
1160
+ case '>':
1161
+ finish_tag_name(parser);
1162
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1163
+ return emit_current_tag(parser, output);
1164
+ }
1165
+ }
1166
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1167
+ abandon_current_tag(parser);
1168
+ return emit_temporary_buffer(parser, output);
1169
+ }
1170
+
1171
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
1172
+ static StateResult handle_rawtext_lt_state(
1173
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1174
+ int c, GumboToken* output) {
1175
+ assert(temporary_buffer_equals(parser, "<"));
1176
+ if (c == '/') {
1177
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1178
+ append_char_to_temporary_buffer(parser, '/');
1179
+ return NEXT_CHAR;
1180
+ } else {
1181
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1182
+ tokenizer->_reconsume_current_input = true;
1183
+ return emit_temporary_buffer(parser, output);
1184
+ }
1185
+ }
1186
+
1187
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
1188
+ static StateResult handle_rawtext_end_tag_open_state(
1189
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1190
+ int c, GumboToken* output) {
1191
+ assert(temporary_buffer_equals(parser, "</"));
1192
+ if (is_alpha(c)) {
1193
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1194
+ start_new_tag(parser, false);
1195
+ append_char_to_temporary_buffer(parser, c);
1196
+ return NEXT_CHAR;
1197
+ } else {
1198
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1199
+ return emit_temporary_buffer(parser, output);
1200
+ }
1201
+ }
1202
+
1203
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
1204
+ static StateResult handle_rawtext_end_tag_name_state(
1205
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1206
+ int c, GumboToken* output) {
1207
+ assert(tokenizer->_temporary_buffer.length >= 2);
1208
+ gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1209
+ tokenizer->_tag_state._buffer.data);
1210
+ if (is_alpha(c)) {
1211
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1212
+ append_char_to_temporary_buffer(parser, c);
1213
+ return NEXT_CHAR;
1214
+ } else if (is_appropriate_end_tag(parser)) {
1215
+ gumbo_debug("Is an appropriate end tag.\n");
1216
+ switch (c) {
1217
+ case '\t':
1218
+ case '\n':
1219
+ case '\f':
1220
+ case ' ':
1221
+ finish_tag_name(parser);
1222
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1223
+ return NEXT_CHAR;
1224
+ case '/':
1225
+ finish_tag_name(parser);
1226
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1227
+ return NEXT_CHAR;
1228
+ case '>':
1229
+ finish_tag_name(parser);
1230
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1231
+ return emit_current_tag(parser, output);
1232
+ }
1233
+ }
1234
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1235
+ abandon_current_tag(parser);
1236
+ return emit_temporary_buffer(parser, output);
1237
+ }
1238
+
1239
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
1240
+ static StateResult handle_script_lt_state(
1241
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1242
+ int c, GumboToken* output) {
1243
+ assert(temporary_buffer_equals(parser, "<"));
1244
+ if (c == '/') {
1245
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1246
+ append_char_to_temporary_buffer(parser, '/');
1247
+ return NEXT_CHAR;
1248
+ } else if (c == '!') {
1249
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1250
+ append_char_to_temporary_buffer(parser, '!');
1251
+ return emit_temporary_buffer(parser, output);
1252
+ } else {
1253
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1254
+ tokenizer->_reconsume_current_input = true;
1255
+ return emit_temporary_buffer(parser, output);
1256
+ }
1257
+ }
1258
+
1259
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
1260
+ static StateResult handle_script_end_tag_open_state(
1261
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1262
+ int c, GumboToken* output) {
1263
+ assert(temporary_buffer_equals(parser, "</"));
1264
+ if (is_alpha(c)) {
1265
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1266
+ start_new_tag(parser, false);
1267
+ append_char_to_temporary_buffer(parser, c);
1268
+ return NEXT_CHAR;
1269
+ } else {
1270
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1271
+ return emit_temporary_buffer(parser, output);
1272
+ }
1273
+ }
1274
+
1275
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
1276
+ static StateResult handle_script_end_tag_name_state(
1277
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1278
+ int c, GumboToken* output) {
1279
+ assert(tokenizer->_temporary_buffer.length >= 2);
1280
+ if (is_alpha(c)) {
1281
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1282
+ append_char_to_temporary_buffer(parser, c);
1283
+ return NEXT_CHAR;
1284
+ } else if (is_appropriate_end_tag(parser)) {
1285
+ switch (c) {
1286
+ case '\t':
1287
+ case '\n':
1288
+ case '\f':
1289
+ case ' ':
1290
+ finish_tag_name(parser);
1291
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1292
+ return NEXT_CHAR;
1293
+ case '/':
1294
+ finish_tag_name(parser);
1295
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1296
+ return NEXT_CHAR;
1297
+ case '>':
1298
+ finish_tag_name(parser);
1299
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1300
+ return emit_current_tag(parser, output);
1301
+ }
1302
+ }
1303
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1304
+ abandon_current_tag(parser);
1305
+ return emit_temporary_buffer(parser, output);
1306
+ }
1307
+
1308
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
1309
+ static StateResult handle_script_escaped_start_state(
1310
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1311
+ int c, GumboToken* output) {
1312
+ if (c == '-') {
1313
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1314
+ return emit_current_char(parser, output);
1315
+ } else {
1316
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1317
+ tokenizer->_reconsume_current_input = true;
1318
+ return NEXT_CHAR;
1319
+ }
1320
+ }
1321
+
1322
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
1323
+ static StateResult handle_script_escaped_start_dash_state(
1324
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1325
+ int c, GumboToken* output) {
1326
+ if (c == '-') {
1327
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1328
+ return emit_current_char(parser, output);
1329
+ } else {
1330
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1331
+ tokenizer->_reconsume_current_input = true;
1332
+ return NEXT_CHAR;
1333
+ }
1334
+ }
1335
+
1336
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
1337
+ static StateResult handle_script_escaped_state(
1338
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1339
+ int c, GumboToken* output) {
1340
+ switch (c) {
1341
+ case '-':
1342
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1343
+ return emit_current_char(parser, output);
1344
+ case '<':
1345
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1346
+ clear_temporary_buffer(parser);
1347
+ append_char_to_temporary_buffer(parser, c);
1348
+ return NEXT_CHAR;
1349
+ case '\0':
1350
+ return emit_replacement_char(parser, output);
1351
+ case -1:
1352
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1353
+ return emit_eof(parser, output);
1354
+ default:
1355
+ return emit_current_char(parser, output);
1356
+ }
1357
+ }
1358
+
1359
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
1360
+ static StateResult handle_script_escaped_dash_state(
1361
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1362
+ int c, GumboToken* output) {
1363
+ switch (c) {
1364
+ case '-':
1365
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1366
+ return emit_current_char(parser, output);
1367
+ case '<':
1368
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1369
+ clear_temporary_buffer(parser);
1370
+ append_char_to_temporary_buffer(parser, c);
1371
+ return NEXT_CHAR;
1372
+ case '\0':
1373
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1374
+ return emit_replacement_char(parser, output);
1375
+ case -1:
1376
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1377
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1378
+ return NEXT_CHAR;
1379
+ default:
1380
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1381
+ return emit_current_char(parser, output);
1382
+ }
1383
+ }
1384
+
1385
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
1386
+ static StateResult handle_script_escaped_dash_dash_state(
1387
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1388
+ int c, GumboToken* output) {
1389
+ switch (c) {
1390
+ case '-':
1391
+ return emit_current_char(parser, output);
1392
+ case '<':
1393
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1394
+ clear_temporary_buffer(parser);
1395
+ append_char_to_temporary_buffer(parser, c);
1396
+ return NEXT_CHAR;
1397
+ case '>':
1398
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1399
+ return emit_current_char(parser, output);
1400
+ case '\0':
1401
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1402
+ return emit_replacement_char(parser, output);
1403
+ case -1:
1404
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1405
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1406
+ return NEXT_CHAR;
1407
+ default:
1408
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1409
+ return emit_current_char(parser, output);
1410
+ }
1411
+ }
1412
+
1413
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
1414
+ static StateResult handle_script_escaped_lt_state(
1415
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1416
+ int c, GumboToken* output) {
1417
+ assert(temporary_buffer_equals(parser, "<"));
1418
+ assert(!tokenizer->_script_data_buffer.length);
1419
+ if (c == '/') {
1420
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1421
+ append_char_to_temporary_buffer(parser, c);
1422
+ return NEXT_CHAR;
1423
+ } else if (is_alpha(c)) {
1424
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1425
+ append_char_to_temporary_buffer(parser, c);
1426
+ gumbo_string_buffer_append_codepoint(
1427
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1428
+ return emit_temporary_buffer(parser, output);
1429
+ } else {
1430
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1431
+ return emit_temporary_buffer(parser, output);
1432
+ }
1433
+ }
1434
+
1435
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
1436
+ static StateResult handle_script_escaped_end_tag_open_state(
1437
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1438
+ int c, GumboToken* output) {
1439
+ assert(temporary_buffer_equals(parser, "</"));
1440
+ if (is_alpha(c)) {
1441
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1442
+ start_new_tag(parser, false);
1443
+ append_char_to_temporary_buffer(parser, c);
1444
+ return NEXT_CHAR;
1445
+ } else {
1446
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1447
+ return emit_temporary_buffer(parser, output);
1448
+ }
1449
+ }
1450
+
1451
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
1452
+ static StateResult handle_script_escaped_end_tag_name_state(
1453
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1454
+ int c, GumboToken* output) {
1455
+ assert(tokenizer->_temporary_buffer.length >= 2);
1456
+ if (is_alpha(c)) {
1457
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1458
+ append_char_to_temporary_buffer(parser, c);
1459
+ return NEXT_CHAR;
1460
+ } else if (is_appropriate_end_tag(parser)) {
1461
+ switch (c) {
1462
+ case '\t':
1463
+ case '\n':
1464
+ case '\f':
1465
+ case ' ':
1466
+ finish_tag_name(parser);
1467
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1468
+ return NEXT_CHAR;
1469
+ case '/':
1470
+ finish_tag_name(parser);
1471
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1472
+ return NEXT_CHAR;
1473
+ case '>':
1474
+ finish_tag_name(parser);
1475
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1476
+ return emit_current_tag(parser, output);
1477
+ }
1478
+ }
1479
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1480
+ abandon_current_tag(parser);
1481
+ return emit_temporary_buffer(parser, output);
1482
+ }
1483
+
1484
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
1485
+ static StateResult handle_script_double_escaped_start_state(
1486
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1487
+ int c, GumboToken* output) {
1488
+ switch (c) {
1489
+ case '\t':
1490
+ case '\n':
1491
+ case '\f':
1492
+ case ' ':
1493
+ case '/':
1494
+ case '>':
1495
+ gumbo_tokenizer_set_state(parser, gumbo_string_equals(
1496
+ &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1497
+ ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED : GUMBO_LEX_SCRIPT_ESCAPED);
1498
+ return emit_current_char(parser, output);
1499
+ default:
1500
+ if (is_alpha(c)) {
1501
+ gumbo_string_buffer_append_codepoint(
1502
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1503
+ return emit_current_char(parser, output);
1504
+ } else {
1505
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1506
+ tokenizer->_reconsume_current_input = true;
1507
+ return NEXT_CHAR;
1508
+ }
1509
+ }
1510
+ }
1511
+
1512
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
1513
+ static StateResult handle_script_double_escaped_state(
1514
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1515
+ int c, GumboToken* output) {
1516
+ switch (c) {
1517
+ case '-':
1518
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1519
+ return emit_current_char(parser, output);
1520
+ case '<':
1521
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1522
+ return emit_current_char(parser, output);
1523
+ case '\0':
1524
+ return emit_replacement_char(parser, output);
1525
+ case -1:
1526
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1527
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1528
+ return NEXT_CHAR;
1529
+ default:
1530
+ return emit_current_char(parser, output);
1531
+ }
1532
+ }
1533
+
1534
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
1535
+ static StateResult handle_script_double_escaped_dash_state(
1536
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1537
+ int c, GumboToken* output) {
1538
+ switch (c) {
1539
+ case '-':
1540
+ gumbo_tokenizer_set_state(
1541
+ parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1542
+ return emit_current_char(parser, output);
1543
+ case '<':
1544
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1545
+ return emit_current_char(parser, output);
1546
+ case '\0':
1547
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1548
+ return emit_replacement_char(parser, output);
1549
+ case -1:
1550
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1551
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1552
+ return NEXT_CHAR;
1553
+ default:
1554
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1555
+ return emit_current_char(parser, output);
1556
+ }
1557
+ }
1558
+
1559
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
1560
+ static StateResult handle_script_double_escaped_dash_dash_state(
1561
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1562
+ int c, GumboToken* output) {
1563
+ switch (c) {
1564
+ case '-':
1565
+ return emit_current_char(parser, output);
1566
+ case '<':
1567
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1568
+ return emit_current_char(parser, output);
1569
+ case '>':
1570
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1571
+ return emit_current_char(parser, output);
1572
+ case '\0':
1573
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1574
+ return emit_replacement_char(parser, output);
1575
+ case -1:
1576
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1577
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1578
+ return NEXT_CHAR;
1579
+ default:
1580
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1581
+ return emit_current_char(parser, output);
1582
+ }
1583
+ }
1584
+
1585
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
1586
+ static StateResult handle_script_double_escaped_lt_state(
1587
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1588
+ int c, GumboToken* output) {
1589
+ if (c == '/') {
1590
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1591
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
1592
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
1593
+ return emit_current_char(parser, output);
1594
+ } else {
1595
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1596
+ tokenizer->_reconsume_current_input = true;
1597
+ return NEXT_CHAR;
1598
+ }
1599
+
1600
+ }
1601
+
1602
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
1603
+ static StateResult handle_script_double_escaped_end_state(
1604
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1605
+ int c, GumboToken* output) {
1606
+ switch (c) {
1607
+ case '\t':
1608
+ case '\n':
1609
+ case '\f':
1610
+ case ' ':
1611
+ case '/':
1612
+ case '>':
1613
+ gumbo_tokenizer_set_state(parser, gumbo_string_equals(
1614
+ &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1615
+ ? GUMBO_LEX_SCRIPT_ESCAPED : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1616
+ return emit_current_char(parser, output);
1617
+ default:
1618
+ if (is_alpha(c)) {
1619
+ gumbo_string_buffer_append_codepoint(
1620
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1621
+ return emit_current_char(parser, output);
1622
+ } else {
1623
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1624
+ tokenizer->_reconsume_current_input = true;
1625
+ return NEXT_CHAR;
1626
+ }
1627
+ }
1628
+ }
1629
+
1630
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
1631
+ static StateResult handle_before_attr_name_state(
1632
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1633
+ int c, GumboToken* output) {
1634
+ switch (c) {
1635
+ case '\t':
1636
+ case '\n':
1637
+ case '\f':
1638
+ case ' ':
1639
+ return NEXT_CHAR;
1640
+ case '/':
1641
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1642
+ return NEXT_CHAR;
1643
+ case '>':
1644
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1645
+ return emit_current_tag(parser, output);
1646
+ case '\0':
1647
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1648
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1649
+ append_char_to_temporary_buffer(parser, 0xfffd);
1650
+ return NEXT_CHAR;
1651
+ case -1:
1652
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1653
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1654
+ abandon_current_tag(parser);
1655
+ return NEXT_CHAR;
1656
+ case '"':
1657
+ case '\'':
1658
+ case '<':
1659
+ case '=':
1660
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1661
+ // Fall through.
1662
+ default:
1663
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1664
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1665
+ return NEXT_CHAR;
1666
+ }
1667
+ }
1668
+
1669
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
1670
+ static StateResult handle_attr_name_state(
1671
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1672
+ int c, GumboToken* output) {
1673
+ switch (c) {
1674
+ case '\t':
1675
+ case '\n':
1676
+ case '\f':
1677
+ case ' ':
1678
+ finish_attribute_name(parser);
1679
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1680
+ return NEXT_CHAR;
1681
+ case '/':
1682
+ finish_attribute_name(parser);
1683
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1684
+ return NEXT_CHAR;
1685
+ case '=':
1686
+ finish_attribute_name(parser);
1687
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1688
+ return NEXT_CHAR;
1689
+ case '>':
1690
+ finish_attribute_name(parser);
1691
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1692
+ return emit_current_tag(parser, output);
1693
+ case '\0':
1694
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1695
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1696
+ return NEXT_CHAR;
1697
+ case -1:
1698
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1699
+ abandon_current_tag(parser);
1700
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1701
+ return NEXT_CHAR;
1702
+ case '"':
1703
+ case '\'':
1704
+ case '<':
1705
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1706
+ // Fall through.
1707
+ default:
1708
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1709
+ return NEXT_CHAR;
1710
+ }
1711
+ }
1712
+
1713
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
1714
+ static StateResult handle_after_attr_name_state(
1715
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1716
+ int c, GumboToken* output) {
1717
+ switch (c) {
1718
+ case '\t':
1719
+ case '\n':
1720
+ case '\f':
1721
+ case ' ':
1722
+ return NEXT_CHAR;
1723
+ case '/':
1724
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1725
+ return NEXT_CHAR;
1726
+ case '=':
1727
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1728
+ return NEXT_CHAR;
1729
+ case '>':
1730
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1731
+ return emit_current_tag(parser, output);
1732
+ case '\0':
1733
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1734
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1735
+ append_char_to_temporary_buffer(parser, 0xfffd);
1736
+ return NEXT_CHAR;
1737
+ case -1:
1738
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1739
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1740
+ abandon_current_tag(parser);
1741
+ return NEXT_CHAR;
1742
+ case '"':
1743
+ case '\'':
1744
+ case '<':
1745
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1746
+ // Fall through.
1747
+ default:
1748
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1749
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1750
+ return NEXT_CHAR;
1751
+ }
1752
+ }
1753
+
1754
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
1755
+ static StateResult handle_before_attr_value_state(
1756
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1757
+ int c, GumboToken* output) {
1758
+ switch (c) {
1759
+ case '\t':
1760
+ case '\n':
1761
+ case '\f':
1762
+ case ' ':
1763
+ return NEXT_CHAR;
1764
+ case '"':
1765
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1766
+ reset_tag_buffer_start_point(parser);
1767
+ return NEXT_CHAR;
1768
+ case '&':
1769
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1770
+ tokenizer->_reconsume_current_input = true;
1771
+ return NEXT_CHAR;
1772
+ case '\'':
1773
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1774
+ reset_tag_buffer_start_point(parser);
1775
+ return NEXT_CHAR;
1776
+ case '\0':
1777
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1778
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1779
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1780
+ return NEXT_CHAR;
1781
+ case -1:
1782
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1783
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1784
+ abandon_current_tag(parser);
1785
+ tokenizer->_reconsume_current_input = true;
1786
+ return NEXT_CHAR;
1787
+ case '>':
1788
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1789
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1790
+ emit_current_tag(parser, output);
1791
+ return RETURN_ERROR;
1792
+ case '<':
1793
+ case '=':
1794
+ case '`':
1795
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1796
+ // Fall through.
1797
+ default:
1798
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1799
+ append_char_to_tag_buffer(parser, c, true);
1800
+ return NEXT_CHAR;
1801
+ }
1802
+ }
1803
+
1804
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
1805
+ static StateResult handle_attr_value_double_quoted_state(
1806
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1807
+ int c, GumboToken* output) {
1808
+ switch (c) {
1809
+ case '"':
1810
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1811
+ return NEXT_CHAR;
1812
+ case '&':
1813
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1814
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1815
+ tokenizer->_reconsume_current_input = true;
1816
+ return NEXT_CHAR;
1817
+ case '\0':
1818
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1819
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1820
+ return NEXT_CHAR;
1821
+ case -1:
1822
+ add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
1823
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1824
+ abandon_current_tag(parser);
1825
+ tokenizer->_reconsume_current_input = true;
1826
+ return NEXT_CHAR;
1827
+ default:
1828
+ append_char_to_tag_buffer(parser, c, false);
1829
+ return NEXT_CHAR;
1830
+ }
1831
+ }
1832
+
1833
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
1834
+ static StateResult handle_attr_value_single_quoted_state(
1835
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1836
+ int c, GumboToken* output) {
1837
+ switch (c) {
1838
+ case '\'':
1839
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1840
+ return NEXT_CHAR;
1841
+ case '&':
1842
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1843
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1844
+ tokenizer->_reconsume_current_input = true;
1845
+ return NEXT_CHAR;
1846
+ case '\0':
1847
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1848
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1849
+ return NEXT_CHAR;
1850
+ case -1:
1851
+ add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
1852
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1853
+ abandon_current_tag(parser);
1854
+ tokenizer->_reconsume_current_input = true;
1855
+ return NEXT_CHAR;
1856
+ default:
1857
+ append_char_to_tag_buffer(parser, c, false);
1858
+ return NEXT_CHAR;
1859
+ }
1860
+ }
1861
+
1862
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
1863
+ static StateResult handle_attr_value_unquoted_state(
1864
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1865
+ int c, GumboToken* output) {
1866
+ switch (c) {
1867
+ case '\t':
1868
+ case '\n':
1869
+ case '\f':
1870
+ case ' ':
1871
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1872
+ finish_attribute_value(parser);
1873
+ return NEXT_CHAR;
1874
+ case '&':
1875
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1876
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1877
+ tokenizer->_reconsume_current_input = true;
1878
+ return NEXT_CHAR;
1879
+ case '>':
1880
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1881
+ finish_attribute_value(parser);
1882
+ return emit_current_tag(parser, output);
1883
+ case '\0':
1884
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1885
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1886
+ return NEXT_CHAR;
1887
+ case -1:
1888
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1889
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1890
+ tokenizer->_reconsume_current_input = true;
1891
+ abandon_current_tag(parser);
1892
+ return NEXT_CHAR;
1893
+ case '<':
1894
+ case '=':
1895
+ case '"':
1896
+ case '\'':
1897
+ case '`':
1898
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1899
+ // Fall through.
1900
+ default:
1901
+ append_char_to_tag_buffer(parser, c, true);
1902
+ return NEXT_CHAR;
1903
+ }
1904
+ }
1905
+
1906
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
1907
+ static StateResult handle_char_ref_in_attr_value_state(
1908
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1909
+ int c, GumboToken* output) {
1910
+ OneOrTwoCodepoints char_ref;
1911
+ int allowed_char;
1912
+ bool is_unquoted = false;
1913
+ switch (tokenizer->_tag_state._attr_value_state) {
1914
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
1915
+ allowed_char = '"';
1916
+ break;
1917
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
1918
+ allowed_char = '\'';
1919
+ break;
1920
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
1921
+ allowed_char = '>';
1922
+ is_unquoted = true;
1923
+ break;
1924
+ default:
1925
+ // -Wmaybe-uninitialized is a little overzealous here, and doesn't
1926
+ // get that the assert(0) means this codepath will never happen.
1927
+ allowed_char = ' ';
1928
+ assert(0);
1929
+ }
1930
+
1931
+ // Ignore the status, since we don't have a convenient way of signalling that
1932
+ // a parser error has occurred when the error occurs in the middle of a
1933
+ // multi-state token. We'd need a flag inside the TokenizerState to do this,
1934
+ // but that's a low priority fix.
1935
+ consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
1936
+ if (char_ref.first != kGumboNoChar) {
1937
+ tokenizer->_reconsume_current_input = true;
1938
+ append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
1939
+ if (char_ref.second != kGumboNoChar) {
1940
+ append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
1941
+ }
1942
+ } else {
1943
+ append_char_to_tag_buffer(parser, '&', is_unquoted);
1944
+ }
1945
+ gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
1946
+ return NEXT_CHAR;
1947
+ }
1948
+
1949
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
1950
+ static StateResult handle_after_attr_value_quoted_state(
1951
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1952
+ int c, GumboToken* output) {
1953
+ finish_attribute_value(parser);
1954
+ switch (c) {
1955
+ case '\t':
1956
+ case '\n':
1957
+ case '\f':
1958
+ case ' ':
1959
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1960
+ return NEXT_CHAR;
1961
+ case '/':
1962
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1963
+ return NEXT_CHAR;
1964
+ case '>':
1965
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1966
+ return emit_current_tag(parser, output);
1967
+ case -1:
1968
+ add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
1969
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1970
+ abandon_current_tag(parser);
1971
+ tokenizer->_reconsume_current_input = true;
1972
+ return NEXT_CHAR;
1973
+ default:
1974
+ add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
1975
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1976
+ tokenizer->_reconsume_current_input = true;
1977
+ return NEXT_CHAR;
1978
+ }
1979
+ }
1980
+
1981
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
1982
+ static StateResult handle_self_closing_start_tag_state(
1983
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1984
+ int c, GumboToken* output) {
1985
+ switch (c) {
1986
+ case '>':
1987
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1988
+ tokenizer->_tag_state._is_self_closing = true;
1989
+ return emit_current_tag(parser, output);
1990
+ case -1:
1991
+ add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
1992
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1993
+ abandon_current_tag(parser);
1994
+ return NEXT_CHAR;
1995
+ default:
1996
+ add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
1997
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1998
+ tokenizer->_reconsume_current_input = true;
1999
+ return NEXT_CHAR;
2000
+ }
2001
+ }
2002
+
2003
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
2004
+ static StateResult handle_bogus_comment_state(
2005
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2006
+ int c, GumboToken* output) {
2007
+ while (c != '>' && c != -1) {
2008
+ if (c == '\0') {
2009
+ c = 0xFFFD;
2010
+ }
2011
+ append_char_to_temporary_buffer(parser, c);
2012
+ utf8iterator_next(&tokenizer->_input);
2013
+ c = utf8iterator_current(&tokenizer->_input);
2014
+ }
2015
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2016
+ return emit_comment(parser, output);
2017
+ }
2018
+
2019
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
2020
+ static StateResult handle_markup_declaration_state(
2021
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2022
+ int c, GumboToken* output) {
2023
+ if (utf8iterator_maybe_consume_match(
2024
+ &tokenizer->_input, "--", sizeof("--") - 1, true)) {
2025
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
2026
+ tokenizer->_reconsume_current_input = true;
2027
+ } else if (utf8iterator_maybe_consume_match(
2028
+ &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
2029
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
2030
+ tokenizer->_reconsume_current_input = true;
2031
+ // If we get here, we know we'll eventually emit a doctype token, so now is
2032
+ // the time to initialize the doctype strings. (Not in doctype_state_init,
2033
+ // since then they'll leak if ownership never gets transferred to the
2034
+ // doctype token.
2035
+ tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
2036
+ tokenizer->_doc_type_state.public_identifier =
2037
+ gumbo_copy_stringz(parser, "");
2038
+ tokenizer->_doc_type_state.system_identifier =
2039
+ gumbo_copy_stringz(parser, "");
2040
+ } else if (tokenizer->_is_current_node_foreign &&
2041
+ utf8iterator_maybe_consume_match(
2042
+ &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2043
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2044
+ tokenizer->_reconsume_current_input = true;
2045
+ } else {
2046
+ add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2047
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2048
+ tokenizer->_reconsume_current_input = true;
2049
+ clear_temporary_buffer(parser);
2050
+ }
2051
+ return NEXT_CHAR;
2052
+ }
2053
+
2054
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
2055
+ static StateResult handle_comment_start_state(
2056
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2057
+ int c, GumboToken* output) {
2058
+ switch (c) {
2059
+ case '-':
2060
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2061
+ return NEXT_CHAR;
2062
+ case '\0':
2063
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2064
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2065
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2066
+ return NEXT_CHAR;
2067
+ case '>':
2068
+ add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2069
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2070
+ emit_comment(parser, output);
2071
+ return RETURN_ERROR;
2072
+ case -1:
2073
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2074
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2075
+ emit_comment(parser, output);
2076
+ return RETURN_ERROR;
2077
+ default:
2078
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2079
+ append_char_to_temporary_buffer(parser, c);
2080
+ return NEXT_CHAR;
2081
+ }
2082
+ }
2083
+
2084
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
2085
+ static StateResult handle_comment_start_dash_state(
2086
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2087
+ int c, GumboToken* output) {
2088
+ switch (c) {
2089
+ case '-':
2090
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2091
+ return NEXT_CHAR;
2092
+ case '\0':
2093
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2094
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2095
+ append_char_to_temporary_buffer(parser, '-');
2096
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2097
+ return NEXT_CHAR;
2098
+ case '>':
2099
+ add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2100
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2101
+ emit_comment(parser, output);
2102
+ return RETURN_ERROR;
2103
+ case -1:
2104
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2105
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2106
+ emit_comment(parser, output);
2107
+ return RETURN_ERROR;
2108
+ default:
2109
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2110
+ append_char_to_temporary_buffer(parser, '-');
2111
+ append_char_to_temporary_buffer(parser, c);
2112
+ return NEXT_CHAR;
2113
+ }
2114
+ }
2115
+
2116
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
2117
+ static StateResult handle_comment_state(
2118
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2119
+ int c, GumboToken* output) {
2120
+ switch (c) {
2121
+ case '-':
2122
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2123
+ return NEXT_CHAR;
2124
+ case '\0':
2125
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2126
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2127
+ return NEXT_CHAR;
2128
+ case -1:
2129
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2130
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2131
+ emit_comment(parser, output);
2132
+ return RETURN_ERROR;
2133
+ default:
2134
+ append_char_to_temporary_buffer(parser, c);
2135
+ return NEXT_CHAR;
2136
+ }
2137
+ }
2138
+
2139
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
2140
+ static StateResult handle_comment_end_dash_state(
2141
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2142
+ int c, GumboToken* output) {
2143
+ switch (c) {
2144
+ case '-':
2145
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2146
+ return NEXT_CHAR;
2147
+ case '\0':
2148
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2149
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2150
+ append_char_to_temporary_buffer(parser, '-');
2151
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2152
+ return NEXT_CHAR;
2153
+ case -1:
2154
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2155
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2156
+ emit_comment(parser, output);
2157
+ return RETURN_ERROR;
2158
+ default:
2159
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2160
+ append_char_to_temporary_buffer(parser, '-');
2161
+ append_char_to_temporary_buffer(parser, c);
2162
+ return NEXT_CHAR;
2163
+ }
2164
+ }
2165
+
2166
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
2167
+ static StateResult handle_comment_end_state(
2168
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2169
+ int c, GumboToken* output) {
2170
+ switch (c) {
2171
+ case '>':
2172
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2173
+ return emit_comment(parser, output);
2174
+ case '\0':
2175
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2176
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2177
+ append_char_to_temporary_buffer(parser, '-');
2178
+ append_char_to_temporary_buffer(parser, '-');
2179
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2180
+ return NEXT_CHAR;
2181
+ case '!':
2182
+ add_parse_error(parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2183
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2184
+ return NEXT_CHAR;
2185
+ case '-':
2186
+ add_parse_error(parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2187
+ append_char_to_temporary_buffer(parser, '-');
2188
+ return NEXT_CHAR;
2189
+ case -1:
2190
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2191
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2192
+ emit_comment(parser, output);
2193
+ return RETURN_ERROR;
2194
+ default:
2195
+ add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2196
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2197
+ append_char_to_temporary_buffer(parser, '-');
2198
+ append_char_to_temporary_buffer(parser, '-');
2199
+ append_char_to_temporary_buffer(parser, c);
2200
+ return NEXT_CHAR;
2201
+ }
2202
+ }
2203
+
2204
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
2205
+ static StateResult handle_comment_end_bang_state(
2206
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2207
+ int c, GumboToken* output) {
2208
+ switch (c) {
2209
+ case '-':
2210
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2211
+ append_char_to_temporary_buffer(parser, '-');
2212
+ append_char_to_temporary_buffer(parser, '-');
2213
+ append_char_to_temporary_buffer(parser, '!');
2214
+ return NEXT_CHAR;
2215
+ case '>':
2216
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2217
+ return emit_comment(parser, output);
2218
+ case '\0':
2219
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2220
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2221
+ append_char_to_temporary_buffer(parser, '-');
2222
+ append_char_to_temporary_buffer(parser, '-');
2223
+ append_char_to_temporary_buffer(parser, '!');
2224
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2225
+ return NEXT_CHAR;
2226
+ case -1:
2227
+ add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2228
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2229
+ emit_comment(parser, output);
2230
+ return RETURN_ERROR;
2231
+ default:
2232
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2233
+ append_char_to_temporary_buffer(parser, '-');
2234
+ append_char_to_temporary_buffer(parser, '-');
2235
+ append_char_to_temporary_buffer(parser, '!');
2236
+ append_char_to_temporary_buffer(parser, c);
2237
+ return NEXT_CHAR;
2238
+ }
2239
+ }
2240
+
2241
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
2242
+ static StateResult handle_doctype_state(
2243
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2244
+ int c, GumboToken* output) {
2245
+ assert(!tokenizer->_temporary_buffer.length);
2246
+ switch (c) {
2247
+ case '\t':
2248
+ case '\n':
2249
+ case '\f':
2250
+ case ' ':
2251
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2252
+ return NEXT_CHAR;
2253
+ case -1:
2254
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2255
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2256
+ tokenizer->_doc_type_state.force_quirks = true;
2257
+ emit_doctype(parser, output);
2258
+ return RETURN_ERROR;
2259
+ default:
2260
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2261
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2262
+ tokenizer->_reconsume_current_input = true;
2263
+ tokenizer->_doc_type_state.force_quirks = true;
2264
+ return NEXT_CHAR;
2265
+ }
2266
+ }
2267
+
2268
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
2269
+ static StateResult handle_before_doctype_name_state(
2270
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2271
+ int c, GumboToken* output) {
2272
+ switch (c) {
2273
+ case '\t':
2274
+ case '\n':
2275
+ case '\f':
2276
+ case ' ':
2277
+ return NEXT_CHAR;
2278
+ case '\0':
2279
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2280
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2281
+ tokenizer->_doc_type_state.force_quirks = true;
2282
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2283
+ return NEXT_CHAR;
2284
+ case '>':
2285
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2286
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2287
+ tokenizer->_doc_type_state.force_quirks = true;
2288
+ emit_doctype(parser, output);
2289
+ return RETURN_ERROR;
2290
+ case -1:
2291
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2292
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2293
+ tokenizer->_doc_type_state.force_quirks = true;
2294
+ emit_doctype(parser, output);
2295
+ return RETURN_ERROR;
2296
+ default:
2297
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2298
+ tokenizer->_doc_type_state.force_quirks = false;
2299
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2300
+ return NEXT_CHAR;
2301
+ }
2302
+ }
2303
+
2304
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
2305
+ static StateResult handle_doctype_name_state(
2306
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2307
+ int c, GumboToken* output) {
2308
+ switch (c) {
2309
+ case '\t':
2310
+ case '\n':
2311
+ case '\f':
2312
+ case ' ':
2313
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2314
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2315
+ finish_temporary_buffer(
2316
+ parser, &tokenizer->_doc_type_state.name);
2317
+ return NEXT_CHAR;
2318
+ case '>':
2319
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2320
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2321
+ finish_temporary_buffer(
2322
+ parser, &tokenizer->_doc_type_state.name);
2323
+ emit_doctype(parser, output);
2324
+ return RETURN_SUCCESS;
2325
+ case '\0':
2326
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2327
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2328
+ return NEXT_CHAR;
2329
+ case -1:
2330
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2331
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2332
+ tokenizer->_doc_type_state.force_quirks = true;
2333
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2334
+ finish_temporary_buffer(
2335
+ parser, &tokenizer->_doc_type_state.name);
2336
+ emit_doctype(parser, output);
2337
+ return RETURN_ERROR;
2338
+ default:
2339
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2340
+ tokenizer->_doc_type_state.force_quirks = false;
2341
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2342
+ return NEXT_CHAR;
2343
+ }
2344
+ }
2345
+
2346
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
2347
+ static StateResult handle_after_doctype_name_state(
2348
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2349
+ int c, GumboToken* output) {
2350
+ switch (c) {
2351
+ case '\t':
2352
+ case '\n':
2353
+ case '\f':
2354
+ case ' ':
2355
+ return NEXT_CHAR;
2356
+ case '>':
2357
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2358
+ emit_doctype(parser, output);
2359
+ return RETURN_SUCCESS;
2360
+ case -1:
2361
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2362
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2363
+ tokenizer->_doc_type_state.force_quirks = true;
2364
+ emit_doctype(parser, output);
2365
+ return RETURN_ERROR;
2366
+ default:
2367
+ if (utf8iterator_maybe_consume_match(
2368
+ &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2369
+ gumbo_tokenizer_set_state(
2370
+ parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2371
+ tokenizer->_reconsume_current_input = true;
2372
+ } else if (utf8iterator_maybe_consume_match(
2373
+ &tokenizer->_input, "SYSTEM", sizeof("SYSTEM") - 1, false)) {
2374
+ gumbo_tokenizer_set_state(
2375
+ parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2376
+ tokenizer->_reconsume_current_input = true;
2377
+ } else {
2378
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2379
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2380
+ tokenizer->_doc_type_state.force_quirks = true;
2381
+ }
2382
+ return NEXT_CHAR;
2383
+ }
2384
+ }
2385
+
2386
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
2387
+ static StateResult handle_after_doctype_public_keyword_state(
2388
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2389
+ int c, GumboToken* output) {
2390
+ switch (c) {
2391
+ case '\t':
2392
+ case '\n':
2393
+ case '\f':
2394
+ case ' ':
2395
+ gumbo_tokenizer_set_state(
2396
+ parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2397
+ return NEXT_CHAR;
2398
+ case '"':
2399
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2400
+ assert(temporary_buffer_equals(parser, ""));
2401
+ gumbo_tokenizer_set_state(
2402
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2403
+ return NEXT_CHAR;
2404
+ case '\'':
2405
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2406
+ assert(temporary_buffer_equals(parser, ""));
2407
+ gumbo_tokenizer_set_state(
2408
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2409
+ return NEXT_CHAR;
2410
+ case '>':
2411
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2412
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2413
+ tokenizer->_doc_type_state.force_quirks = true;
2414
+ emit_doctype(parser, output);
2415
+ return RETURN_ERROR;
2416
+ case -1:
2417
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2418
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2419
+ tokenizer->_doc_type_state.force_quirks = true;
2420
+ emit_doctype(parser, output);
2421
+ return RETURN_ERROR;
2422
+ default:
2423
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2424
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2425
+ tokenizer->_doc_type_state.force_quirks = true;
2426
+ emit_doctype(parser, output);
2427
+ return RETURN_ERROR;
2428
+ }
2429
+ }
2430
+
2431
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
2432
+ static StateResult handle_before_doctype_public_id_state(
2433
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2434
+ int c, GumboToken* output) {
2435
+ switch (c) {
2436
+ case '\t':
2437
+ case '\n':
2438
+ case '\f':
2439
+ case ' ':
2440
+ return NEXT_CHAR;
2441
+ case '"':
2442
+ assert(temporary_buffer_equals(parser, ""));
2443
+ gumbo_tokenizer_set_state(
2444
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2445
+ return NEXT_CHAR;
2446
+ case '\'':
2447
+ assert(temporary_buffer_equals(parser, ""));
2448
+ gumbo_tokenizer_set_state(
2449
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2450
+ return NEXT_CHAR;
2451
+ case '>':
2452
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2453
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2454
+ tokenizer->_doc_type_state.force_quirks = true;
2455
+ emit_doctype(parser, output);
2456
+ return RETURN_ERROR;
2457
+ case -1:
2458
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2459
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2460
+ tokenizer->_doc_type_state.force_quirks = true;
2461
+ emit_doctype(parser, output);
2462
+ return RETURN_ERROR;
2463
+ default:
2464
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2465
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2466
+ tokenizer->_doc_type_state.force_quirks = true;
2467
+ emit_doctype(parser, output);
2468
+ return RETURN_ERROR;
2469
+ }
2470
+ }
2471
+
2472
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
2473
+ static StateResult handle_doctype_public_id_double_quoted_state(
2474
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2475
+ int c, GumboToken* output) {
2476
+ switch (c) {
2477
+ case '"':
2478
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2479
+ finish_doctype_public_id(parser);
2480
+ return NEXT_CHAR;
2481
+ case '\0':
2482
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2483
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2484
+ return NEXT_CHAR;
2485
+ case '>':
2486
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2487
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2488
+ tokenizer->_doc_type_state.force_quirks = true;
2489
+ finish_doctype_public_id(parser);
2490
+ emit_doctype(parser, output);
2491
+ return RETURN_ERROR;
2492
+ case -1:
2493
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2494
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2495
+ tokenizer->_doc_type_state.force_quirks = true;
2496
+ finish_doctype_public_id(parser);
2497
+ emit_doctype(parser, output);
2498
+ return RETURN_ERROR;
2499
+ default:
2500
+ append_char_to_temporary_buffer(parser, c);
2501
+ return NEXT_CHAR;
2502
+ }
2503
+ }
2504
+
2505
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
2506
+ static StateResult handle_doctype_public_id_single_quoted_state(
2507
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2508
+ int c, GumboToken* output) {
2509
+ switch (c) {
2510
+ case '\'':
2511
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2512
+ finish_doctype_public_id(parser);
2513
+ return NEXT_CHAR;
2514
+ case '\0':
2515
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2516
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2517
+ return NEXT_CHAR;
2518
+ case '>':
2519
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2520
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2521
+ tokenizer->_doc_type_state.force_quirks = true;
2522
+ finish_doctype_public_id(parser);
2523
+ emit_doctype(parser, output);
2524
+ return RETURN_ERROR;
2525
+ case -1:
2526
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2527
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2528
+ tokenizer->_doc_type_state.force_quirks = true;
2529
+ finish_doctype_public_id(parser);
2530
+ emit_doctype(parser, output);
2531
+ return RETURN_ERROR;
2532
+ default:
2533
+ append_char_to_temporary_buffer(parser, c);
2534
+ return NEXT_CHAR;
2535
+ }
2536
+ }
2537
+
2538
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
2539
+ static StateResult handle_after_doctype_public_id_state(
2540
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2541
+ int c, GumboToken* output) {
2542
+ switch (c) {
2543
+ case '\t':
2544
+ case '\n':
2545
+ case '\f':
2546
+ case ' ':
2547
+ gumbo_tokenizer_set_state(
2548
+ parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2549
+ return NEXT_CHAR;
2550
+ case '>':
2551
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2552
+ emit_doctype(parser, output);
2553
+ return RETURN_SUCCESS;
2554
+ case '"':
2555
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2556
+ assert(temporary_buffer_equals(parser, ""));
2557
+ gumbo_tokenizer_set_state(
2558
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2559
+ return NEXT_CHAR;
2560
+ case '\'':
2561
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2562
+ assert(temporary_buffer_equals(parser, ""));
2563
+ gumbo_tokenizer_set_state(
2564
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2565
+ return NEXT_CHAR;
2566
+ case -1:
2567
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2568
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2569
+ tokenizer->_reconsume_current_input = true;
2570
+ tokenizer->_doc_type_state.force_quirks = true;
2571
+ return NEXT_CHAR;
2572
+ default:
2573
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2574
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2575
+ tokenizer->_doc_type_state.force_quirks = true;
2576
+ return NEXT_CHAR;
2577
+ }
2578
+ }
2579
+
2580
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
2581
+ static StateResult handle_between_doctype_public_system_id_state(
2582
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2583
+ int c, GumboToken* output) {
2584
+ switch (c) {
2585
+ case '\t':
2586
+ case '\n':
2587
+ case '\f':
2588
+ case ' ':
2589
+ return NEXT_CHAR;
2590
+ case '>':
2591
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2592
+ emit_doctype(parser, output);
2593
+ return RETURN_SUCCESS;
2594
+ case '"':
2595
+ assert(temporary_buffer_equals(parser, ""));
2596
+ gumbo_tokenizer_set_state(
2597
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2598
+ return NEXT_CHAR;
2599
+ case '\'':
2600
+ assert(temporary_buffer_equals(parser, ""));
2601
+ gumbo_tokenizer_set_state(
2602
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2603
+ return NEXT_CHAR;
2604
+ case -1:
2605
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2606
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2607
+ tokenizer->_doc_type_state.force_quirks = true;
2608
+ emit_doctype(parser, output);
2609
+ return RETURN_ERROR;
2610
+ default:
2611
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2612
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2613
+ tokenizer->_doc_type_state.force_quirks = true;
2614
+ emit_doctype(parser, output);
2615
+ return RETURN_ERROR;
2616
+ }
2617
+ }
2618
+
2619
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
2620
+ static StateResult handle_after_doctype_system_keyword_state(
2621
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2622
+ int c, GumboToken* output) {
2623
+ switch (c) {
2624
+ case '\t':
2625
+ case '\n':
2626
+ case '\f':
2627
+ case ' ':
2628
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2629
+ return NEXT_CHAR;
2630
+ case '"':
2631
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2632
+ assert(temporary_buffer_equals(parser, ""));
2633
+ gumbo_tokenizer_set_state(
2634
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2635
+ return NEXT_CHAR;
2636
+ case '\'':
2637
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2638
+ assert(temporary_buffer_equals(parser, ""));
2639
+ gumbo_tokenizer_set_state(
2640
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2641
+ return NEXT_CHAR;
2642
+ case '>':
2643
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2644
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2645
+ tokenizer->_doc_type_state.force_quirks = true;
2646
+ emit_doctype(parser, output);
2647
+ return RETURN_ERROR;
2648
+ case -1:
2649
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2650
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2651
+ tokenizer->_doc_type_state.force_quirks = true;
2652
+ emit_doctype(parser, output);
2653
+ return RETURN_ERROR;
2654
+ default:
2655
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2656
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2657
+ tokenizer->_doc_type_state.force_quirks = true;
2658
+ return NEXT_CHAR;
2659
+ }
2660
+ }
2661
+
2662
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
2663
+ static StateResult handle_before_doctype_system_id_state(
2664
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2665
+ int c, GumboToken* output) {
2666
+ switch (c) {
2667
+ case '\t':
2668
+ case '\n':
2669
+ case '\f':
2670
+ case ' ':
2671
+ return NEXT_CHAR;
2672
+ case '"':
2673
+ assert(temporary_buffer_equals(parser, ""));
2674
+ gumbo_tokenizer_set_state(
2675
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2676
+ return NEXT_CHAR;
2677
+ case '\'':
2678
+ assert(temporary_buffer_equals(parser, ""));
2679
+ gumbo_tokenizer_set_state(
2680
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2681
+ return NEXT_CHAR;
2682
+ case '>':
2683
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2684
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2685
+ tokenizer->_doc_type_state.force_quirks = true;
2686
+ emit_doctype(parser, output);
2687
+ return RETURN_ERROR;
2688
+ case -1:
2689
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2690
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2691
+ tokenizer->_doc_type_state.force_quirks = true;
2692
+ emit_doctype(parser, output);
2693
+ return RETURN_ERROR;
2694
+ default:
2695
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2696
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2697
+ tokenizer->_doc_type_state.force_quirks = true;
2698
+ return NEXT_CHAR;
2699
+ }
2700
+ }
2701
+
2702
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
2703
+ static StateResult handle_doctype_system_id_double_quoted_state(
2704
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2705
+ int c, GumboToken* output) {
2706
+ switch (c) {
2707
+ case '"':
2708
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2709
+ finish_doctype_system_id(parser);
2710
+ return NEXT_CHAR;
2711
+ case '\0':
2712
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2713
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2714
+ return NEXT_CHAR;
2715
+ case '>':
2716
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2717
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2718
+ tokenizer->_doc_type_state.force_quirks = true;
2719
+ finish_doctype_system_id(parser);
2720
+ emit_doctype(parser, output);
2721
+ return RETURN_ERROR;
2722
+ case -1:
2723
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2724
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2725
+ tokenizer->_doc_type_state.force_quirks = true;
2726
+ finish_doctype_system_id(parser);
2727
+ emit_doctype(parser, output);
2728
+ return RETURN_ERROR;
2729
+ default:
2730
+ append_char_to_temporary_buffer(parser, c);
2731
+ return NEXT_CHAR;
2732
+ }
2733
+ }
2734
+
2735
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
2736
+ static StateResult handle_doctype_system_id_single_quoted_state(
2737
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2738
+ int c, GumboToken* output) {
2739
+ switch (c) {
2740
+ case '\'':
2741
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2742
+ finish_doctype_system_id(parser);
2743
+ return NEXT_CHAR;
2744
+ case '\0':
2745
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2746
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2747
+ return NEXT_CHAR;
2748
+ case '>':
2749
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2750
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2751
+ tokenizer->_doc_type_state.force_quirks = true;
2752
+ finish_doctype_system_id(parser);
2753
+ emit_doctype(parser, output);
2754
+ return RETURN_ERROR;
2755
+ case -1:
2756
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2757
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2758
+ tokenizer->_doc_type_state.force_quirks = true;
2759
+ finish_doctype_system_id(parser);
2760
+ emit_doctype(parser, output);
2761
+ return RETURN_ERROR;
2762
+ default:
2763
+ append_char_to_temporary_buffer(parser, c);
2764
+ return NEXT_CHAR;
2765
+ }
2766
+ }
2767
+
2768
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
2769
+ static StateResult handle_after_doctype_system_id_state(
2770
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2771
+ int c, GumboToken* output) {
2772
+ switch (c) {
2773
+ case '\t':
2774
+ case '\n':
2775
+ case '\f':
2776
+ case ' ':
2777
+ return NEXT_CHAR;
2778
+ case '>':
2779
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2780
+ emit_doctype(parser, output);
2781
+ return RETURN_SUCCESS;
2782
+ case -1:
2783
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2784
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2785
+ tokenizer->_doc_type_state.force_quirks = true;
2786
+ emit_doctype(parser, output);
2787
+ return RETURN_ERROR;
2788
+ default:
2789
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2790
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2791
+ return NEXT_CHAR;
2792
+ }
2793
+ }
2794
+
2795
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
2796
+ static StateResult handle_bogus_doctype_state(
2797
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2798
+ int c, GumboToken* output) {
2799
+ if (c == '>' || c == -1) {
2800
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2801
+ emit_doctype(parser, output);
2802
+ return RETURN_ERROR;
2803
+ }
2804
+ return NEXT_CHAR;
2805
+ }
2806
+
2807
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
2808
+ static StateResult handle_cdata_state(
2809
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2810
+ int c, GumboToken* output) {
2811
+ if (c == -1 || utf8iterator_maybe_consume_match(
2812
+ &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2813
+ tokenizer->_reconsume_current_input = true;
2814
+ reset_token_start_point(tokenizer);
2815
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2816
+ return NEXT_CHAR;
2817
+ } else {
2818
+ return emit_current_char(parser, output);
2819
+ }
2820
+ }
2821
+
2822
+ typedef StateResult (*GumboLexerStateFunction)(
2823
+ GumboParser*, GumboTokenizerState*, int, GumboToken*);
2824
+
2825
+ static GumboLexerStateFunction dispatch_table[] = {
2826
+ handle_data_state,
2827
+ handle_char_ref_in_data_state,
2828
+ handle_rcdata_state,
2829
+ handle_char_ref_in_rcdata_state,
2830
+ handle_rawtext_state,
2831
+ handle_script_state,
2832
+ handle_plaintext_state,
2833
+ handle_tag_open_state,
2834
+ handle_end_tag_open_state,
2835
+ handle_tag_name_state,
2836
+ handle_rcdata_lt_state,
2837
+ handle_rcdata_end_tag_open_state,
2838
+ handle_rcdata_end_tag_name_state,
2839
+ handle_rawtext_lt_state,
2840
+ handle_rawtext_end_tag_open_state,
2841
+ handle_rawtext_end_tag_name_state,
2842
+ handle_script_lt_state,
2843
+ handle_script_end_tag_open_state,
2844
+ handle_script_end_tag_name_state,
2845
+ handle_script_escaped_start_state,
2846
+ handle_script_escaped_start_dash_state,
2847
+ handle_script_escaped_state,
2848
+ handle_script_escaped_dash_state,
2849
+ handle_script_escaped_dash_dash_state,
2850
+ handle_script_escaped_lt_state,
2851
+ handle_script_escaped_end_tag_open_state,
2852
+ handle_script_escaped_end_tag_name_state,
2853
+ handle_script_double_escaped_start_state,
2854
+ handle_script_double_escaped_state,
2855
+ handle_script_double_escaped_dash_state,
2856
+ handle_script_double_escaped_dash_dash_state,
2857
+ handle_script_double_escaped_lt_state,
2858
+ handle_script_double_escaped_end_state,
2859
+ handle_before_attr_name_state,
2860
+ handle_attr_name_state,
2861
+ handle_after_attr_name_state,
2862
+ handle_before_attr_value_state,
2863
+ handle_attr_value_double_quoted_state,
2864
+ handle_attr_value_single_quoted_state,
2865
+ handle_attr_value_unquoted_state,
2866
+ handle_char_ref_in_attr_value_state,
2867
+ handle_after_attr_value_quoted_state,
2868
+ handle_self_closing_start_tag_state,
2869
+ handle_bogus_comment_state,
2870
+ handle_markup_declaration_state,
2871
+ handle_comment_start_state,
2872
+ handle_comment_start_dash_state,
2873
+ handle_comment_state,
2874
+ handle_comment_end_dash_state,
2875
+ handle_comment_end_state,
2876
+ handle_comment_end_bang_state,
2877
+ handle_doctype_state,
2878
+ handle_before_doctype_name_state,
2879
+ handle_doctype_name_state,
2880
+ handle_after_doctype_name_state,
2881
+ handle_after_doctype_public_keyword_state,
2882
+ handle_before_doctype_public_id_state,
2883
+ handle_doctype_public_id_double_quoted_state,
2884
+ handle_doctype_public_id_single_quoted_state,
2885
+ handle_after_doctype_public_id_state,
2886
+ handle_between_doctype_public_system_id_state,
2887
+ handle_after_doctype_system_keyword_state,
2888
+ handle_before_doctype_system_id_state,
2889
+ handle_doctype_system_id_double_quoted_state,
2890
+ handle_doctype_system_id_single_quoted_state,
2891
+ handle_after_doctype_system_id_state,
2892
+ handle_bogus_doctype_state,
2893
+ handle_cdata_state
2894
+ };
2895
+
2896
+ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2897
+ // Because of the spec requirements that...
2898
+ //
2899
+ // 1. Tokens be handled immediately by the parser upon emission.
2900
+ // 2. Some states (eg. CDATA, or various error conditions) require the
2901
+ // emission of multiple tokens in the same states.
2902
+ // 3. The tokenizer often has to reconsume the same character in a different
2903
+ // state.
2904
+ //
2905
+ // ...all state must be held in the GumboTokenizer struct instead of in local
2906
+ // variables in this function. That allows us to return from this method with
2907
+ // a token, and then immediately jump back to the same state with the same
2908
+ // input if we need to return a different token. The various emit_* functions
2909
+ // are responsible for changing state (eg. flushing the chardata buffer,
2910
+ // reading the next input character) to avoid an infinite loop.
2911
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
2912
+
2913
+ if (tokenizer->_buffered_emit_char != kGumboNoChar) {
2914
+ tokenizer->_reconsume_current_input = true;
2915
+ emit_char(parser, tokenizer->_buffered_emit_char, output);
2916
+ // And now that we've avoided advancing the input, make sure we set
2917
+ // _reconsume_current_input back to false to make sure the *next* character
2918
+ // isn't consumed twice.
2919
+ tokenizer->_reconsume_current_input = false;
2920
+ tokenizer->_buffered_emit_char = kGumboNoChar;
2921
+ return true;
2922
+ }
2923
+
2924
+ if (maybe_emit_from_temporary_buffer(parser, output)) {
2925
+ return true;
2926
+ }
2927
+
2928
+ while (1) {
2929
+ assert(!tokenizer->_temporary_buffer_emit);
2930
+ assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2931
+ int c = utf8iterator_current(&tokenizer->_input);
2932
+ gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
2933
+ StateResult result =
2934
+ dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2935
+ // We need to clear reconsume_current_input before returning to prevent
2936
+ // certain infinite loop states.
2937
+ bool should_advance = !tokenizer->_reconsume_current_input;
2938
+ tokenizer->_reconsume_current_input = false;
2939
+
2940
+ if (result == RETURN_SUCCESS) {
2941
+ return true;
2942
+ } else if(result == RETURN_ERROR) {
2943
+ return false;
2944
+ }
2945
+
2946
+ if (should_advance) {
2947
+ utf8iterator_next(&tokenizer->_input);
2948
+ }
2949
+ }
2950
+ }
2951
+
2952
+ void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
2953
+ if (!token) return;
2954
+
2955
+ switch (token->type) {
2956
+ case GUMBO_TOKEN_DOCTYPE:
2957
+ gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
2958
+ gumbo_parser_deallocate(
2959
+ parser, (void*) token->v.doc_type.public_identifier);
2960
+ gumbo_parser_deallocate(
2961
+ parser, (void*) token->v.doc_type.system_identifier);
2962
+ return;
2963
+ case GUMBO_TOKEN_START_TAG:
2964
+ for (int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2965
+ GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2966
+ if (attr) {
2967
+ // May have been nulled out if this token was merged with another.
2968
+ gumbo_destroy_attribute(parser, attr);
2969
+ }
2970
+ }
2971
+ gumbo_parser_deallocate(
2972
+ parser, (void*) token->v.start_tag.attributes.data);
2973
+ return;
2974
+ case GUMBO_TOKEN_COMMENT:
2975
+ gumbo_parser_deallocate(parser, (void*) token->v.text);
2976
+ return;
2977
+ default:
2978
+ return;
2979
+ }
2980
+ }