ruby-gumbo 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.mkd +28 -31
  4. data/Rakefile +60 -59
  5. data/ext/extconf.rb +17 -9
  6. data/ext/{gumbo.c → ruby_gumbo_ext.c} +29 -28
  7. data/lib/gumbo.rb +19 -0
  8. data/lib/gumbo/element.rb +52 -0
  9. data/lib/gumbo/{extra.rb → node.rb} +19 -22
  10. data/lib/gumbo/text.rb +29 -0
  11. data/vendor/gumbo-parser/src/attribute.c +44 -0
  12. data/vendor/gumbo-parser/src/attribute.h +37 -0
  13. data/vendor/gumbo-parser/src/char_ref.c +2561 -0
  14. data/vendor/gumbo-parser/src/char_ref.h +61 -0
  15. data/vendor/gumbo-parser/src/error.c +258 -0
  16. data/vendor/gumbo-parser/src/error.h +227 -0
  17. data/vendor/gumbo-parser/src/gumbo.h +807 -0
  18. data/vendor/gumbo-parser/src/insertion_mode.h +57 -0
  19. data/vendor/gumbo-parser/src/parser.c +3917 -0
  20. data/vendor/gumbo-parser/src/parser.h +57 -0
  21. data/vendor/gumbo-parser/src/string_buffer.c +106 -0
  22. data/vendor/gumbo-parser/src/string_buffer.h +81 -0
  23. data/vendor/gumbo-parser/src/string_piece.c +49 -0
  24. data/vendor/gumbo-parser/src/string_piece.h +39 -0
  25. data/vendor/gumbo-parser/src/tag.c +225 -0
  26. data/vendor/gumbo-parser/src/token_type.h +40 -0
  27. data/vendor/gumbo-parser/src/tokenizer.c +2980 -0
  28. data/vendor/gumbo-parser/src/tokenizer.h +123 -0
  29. data/vendor/gumbo-parser/src/tokenizer_states.h +103 -0
  30. data/vendor/gumbo-parser/src/utf8.c +275 -0
  31. data/vendor/gumbo-parser/src/utf8.h +127 -0
  32. data/vendor/gumbo-parser/src/util.c +58 -0
  33. data/vendor/gumbo-parser/src/util.h +62 -0
  34. data/vendor/gumbo-parser/src/vector.c +123 -0
  35. data/vendor/gumbo-parser/src/vector.h +69 -0
  36. metadata +40 -10
  37. data/ext/extconf.h +0 -3
@@ -0,0 +1,40 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #ifndef GUMBO_TOKEN_TYPE_H_
18
+ #define GUMBO_TOKEN_TYPE_H_
19
+
20
+ #ifdef __cplusplus
21
+ extern "C" {
22
+ #endif
23
+
24
+ // An enum representing the type of token.
25
+ typedef enum {
26
+ GUMBO_TOKEN_DOCTYPE,
27
+ GUMBO_TOKEN_START_TAG,
28
+ GUMBO_TOKEN_END_TAG,
29
+ GUMBO_TOKEN_COMMENT,
30
+ GUMBO_TOKEN_WHITESPACE,
31
+ GUMBO_TOKEN_CHARACTER,
32
+ GUMBO_TOKEN_NULL,
33
+ GUMBO_TOKEN_EOF
34
+ } GumboTokenType;
35
+
36
+ #ifdef __cplusplus
37
+ } // extern C
38
+ #endif
39
+
40
+ #endif // GUMBO_TOKEN_TYPE_H_
@@ -0,0 +1,2980 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // Coding conventions specific to this file:
18
+ //
19
+ // 1. Functions that fill in a token should be named emit_*, and should be
20
+ // followed immediately by a return from the tokenizer (true if no error
21
+ // occurred, false if an error occurred). Sometimes the emit functions
22
+ // themselves return a boolean so that they can be combined with the return
23
+ // statement; in this case, they should match this convention.
24
+ // 2. Functions that shuffle data from temporaries to final API structures
25
+ // should be named finish_*, and be called just before the tokenizer exits the
26
+ // state that accumulates the temporary.
27
+ // 3. All internal data structures should be kept in an initialized state from
28
+ // tokenizer creation onwards, ready to accept input. When a buffer's flushed
29
+ // and reset, it should be deallocated and immediately reinitialized.
30
+ // 4. Make sure there are appropriate break statements following each state.
31
+ // 5. Assertions on the state of the temporary and tag buffers are usually a
32
+ // good idea, and should go at the entry point of each state when added.
33
+ // 6. Statement order within states goes:
34
+ // 1. Add parse errors, if appropriate.
35
+ // 2. Call finish_* functions to build up tag state.
36
+ // 2. Switch to new state. Set _reconsume flag if appropriate.
37
+ // 3. Perform any other temporary buffer manipulation.
38
+ // 4. Emit tokens
39
+ // 5. Return/break.
40
+ // This order ensures that we can verify that every emit is followed by a
41
+ // return, ensures that the correct state is recorded with any parse errors, and
42
+ // prevents parse error position from being messed up by possible mark/resets in
43
+ // temporary buffer manipulation.
44
+
45
+
46
+ #include "tokenizer.h"
47
+
48
+ #include <assert.h>
49
+ #include <stdbool.h>
50
+ #include <string.h>
51
+
52
+ #include "attribute.h"
53
+ #include "char_ref.h"
54
+ #include "error.h"
55
+ #include "gumbo.h"
56
+ #include "parser.h"
57
+ #include "string_buffer.h"
58
+ #include "string_piece.h"
59
+ #include "token_type.h"
60
+ #include "tokenizer_states.h"
61
+ #include "utf8.h"
62
+ #include "util.h"
63
+ #include "vector.h"
64
+
65
+ // Compared against _script_data_buffer to determine if we're in double-escaped
66
+ // script mode.
67
+ const GumboStringPiece kScriptTag = { "script", 6 };
68
+
69
+ // An enum for the return value of each individual state.
70
+ typedef enum {
71
+ RETURN_ERROR, // Return false (error) from the tokenizer.
72
+ RETURN_SUCCESS, // Return true (success) from the tokenizer.
73
+ NEXT_CHAR // Proceed to the next character and continue lexing.
74
+ } StateResult;
75
+
76
+ // This is a struct containing state necessary to build up a tag token,
77
+ // character by character.
78
+ typedef struct GumboInternalTagState {
79
+ // A buffer to accumulate characters for various GumboStringPiece fields.
80
+ GumboStringBuffer _buffer;
81
+
82
+ // A pointer to the start of the original text corresponding to the contents
83
+ // of the buffer.
84
+ const char* _original_text;
85
+
86
+ // The current tag enum, computed once the tag name state has finished so that
87
+ // the buffer can be re-used for building up attributes.
88
+ GumboTag _tag;
89
+
90
+ // The starting location of the text in the buffer.
91
+ GumboSourcePosition _start_pos;
92
+
93
+ // The current list of attributes. This is copied (and ownership of its data
94
+ // transferred) to the GumboStartTag token upon completion of the tag. New
95
+ // attributes are added as soon as their attribute name state is complete, and
96
+ // values are filled in by operating on _attributes.data[attributes.length-1].
97
+ GumboVector /* GumboAttribute */ _attributes;
98
+
99
+ // If true, the next attribute value to be finished should be dropped. This
100
+ // happens if a duplicate attribute name is encountered - we want to consume
101
+ // the attribute value, but shouldn't overwrite the existing value.
102
+ bool _drop_next_attr_value;
103
+
104
+ // The state that caused the tokenizer to switch into a character reference in
105
+ // attribute value state. This is used to set the additional allowed
106
+ // character, and is switched back to on completion. Initialized as the
107
+ // tokenizer enters the character reference state.
108
+ GumboTokenizerEnum _attr_value_state;
109
+
110
+ // The last start tag to have been emitted by the tokenizer. This is
111
+ // necessary to check for appropriate end tags.
112
+ GumboTag _last_start_tag;
113
+
114
+ // If true, then this is a start tag. If false, it's an end tag. This is
115
+ // necessary to generate the appropriate token type at tag-closing time.
116
+ bool _is_start_tag;
117
+
118
+ // If true, then this tag is "self-closing" and doesn't have an end tag.
119
+ bool _is_self_closing;
120
+ } GumboTagState;
121
+
122
+ // This is the main tokenizer state struct, containing all state used by in
123
+ // tokenizing the input stream.
124
+ typedef struct GumboInternalTokenizerState {
125
+ // The current lexer state. Starts in GUMBO_LEX_DATA.
126
+ GumboTokenizerEnum _state;
127
+
128
+ // A flag indicating whether the current input character needs to reconsumed
129
+ // in another state, or whether the next input character should be read for
130
+ // the next iteration of the state loop. This is set when the spec reads
131
+ // "Reconsume the current input character in..."
132
+ bool _reconsume_current_input;
133
+
134
+ // A flag indicating whether the current node is a foreign element. This is
135
+ // set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
136
+ // markup declaration state.
137
+ bool _is_current_node_foreign;
138
+
139
+ // Certain states (notably character references) may emit two character tokens
140
+ // at once, but the contract for lex() fills in only one token at a time. The
141
+ // extra character is buffered here, and then this is checked on entry to
142
+ // lex(). If a character is stored here, it's immediately emitted and control
143
+ // returns from the lexer. kGumboNoChar is used to represent 'no character
144
+ // stored.'
145
+ //
146
+ // Note that characters emitted through this mechanism will have their source
147
+ // position marked as the character under the mark, i.e. multiple characters
148
+ // may be emitted with the same position. This is desirable for character
149
+ // references, but unsuitable for many other cases. Use the _temporary_buffer
150
+ // mechanism if the buffered characters must have their original positions in
151
+ // the document.
152
+ int _buffered_emit_char;
153
+
154
+ // A temporary buffer to accumulate characters, as described by the "temporary
155
+ // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
156
+ // way: we record the specific character to go into the buffer, which may
157
+ // sometimes be a lowercased version of the actual input character. However,
158
+ // we *also* use utf8iterator_mark() to record the position at tag start.
159
+ // When we start flushing the temporary buffer, we set _temporary_buffer_emit
160
+ // to the start of it, and then increment it for each call to the tokenizer.
161
+ // We also call utf8iterator_reset(), and utf8iterator_next() through the
162
+ // input stream, so that tokens emitted by emit_char have the correct position
163
+ // and original text.
164
+ GumboStringBuffer _temporary_buffer;
165
+
166
+ // The current cursor position we're emitting from within
167
+ // _temporary_buffer.data. NULL whenever we're not flushing the buffer.
168
+ const char* _temporary_buffer_emit;
169
+
170
+ // The temporary buffer is also used by the spec to check whether we should
171
+ // enter the script data double escaped state, but we can't use the same
172
+ // buffer for both because we have to flush out "<s" as emits while still
173
+ // maintaining the context that will eventually become "script". This is a
174
+ // separate buffer that's used in place of the temporary buffer for states
175
+ // that may enter the script data double escape start state.
176
+ GumboStringBuffer _script_data_buffer;
177
+
178
+ // Pointer to the beginning of the current token in the original buffer; used
179
+ // to record the original text.
180
+ const char* _token_start;
181
+
182
+ // GumboSourcePosition recording the source location of the start of the
183
+ // current token.
184
+ GumboSourcePosition _token_start_pos;
185
+
186
+ // Current tag state.
187
+ GumboTagState _tag_state;
188
+
189
+ // Doctype state. We use the temporary buffer to accumulate characters (it's
190
+ // not used for anything else in the doctype states), and then freshly
191
+ // allocate the strings in the doctype token, then copy it over on emit.
192
+ GumboTokenDocType _doc_type_state;
193
+
194
+ // The UTF8Iterator over the tokenizer input.
195
+ Utf8Iterator _input;
196
+ } GumboTokenizerState;
197
+
198
+ // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
199
+ static void add_parse_error(GumboParser* parser, GumboErrorType type) {
200
+ GumboError* error = gumbo_add_error(parser);
201
+ if (!error) {
202
+ return;
203
+ }
204
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
205
+ utf8iterator_get_position(&tokenizer->_input, &error->position);
206
+ error->original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
207
+ error->type = type;
208
+ error->v.tokenizer.codepoint = utf8iterator_current(&tokenizer->_input);
209
+ switch (tokenizer->_state) {
210
+ case GUMBO_LEX_DATA:
211
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DATA;
212
+ break;
213
+ case GUMBO_LEX_CHAR_REF_IN_DATA:
214
+ case GUMBO_LEX_CHAR_REF_IN_RCDATA:
215
+ case GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE:
216
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CHAR_REF;
217
+ break;
218
+ case GUMBO_LEX_RCDATA:
219
+ case GUMBO_LEX_RCDATA_LT:
220
+ case GUMBO_LEX_RCDATA_END_TAG_OPEN:
221
+ case GUMBO_LEX_RCDATA_END_TAG_NAME:
222
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RCDATA;
223
+ break;
224
+ case GUMBO_LEX_RAWTEXT:
225
+ case GUMBO_LEX_RAWTEXT_LT:
226
+ case GUMBO_LEX_RAWTEXT_END_TAG_OPEN:
227
+ case GUMBO_LEX_RAWTEXT_END_TAG_NAME:
228
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_RAWTEXT;
229
+ break;
230
+ case GUMBO_LEX_PLAINTEXT:
231
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_PLAINTEXT;
232
+ break;
233
+ case GUMBO_LEX_SCRIPT:
234
+ case GUMBO_LEX_SCRIPT_LT:
235
+ case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
236
+ case GUMBO_LEX_SCRIPT_END_TAG_NAME:
237
+ case GUMBO_LEX_SCRIPT_ESCAPED_START:
238
+ case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
239
+ case GUMBO_LEX_SCRIPT_ESCAPED:
240
+ case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
241
+ case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
242
+ case GUMBO_LEX_SCRIPT_ESCAPED_LT:
243
+ case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
244
+ case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
245
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
246
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
247
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
248
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
249
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
250
+ case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
251
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
252
+ break;
253
+ case GUMBO_LEX_TAG_OPEN:
254
+ case GUMBO_LEX_END_TAG_OPEN:
255
+ case GUMBO_LEX_TAG_NAME:
256
+ case GUMBO_LEX_BEFORE_ATTR_NAME:
257
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
258
+ break;
259
+ case GUMBO_LEX_SELF_CLOSING_START_TAG:
260
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
261
+ break;
262
+ case GUMBO_LEX_ATTR_NAME:
263
+ case GUMBO_LEX_AFTER_ATTR_NAME:
264
+ case GUMBO_LEX_BEFORE_ATTR_VALUE:
265
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
266
+ break;
267
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
268
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
269
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
270
+ case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
271
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
272
+ break;
273
+ case GUMBO_LEX_BOGUS_COMMENT:
274
+ case GUMBO_LEX_COMMENT_START:
275
+ case GUMBO_LEX_COMMENT_START_DASH:
276
+ case GUMBO_LEX_COMMENT:
277
+ case GUMBO_LEX_COMMENT_END_DASH:
278
+ case GUMBO_LEX_COMMENT_END:
279
+ case GUMBO_LEX_COMMENT_END_BANG:
280
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
281
+ break;
282
+ case GUMBO_LEX_MARKUP_DECLARATION:
283
+ case GUMBO_LEX_DOCTYPE:
284
+ case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
285
+ case GUMBO_LEX_DOCTYPE_NAME:
286
+ case GUMBO_LEX_AFTER_DOCTYPE_NAME:
287
+ case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
288
+ case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
289
+ case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
290
+ case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
291
+ case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
292
+ case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
293
+ case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
294
+ case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
295
+ case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
296
+ case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
297
+ case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
298
+ case GUMBO_LEX_BOGUS_DOCTYPE:
299
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
300
+ break;
301
+ case GUMBO_LEX_CDATA:
302
+ error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
303
+ break;
304
+ }
305
+ }
306
+
307
+ static bool is_alpha(int c) {
308
+ // We don't use ISO C isupper/islower functions here because they
309
+ // depend upon the program's locale, while the behavior of the HTML5 spec is
310
+ // independent of which locale the program is run in.
311
+ return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
312
+ }
313
+
314
+ static int ensure_lowercase(int c) {
315
+ return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
316
+ }
317
+
318
+ static GumboTokenType get_char_token_type(int c) {
319
+ switch (c) {
320
+ case '\t':
321
+ case '\n':
322
+ case '\r':
323
+ case '\f':
324
+ case ' ':
325
+ return GUMBO_TOKEN_WHITESPACE;
326
+ case 0:
327
+ gumbo_debug("Emitted null byte.\n");
328
+ return GUMBO_TOKEN_NULL;
329
+ case -1:
330
+ return GUMBO_TOKEN_EOF;
331
+ default:
332
+ return GUMBO_TOKEN_CHARACTER;
333
+ }
334
+ }
335
+
336
+ // Starts recording characters in the temporary buffer.
337
+ // Because this needs to reset the utf8iterator_mark to the beginning of the
338
+ // text that will eventually be emitted, it needs to be called a couple of
339
+ // states before the spec says "Set the temporary buffer to the empty string".
340
+ // In general, this should be called whenever there's a transition to a
341
+ // "less-than sign state". The initial < and possibly / then need to be
342
+ // appended to the temporary buffer, their presence needs to be accounted for in
343
+ // states that compare the temporary buffer against a literal value, and
344
+ // spec stanzas that say "emit a < and / character token along with a character
345
+ // token for each character in the temporary buffer" need to be adjusted to
346
+ // account for the presence of the < and / inside the temporary buffer.
347
+ static void clear_temporary_buffer(GumboParser* parser) {
348
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
349
+ assert(!tokenizer->_temporary_buffer_emit);
350
+ utf8iterator_mark(&tokenizer->_input);
351
+ gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
352
+ gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
353
+ // The temporary buffer and script data buffer are the same object in the
354
+ // spec, so the script data buffer should be cleared as well.
355
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
356
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
357
+ }
358
+
359
+ // Appends a codepoint to the temporary buffer.
360
+ static void append_char_to_temporary_buffer(
361
+ GumboParser* parser, int codepoint) {
362
+ gumbo_string_buffer_append_codepoint(
363
+ parser, codepoint, &parser->_tokenizer_state->_temporary_buffer);
364
+ }
365
+
366
+ // Checks to see if the temporary buffer equals a certain string.
367
+ // Make sure this remains side-effect free; it's used in assertions.
368
+ #ifndef NDEBUG
369
+ static bool temporary_buffer_equals(
370
+ GumboParser* parser, const char* text) {
371
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
372
+ // TODO(jdtang): See if the extra strlen is a performance problem, and replace
373
+ // it with an explicit sizeof(literal) if necessary. I don't think it will
374
+ // be, as this is only used in a couple of rare states.
375
+ int text_len = strlen(text);
376
+ return text_len == buffer->length &&
377
+ memcmp(buffer->data, text, text_len) == 0;
378
+ }
379
+ #endif
380
+
381
+ static void doc_type_state_init(GumboParser* parser) {
382
+ GumboTokenDocType* doc_type_state =
383
+ &parser->_tokenizer_state->_doc_type_state;
384
+ // We initialize these to NULL here so that we don't end up leaking memory if
385
+ // we never see a doctype token. When we do see a doctype token, we reset
386
+ // them to a freshly-allocated empty string so that we can present a uniform
387
+ // interface to client code and not make them check for null. Ownership is
388
+ // transferred to the doctype token when it's emitted.
389
+ doc_type_state->name = NULL;
390
+ doc_type_state->public_identifier = NULL;
391
+ doc_type_state->system_identifier = NULL;
392
+ doc_type_state->force_quirks = false;
393
+ doc_type_state->has_public_identifier = false;
394
+ doc_type_state->has_system_identifier = false;
395
+ }
396
+
397
+ // Sets the token original_text and position to the current iterator position.
398
+ // This is necessary because [CDATA[ sections may include text that is ignored
399
+ // by the tokenizer.
400
+ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
401
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
402
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
403
+ }
404
+
405
+ // Sets the tag buffer original text and start point to the current iterator
406
+ // position. This is necessary because attribute names & values may have
407
+ // whitespace preceeding them, and so we can't assume that the actual token
408
+ // starting point was the end of the last tag buffer usage.
409
+ static void reset_tag_buffer_start_point(GumboParser* parser) {
410
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
411
+ GumboTagState* tag_state = &tokenizer->_tag_state;
412
+
413
+ utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
414
+ tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
415
+ }
416
+
417
+ // Moves the temporary buffer contents over to the specified output string,
418
+ // and clears the temporary buffer.
419
+ static void finish_temporary_buffer(GumboParser* parser, const char** output) {
420
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
421
+ *output =
422
+ gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
423
+ clear_temporary_buffer(parser);
424
+ }
425
+
426
+ // Advances the iterator past the end of the token, and then fills in the
427
+ // relevant position fields. It's assumed that after every emit, the tokenizer
428
+ // will immediately return (letting the tree-construction stage read the filled
429
+ // in Token). Thus, it's safe to advance the input stream here, since it will
430
+ // bypass the advance at the bottom of the state machine loop.
431
+ //
432
+ // Since this advances the iterator and resets the current input, make sure to
433
+ // call it after you've recorded any other data you need for the token.
434
+ static void finish_token(GumboParser* parser, GumboToken* token) {
435
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
436
+ if (!tokenizer->_reconsume_current_input) {
437
+ utf8iterator_next(&tokenizer->_input);
438
+ }
439
+
440
+ token->position = tokenizer->_token_start_pos;
441
+ token->original_text.data = tokenizer->_token_start;
442
+ reset_token_start_point(tokenizer);
443
+ token->original_text.length =
444
+ tokenizer->_token_start - token->original_text.data;
445
+ if (token->original_text.length > 0 &&
446
+ token->original_text.data[token->original_text.length - 1] == '\r') {
447
+ // The UTF8 iterator will ignore carriage returns in the input stream, which
448
+ // means that the next token may start one past a \r character. The pointer
449
+ // arithmetic above results in that \r being appended to the original text
450
+ // of the preceding token, so we have to adjust its length here to chop the
451
+ // \r off.
452
+ --token->original_text.length;
453
+ }
454
+ }
455
+
456
+ // Records the doctype public ID, assumed to be in the temporary buffer.
457
+ // Convenience method that also sets has_public_identifier to true.
458
+ static void finish_doctype_public_id(GumboParser* parser) {
459
+ GumboTokenDocType* doc_type_state =
460
+ &parser->_tokenizer_state->_doc_type_state;
461
+ gumbo_parser_deallocate(parser, (void*) doc_type_state->public_identifier);
462
+ finish_temporary_buffer(parser, &doc_type_state->public_identifier);
463
+ doc_type_state->has_public_identifier = true;
464
+ }
465
+
466
+ // Records the doctype system ID, assumed to be in the temporary buffer.
467
+ // Convenience method that also sets has_system_identifier to true.
468
+ static void finish_doctype_system_id(GumboParser* parser) {
469
+ GumboTokenDocType* doc_type_state =
470
+ &parser->_tokenizer_state->_doc_type_state;
471
+ gumbo_parser_deallocate(parser, (void*) doc_type_state->system_identifier);
472
+ finish_temporary_buffer(parser, &doc_type_state->system_identifier);
473
+ doc_type_state->has_system_identifier = true;
474
+ }
475
+
476
+ // Writes a single specified character to the output token.
477
+ static void emit_char(GumboParser* parser, int c, GumboToken* output) {
478
+ output->type = get_char_token_type(c);
479
+ output->v.character = c;
480
+ finish_token(parser, output);
481
+ }
482
+
483
+ // Writes a replacement character token and records a parse error.
484
+ // Always returns RETURN_ERROR, per gumbo_lex return value.
485
+ static StateResult emit_replacement_char(
486
+ GumboParser* parser, GumboToken* output) {
487
+ // In all cases, this is because of a null byte in the input stream.
488
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
489
+ emit_char(parser, kUtf8ReplacementChar, output);
490
+ return RETURN_ERROR;
491
+ }
492
+
493
+ // Writes an EOF character token. Always returns RETURN_SUCCESS.
494
+ static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
495
+ emit_char(parser, -1, output);
496
+ return RETURN_SUCCESS;
497
+ }
498
+
499
+ // Writes the current input character out as a character token.
500
+ // Always returns RETURN_SUCCESS.
501
+ static bool emit_current_char(GumboParser* parser, GumboToken* output) {
502
+ emit_char(
503
+ parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
504
+ return RETURN_SUCCESS;
505
+ }
506
+
507
+ // Writes out a doctype token, copying it from the tokenizer state.
508
+ static void emit_doctype(GumboParser* parser, GumboToken* output) {
509
+ output->type = GUMBO_TOKEN_DOCTYPE;
510
+ output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
511
+ finish_token(parser, output);
512
+ doc_type_state_init(parser);
513
+ }
514
+
515
+ // Debug-only function that explicitly sets the attribute vector data to NULL so
516
+ // it can be asserted on tag creation, verifying that there are no memory leaks.
517
+ static void mark_tag_state_as_empty(GumboTagState* tag_state) {
518
+ #ifndef NDEBUG
519
+ tag_state->_attributes = kGumboEmptyVector;
520
+ #endif
521
+ }
522
+
523
+ // Writes out the current tag as a start or end tag token.
524
+ // Always returns RETURN_SUCCESS.
525
+ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
526
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
527
+ if (tag_state->_is_start_tag) {
528
+ output->type = GUMBO_TOKEN_START_TAG;
529
+ output->v.start_tag.tag = tag_state->_tag;
530
+ output->v.start_tag.attributes = tag_state->_attributes;
531
+ output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
532
+ tag_state->_last_start_tag = tag_state->_tag;
533
+ mark_tag_state_as_empty(tag_state);
534
+ gumbo_debug("Emitted start tag %s.\n",
535
+ gumbo_normalized_tagname(tag_state->_tag));
536
+ } else {
537
+ output->type = GUMBO_TOKEN_END_TAG;
538
+ output->v.end_tag = tag_state->_tag;
539
+ // In end tags, ownership of the attributes vector is not transferred to the
540
+ // token, but it's still initialized as normal, so it must be manually
541
+ // deallocated. There may also be attributes to destroy, in certain broken
542
+ // cases like </div</th> (the "th" is an attribute there).
543
+ for (int i = 0; i < tag_state->_attributes.length; ++i) {
544
+ gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
545
+ }
546
+ gumbo_parser_deallocate(parser, tag_state->_attributes.data);
547
+ mark_tag_state_as_empty(tag_state);
548
+ gumbo_debug("Emitted end tag %s.\n",
549
+ gumbo_normalized_tagname(tag_state->_tag));
550
+ }
551
+ gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
552
+ finish_token(parser, output);
553
+ gumbo_debug("Original text = %.*s.\n", output->original_text.length, output->original_text.data);
554
+ assert(output->original_text.length >= 2);
555
+ assert(output->original_text.data[0] == '<');
556
+ assert(output->original_text.data[output->original_text.length - 1] == '>');
557
+ return RETURN_SUCCESS;
558
+ }
559
+
560
+ // In some states, we speculatively start a tag, but don't know whether it'll be
561
+ // emitted as tag token or as a series of character tokens until we finish it.
562
+ // We need to abandon the tag we'd started & free its memory in that case to
563
+ // avoid a memory leak.
564
+ static void abandon_current_tag(GumboParser* parser) {
565
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
566
+ for (int i = 0; i < tag_state->_attributes.length; ++i) {
567
+ gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
568
+ }
569
+ gumbo_parser_deallocate(parser, tag_state->_attributes.data);
570
+ mark_tag_state_as_empty(tag_state);
571
+ gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
572
+ gumbo_debug("Abandoning current tag.\n");
573
+ }
574
+
575
+ // Wraps the consume_char_ref function to handle its output and make the
576
+ // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
577
+ // error occurred, RETURN_SUCCESS otherwise.
578
+ static StateResult emit_char_ref(
579
+ GumboParser* parser, int additional_allowed_char,
580
+ bool is_in_attribute, GumboToken* output) {
581
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
582
+ OneOrTwoCodepoints char_ref;
583
+ bool status = consume_char_ref(
584
+ parser, &tokenizer->_input, additional_allowed_char, false, &char_ref);
585
+ if (char_ref.first != kGumboNoChar) {
586
+ // consume_char_ref ends with the iterator pointing at the next character,
587
+ // so we need to be sure not advance it again before reading the next token.
588
+ tokenizer->_reconsume_current_input = true;
589
+ emit_char(parser, char_ref.first, output);
590
+ tokenizer->_buffered_emit_char = char_ref.second;
591
+ } else {
592
+ emit_char(parser, '&', output);
593
+ }
594
+ return status ? RETURN_SUCCESS : RETURN_ERROR;
595
+ }
596
+
597
+ // Emits a comment token. Comments use the temporary buffer to accumulate their
598
+ // data, and then it's copied over and released to the 'text' field of the
599
+ // GumboToken union. Always returns RETURN_SUCCESS.
600
+ static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
601
+ output->type = GUMBO_TOKEN_COMMENT;
602
+ finish_temporary_buffer(parser, &output->v.text);
603
+ finish_token(parser, output);
604
+ return RETURN_SUCCESS;
605
+ }
606
+
607
+ // Checks to see we should be flushing accumulated characters in the temporary
608
+ // buffer, and fills the output token with the next output character if so.
609
+ // Returns true if a character has been emitted and the tokenizer should
610
+ // immediately return, false if we're at the end of the temporary buffer and
611
+ // should resume normal operation.
612
+ static bool maybe_emit_from_temporary_buffer(
613
+ GumboParser* parser, GumboToken* output) {
614
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
615
+ const char* c = tokenizer->_temporary_buffer_emit;
616
+ GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
617
+
618
+ if (!c || c >= buffer->data + buffer->length) {
619
+ tokenizer->_temporary_buffer_emit = NULL;
620
+ return false;
621
+ }
622
+
623
+ assert(*c == utf8iterator_current(&tokenizer->_input));
624
+ // emit_char also advances the input stream. We need to do some juggling of
625
+ // the _reconsume_current_input flag to get the proper behavior when emitting
626
+ // previous tokens. Basically, _reconsume_current_input should *never* be set
627
+ // when emitting anything from the temporary buffer, since those characters
628
+ // have already been advanced past. However, it should be preserved so that
629
+ // when the *next* character is encountered again, the tokenizer knows not to
630
+ // advance past it.
631
+ bool saved_reconsume_state = tokenizer->_reconsume_current_input;
632
+ tokenizer->_reconsume_current_input = false;
633
+ emit_char(parser, *c, output);
634
+ ++tokenizer->_temporary_buffer_emit;
635
+ tokenizer->_reconsume_current_input = saved_reconsume_state;
636
+ return true;
637
+ }
638
+
639
+ // Sets up the tokenizer to begin flushing the temporary buffer.
640
+ // This resets the input iterator stream to the start of the last tag, sets up
641
+ // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
642
+ // the first character in it. It returns true if a character was emitted, false
643
+ // otherwise.
644
+ static bool emit_temporary_buffer(
645
+ GumboParser* parser, GumboToken* output) {
646
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
647
+ assert(tokenizer->_temporary_buffer.data);
648
+ utf8iterator_reset(&tokenizer->_input);
649
+ tokenizer->_temporary_buffer_emit = tokenizer->_temporary_buffer.data;
650
+ return maybe_emit_from_temporary_buffer(parser, output);
651
+ }
652
+
653
+ // Appends a codepoint to the current tag buffer. If
654
+ // reinitilize_position_on_first is set, this also initializes the tag buffer
655
+ // start point; the only time you would *not* want to pass true for this
656
+ // parameter is if you want the original_text to include character (like an
657
+ // opening quote) that doesn't appear in the value.
658
+ static void append_char_to_tag_buffer(GumboParser* parser, int codepoint,
659
+ bool reinitilize_position_on_first) {
660
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
661
+ if (buffer->length == 0 && reinitilize_position_on_first) {
662
+ reset_tag_buffer_start_point(parser);
663
+ }
664
+ gumbo_string_buffer_append_codepoint(parser, codepoint, buffer);
665
+ }
666
+
667
+ // (Re-)initialize the tag buffer. This also resets the original_text pointer
668
+ // and _start_pos field to point to the current position.
669
+ static void initialize_tag_buffer(GumboParser* parser) {
670
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
671
+ GumboTagState* tag_state = &tokenizer->_tag_state;
672
+
673
+ gumbo_string_buffer_init(parser, &tag_state->_buffer);
674
+ reset_tag_buffer_start_point(parser);
675
+ }
676
+
677
+ // Initializes the tag_state to start a new tag, keeping track of the opening
678
+ // positions and original text. Takes a boolean indicating whether this is a
679
+ // start or end tag.
680
+ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
681
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
682
+ GumboTagState* tag_state = &tokenizer->_tag_state;
683
+ int c = utf8iterator_current(&tokenizer->_input);
684
+ assert(is_alpha(c));
685
+ c = ensure_lowercase(c);
686
+ assert(is_alpha(c));
687
+
688
+ initialize_tag_buffer(parser);
689
+ gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
690
+
691
+ assert(tag_state->_attributes.data == NULL);
692
+ gumbo_vector_init(parser, 4, &tag_state->_attributes);
693
+ tag_state->_drop_next_attr_value = false;
694
+ tag_state->_is_start_tag = is_start_tag;
695
+ tag_state->_is_self_closing = false;
696
+ gumbo_debug("Starting new tag.\n");
697
+ }
698
+
699
+ // Fills in the specified char* with the contents of the tag buffer.
700
+ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
701
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
702
+ GumboTagState* tag_state = &tokenizer->_tag_state;
703
+ *output = gumbo_string_buffer_to_string(parser, &tag_state->_buffer);
704
+ }
705
+
706
+ // Fills in:
707
+ // * The original_text GumboStringPiece with the portion of the original
708
+ // buffer that corresponds to the tag buffer.
709
+ // * The start_pos GumboSourcePosition with the start position of the tag
710
+ // buffer.
711
+ // * The end_pos GumboSourcePosition with the current source position.
712
+ static void copy_over_original_tag_text(
713
+ GumboParser* parser, GumboStringPiece* original_text,
714
+ GumboSourcePosition* start_pos, GumboSourcePosition* end_pos) {
715
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
716
+ GumboTagState* tag_state = &tokenizer->_tag_state;
717
+
718
+ original_text->data = tag_state->_original_text;
719
+ original_text->length =
720
+ utf8iterator_get_char_pointer(&tokenizer->_input) -
721
+ tag_state->_original_text;
722
+ if (original_text->data[original_text->length - 1] == '\r') {
723
+ // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
724
+ // appended to the end of original text even when it's really the first part
725
+ // of the next character. If we detect this situation, shrink the length of
726
+ // the original text by 1 to remove the carriage return.
727
+ --original_text->length;
728
+ }
729
+ *start_pos = tag_state->_start_pos;
730
+ utf8iterator_get_position(&tokenizer->_input, end_pos);
731
+ }
732
+
733
+ // Releases and then re-initializes the tag buffer.
734
+ static void reinitialize_tag_buffer(GumboParser* parser) {
735
+ gumbo_parser_deallocate(
736
+ parser, parser->_tokenizer_state->_tag_state._buffer.data);
737
+ initialize_tag_buffer(parser);
738
+ }
739
+
740
+ // Moves some data from the temporary buffer over the the tag-based fields in
741
+ // TagState.
742
+ static void finish_tag_name(GumboParser* parser) {
743
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
744
+ GumboTagState* tag_state = &tokenizer->_tag_state;
745
+
746
+ const char* temp;
747
+ copy_over_tag_buffer(parser, &temp);
748
+ tag_state->_tag = gumbo_tag_enum(temp);
749
+ reinitialize_tag_buffer(parser);
750
+ gumbo_parser_deallocate(parser, (void*) temp);
751
+ }
752
+
753
+ // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
754
+ static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
755
+ int original_index, int new_index) {
756
+ GumboError* error = gumbo_add_error(parser);
757
+ if (!error) {
758
+ return;
759
+ }
760
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
761
+ error->type = GUMBO_ERR_DUPLICATE_ATTR;
762
+ error->position = tag_state->_start_pos;
763
+ error->original_text = tag_state->_original_text;
764
+ error->v.duplicate_attr.original_index = original_index;
765
+ error->v.duplicate_attr.new_index = new_index;
766
+ copy_over_tag_buffer(parser, &error->v.duplicate_attr.name);
767
+ reinitialize_tag_buffer(parser);
768
+ }
769
+
770
+ // Creates a new attribute in the current tag, copying the current tag buffer to
771
+ // the attribute's name. The attribute's value starts out as the empty string
772
+ // (following the "Boolean attributes" section of the spec) and is only
773
+ // overwritten on finish_attribute_value(). If the attribute has already been
774
+ // specified, the new attribute is dropped, a parse error is added, and the
775
+ // function returns false. Otherwise, this returns true.
776
+ static bool finish_attribute_name(GumboParser* parser) {
777
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
778
+ GumboTagState* tag_state = &tokenizer->_tag_state;
779
+ // May've been set by a previous attribute without a value; reset it here.
780
+ tag_state->_drop_next_attr_value = false;
781
+ assert(tag_state->_attributes.data);
782
+ assert(tag_state->_attributes.capacity);
783
+
784
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
785
+ for (int i = 0; i < attributes->length; ++i) {
786
+ GumboAttribute* attr = attributes->data[i];
787
+ if (strlen(attr->name) == tag_state->_buffer.length &&
788
+ memcmp(attr->name, tag_state->_buffer.data,
789
+ tag_state->_buffer.length) == 0) {
790
+ // Identical attribute; bail.
791
+ add_duplicate_attr_error(
792
+ parser, attr->name, i, attributes->length);
793
+ tag_state->_drop_next_attr_value = true;
794
+ return false;
795
+ }
796
+ }
797
+
798
+ GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
799
+ attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
800
+ copy_over_tag_buffer(parser, &attr->name);
801
+ copy_over_original_tag_text(parser, &attr->original_name,
802
+ &attr->name_start, &attr->name_end);
803
+ attr->value = gumbo_copy_stringz(parser, "");
804
+ copy_over_original_tag_text(parser, &attr->original_value,
805
+ &attr->name_start, &attr->name_end);
806
+ gumbo_vector_add(parser, attr, attributes);
807
+ reinitialize_tag_buffer(parser);
808
+ return true;
809
+ }
810
+
811
+ // Finishes an attribute value. This sets the value of the most recently added
812
+ // attribute to the current contents of the tag buffer.
813
+ static void finish_attribute_value(GumboParser* parser) {
814
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
815
+ if (tag_state->_drop_next_attr_value) {
816
+ // Duplicate attribute name detected in an earlier state, so we have to
817
+ // ignore the value.
818
+ tag_state->_drop_next_attr_value = false;
819
+ reinitialize_tag_buffer(parser);
820
+ return;
821
+ }
822
+
823
+ GumboAttribute* attr =
824
+ tag_state->_attributes.data[tag_state->_attributes.length - 1];
825
+ gumbo_parser_deallocate(parser, (void*) attr->value);
826
+ copy_over_tag_buffer(parser, &attr->value);
827
+ copy_over_original_tag_text(parser, &attr->original_value,
828
+ &attr->value_start, &attr->value_end);
829
+ reinitialize_tag_buffer(parser);
830
+ }
831
+
832
+ // Returns true if the current end tag matches the last start tag emitted.
833
+ static bool is_appropriate_end_tag(GumboParser* parser) {
834
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
835
+ assert(!tag_state->_is_start_tag);
836
+ // Null terminate the current string buffer, so it can be passed to
837
+ // gumbo_tag_enum, but don't increment the length in case we need to dump the
838
+ // buffer as character tokens.
839
+ gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
840
+ --tag_state->_buffer.length;
841
+ return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
842
+ tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
843
+ }
844
+
845
+ void gumbo_tokenizer_state_init(
846
+ GumboParser* parser, const char* text, size_t text_length) {
847
+ GumboTokenizerState* tokenizer =
848
+ gumbo_parser_allocate(parser, sizeof(GumboTokenizerState));
849
+ parser->_tokenizer_state = tokenizer;
850
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
851
+ tokenizer->_reconsume_current_input = false;
852
+ tokenizer->_is_current_node_foreign = false;
853
+ tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
854
+
855
+ tokenizer->_buffered_emit_char = kGumboNoChar;
856
+ gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
857
+ tokenizer->_temporary_buffer_emit = NULL;
858
+
859
+ mark_tag_state_as_empty(&tokenizer->_tag_state);
860
+
861
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
862
+ tokenizer->_token_start = text;
863
+ utf8iterator_init(parser, text, text_length, &tokenizer->_input);
864
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
865
+ doc_type_state_init(parser);
866
+ }
867
+
868
+ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
869
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
870
+ assert(tokenizer->_doc_type_state.name == NULL);
871
+ assert(tokenizer->_doc_type_state.public_identifier == NULL);
872
+ assert(tokenizer->_doc_type_state.system_identifier == NULL);
873
+ gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
874
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
875
+ gumbo_parser_deallocate(parser, tokenizer);
876
+ }
877
+
878
+ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
879
+ parser->_tokenizer_state->_state = state;
880
+ }
881
+
882
+ void gumbo_tokenizer_set_is_current_node_foreign(
883
+ GumboParser* parser, bool is_foreign) {
884
+ if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
885
+ gumbo_debug("Toggling is_current_node_foreign to %s.\n",
886
+ is_foreign ? "true" : "false");
887
+ }
888
+ parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
889
+ }
890
+
891
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
892
+ static StateResult handle_data_state(
893
+ GumboParser* parser, GumboTokenizerState* tokenizer,
894
+ int c, GumboToken* output) {
895
+ switch (c) {
896
+ case '&':
897
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
898
+ // The char_ref machinery expects to be on the & so it can mark that
899
+ // and return to it if the text isn't a char ref, so we need to
900
+ // reconsume it.
901
+ tokenizer->_reconsume_current_input = true;
902
+ return NEXT_CHAR;
903
+ case '<':
904
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
905
+ clear_temporary_buffer(parser);
906
+ append_char_to_temporary_buffer(parser, '<');
907
+ return NEXT_CHAR;
908
+ case '\0':
909
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
910
+ emit_char(parser, c, output);
911
+ return RETURN_ERROR;
912
+ default:
913
+ return emit_current_char(parser, output);
914
+ }
915
+ }
916
+
917
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
918
+ static StateResult handle_char_ref_in_data_state(
919
+ GumboParser* parser, GumboTokenizerState* tokenizer,
920
+ int c, GumboToken* output) {
921
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
922
+ return emit_char_ref(parser, ' ', false, output);
923
+ }
924
+
925
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
926
+ static StateResult handle_rcdata_state(
927
+ GumboParser* parser, GumboTokenizerState* tokenizer,
928
+ int c, GumboToken* output) {
929
+ switch (c) {
930
+ case '&':
931
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
932
+ tokenizer->_reconsume_current_input = true;
933
+ return NEXT_CHAR;
934
+ case '<':
935
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
936
+ clear_temporary_buffer(parser);
937
+ append_char_to_temporary_buffer(parser, '<');
938
+ return NEXT_CHAR;
939
+ case '\0':
940
+ return emit_replacement_char(parser, output);
941
+ case -1:
942
+ return emit_eof(parser, output);
943
+ default:
944
+ return emit_current_char(parser, output);
945
+ }
946
+ }
947
+
948
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
949
+ static StateResult handle_char_ref_in_rcdata_state(
950
+ GumboParser* parser, GumboTokenizerState* tokenizer,
951
+ int c, GumboToken* output) {
952
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
953
+ return emit_char_ref(parser, ' ', false, output);
954
+ }
955
+
956
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
957
+ static StateResult handle_rawtext_state(
958
+ GumboParser* parser, GumboTokenizerState* tokenizer,
959
+ int c, GumboToken* output) {
960
+ switch (c) {
961
+ case '<':
962
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
963
+ clear_temporary_buffer(parser);
964
+ append_char_to_temporary_buffer(parser, '<');
965
+ return NEXT_CHAR;
966
+ case '\0':
967
+ return emit_replacement_char(parser, output);
968
+ case -1:
969
+ return emit_eof(parser, output);
970
+ default:
971
+ return emit_current_char(parser, output);
972
+ }
973
+ }
974
+
975
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
976
+ static StateResult handle_script_state(
977
+ GumboParser* parser, GumboTokenizerState* tokenizer,
978
+ int c, GumboToken* output) {
979
+ switch (c) {
980
+ case '<':
981
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
982
+ clear_temporary_buffer(parser);
983
+ append_char_to_temporary_buffer(parser, '<');
984
+ return NEXT_CHAR;
985
+ case '\0':
986
+ return emit_replacement_char(parser, output);
987
+ case -1:
988
+ return emit_eof(parser, output);
989
+ default:
990
+ return emit_current_char(parser, output);
991
+ }
992
+ }
993
+
994
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
995
+ static StateResult handle_plaintext_state(
996
+ GumboParser* parser, GumboTokenizerState* tokenizer,
997
+ int c, GumboToken* output) {
998
+ switch (c) {
999
+ case '\0':
1000
+ return emit_replacement_char(parser, output);
1001
+ case -1:
1002
+ return emit_eof(parser, output);
1003
+ default:
1004
+ return emit_current_char(parser, output);
1005
+ }
1006
+ }
1007
+
1008
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
1009
+ static StateResult handle_tag_open_state(
1010
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1011
+ int c, GumboToken* output) {
1012
+ assert(temporary_buffer_equals(parser, "<"));
1013
+ switch (c) {
1014
+ case '!':
1015
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION);
1016
+ clear_temporary_buffer(parser);
1017
+ return NEXT_CHAR;
1018
+ case '/':
1019
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1020
+ append_char_to_temporary_buffer(parser, '/');
1021
+ return NEXT_CHAR;
1022
+ case '?':
1023
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1024
+ clear_temporary_buffer(parser);
1025
+ append_char_to_temporary_buffer(parser, '?');
1026
+ add_parse_error(parser, GUMBO_ERR_TAG_STARTS_WITH_QUESTION);
1027
+ return NEXT_CHAR;
1028
+ default:
1029
+ if (is_alpha(c)) {
1030
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1031
+ start_new_tag(parser, true);
1032
+ return NEXT_CHAR;
1033
+ } else {
1034
+ add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
1035
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1036
+ emit_temporary_buffer(parser, output);
1037
+ return RETURN_ERROR;
1038
+ }
1039
+ }
1040
+ }
1041
+
1042
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
1043
+ static StateResult handle_end_tag_open_state(
1044
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1045
+ int c, GumboToken* output) {
1046
+ assert(temporary_buffer_equals(parser, "</"));
1047
+ switch (c) {
1048
+ case '>':
1049
+ add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EMPTY);
1050
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1051
+ return NEXT_CHAR;
1052
+ case -1:
1053
+ add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_EOF);
1054
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1055
+ return emit_temporary_buffer(parser, output);
1056
+ default:
1057
+ if (is_alpha(c)) {
1058
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_NAME);
1059
+ start_new_tag(parser, false);
1060
+ } else {
1061
+ add_parse_error(parser, GUMBO_ERR_CLOSE_TAG_INVALID);
1062
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1063
+ clear_temporary_buffer(parser);
1064
+ append_char_to_temporary_buffer(parser, c);
1065
+ }
1066
+ return NEXT_CHAR;
1067
+ }
1068
+ }
1069
+
1070
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
1071
+ static StateResult handle_tag_name_state(
1072
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1073
+ int c, GumboToken* output) {
1074
+ switch (c) {
1075
+ case '\t':
1076
+ case '\n':
1077
+ case '\f':
1078
+ case ' ':
1079
+ finish_tag_name(parser);
1080
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1081
+ return NEXT_CHAR;
1082
+ case '/':
1083
+ finish_tag_name(parser);
1084
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1085
+ return NEXT_CHAR;
1086
+ case '>':
1087
+ finish_tag_name(parser);
1088
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1089
+ return emit_current_tag(parser, output);
1090
+ case '\0':
1091
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1092
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1093
+ return NEXT_CHAR;
1094
+ case -1:
1095
+ add_parse_error(parser, GUMBO_ERR_TAG_EOF);
1096
+ abandon_current_tag(parser);
1097
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1098
+ return NEXT_CHAR;
1099
+ default:
1100
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1101
+ return NEXT_CHAR;
1102
+ }
1103
+ }
1104
+
1105
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
1106
+ static StateResult handle_rcdata_lt_state(
1107
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1108
+ int c, GumboToken* output) {
1109
+ assert(temporary_buffer_equals(parser, "<"));
1110
+ if (c == '/') {
1111
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1112
+ append_char_to_temporary_buffer(parser, '/');
1113
+ return NEXT_CHAR;
1114
+ } else {
1115
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1116
+ tokenizer->_reconsume_current_input = true;
1117
+ return emit_temporary_buffer(parser, output);
1118
+ }
1119
+ }
1120
+
1121
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
1122
+ static StateResult handle_rcdata_end_tag_open_state(
1123
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1124
+ int c, GumboToken* output) {
1125
+ assert(temporary_buffer_equals(parser, "</"));
1126
+ if (is_alpha(c)) {
1127
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1128
+ start_new_tag(parser, false);
1129
+ append_char_to_temporary_buffer(parser, c);
1130
+ return NEXT_CHAR;
1131
+ } else {
1132
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1133
+ return emit_temporary_buffer(parser, output);
1134
+ }
1135
+ return true;
1136
+ }
1137
+
1138
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
1139
+ static StateResult handle_rcdata_end_tag_name_state(
1140
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1141
+ int c, GumboToken* output) {
1142
+ assert(tokenizer->_temporary_buffer.length >= 2);
1143
+ if (is_alpha(c)) {
1144
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1145
+ append_char_to_temporary_buffer(parser, c);
1146
+ return NEXT_CHAR;
1147
+ } else if (is_appropriate_end_tag(parser)) {
1148
+ switch (c) {
1149
+ case '\t':
1150
+ case '\n':
1151
+ case '\f':
1152
+ case ' ':
1153
+ finish_tag_name(parser);
1154
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1155
+ return NEXT_CHAR;
1156
+ case '/':
1157
+ finish_tag_name(parser);
1158
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1159
+ return NEXT_CHAR;
1160
+ case '>':
1161
+ finish_tag_name(parser);
1162
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1163
+ return emit_current_tag(parser, output);
1164
+ }
1165
+ }
1166
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
1167
+ abandon_current_tag(parser);
1168
+ return emit_temporary_buffer(parser, output);
1169
+ }
1170
+
1171
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
1172
+ static StateResult handle_rawtext_lt_state(
1173
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1174
+ int c, GumboToken* output) {
1175
+ assert(temporary_buffer_equals(parser, "<"));
1176
+ if (c == '/') {
1177
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1178
+ append_char_to_temporary_buffer(parser, '/');
1179
+ return NEXT_CHAR;
1180
+ } else {
1181
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1182
+ tokenizer->_reconsume_current_input = true;
1183
+ return emit_temporary_buffer(parser, output);
1184
+ }
1185
+ }
1186
+
1187
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
1188
+ static StateResult handle_rawtext_end_tag_open_state(
1189
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1190
+ int c, GumboToken* output) {
1191
+ assert(temporary_buffer_equals(parser, "</"));
1192
+ if (is_alpha(c)) {
1193
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1194
+ start_new_tag(parser, false);
1195
+ append_char_to_temporary_buffer(parser, c);
1196
+ return NEXT_CHAR;
1197
+ } else {
1198
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1199
+ return emit_temporary_buffer(parser, output);
1200
+ }
1201
+ }
1202
+
1203
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
1204
+ static StateResult handle_rawtext_end_tag_name_state(
1205
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1206
+ int c, GumboToken* output) {
1207
+ assert(tokenizer->_temporary_buffer.length >= 2);
1208
+ gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1209
+ tokenizer->_tag_state._buffer.data);
1210
+ if (is_alpha(c)) {
1211
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1212
+ append_char_to_temporary_buffer(parser, c);
1213
+ return NEXT_CHAR;
1214
+ } else if (is_appropriate_end_tag(parser)) {
1215
+ gumbo_debug("Is an appropriate end tag.\n");
1216
+ switch (c) {
1217
+ case '\t':
1218
+ case '\n':
1219
+ case '\f':
1220
+ case ' ':
1221
+ finish_tag_name(parser);
1222
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1223
+ return NEXT_CHAR;
1224
+ case '/':
1225
+ finish_tag_name(parser);
1226
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1227
+ return NEXT_CHAR;
1228
+ case '>':
1229
+ finish_tag_name(parser);
1230
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1231
+ return emit_current_tag(parser, output);
1232
+ }
1233
+ }
1234
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
1235
+ abandon_current_tag(parser);
1236
+ return emit_temporary_buffer(parser, output);
1237
+ }
1238
+
1239
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
1240
+ static StateResult handle_script_lt_state(
1241
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1242
+ int c, GumboToken* output) {
1243
+ assert(temporary_buffer_equals(parser, "<"));
1244
+ if (c == '/') {
1245
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
1246
+ append_char_to_temporary_buffer(parser, '/');
1247
+ return NEXT_CHAR;
1248
+ } else if (c == '!') {
1249
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START);
1250
+ append_char_to_temporary_buffer(parser, '!');
1251
+ return emit_temporary_buffer(parser, output);
1252
+ } else {
1253
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1254
+ tokenizer->_reconsume_current_input = true;
1255
+ return emit_temporary_buffer(parser, output);
1256
+ }
1257
+ }
1258
+
1259
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
1260
+ static StateResult handle_script_end_tag_open_state(
1261
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1262
+ int c, GumboToken* output) {
1263
+ assert(temporary_buffer_equals(parser, "</"));
1264
+ if (is_alpha(c)) {
1265
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
1266
+ start_new_tag(parser, false);
1267
+ append_char_to_temporary_buffer(parser, c);
1268
+ return NEXT_CHAR;
1269
+ } else {
1270
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1271
+ return emit_temporary_buffer(parser, output);
1272
+ }
1273
+ }
1274
+
1275
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
1276
+ static StateResult handle_script_end_tag_name_state(
1277
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1278
+ int c, GumboToken* output) {
1279
+ assert(tokenizer->_temporary_buffer.length >= 2);
1280
+ if (is_alpha(c)) {
1281
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1282
+ append_char_to_temporary_buffer(parser, c);
1283
+ return NEXT_CHAR;
1284
+ } else if (is_appropriate_end_tag(parser)) {
1285
+ switch (c) {
1286
+ case '\t':
1287
+ case '\n':
1288
+ case '\f':
1289
+ case ' ':
1290
+ finish_tag_name(parser);
1291
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1292
+ return NEXT_CHAR;
1293
+ case '/':
1294
+ finish_tag_name(parser);
1295
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1296
+ return NEXT_CHAR;
1297
+ case '>':
1298
+ finish_tag_name(parser);
1299
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1300
+ return emit_current_tag(parser, output);
1301
+ }
1302
+ }
1303
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1304
+ abandon_current_tag(parser);
1305
+ return emit_temporary_buffer(parser, output);
1306
+ }
1307
+
1308
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
1309
+ static StateResult handle_script_escaped_start_state(
1310
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1311
+ int c, GumboToken* output) {
1312
+ if (c == '-') {
1313
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1314
+ return emit_current_char(parser, output);
1315
+ } else {
1316
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1317
+ tokenizer->_reconsume_current_input = true;
1318
+ return NEXT_CHAR;
1319
+ }
1320
+ }
1321
+
1322
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
1323
+ static StateResult handle_script_escaped_start_dash_state(
1324
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1325
+ int c, GumboToken* output) {
1326
+ if (c == '-') {
1327
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1328
+ return emit_current_char(parser, output);
1329
+ } else {
1330
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1331
+ tokenizer->_reconsume_current_input = true;
1332
+ return NEXT_CHAR;
1333
+ }
1334
+ }
1335
+
1336
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
1337
+ static StateResult handle_script_escaped_state(
1338
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1339
+ int c, GumboToken* output) {
1340
+ switch (c) {
1341
+ case '-':
1342
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
1343
+ return emit_current_char(parser, output);
1344
+ case '<':
1345
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1346
+ clear_temporary_buffer(parser);
1347
+ append_char_to_temporary_buffer(parser, c);
1348
+ return NEXT_CHAR;
1349
+ case '\0':
1350
+ return emit_replacement_char(parser, output);
1351
+ case -1:
1352
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1353
+ return emit_eof(parser, output);
1354
+ default:
1355
+ return emit_current_char(parser, output);
1356
+ }
1357
+ }
1358
+
1359
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
1360
+ static StateResult handle_script_escaped_dash_state(
1361
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1362
+ int c, GumboToken* output) {
1363
+ switch (c) {
1364
+ case '-':
1365
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1366
+ return emit_current_char(parser, output);
1367
+ case '<':
1368
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1369
+ clear_temporary_buffer(parser);
1370
+ append_char_to_temporary_buffer(parser, c);
1371
+ return NEXT_CHAR;
1372
+ case '\0':
1373
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1374
+ return emit_replacement_char(parser, output);
1375
+ case -1:
1376
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1377
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1378
+ return NEXT_CHAR;
1379
+ default:
1380
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1381
+ return emit_current_char(parser, output);
1382
+ }
1383
+ }
1384
+
1385
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
1386
+ static StateResult handle_script_escaped_dash_dash_state(
1387
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1388
+ int c, GumboToken* output) {
1389
+ switch (c) {
1390
+ case '-':
1391
+ return emit_current_char(parser, output);
1392
+ case '<':
1393
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_LT);
1394
+ clear_temporary_buffer(parser);
1395
+ append_char_to_temporary_buffer(parser, c);
1396
+ return NEXT_CHAR;
1397
+ case '>':
1398
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1399
+ return emit_current_char(parser, output);
1400
+ case '\0':
1401
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1402
+ return emit_replacement_char(parser, output);
1403
+ case -1:
1404
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1405
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1406
+ return NEXT_CHAR;
1407
+ default:
1408
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1409
+ return emit_current_char(parser, output);
1410
+ }
1411
+ }
1412
+
1413
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
1414
+ static StateResult handle_script_escaped_lt_state(
1415
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1416
+ int c, GumboToken* output) {
1417
+ assert(temporary_buffer_equals(parser, "<"));
1418
+ assert(!tokenizer->_script_data_buffer.length);
1419
+ if (c == '/') {
1420
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN);
1421
+ append_char_to_temporary_buffer(parser, c);
1422
+ return NEXT_CHAR;
1423
+ } else if (is_alpha(c)) {
1424
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
1425
+ append_char_to_temporary_buffer(parser, c);
1426
+ gumbo_string_buffer_append_codepoint(
1427
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1428
+ return emit_temporary_buffer(parser, output);
1429
+ } else {
1430
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1431
+ return emit_temporary_buffer(parser, output);
1432
+ }
1433
+ }
1434
+
1435
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
1436
+ static StateResult handle_script_escaped_end_tag_open_state(
1437
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1438
+ int c, GumboToken* output) {
1439
+ assert(temporary_buffer_equals(parser, "</"));
1440
+ if (is_alpha(c)) {
1441
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
1442
+ start_new_tag(parser, false);
1443
+ append_char_to_temporary_buffer(parser, c);
1444
+ return NEXT_CHAR;
1445
+ } else {
1446
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1447
+ return emit_temporary_buffer(parser, output);
1448
+ }
1449
+ }
1450
+
1451
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
1452
+ static StateResult handle_script_escaped_end_tag_name_state(
1453
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1454
+ int c, GumboToken* output) {
1455
+ assert(tokenizer->_temporary_buffer.length >= 2);
1456
+ if (is_alpha(c)) {
1457
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1458
+ append_char_to_temporary_buffer(parser, c);
1459
+ return NEXT_CHAR;
1460
+ } else if (is_appropriate_end_tag(parser)) {
1461
+ switch (c) {
1462
+ case '\t':
1463
+ case '\n':
1464
+ case '\f':
1465
+ case ' ':
1466
+ finish_tag_name(parser);
1467
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1468
+ return NEXT_CHAR;
1469
+ case '/':
1470
+ finish_tag_name(parser);
1471
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1472
+ return NEXT_CHAR;
1473
+ case '>':
1474
+ finish_tag_name(parser);
1475
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1476
+ return emit_current_tag(parser, output);
1477
+ }
1478
+ }
1479
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1480
+ abandon_current_tag(parser);
1481
+ return emit_temporary_buffer(parser, output);
1482
+ }
1483
+
1484
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
1485
+ static StateResult handle_script_double_escaped_start_state(
1486
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1487
+ int c, GumboToken* output) {
1488
+ switch (c) {
1489
+ case '\t':
1490
+ case '\n':
1491
+ case '\f':
1492
+ case ' ':
1493
+ case '/':
1494
+ case '>':
1495
+ gumbo_tokenizer_set_state(parser, gumbo_string_equals(
1496
+ &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1497
+ ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED : GUMBO_LEX_SCRIPT_ESCAPED);
1498
+ return emit_current_char(parser, output);
1499
+ default:
1500
+ if (is_alpha(c)) {
1501
+ gumbo_string_buffer_append_codepoint(
1502
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1503
+ return emit_current_char(parser, output);
1504
+ } else {
1505
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
1506
+ tokenizer->_reconsume_current_input = true;
1507
+ return NEXT_CHAR;
1508
+ }
1509
+ }
1510
+ }
1511
+
1512
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
1513
+ static StateResult handle_script_double_escaped_state(
1514
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1515
+ int c, GumboToken* output) {
1516
+ switch (c) {
1517
+ case '-':
1518
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
1519
+ return emit_current_char(parser, output);
1520
+ case '<':
1521
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1522
+ return emit_current_char(parser, output);
1523
+ case '\0':
1524
+ return emit_replacement_char(parser, output);
1525
+ case -1:
1526
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1527
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1528
+ return NEXT_CHAR;
1529
+ default:
1530
+ return emit_current_char(parser, output);
1531
+ }
1532
+ }
1533
+
1534
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
1535
+ static StateResult handle_script_double_escaped_dash_state(
1536
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1537
+ int c, GumboToken* output) {
1538
+ switch (c) {
1539
+ case '-':
1540
+ gumbo_tokenizer_set_state(
1541
+ parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH);
1542
+ return emit_current_char(parser, output);
1543
+ case '<':
1544
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1545
+ return emit_current_char(parser, output);
1546
+ case '\0':
1547
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1548
+ return emit_replacement_char(parser, output);
1549
+ case -1:
1550
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1551
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1552
+ return NEXT_CHAR;
1553
+ default:
1554
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1555
+ return emit_current_char(parser, output);
1556
+ }
1557
+ }
1558
+
1559
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
1560
+ static StateResult handle_script_double_escaped_dash_dash_state(
1561
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1562
+ int c, GumboToken* output) {
1563
+ switch (c) {
1564
+ case '-':
1565
+ return emit_current_char(parser, output);
1566
+ case '<':
1567
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT);
1568
+ return emit_current_char(parser, output);
1569
+ case '>':
1570
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
1571
+ return emit_current_char(parser, output);
1572
+ case '\0':
1573
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1574
+ return emit_replacement_char(parser, output);
1575
+ case -1:
1576
+ add_parse_error(parser, GUMBO_ERR_SCRIPT_EOF);
1577
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1578
+ return NEXT_CHAR;
1579
+ default:
1580
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1581
+ return emit_current_char(parser, output);
1582
+ }
1583
+ }
1584
+
1585
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
1586
+ static StateResult handle_script_double_escaped_lt_state(
1587
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1588
+ int c, GumboToken* output) {
1589
+ if (c == '/') {
1590
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1591
+ gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
1592
+ gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
1593
+ return emit_current_char(parser, output);
1594
+ } else {
1595
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1596
+ tokenizer->_reconsume_current_input = true;
1597
+ return NEXT_CHAR;
1598
+ }
1599
+
1600
+ }
1601
+
1602
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
1603
+ static StateResult handle_script_double_escaped_end_state(
1604
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1605
+ int c, GumboToken* output) {
1606
+ switch (c) {
1607
+ case '\t':
1608
+ case '\n':
1609
+ case '\f':
1610
+ case ' ':
1611
+ case '/':
1612
+ case '>':
1613
+ gumbo_tokenizer_set_state(parser, gumbo_string_equals(
1614
+ &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1615
+ ? GUMBO_LEX_SCRIPT_ESCAPED : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1616
+ return emit_current_char(parser, output);
1617
+ default:
1618
+ if (is_alpha(c)) {
1619
+ gumbo_string_buffer_append_codepoint(
1620
+ parser, ensure_lowercase(c), &tokenizer->_script_data_buffer);
1621
+ return emit_current_char(parser, output);
1622
+ } else {
1623
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1624
+ tokenizer->_reconsume_current_input = true;
1625
+ return NEXT_CHAR;
1626
+ }
1627
+ }
1628
+ }
1629
+
1630
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
1631
+ static StateResult handle_before_attr_name_state(
1632
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1633
+ int c, GumboToken* output) {
1634
+ switch (c) {
1635
+ case '\t':
1636
+ case '\n':
1637
+ case '\f':
1638
+ case ' ':
1639
+ return NEXT_CHAR;
1640
+ case '/':
1641
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1642
+ return NEXT_CHAR;
1643
+ case '>':
1644
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1645
+ return emit_current_tag(parser, output);
1646
+ case '\0':
1647
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1648
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1649
+ append_char_to_temporary_buffer(parser, 0xfffd);
1650
+ return NEXT_CHAR;
1651
+ case -1:
1652
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1653
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1654
+ abandon_current_tag(parser);
1655
+ return NEXT_CHAR;
1656
+ case '"':
1657
+ case '\'':
1658
+ case '<':
1659
+ case '=':
1660
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1661
+ // Fall through.
1662
+ default:
1663
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1664
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1665
+ return NEXT_CHAR;
1666
+ }
1667
+ }
1668
+
1669
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
1670
+ static StateResult handle_attr_name_state(
1671
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1672
+ int c, GumboToken* output) {
1673
+ switch (c) {
1674
+ case '\t':
1675
+ case '\n':
1676
+ case '\f':
1677
+ case ' ':
1678
+ finish_attribute_name(parser);
1679
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1680
+ return NEXT_CHAR;
1681
+ case '/':
1682
+ finish_attribute_name(parser);
1683
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1684
+ return NEXT_CHAR;
1685
+ case '=':
1686
+ finish_attribute_name(parser);
1687
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1688
+ return NEXT_CHAR;
1689
+ case '>':
1690
+ finish_attribute_name(parser);
1691
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1692
+ return emit_current_tag(parser, output);
1693
+ case '\0':
1694
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1695
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1696
+ return NEXT_CHAR;
1697
+ case -1:
1698
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1699
+ abandon_current_tag(parser);
1700
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1701
+ return NEXT_CHAR;
1702
+ case '"':
1703
+ case '\'':
1704
+ case '<':
1705
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1706
+ // Fall through.
1707
+ default:
1708
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1709
+ return NEXT_CHAR;
1710
+ }
1711
+ }
1712
+
1713
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
1714
+ static StateResult handle_after_attr_name_state(
1715
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1716
+ int c, GumboToken* output) {
1717
+ switch (c) {
1718
+ case '\t':
1719
+ case '\n':
1720
+ case '\f':
1721
+ case ' ':
1722
+ return NEXT_CHAR;
1723
+ case '/':
1724
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1725
+ return NEXT_CHAR;
1726
+ case '=':
1727
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1728
+ return NEXT_CHAR;
1729
+ case '>':
1730
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1731
+ return emit_current_tag(parser, output);
1732
+ case '\0':
1733
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1734
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1735
+ append_char_to_temporary_buffer(parser, 0xfffd);
1736
+ return NEXT_CHAR;
1737
+ case -1:
1738
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
1739
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1740
+ abandon_current_tag(parser);
1741
+ return NEXT_CHAR;
1742
+ case '"':
1743
+ case '\'':
1744
+ case '<':
1745
+ add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1746
+ // Fall through.
1747
+ default:
1748
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1749
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1750
+ return NEXT_CHAR;
1751
+ }
1752
+ }
1753
+
1754
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
1755
+ static StateResult handle_before_attr_value_state(
1756
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1757
+ int c, GumboToken* output) {
1758
+ switch (c) {
1759
+ case '\t':
1760
+ case '\n':
1761
+ case '\f':
1762
+ case ' ':
1763
+ return NEXT_CHAR;
1764
+ case '"':
1765
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1766
+ reset_tag_buffer_start_point(parser);
1767
+ return NEXT_CHAR;
1768
+ case '&':
1769
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1770
+ tokenizer->_reconsume_current_input = true;
1771
+ return NEXT_CHAR;
1772
+ case '\'':
1773
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1774
+ reset_tag_buffer_start_point(parser);
1775
+ return NEXT_CHAR;
1776
+ case '\0':
1777
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1778
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1779
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1780
+ return NEXT_CHAR;
1781
+ case -1:
1782
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1783
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1784
+ abandon_current_tag(parser);
1785
+ tokenizer->_reconsume_current_input = true;
1786
+ return NEXT_CHAR;
1787
+ case '>':
1788
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET);
1789
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1790
+ emit_current_tag(parser, output);
1791
+ return RETURN_ERROR;
1792
+ case '<':
1793
+ case '=':
1794
+ case '`':
1795
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1796
+ // Fall through.
1797
+ default:
1798
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1799
+ append_char_to_tag_buffer(parser, c, true);
1800
+ return NEXT_CHAR;
1801
+ }
1802
+ }
1803
+
1804
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
1805
+ static StateResult handle_attr_value_double_quoted_state(
1806
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1807
+ int c, GumboToken* output) {
1808
+ switch (c) {
1809
+ case '"':
1810
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1811
+ return NEXT_CHAR;
1812
+ case '&':
1813
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1814
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1815
+ tokenizer->_reconsume_current_input = true;
1816
+ return NEXT_CHAR;
1817
+ case '\0':
1818
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1819
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1820
+ return NEXT_CHAR;
1821
+ case -1:
1822
+ add_parse_error(parser, GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF);
1823
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1824
+ abandon_current_tag(parser);
1825
+ tokenizer->_reconsume_current_input = true;
1826
+ return NEXT_CHAR;
1827
+ default:
1828
+ append_char_to_tag_buffer(parser, c, false);
1829
+ return NEXT_CHAR;
1830
+ }
1831
+ }
1832
+
1833
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
1834
+ static StateResult handle_attr_value_single_quoted_state(
1835
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1836
+ int c, GumboToken* output) {
1837
+ switch (c) {
1838
+ case '\'':
1839
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1840
+ return NEXT_CHAR;
1841
+ case '&':
1842
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1843
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1844
+ tokenizer->_reconsume_current_input = true;
1845
+ return NEXT_CHAR;
1846
+ case '\0':
1847
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1848
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1849
+ return NEXT_CHAR;
1850
+ case -1:
1851
+ add_parse_error(parser, GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF);
1852
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1853
+ abandon_current_tag(parser);
1854
+ tokenizer->_reconsume_current_input = true;
1855
+ return NEXT_CHAR;
1856
+ default:
1857
+ append_char_to_tag_buffer(parser, c, false);
1858
+ return NEXT_CHAR;
1859
+ }
1860
+ }
1861
+
1862
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
1863
+ static StateResult handle_attr_value_unquoted_state(
1864
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1865
+ int c, GumboToken* output) {
1866
+ switch (c) {
1867
+ case '\t':
1868
+ case '\n':
1869
+ case '\f':
1870
+ case ' ':
1871
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1872
+ finish_attribute_value(parser);
1873
+ return NEXT_CHAR;
1874
+ case '&':
1875
+ tokenizer->_tag_state._attr_value_state = tokenizer->_state;
1876
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE);
1877
+ tokenizer->_reconsume_current_input = true;
1878
+ return NEXT_CHAR;
1879
+ case '>':
1880
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1881
+ finish_attribute_value(parser);
1882
+ return emit_current_tag(parser, output);
1883
+ case '\0':
1884
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
1885
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1886
+ return NEXT_CHAR;
1887
+ case -1:
1888
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
1889
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1890
+ tokenizer->_reconsume_current_input = true;
1891
+ abandon_current_tag(parser);
1892
+ return NEXT_CHAR;
1893
+ case '<':
1894
+ case '=':
1895
+ case '"':
1896
+ case '\'':
1897
+ case '`':
1898
+ add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1899
+ // Fall through.
1900
+ default:
1901
+ append_char_to_tag_buffer(parser, c, true);
1902
+ return NEXT_CHAR;
1903
+ }
1904
+ }
1905
+
1906
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
1907
+ static StateResult handle_char_ref_in_attr_value_state(
1908
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1909
+ int c, GumboToken* output) {
1910
+ OneOrTwoCodepoints char_ref;
1911
+ int allowed_char;
1912
+ bool is_unquoted = false;
1913
+ switch (tokenizer->_tag_state._attr_value_state) {
1914
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
1915
+ allowed_char = '"';
1916
+ break;
1917
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
1918
+ allowed_char = '\'';
1919
+ break;
1920
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
1921
+ allowed_char = '>';
1922
+ is_unquoted = true;
1923
+ break;
1924
+ default:
1925
+ // -Wmaybe-uninitialized is a little overzealous here, and doesn't
1926
+ // get that the assert(0) means this codepath will never happen.
1927
+ allowed_char = ' ';
1928
+ assert(0);
1929
+ }
1930
+
1931
+ // Ignore the status, since we don't have a convenient way of signalling that
1932
+ // a parser error has occurred when the error occurs in the middle of a
1933
+ // multi-state token. We'd need a flag inside the TokenizerState to do this,
1934
+ // but that's a low priority fix.
1935
+ consume_char_ref(parser, &tokenizer->_input, allowed_char, true, &char_ref);
1936
+ if (char_ref.first != kGumboNoChar) {
1937
+ tokenizer->_reconsume_current_input = true;
1938
+ append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
1939
+ if (char_ref.second != kGumboNoChar) {
1940
+ append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
1941
+ }
1942
+ } else {
1943
+ append_char_to_tag_buffer(parser, '&', is_unquoted);
1944
+ }
1945
+ gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
1946
+ return NEXT_CHAR;
1947
+ }
1948
+
1949
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
1950
+ static StateResult handle_after_attr_value_quoted_state(
1951
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1952
+ int c, GumboToken* output) {
1953
+ finish_attribute_value(parser);
1954
+ switch (c) {
1955
+ case '\t':
1956
+ case '\n':
1957
+ case '\f':
1958
+ case ' ':
1959
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1960
+ return NEXT_CHAR;
1961
+ case '/':
1962
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1963
+ return NEXT_CHAR;
1964
+ case '>':
1965
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1966
+ return emit_current_tag(parser, output);
1967
+ case -1:
1968
+ add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_EOF);
1969
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1970
+ abandon_current_tag(parser);
1971
+ tokenizer->_reconsume_current_input = true;
1972
+ return NEXT_CHAR;
1973
+ default:
1974
+ add_parse_error(parser, GUMBO_ERR_ATTR_AFTER_INVALID);
1975
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1976
+ tokenizer->_reconsume_current_input = true;
1977
+ return NEXT_CHAR;
1978
+ }
1979
+ }
1980
+
1981
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
1982
+ static StateResult handle_self_closing_start_tag_state(
1983
+ GumboParser* parser, GumboTokenizerState* tokenizer,
1984
+ int c, GumboToken* output) {
1985
+ switch (c) {
1986
+ case '>':
1987
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1988
+ tokenizer->_tag_state._is_self_closing = true;
1989
+ return emit_current_tag(parser, output);
1990
+ case -1:
1991
+ add_parse_error(parser, GUMBO_ERR_SOLIDUS_EOF);
1992
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1993
+ abandon_current_tag(parser);
1994
+ return NEXT_CHAR;
1995
+ default:
1996
+ add_parse_error(parser, GUMBO_ERR_SOLIDUS_INVALID);
1997
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1998
+ tokenizer->_reconsume_current_input = true;
1999
+ return NEXT_CHAR;
2000
+ }
2001
+ }
2002
+
2003
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
2004
+ static StateResult handle_bogus_comment_state(
2005
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2006
+ int c, GumboToken* output) {
2007
+ while (c != '>' && c != -1) {
2008
+ if (c == '\0') {
2009
+ c = 0xFFFD;
2010
+ }
2011
+ append_char_to_temporary_buffer(parser, c);
2012
+ utf8iterator_next(&tokenizer->_input);
2013
+ c = utf8iterator_current(&tokenizer->_input);
2014
+ }
2015
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2016
+ return emit_comment(parser, output);
2017
+ }
2018
+
2019
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
2020
+ static StateResult handle_markup_declaration_state(
2021
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2022
+ int c, GumboToken* output) {
2023
+ if (utf8iterator_maybe_consume_match(
2024
+ &tokenizer->_input, "--", sizeof("--") - 1, true)) {
2025
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
2026
+ tokenizer->_reconsume_current_input = true;
2027
+ } else if (utf8iterator_maybe_consume_match(
2028
+ &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
2029
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
2030
+ tokenizer->_reconsume_current_input = true;
2031
+ // If we get here, we know we'll eventually emit a doctype token, so now is
2032
+ // the time to initialize the doctype strings. (Not in doctype_state_init,
2033
+ // since then they'll leak if ownership never gets transferred to the
2034
+ // doctype token.
2035
+ tokenizer->_doc_type_state.name = gumbo_copy_stringz(parser, "");
2036
+ tokenizer->_doc_type_state.public_identifier =
2037
+ gumbo_copy_stringz(parser, "");
2038
+ tokenizer->_doc_type_state.system_identifier =
2039
+ gumbo_copy_stringz(parser, "");
2040
+ } else if (tokenizer->_is_current_node_foreign &&
2041
+ utf8iterator_maybe_consume_match(
2042
+ &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2043
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2044
+ tokenizer->_reconsume_current_input = true;
2045
+ } else {
2046
+ add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
2047
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2048
+ tokenizer->_reconsume_current_input = true;
2049
+ clear_temporary_buffer(parser);
2050
+ }
2051
+ return NEXT_CHAR;
2052
+ }
2053
+
2054
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
2055
+ static StateResult handle_comment_start_state(
2056
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2057
+ int c, GumboToken* output) {
2058
+ switch (c) {
2059
+ case '-':
2060
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2061
+ return NEXT_CHAR;
2062
+ case '\0':
2063
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2064
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2065
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2066
+ return NEXT_CHAR;
2067
+ case '>':
2068
+ add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2069
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2070
+ emit_comment(parser, output);
2071
+ return RETURN_ERROR;
2072
+ case -1:
2073
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2074
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2075
+ emit_comment(parser, output);
2076
+ return RETURN_ERROR;
2077
+ default:
2078
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2079
+ append_char_to_temporary_buffer(parser, c);
2080
+ return NEXT_CHAR;
2081
+ }
2082
+ }
2083
+
2084
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
2085
+ static StateResult handle_comment_start_dash_state(
2086
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2087
+ int c, GumboToken* output) {
2088
+ switch (c) {
2089
+ case '-':
2090
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2091
+ return NEXT_CHAR;
2092
+ case '\0':
2093
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2094
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2095
+ append_char_to_temporary_buffer(parser, '-');
2096
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2097
+ return NEXT_CHAR;
2098
+ case '>':
2099
+ add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2100
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2101
+ emit_comment(parser, output);
2102
+ return RETURN_ERROR;
2103
+ case -1:
2104
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2105
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2106
+ emit_comment(parser, output);
2107
+ return RETURN_ERROR;
2108
+ default:
2109
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2110
+ append_char_to_temporary_buffer(parser, '-');
2111
+ append_char_to_temporary_buffer(parser, c);
2112
+ return NEXT_CHAR;
2113
+ }
2114
+ }
2115
+
2116
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
2117
+ static StateResult handle_comment_state(
2118
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2119
+ int c, GumboToken* output) {
2120
+ switch (c) {
2121
+ case '-':
2122
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2123
+ return NEXT_CHAR;
2124
+ case '\0':
2125
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2126
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2127
+ return NEXT_CHAR;
2128
+ case -1:
2129
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2130
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2131
+ emit_comment(parser, output);
2132
+ return RETURN_ERROR;
2133
+ default:
2134
+ append_char_to_temporary_buffer(parser, c);
2135
+ return NEXT_CHAR;
2136
+ }
2137
+ }
2138
+
2139
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
2140
+ static StateResult handle_comment_end_dash_state(
2141
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2142
+ int c, GumboToken* output) {
2143
+ switch (c) {
2144
+ case '-':
2145
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2146
+ return NEXT_CHAR;
2147
+ case '\0':
2148
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2149
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2150
+ append_char_to_temporary_buffer(parser, '-');
2151
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2152
+ return NEXT_CHAR;
2153
+ case -1:
2154
+ add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
2155
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2156
+ emit_comment(parser, output);
2157
+ return RETURN_ERROR;
2158
+ default:
2159
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2160
+ append_char_to_temporary_buffer(parser, '-');
2161
+ append_char_to_temporary_buffer(parser, c);
2162
+ return NEXT_CHAR;
2163
+ }
2164
+ }
2165
+
2166
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
2167
+ static StateResult handle_comment_end_state(
2168
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2169
+ int c, GumboToken* output) {
2170
+ switch (c) {
2171
+ case '>':
2172
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2173
+ return emit_comment(parser, output);
2174
+ case '\0':
2175
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2176
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2177
+ append_char_to_temporary_buffer(parser, '-');
2178
+ append_char_to_temporary_buffer(parser, '-');
2179
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2180
+ return NEXT_CHAR;
2181
+ case '!':
2182
+ add_parse_error(parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2183
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2184
+ return NEXT_CHAR;
2185
+ case '-':
2186
+ add_parse_error(parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2187
+ append_char_to_temporary_buffer(parser, '-');
2188
+ return NEXT_CHAR;
2189
+ case -1:
2190
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2191
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2192
+ emit_comment(parser, output);
2193
+ return RETURN_ERROR;
2194
+ default:
2195
+ add_parse_error(parser, GUMBO_ERR_COMMENT_INVALID);
2196
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2197
+ append_char_to_temporary_buffer(parser, '-');
2198
+ append_char_to_temporary_buffer(parser, '-');
2199
+ append_char_to_temporary_buffer(parser, c);
2200
+ return NEXT_CHAR;
2201
+ }
2202
+ }
2203
+
2204
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
2205
+ static StateResult handle_comment_end_bang_state(
2206
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2207
+ int c, GumboToken* output) {
2208
+ switch (c) {
2209
+ case '-':
2210
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2211
+ append_char_to_temporary_buffer(parser, '-');
2212
+ append_char_to_temporary_buffer(parser, '-');
2213
+ append_char_to_temporary_buffer(parser, '!');
2214
+ return NEXT_CHAR;
2215
+ case '>':
2216
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2217
+ return emit_comment(parser, output);
2218
+ case '\0':
2219
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2220
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2221
+ append_char_to_temporary_buffer(parser, '-');
2222
+ append_char_to_temporary_buffer(parser, '-');
2223
+ append_char_to_temporary_buffer(parser, '!');
2224
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2225
+ return NEXT_CHAR;
2226
+ case -1:
2227
+ add_parse_error(parser, GUMBO_ERR_COMMENT_END_BANG_EOF);
2228
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2229
+ emit_comment(parser, output);
2230
+ return RETURN_ERROR;
2231
+ default:
2232
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
2233
+ append_char_to_temporary_buffer(parser, '-');
2234
+ append_char_to_temporary_buffer(parser, '-');
2235
+ append_char_to_temporary_buffer(parser, '!');
2236
+ append_char_to_temporary_buffer(parser, c);
2237
+ return NEXT_CHAR;
2238
+ }
2239
+ }
2240
+
2241
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
2242
+ static StateResult handle_doctype_state(
2243
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2244
+ int c, GumboToken* output) {
2245
+ assert(!tokenizer->_temporary_buffer.length);
2246
+ switch (c) {
2247
+ case '\t':
2248
+ case '\n':
2249
+ case '\f':
2250
+ case ' ':
2251
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2252
+ return NEXT_CHAR;
2253
+ case -1:
2254
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2255
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2256
+ tokenizer->_doc_type_state.force_quirks = true;
2257
+ emit_doctype(parser, output);
2258
+ return RETURN_ERROR;
2259
+ default:
2260
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE);
2261
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2262
+ tokenizer->_reconsume_current_input = true;
2263
+ tokenizer->_doc_type_state.force_quirks = true;
2264
+ return NEXT_CHAR;
2265
+ }
2266
+ }
2267
+
2268
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
2269
+ static StateResult handle_before_doctype_name_state(
2270
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2271
+ int c, GumboToken* output) {
2272
+ switch (c) {
2273
+ case '\t':
2274
+ case '\n':
2275
+ case '\f':
2276
+ case ' ':
2277
+ return NEXT_CHAR;
2278
+ case '\0':
2279
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2280
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2281
+ tokenizer->_doc_type_state.force_quirks = true;
2282
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2283
+ return NEXT_CHAR;
2284
+ case '>':
2285
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2286
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2287
+ tokenizer->_doc_type_state.force_quirks = true;
2288
+ emit_doctype(parser, output);
2289
+ return RETURN_ERROR;
2290
+ case -1:
2291
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2292
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2293
+ tokenizer->_doc_type_state.force_quirks = true;
2294
+ emit_doctype(parser, output);
2295
+ return RETURN_ERROR;
2296
+ default:
2297
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2298
+ tokenizer->_doc_type_state.force_quirks = false;
2299
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2300
+ return NEXT_CHAR;
2301
+ }
2302
+ }
2303
+
2304
+ // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
2305
+ static StateResult handle_doctype_name_state(
2306
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2307
+ int c, GumboToken* output) {
2308
+ switch (c) {
2309
+ case '\t':
2310
+ case '\n':
2311
+ case '\f':
2312
+ case ' ':
2313
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2314
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2315
+ finish_temporary_buffer(
2316
+ parser, &tokenizer->_doc_type_state.name);
2317
+ return NEXT_CHAR;
2318
+ case '>':
2319
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2320
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2321
+ finish_temporary_buffer(
2322
+ parser, &tokenizer->_doc_type_state.name);
2323
+ emit_doctype(parser, output);
2324
+ return RETURN_SUCCESS;
2325
+ case '\0':
2326
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2327
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2328
+ return NEXT_CHAR;
2329
+ case -1:
2330
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2331
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2332
+ tokenizer->_doc_type_state.force_quirks = true;
2333
+ gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2334
+ finish_temporary_buffer(
2335
+ parser, &tokenizer->_doc_type_state.name);
2336
+ emit_doctype(parser, output);
2337
+ return RETURN_ERROR;
2338
+ default:
2339
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2340
+ tokenizer->_doc_type_state.force_quirks = false;
2341
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2342
+ return NEXT_CHAR;
2343
+ }
2344
+ }
2345
+
2346
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
2347
+ static StateResult handle_after_doctype_name_state(
2348
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2349
+ int c, GumboToken* output) {
2350
+ switch (c) {
2351
+ case '\t':
2352
+ case '\n':
2353
+ case '\f':
2354
+ case ' ':
2355
+ return NEXT_CHAR;
2356
+ case '>':
2357
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2358
+ emit_doctype(parser, output);
2359
+ return RETURN_SUCCESS;
2360
+ case -1:
2361
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2362
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2363
+ tokenizer->_doc_type_state.force_quirks = true;
2364
+ emit_doctype(parser, output);
2365
+ return RETURN_ERROR;
2366
+ default:
2367
+ if (utf8iterator_maybe_consume_match(
2368
+ &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2369
+ gumbo_tokenizer_set_state(
2370
+ parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2371
+ tokenizer->_reconsume_current_input = true;
2372
+ } else if (utf8iterator_maybe_consume_match(
2373
+ &tokenizer->_input, "SYSTEM", sizeof("SYSTEM") - 1, false)) {
2374
+ gumbo_tokenizer_set_state(
2375
+ parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2376
+ tokenizer->_reconsume_current_input = true;
2377
+ } else {
2378
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2379
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2380
+ tokenizer->_doc_type_state.force_quirks = true;
2381
+ }
2382
+ return NEXT_CHAR;
2383
+ }
2384
+ }
2385
+
2386
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
2387
+ static StateResult handle_after_doctype_public_keyword_state(
2388
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2389
+ int c, GumboToken* output) {
2390
+ switch (c) {
2391
+ case '\t':
2392
+ case '\n':
2393
+ case '\f':
2394
+ case ' ':
2395
+ gumbo_tokenizer_set_state(
2396
+ parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2397
+ return NEXT_CHAR;
2398
+ case '"':
2399
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2400
+ assert(temporary_buffer_equals(parser, ""));
2401
+ gumbo_tokenizer_set_state(
2402
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2403
+ return NEXT_CHAR;
2404
+ case '\'':
2405
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2406
+ assert(temporary_buffer_equals(parser, ""));
2407
+ gumbo_tokenizer_set_state(
2408
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2409
+ return NEXT_CHAR;
2410
+ case '>':
2411
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_RIGHT_BRACKET);
2412
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2413
+ tokenizer->_doc_type_state.force_quirks = true;
2414
+ emit_doctype(parser, output);
2415
+ return RETURN_ERROR;
2416
+ case -1:
2417
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2418
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2419
+ tokenizer->_doc_type_state.force_quirks = true;
2420
+ emit_doctype(parser, output);
2421
+ return RETURN_ERROR;
2422
+ default:
2423
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2424
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2425
+ tokenizer->_doc_type_state.force_quirks = true;
2426
+ emit_doctype(parser, output);
2427
+ return RETURN_ERROR;
2428
+ }
2429
+ }
2430
+
2431
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
2432
+ static StateResult handle_before_doctype_public_id_state(
2433
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2434
+ int c, GumboToken* output) {
2435
+ switch (c) {
2436
+ case '\t':
2437
+ case '\n':
2438
+ case '\f':
2439
+ case ' ':
2440
+ return NEXT_CHAR;
2441
+ case '"':
2442
+ assert(temporary_buffer_equals(parser, ""));
2443
+ gumbo_tokenizer_set_state(
2444
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2445
+ return NEXT_CHAR;
2446
+ case '\'':
2447
+ assert(temporary_buffer_equals(parser, ""));
2448
+ gumbo_tokenizer_set_state(
2449
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2450
+ return NEXT_CHAR;
2451
+ case '>':
2452
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2453
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2454
+ tokenizer->_doc_type_state.force_quirks = true;
2455
+ emit_doctype(parser, output);
2456
+ return RETURN_ERROR;
2457
+ case -1:
2458
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2459
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2460
+ tokenizer->_doc_type_state.force_quirks = true;
2461
+ emit_doctype(parser, output);
2462
+ return RETURN_ERROR;
2463
+ default:
2464
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2465
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2466
+ tokenizer->_doc_type_state.force_quirks = true;
2467
+ emit_doctype(parser, output);
2468
+ return RETURN_ERROR;
2469
+ }
2470
+ }
2471
+
2472
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
2473
+ static StateResult handle_doctype_public_id_double_quoted_state(
2474
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2475
+ int c, GumboToken* output) {
2476
+ switch (c) {
2477
+ case '"':
2478
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2479
+ finish_doctype_public_id(parser);
2480
+ return NEXT_CHAR;
2481
+ case '\0':
2482
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2483
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2484
+ return NEXT_CHAR;
2485
+ case '>':
2486
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2487
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2488
+ tokenizer->_doc_type_state.force_quirks = true;
2489
+ finish_doctype_public_id(parser);
2490
+ emit_doctype(parser, output);
2491
+ return RETURN_ERROR;
2492
+ case -1:
2493
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2494
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2495
+ tokenizer->_doc_type_state.force_quirks = true;
2496
+ finish_doctype_public_id(parser);
2497
+ emit_doctype(parser, output);
2498
+ return RETURN_ERROR;
2499
+ default:
2500
+ append_char_to_temporary_buffer(parser, c);
2501
+ return NEXT_CHAR;
2502
+ }
2503
+ }
2504
+
2505
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
2506
+ static StateResult handle_doctype_public_id_single_quoted_state(
2507
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2508
+ int c, GumboToken* output) {
2509
+ switch (c) {
2510
+ case '\'':
2511
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2512
+ finish_doctype_public_id(parser);
2513
+ return NEXT_CHAR;
2514
+ case '\0':
2515
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2516
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2517
+ return NEXT_CHAR;
2518
+ case '>':
2519
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2520
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2521
+ tokenizer->_doc_type_state.force_quirks = true;
2522
+ finish_doctype_public_id(parser);
2523
+ emit_doctype(parser, output);
2524
+ return RETURN_ERROR;
2525
+ case -1:
2526
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2527
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2528
+ tokenizer->_doc_type_state.force_quirks = true;
2529
+ finish_doctype_public_id(parser);
2530
+ emit_doctype(parser, output);
2531
+ return RETURN_ERROR;
2532
+ default:
2533
+ append_char_to_temporary_buffer(parser, c);
2534
+ return NEXT_CHAR;
2535
+ }
2536
+ }
2537
+
2538
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
2539
+ static StateResult handle_after_doctype_public_id_state(
2540
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2541
+ int c, GumboToken* output) {
2542
+ switch (c) {
2543
+ case '\t':
2544
+ case '\n':
2545
+ case '\f':
2546
+ case ' ':
2547
+ gumbo_tokenizer_set_state(
2548
+ parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2549
+ return NEXT_CHAR;
2550
+ case '>':
2551
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2552
+ emit_doctype(parser, output);
2553
+ return RETURN_SUCCESS;
2554
+ case '"':
2555
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2556
+ assert(temporary_buffer_equals(parser, ""));
2557
+ gumbo_tokenizer_set_state(
2558
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2559
+ return NEXT_CHAR;
2560
+ case '\'':
2561
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2562
+ assert(temporary_buffer_equals(parser, ""));
2563
+ gumbo_tokenizer_set_state(
2564
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2565
+ return NEXT_CHAR;
2566
+ case -1:
2567
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2568
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2569
+ tokenizer->_reconsume_current_input = true;
2570
+ tokenizer->_doc_type_state.force_quirks = true;
2571
+ return NEXT_CHAR;
2572
+ default:
2573
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2574
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2575
+ tokenizer->_doc_type_state.force_quirks = true;
2576
+ return NEXT_CHAR;
2577
+ }
2578
+ }
2579
+
2580
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
2581
+ static StateResult handle_between_doctype_public_system_id_state(
2582
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2583
+ int c, GumboToken* output) {
2584
+ switch (c) {
2585
+ case '\t':
2586
+ case '\n':
2587
+ case '\f':
2588
+ case ' ':
2589
+ return NEXT_CHAR;
2590
+ case '>':
2591
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2592
+ emit_doctype(parser, output);
2593
+ return RETURN_SUCCESS;
2594
+ case '"':
2595
+ assert(temporary_buffer_equals(parser, ""));
2596
+ gumbo_tokenizer_set_state(
2597
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2598
+ return NEXT_CHAR;
2599
+ case '\'':
2600
+ assert(temporary_buffer_equals(parser, ""));
2601
+ gumbo_tokenizer_set_state(
2602
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2603
+ return NEXT_CHAR;
2604
+ case -1:
2605
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2606
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2607
+ tokenizer->_doc_type_state.force_quirks = true;
2608
+ emit_doctype(parser, output);
2609
+ return RETURN_ERROR;
2610
+ default:
2611
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2612
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2613
+ tokenizer->_doc_type_state.force_quirks = true;
2614
+ emit_doctype(parser, output);
2615
+ return RETURN_ERROR;
2616
+ }
2617
+ }
2618
+
2619
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
2620
+ static StateResult handle_after_doctype_system_keyword_state(
2621
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2622
+ int c, GumboToken* output) {
2623
+ switch (c) {
2624
+ case '\t':
2625
+ case '\n':
2626
+ case '\f':
2627
+ case ' ':
2628
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2629
+ return NEXT_CHAR;
2630
+ case '"':
2631
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2632
+ assert(temporary_buffer_equals(parser, ""));
2633
+ gumbo_tokenizer_set_state(
2634
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2635
+ return NEXT_CHAR;
2636
+ case '\'':
2637
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2638
+ assert(temporary_buffer_equals(parser, ""));
2639
+ gumbo_tokenizer_set_state(
2640
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2641
+ return NEXT_CHAR;
2642
+ case '>':
2643
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2644
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2645
+ tokenizer->_doc_type_state.force_quirks = true;
2646
+ emit_doctype(parser, output);
2647
+ return RETURN_ERROR;
2648
+ case -1:
2649
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2650
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2651
+ tokenizer->_doc_type_state.force_quirks = true;
2652
+ emit_doctype(parser, output);
2653
+ return RETURN_ERROR;
2654
+ default:
2655
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2656
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2657
+ tokenizer->_doc_type_state.force_quirks = true;
2658
+ return NEXT_CHAR;
2659
+ }
2660
+ }
2661
+
2662
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
2663
+ static StateResult handle_before_doctype_system_id_state(
2664
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2665
+ int c, GumboToken* output) {
2666
+ switch (c) {
2667
+ case '\t':
2668
+ case '\n':
2669
+ case '\f':
2670
+ case ' ':
2671
+ return NEXT_CHAR;
2672
+ case '"':
2673
+ assert(temporary_buffer_equals(parser, ""));
2674
+ gumbo_tokenizer_set_state(
2675
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2676
+ return NEXT_CHAR;
2677
+ case '\'':
2678
+ assert(temporary_buffer_equals(parser, ""));
2679
+ gumbo_tokenizer_set_state(
2680
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2681
+ return NEXT_CHAR;
2682
+ case '>':
2683
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2684
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2685
+ tokenizer->_doc_type_state.force_quirks = true;
2686
+ emit_doctype(parser, output);
2687
+ return RETURN_ERROR;
2688
+ case -1:
2689
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2690
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2691
+ tokenizer->_doc_type_state.force_quirks = true;
2692
+ emit_doctype(parser, output);
2693
+ return RETURN_ERROR;
2694
+ default:
2695
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2696
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2697
+ tokenizer->_doc_type_state.force_quirks = true;
2698
+ return NEXT_CHAR;
2699
+ }
2700
+ }
2701
+
2702
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
2703
+ static StateResult handle_doctype_system_id_double_quoted_state(
2704
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2705
+ int c, GumboToken* output) {
2706
+ switch (c) {
2707
+ case '"':
2708
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2709
+ finish_doctype_system_id(parser);
2710
+ return NEXT_CHAR;
2711
+ case '\0':
2712
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2713
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2714
+ return NEXT_CHAR;
2715
+ case '>':
2716
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2717
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2718
+ tokenizer->_doc_type_state.force_quirks = true;
2719
+ finish_doctype_system_id(parser);
2720
+ emit_doctype(parser, output);
2721
+ return RETURN_ERROR;
2722
+ case -1:
2723
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2724
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2725
+ tokenizer->_doc_type_state.force_quirks = true;
2726
+ finish_doctype_system_id(parser);
2727
+ emit_doctype(parser, output);
2728
+ return RETURN_ERROR;
2729
+ default:
2730
+ append_char_to_temporary_buffer(parser, c);
2731
+ return NEXT_CHAR;
2732
+ }
2733
+ }
2734
+
2735
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
2736
+ static StateResult handle_doctype_system_id_single_quoted_state(
2737
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2738
+ int c, GumboToken* output) {
2739
+ switch (c) {
2740
+ case '\'':
2741
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2742
+ finish_doctype_system_id(parser);
2743
+ return NEXT_CHAR;
2744
+ case '\0':
2745
+ add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
2746
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2747
+ return NEXT_CHAR;
2748
+ case '>':
2749
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_END);
2750
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2751
+ tokenizer->_doc_type_state.force_quirks = true;
2752
+ finish_doctype_system_id(parser);
2753
+ emit_doctype(parser, output);
2754
+ return RETURN_ERROR;
2755
+ case -1:
2756
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2757
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2758
+ tokenizer->_doc_type_state.force_quirks = true;
2759
+ finish_doctype_system_id(parser);
2760
+ emit_doctype(parser, output);
2761
+ return RETURN_ERROR;
2762
+ default:
2763
+ append_char_to_temporary_buffer(parser, c);
2764
+ return NEXT_CHAR;
2765
+ }
2766
+ }
2767
+
2768
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
2769
+ static StateResult handle_after_doctype_system_id_state(
2770
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2771
+ int c, GumboToken* output) {
2772
+ switch (c) {
2773
+ case '\t':
2774
+ case '\n':
2775
+ case '\f':
2776
+ case ' ':
2777
+ return NEXT_CHAR;
2778
+ case '>':
2779
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2780
+ emit_doctype(parser, output);
2781
+ return RETURN_SUCCESS;
2782
+ case -1:
2783
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
2784
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2785
+ tokenizer->_doc_type_state.force_quirks = true;
2786
+ emit_doctype(parser, output);
2787
+ return RETURN_ERROR;
2788
+ default:
2789
+ add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2790
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2791
+ return NEXT_CHAR;
2792
+ }
2793
+ }
2794
+
2795
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
2796
+ static StateResult handle_bogus_doctype_state(
2797
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2798
+ int c, GumboToken* output) {
2799
+ if (c == '>' || c == -1) {
2800
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2801
+ emit_doctype(parser, output);
2802
+ return RETURN_ERROR;
2803
+ }
2804
+ return NEXT_CHAR;
2805
+ }
2806
+
2807
+ // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
2808
+ static StateResult handle_cdata_state(
2809
+ GumboParser* parser, GumboTokenizerState* tokenizer,
2810
+ int c, GumboToken* output) {
2811
+ if (c == -1 || utf8iterator_maybe_consume_match(
2812
+ &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2813
+ tokenizer->_reconsume_current_input = true;
2814
+ reset_token_start_point(tokenizer);
2815
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2816
+ return NEXT_CHAR;
2817
+ } else {
2818
+ return emit_current_char(parser, output);
2819
+ }
2820
+ }
2821
+
2822
+ typedef StateResult (*GumboLexerStateFunction)(
2823
+ GumboParser*, GumboTokenizerState*, int, GumboToken*);
2824
+
2825
+ static GumboLexerStateFunction dispatch_table[] = {
2826
+ handle_data_state,
2827
+ handle_char_ref_in_data_state,
2828
+ handle_rcdata_state,
2829
+ handle_char_ref_in_rcdata_state,
2830
+ handle_rawtext_state,
2831
+ handle_script_state,
2832
+ handle_plaintext_state,
2833
+ handle_tag_open_state,
2834
+ handle_end_tag_open_state,
2835
+ handle_tag_name_state,
2836
+ handle_rcdata_lt_state,
2837
+ handle_rcdata_end_tag_open_state,
2838
+ handle_rcdata_end_tag_name_state,
2839
+ handle_rawtext_lt_state,
2840
+ handle_rawtext_end_tag_open_state,
2841
+ handle_rawtext_end_tag_name_state,
2842
+ handle_script_lt_state,
2843
+ handle_script_end_tag_open_state,
2844
+ handle_script_end_tag_name_state,
2845
+ handle_script_escaped_start_state,
2846
+ handle_script_escaped_start_dash_state,
2847
+ handle_script_escaped_state,
2848
+ handle_script_escaped_dash_state,
2849
+ handle_script_escaped_dash_dash_state,
2850
+ handle_script_escaped_lt_state,
2851
+ handle_script_escaped_end_tag_open_state,
2852
+ handle_script_escaped_end_tag_name_state,
2853
+ handle_script_double_escaped_start_state,
2854
+ handle_script_double_escaped_state,
2855
+ handle_script_double_escaped_dash_state,
2856
+ handle_script_double_escaped_dash_dash_state,
2857
+ handle_script_double_escaped_lt_state,
2858
+ handle_script_double_escaped_end_state,
2859
+ handle_before_attr_name_state,
2860
+ handle_attr_name_state,
2861
+ handle_after_attr_name_state,
2862
+ handle_before_attr_value_state,
2863
+ handle_attr_value_double_quoted_state,
2864
+ handle_attr_value_single_quoted_state,
2865
+ handle_attr_value_unquoted_state,
2866
+ handle_char_ref_in_attr_value_state,
2867
+ handle_after_attr_value_quoted_state,
2868
+ handle_self_closing_start_tag_state,
2869
+ handle_bogus_comment_state,
2870
+ handle_markup_declaration_state,
2871
+ handle_comment_start_state,
2872
+ handle_comment_start_dash_state,
2873
+ handle_comment_state,
2874
+ handle_comment_end_dash_state,
2875
+ handle_comment_end_state,
2876
+ handle_comment_end_bang_state,
2877
+ handle_doctype_state,
2878
+ handle_before_doctype_name_state,
2879
+ handle_doctype_name_state,
2880
+ handle_after_doctype_name_state,
2881
+ handle_after_doctype_public_keyword_state,
2882
+ handle_before_doctype_public_id_state,
2883
+ handle_doctype_public_id_double_quoted_state,
2884
+ handle_doctype_public_id_single_quoted_state,
2885
+ handle_after_doctype_public_id_state,
2886
+ handle_between_doctype_public_system_id_state,
2887
+ handle_after_doctype_system_keyword_state,
2888
+ handle_before_doctype_system_id_state,
2889
+ handle_doctype_system_id_double_quoted_state,
2890
+ handle_doctype_system_id_single_quoted_state,
2891
+ handle_after_doctype_system_id_state,
2892
+ handle_bogus_doctype_state,
2893
+ handle_cdata_state
2894
+ };
2895
+
2896
+ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2897
+ // Because of the spec requirements that...
2898
+ //
2899
+ // 1. Tokens be handled immediately by the parser upon emission.
2900
+ // 2. Some states (eg. CDATA, or various error conditions) require the
2901
+ // emission of multiple tokens in the same states.
2902
+ // 3. The tokenizer often has to reconsume the same character in a different
2903
+ // state.
2904
+ //
2905
+ // ...all state must be held in the GumboTokenizer struct instead of in local
2906
+ // variables in this function. That allows us to return from this method with
2907
+ // a token, and then immediately jump back to the same state with the same
2908
+ // input if we need to return a different token. The various emit_* functions
2909
+ // are responsible for changing state (eg. flushing the chardata buffer,
2910
+ // reading the next input character) to avoid an infinite loop.
2911
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
2912
+
2913
+ if (tokenizer->_buffered_emit_char != kGumboNoChar) {
2914
+ tokenizer->_reconsume_current_input = true;
2915
+ emit_char(parser, tokenizer->_buffered_emit_char, output);
2916
+ // And now that we've avoided advancing the input, make sure we set
2917
+ // _reconsume_current_input back to false to make sure the *next* character
2918
+ // isn't consumed twice.
2919
+ tokenizer->_reconsume_current_input = false;
2920
+ tokenizer->_buffered_emit_char = kGumboNoChar;
2921
+ return true;
2922
+ }
2923
+
2924
+ if (maybe_emit_from_temporary_buffer(parser, output)) {
2925
+ return true;
2926
+ }
2927
+
2928
+ while (1) {
2929
+ assert(!tokenizer->_temporary_buffer_emit);
2930
+ assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2931
+ int c = utf8iterator_current(&tokenizer->_input);
2932
+ gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
2933
+ StateResult result =
2934
+ dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2935
+ // We need to clear reconsume_current_input before returning to prevent
2936
+ // certain infinite loop states.
2937
+ bool should_advance = !tokenizer->_reconsume_current_input;
2938
+ tokenizer->_reconsume_current_input = false;
2939
+
2940
+ if (result == RETURN_SUCCESS) {
2941
+ return true;
2942
+ } else if(result == RETURN_ERROR) {
2943
+ return false;
2944
+ }
2945
+
2946
+ if (should_advance) {
2947
+ utf8iterator_next(&tokenizer->_input);
2948
+ }
2949
+ }
2950
+ }
2951
+
2952
+ void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
2953
+ if (!token) return;
2954
+
2955
+ switch (token->type) {
2956
+ case GUMBO_TOKEN_DOCTYPE:
2957
+ gumbo_parser_deallocate(parser, (void*) token->v.doc_type.name);
2958
+ gumbo_parser_deallocate(
2959
+ parser, (void*) token->v.doc_type.public_identifier);
2960
+ gumbo_parser_deallocate(
2961
+ parser, (void*) token->v.doc_type.system_identifier);
2962
+ return;
2963
+ case GUMBO_TOKEN_START_TAG:
2964
+ for (int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2965
+ GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2966
+ if (attr) {
2967
+ // May have been nulled out if this token was merged with another.
2968
+ gumbo_destroy_attribute(parser, attr);
2969
+ }
2970
+ }
2971
+ gumbo_parser_deallocate(
2972
+ parser, (void*) token->v.start_tag.attributes.data);
2973
+ return;
2974
+ case GUMBO_TOKEN_COMMENT:
2975
+ gumbo_parser_deallocate(parser, (void*) token->v.text);
2976
+ return;
2977
+ default:
2978
+ return;
2979
+ }
2980
+ }