nokogumbo 2.0.0.pre.alpha → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +81 -10
- data/ext/nokogumbo/extconf.rb +6 -1
- data/ext/nokogumbo/nokogumbo.c +579 -233
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +376 -120
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +47 -4
- data/gumbo-parser/src/parser.c +849 -709
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1426 -1261
- data/gumbo-parser/src/tokenizer.h +5 -5
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +12 -59
- data/gumbo-parser/src/utf8.h +51 -16
- data/lib/nokogumbo.rb +0 -1
- data/lib/nokogumbo/html5.rb +2 -1
- data/lib/nokogumbo/html5/document.rb +12 -1
- data/lib/nokogumbo/html5/document_fragment.rb +35 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +16 -9
- data/CHANGELOG.md +0 -56
@@ -0,0 +1,79 @@
|
|
1
|
+
/*
|
2
|
+
Copyright 2018 Stephen Checkoway
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
*/
|
16
|
+
|
17
|
+
#include <assert.h>
|
18
|
+
|
19
|
+
#include "ascii.h"
|
20
|
+
#include "token_buffer.h"
|
21
|
+
#include "tokenizer.h"
|
22
|
+
#include "util.h"
|
23
|
+
|
24
|
+
struct GumboInternalCharacterToken {
|
25
|
+
GumboSourcePosition position;
|
26
|
+
GumboStringPiece original_text;
|
27
|
+
int c;
|
28
|
+
};
|
29
|
+
|
30
|
+
void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer) {
|
31
|
+
buffer->data = NULL;
|
32
|
+
buffer->length = 0;
|
33
|
+
buffer->capacity = 0;
|
34
|
+
}
|
35
|
+
|
36
|
+
void gumbo_character_token_buffer_append (
|
37
|
+
const GumboToken* token,
|
38
|
+
GumboCharacterTokenBuffer* buffer
|
39
|
+
) {
|
40
|
+
assert(token->type == GUMBO_TOKEN_WHITESPACE
|
41
|
+
|| token->type == GUMBO_TOKEN_CHARACTER);
|
42
|
+
if (buffer->length == buffer->capacity) {
|
43
|
+
if (buffer->capacity == 0)
|
44
|
+
buffer->capacity = 10;
|
45
|
+
else
|
46
|
+
buffer->capacity *= 2;
|
47
|
+
size_t bytes = sizeof(*buffer->data) * buffer->capacity;
|
48
|
+
buffer->data = gumbo_realloc(buffer->data, bytes);
|
49
|
+
}
|
50
|
+
size_t index = buffer->length++;
|
51
|
+
buffer->data[index].position = token->position;
|
52
|
+
buffer->data[index].original_text = token->original_text;
|
53
|
+
buffer->data[index].c = token->v.character;
|
54
|
+
}
|
55
|
+
|
56
|
+
void gumbo_character_token_buffer_get (
|
57
|
+
const GumboCharacterTokenBuffer* buffer,
|
58
|
+
size_t index,
|
59
|
+
struct GumboInternalToken* output
|
60
|
+
) {
|
61
|
+
assert(index < buffer->length);
|
62
|
+
int c = buffer->data[index].c;
|
63
|
+
output->type = gumbo_ascii_isspace(c)?
|
64
|
+
GUMBO_TOKEN_WHITESPACE : GUMBO_TOKEN_CHARACTER;
|
65
|
+
output->position = buffer->data[index].position;
|
66
|
+
output->original_text = buffer->data[index].original_text;
|
67
|
+
output->v.character = c;
|
68
|
+
}
|
69
|
+
|
70
|
+
void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer) {
|
71
|
+
buffer->length = 0;
|
72
|
+
}
|
73
|
+
|
74
|
+
void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer) {
|
75
|
+
gumbo_free(buffer->data);
|
76
|
+
buffer->data = NULL;
|
77
|
+
buffer->length = 0;
|
78
|
+
buffer->capacity = 0;
|
79
|
+
}
|
@@ -0,0 +1,71 @@
|
|
1
|
+
/*
|
2
|
+
Copyright 2018 Stephen Checkoway
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
*/
|
16
|
+
|
17
|
+
#ifndef GUMBO_TOKEN_BUFFER_H
|
18
|
+
#define GUMBO_TOKEN_BUFFER_H
|
19
|
+
|
20
|
+
#include <stdbool.h>
|
21
|
+
#include <stddef.h>
|
22
|
+
|
23
|
+
#include "gumbo.h"
|
24
|
+
|
25
|
+
#ifdef __cplusplus
|
26
|
+
extern "C" {
|
27
|
+
#endif
|
28
|
+
|
29
|
+
struct GumboInternalCharacterToken;
|
30
|
+
struct GumboInternalToken;
|
31
|
+
|
32
|
+
// A struct representing a growable sequence of character (and whitespace)
|
33
|
+
// tokens.
|
34
|
+
typedef struct {
|
35
|
+
// A pointer to the start of the sequence.
|
36
|
+
struct GumboInternalCharacterToken* data;
|
37
|
+
|
38
|
+
// The length of the sequence.
|
39
|
+
size_t length;
|
40
|
+
|
41
|
+
// The capacity of the buffer.
|
42
|
+
size_t capacity;
|
43
|
+
} GumboCharacterTokenBuffer;
|
44
|
+
|
45
|
+
// Initializes a new GumboCharacterTokenBuffer.
|
46
|
+
void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer);
|
47
|
+
|
48
|
+
// Appends a character (or whitespace) token.
|
49
|
+
void gumbo_character_token_buffer_append (
|
50
|
+
const struct GumboInternalToken* token,
|
51
|
+
GumboCharacterTokenBuffer* buffer
|
52
|
+
);
|
53
|
+
|
54
|
+
void gumbo_character_token_buffer_get (
|
55
|
+
const GumboCharacterTokenBuffer* buffer,
|
56
|
+
size_t index,
|
57
|
+
struct GumboInternalToken* output
|
58
|
+
);
|
59
|
+
|
60
|
+
// Reinitialize this string buffer. This clears it by setting length=0. It
|
61
|
+
// does not zero out the buffer itself.
|
62
|
+
void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer);
|
63
|
+
|
64
|
+
// Deallocates this GumboCharacterTokenBuffer.
|
65
|
+
void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer);
|
66
|
+
|
67
|
+
#ifdef __cplusplus
|
68
|
+
}
|
69
|
+
#endif
|
70
|
+
|
71
|
+
#endif // GUMBO_TOKEN_BUFFER_H
|
@@ -1,5 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
Copyright 2010 Google Inc.
|
3
|
+
Copyright 2017-2018 Craig Barnes
|
4
|
+
Copyright 2018 Stephen Checkoway
|
3
5
|
|
4
6
|
Licensed under the Apache License, Version 2.0 (the "License");
|
5
7
|
you may not use this file except in compliance with the License.
|
@@ -60,15 +62,18 @@
|
|
60
62
|
#include "util.h"
|
61
63
|
#include "vector.h"
|
62
64
|
|
63
|
-
// Compared against
|
65
|
+
// Compared against _temporary_buffer to determine if we're in
|
64
66
|
// double-escaped script mode.
|
65
67
|
static const GumboStringPiece kScriptTag = {.data = "script", .length = 6};
|
66
68
|
|
67
|
-
// An enum for the return value of each individual state.
|
69
|
+
// An enum for the return value of each individual state. Each of the emit_*
|
70
|
+
// functions should return EMIT_TOKEN and should be called as
|
71
|
+
// return emit_foo(parser, ..., output);
|
72
|
+
// Each of the handle_*_state functions that do not return emit_* should
|
73
|
+
// instead return CONTINUE to indicate to gumbo_lex to continue lexing.
|
68
74
|
typedef enum {
|
69
|
-
|
70
|
-
|
71
|
-
NEXT_CHAR // Proceed to the next character and continue lexing.
|
75
|
+
EMIT_TOKEN,
|
76
|
+
CONTINUE,
|
72
77
|
} StateResult;
|
73
78
|
|
74
79
|
// This is a struct containing state necessary to build up a tag token,
|
@@ -103,12 +108,6 @@ typedef struct GumboInternalTagState {
|
|
103
108
|
// the attribute value, but shouldn't overwrite the existing value.
|
104
109
|
bool _drop_next_attr_value;
|
105
110
|
|
106
|
-
// The state that caused the tokenizer to switch into a character reference in
|
107
|
-
// attribute value state. This is used to set the additional allowed
|
108
|
-
// character, and is switched back to on completion. Initialized as the
|
109
|
-
// tokenizer enters the character reference state.
|
110
|
-
GumboTokenizerEnum _attr_value_state;
|
111
|
-
|
112
111
|
// The last start tag to have been emitted by the tokenizer. This is
|
113
112
|
// necessary to check for appropriate end tags.
|
114
113
|
GumboTag _last_start_tag;
|
@@ -133,15 +132,19 @@ typedef struct GumboInternalTokenizerState {
|
|
133
132
|
// "Reconsume the current input character in..."
|
134
133
|
bool _reconsume_current_input;
|
135
134
|
|
136
|
-
// A flag indicating whether the current node is a foreign element.
|
137
|
-
// set by
|
138
|
-
// markup declaration state.
|
139
|
-
bool
|
135
|
+
// A flag indicating whether the adjusted current node is a foreign element.
|
136
|
+
// This is set by gumbo_tokenizer_set_is_adjusted_current_node_foreign and
|
137
|
+
// checked in the markup declaration state.
|
138
|
+
bool _is_adjusted_current_node_foreign;
|
140
139
|
|
141
140
|
// A flag indicating whether the tokenizer is in a CDATA section. If so, then
|
142
141
|
// text tokens emitted will be GUMBO_TOKEN_CDATA.
|
143
142
|
bool _is_in_cdata;
|
144
143
|
|
144
|
+
// A flag indicating whether the tokenizer has seen a parse error since the
|
145
|
+
// last token was emitted.
|
146
|
+
bool _parse_error;
|
147
|
+
|
145
148
|
// Certain states (notably character references) may emit two character tokens
|
146
149
|
// at once, but the contract for lex() fills in only one token at a time. The
|
147
150
|
// extra character is buffered here, and then this is checked on entry to
|
@@ -159,27 +162,24 @@ typedef struct GumboInternalTokenizerState {
|
|
159
162
|
|
160
163
|
// A temporary buffer to accumulate characters, as described by the "temporary
|
161
164
|
// buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
|
162
|
-
// way:
|
163
|
-
//
|
164
|
-
//
|
165
|
-
//
|
166
|
-
//
|
167
|
-
//
|
168
|
-
// input stream, so that tokens emitted by emit_char have the correct position
|
169
|
-
// and original text.
|
165
|
+
// way: In situations where the spec calls for inserting characters into the
|
166
|
+
// temporary buffer that exactly match the input in order to emit them as
|
167
|
+
// character tokens, we don't actually do it.
|
168
|
+
// Instead, we mark the input and reset the input to it using set_mark() and
|
169
|
+
// emit_from_mark(). We do use the temporary buffer for other uses such as
|
170
|
+
// DOCTYPEs, comments, and detecting escaped <script> tags.
|
170
171
|
GumboStringBuffer _temporary_buffer;
|
171
172
|
|
172
|
-
// The
|
173
|
-
//
|
174
|
-
const char*
|
173
|
+
// The position to resume normal operation after we start emitting from the
|
174
|
+
// mark. NULL whenever we're not emitting from the mark.
|
175
|
+
const char* _resume_pos;
|
175
176
|
|
176
|
-
// The
|
177
|
-
//
|
178
|
-
|
179
|
-
|
180
|
-
//
|
181
|
-
|
182
|
-
GumboStringBuffer _script_data_buffer;
|
177
|
+
// The character reference state uses a return state to return to the state
|
178
|
+
// it was invoked from.
|
179
|
+
GumboTokenizerEnum _return_state;
|
180
|
+
|
181
|
+
// Numeric character reference.
|
182
|
+
uint32_t _character_reference_code;
|
183
183
|
|
184
184
|
// Pointer to the beginning of the current token in the original buffer; used
|
185
185
|
// to record the original text.
|
@@ -201,123 +201,69 @@ typedef struct GumboInternalTokenizerState {
|
|
201
201
|
Utf8Iterator _input;
|
202
202
|
} GumboTokenizerState;
|
203
203
|
|
204
|
-
// Adds
|
204
|
+
// Adds a parse error to the parser's error struct.
|
205
205
|
static void tokenizer_add_parse_error (
|
206
206
|
GumboParser* parser,
|
207
207
|
GumboErrorType type
|
208
208
|
) {
|
209
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
210
|
+
tokenizer->_parse_error = true;
|
209
211
|
GumboError* error = gumbo_add_error(parser);
|
210
212
|
if (!error) {
|
211
213
|
return;
|
212
214
|
}
|
215
|
+
const Utf8Iterator* input = &tokenizer->_input;
|
216
|
+
utf8iterator_get_position(input, &error->position);
|
217
|
+
error->original_text.data = utf8iterator_get_char_pointer(input);
|
218
|
+
error->original_text.length = utf8iterator_get_width(input);
|
219
|
+
error->type = type;
|
220
|
+
error->v.tokenizer.state = tokenizer->_state;
|
221
|
+
error->v.tokenizer.codepoint = utf8iterator_current(input);
|
222
|
+
}
|
223
|
+
|
224
|
+
// Adds an error pointing at the start of the character reference.
|
225
|
+
static void tokenizer_add_char_ref_error (
|
226
|
+
struct GumboInternalParser* parser,
|
227
|
+
GumboErrorType type,
|
228
|
+
int codepoint
|
229
|
+
) {
|
213
230
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
214
|
-
|
215
|
-
error
|
231
|
+
tokenizer->_parse_error = true;
|
232
|
+
GumboError* error = gumbo_add_error(parser);
|
233
|
+
if (!error)
|
234
|
+
return;
|
235
|
+
Utf8Iterator* input = &tokenizer->_input;
|
216
236
|
error->type = type;
|
217
|
-
error->
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
case GUMBO_LEX_SCRIPT_LT:
|
244
|
-
case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
|
245
|
-
case GUMBO_LEX_SCRIPT_END_TAG_NAME:
|
246
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_START:
|
247
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
|
248
|
-
case GUMBO_LEX_SCRIPT_ESCAPED:
|
249
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
|
250
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
|
251
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_LT:
|
252
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
|
253
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
|
254
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
|
255
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
|
256
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
|
257
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
|
258
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
|
259
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
|
260
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
|
261
|
-
break;
|
262
|
-
case GUMBO_LEX_TAG_OPEN:
|
263
|
-
case GUMBO_LEX_END_TAG_OPEN:
|
264
|
-
case GUMBO_LEX_TAG_NAME:
|
265
|
-
case GUMBO_LEX_BEFORE_ATTR_NAME:
|
266
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
|
267
|
-
break;
|
268
|
-
case GUMBO_LEX_SELF_CLOSING_START_TAG:
|
269
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
|
270
|
-
break;
|
271
|
-
case GUMBO_LEX_ATTR_NAME:
|
272
|
-
case GUMBO_LEX_AFTER_ATTR_NAME:
|
273
|
-
case GUMBO_LEX_BEFORE_ATTR_VALUE:
|
274
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
|
275
|
-
break;
|
276
|
-
case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
|
277
|
-
case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
|
278
|
-
case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
|
279
|
-
case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
|
280
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
|
281
|
-
break;
|
282
|
-
case GUMBO_LEX_BOGUS_COMMENT:
|
283
|
-
case GUMBO_LEX_COMMENT_START:
|
284
|
-
case GUMBO_LEX_COMMENT_START_DASH:
|
285
|
-
case GUMBO_LEX_COMMENT:
|
286
|
-
case GUMBO_LEX_COMMENT_END_DASH:
|
287
|
-
case GUMBO_LEX_COMMENT_END:
|
288
|
-
case GUMBO_LEX_COMMENT_END_BANG:
|
289
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
|
290
|
-
break;
|
291
|
-
case GUMBO_LEX_MARKUP_DECLARATION:
|
292
|
-
case GUMBO_LEX_DOCTYPE:
|
293
|
-
case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
|
294
|
-
case GUMBO_LEX_DOCTYPE_NAME:
|
295
|
-
case GUMBO_LEX_AFTER_DOCTYPE_NAME:
|
296
|
-
case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
|
297
|
-
case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
|
298
|
-
case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
|
299
|
-
case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
|
300
|
-
case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
|
301
|
-
case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
|
302
|
-
case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
|
303
|
-
case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
|
304
|
-
case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
|
305
|
-
case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
|
306
|
-
case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
|
307
|
-
case GUMBO_LEX_BOGUS_DOCTYPE:
|
308
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
|
309
|
-
break;
|
310
|
-
case GUMBO_LEX_CDATA:
|
311
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
|
312
|
-
break;
|
313
|
-
}
|
237
|
+
error->position = utf8iterator_get_mark_position(input);
|
238
|
+
const char* mark = utf8iterator_get_mark_pointer(input);
|
239
|
+
error->original_text.data = mark;
|
240
|
+
error->original_text.length = utf8iterator_get_char_pointer(input) - mark;
|
241
|
+
error->v.tokenizer.state = tokenizer->_state;
|
242
|
+
error->v.tokenizer.codepoint = codepoint;
|
243
|
+
}
|
244
|
+
|
245
|
+
// Adds an error pointing at the start of the token.
|
246
|
+
static void tokenizer_add_token_parse_error (
|
247
|
+
GumboParser* parser,
|
248
|
+
GumboErrorType type
|
249
|
+
) {
|
250
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
251
|
+
tokenizer->_parse_error = true;
|
252
|
+
GumboError* error = gumbo_add_error(parser);
|
253
|
+
if (!error)
|
254
|
+
return;
|
255
|
+
Utf8Iterator* input = &tokenizer->_input;
|
256
|
+
error->type = type;
|
257
|
+
error->position = tokenizer->_token_start_pos;
|
258
|
+
error->original_text.data = tokenizer->_token_start;
|
259
|
+
error->original_text.length =
|
260
|
+
utf8iterator_get_char_pointer(input) - tokenizer->_token_start;
|
261
|
+
error->v.tokenizer.state = tokenizer->_state;
|
262
|
+
error->v.tokenizer.codepoint = 0;
|
314
263
|
}
|
315
264
|
|
316
265
|
static bool is_alpha(int c) {
|
317
|
-
|
318
|
-
// on the current locale, whereas the behavior in the HTML5 spec is
|
319
|
-
// locale-independent.
|
320
|
-
return ((unsigned) c | 32) - 'a' < 26;
|
266
|
+
return gumbo_ascii_isalpha(c);
|
321
267
|
}
|
322
268
|
|
323
269
|
static int ensure_lowercase(int c) {
|
@@ -347,24 +293,9 @@ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
|
|
347
293
|
}
|
348
294
|
|
349
295
|
// Starts recording characters in the temporary buffer.
|
350
|
-
// Because this needs to reset the utf8iterator_mark to the beginning of the
|
351
|
-
// text that will eventually be emitted, it needs to be called a couple of
|
352
|
-
// states before the spec says "Set the temporary buffer to the empty string".
|
353
|
-
// In general, this should be called whenever there's a transition to a
|
354
|
-
// "less-than sign state". The initial < and possibly / then need to be
|
355
|
-
// appended to the temporary buffer, their presence needs to be accounted for in
|
356
|
-
// states that compare the temporary buffer against a literal value, and
|
357
|
-
// spec stanzas that say "emit a < and / character token along with a character
|
358
|
-
// token for each character in the temporary buffer" need to be adjusted to
|
359
|
-
// account for the presence of the < and / inside the temporary buffer.
|
360
296
|
static void clear_temporary_buffer(GumboParser* parser) {
|
361
297
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
362
|
-
assert(!tokenizer->_temporary_buffer_emit);
|
363
|
-
utf8iterator_mark(&tokenizer->_input);
|
364
298
|
gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
|
365
|
-
// The temporary buffer and script data buffer are the same object in the
|
366
|
-
// spec, so the script data buffer should be cleared as well.
|
367
|
-
gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
|
368
299
|
}
|
369
300
|
|
370
301
|
// Appends a codepoint to the temporary buffer.
|
@@ -378,25 +309,20 @@ static void append_char_to_temporary_buffer (
|
|
378
309
|
);
|
379
310
|
}
|
380
311
|
|
381
|
-
|
382
|
-
|
383
|
-
const
|
384
|
-
const char* text,
|
385
|
-
size_t text_len
|
312
|
+
static void append_string_to_temporary_buffer (
|
313
|
+
GumboParser* parser,
|
314
|
+
const GumboStringPiece* str
|
386
315
|
) {
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
316
|
+
gumbo_string_buffer_append_string (
|
317
|
+
str,
|
318
|
+
&parser->_tokenizer_state->_temporary_buffer
|
319
|
+
);
|
391
320
|
}
|
392
321
|
|
393
|
-
#define temporary_buffer_equals(parser, text) \
|
394
|
-
temporary_buffer_equals__(parser, "" text, sizeof(text) - 1)
|
395
322
|
|
396
323
|
static bool temporary_buffer_is_empty(const GumboParser* parser) {
|
397
324
|
return parser->_tokenizer_state->_temporary_buffer.length == 0;
|
398
325
|
}
|
399
|
-
#endif
|
400
326
|
|
401
327
|
static void doc_type_state_init(GumboParser* parser) {
|
402
328
|
GumboTokenDocType* doc_type_state =
|
@@ -493,56 +419,49 @@ static void finish_doctype_system_id(GumboParser* parser) {
|
|
493
419
|
}
|
494
420
|
|
495
421
|
// Writes a single specified character to the output token.
|
496
|
-
static
|
422
|
+
static StateResult emit_char(GumboParser* parser, int c, GumboToken* output) {
|
497
423
|
output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
|
498
424
|
output->v.character = c;
|
499
425
|
finish_token(parser, output);
|
426
|
+
return EMIT_TOKEN;
|
500
427
|
}
|
501
428
|
|
502
429
|
// Writes a replacement character token and records a parse error.
|
503
|
-
// Always returns
|
430
|
+
// Always returns EMIT_TOKEN, per gumbo_lex return value.
|
504
431
|
static StateResult emit_replacement_char(
|
505
432
|
GumboParser* parser, GumboToken* output) {
|
506
433
|
// In all cases, this is because of a null byte in the input stream.
|
507
|
-
tokenizer_add_parse_error(parser,
|
434
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
508
435
|
emit_char(parser, kUtf8ReplacementChar, output);
|
509
|
-
return
|
436
|
+
return EMIT_TOKEN;
|
510
437
|
}
|
511
438
|
|
512
|
-
// Writes an EOF character token. Always returns
|
439
|
+
// Writes an EOF character token. Always returns EMIT_TOKEN.
|
513
440
|
static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
|
514
|
-
emit_char(parser, -1, output);
|
515
|
-
return RETURN_SUCCESS;
|
516
|
-
}
|
517
|
-
|
518
|
-
// Writes the current input character out as a character token.
|
519
|
-
// Always returns RETURN_SUCCESS.
|
520
|
-
static bool emit_current_char(GumboParser* parser, GumboToken* output) {
|
521
|
-
emit_char(
|
522
|
-
parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
|
523
|
-
return RETURN_SUCCESS;
|
441
|
+
return emit_char(parser, -1, output);
|
524
442
|
}
|
525
443
|
|
526
444
|
// Writes out a doctype token, copying it from the tokenizer state.
|
527
|
-
static
|
445
|
+
static StateResult emit_doctype(GumboParser* parser, GumboToken* output) {
|
528
446
|
output->type = GUMBO_TOKEN_DOCTYPE;
|
529
447
|
output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
|
530
448
|
finish_token(parser, output);
|
531
449
|
doc_type_state_init(parser);
|
450
|
+
return EMIT_TOKEN;
|
532
451
|
}
|
533
452
|
|
534
453
|
// Debug-only function that explicitly sets the attribute vector data to NULL so
|
535
454
|
// it can be asserted on tag creation, verifying that there are no memory leaks.
|
536
455
|
static void mark_tag_state_as_empty(GumboTagState* tag_state) {
|
537
456
|
UNUSED_IF_NDEBUG(tag_state);
|
538
|
-
#ifndef NDEBUG
|
539
457
|
tag_state->_name = NULL;
|
458
|
+
#ifndef NDEBUG
|
540
459
|
tag_state->_attributes = kGumboEmptyVector;
|
541
460
|
#endif
|
542
461
|
}
|
543
462
|
|
544
463
|
// Writes out the current tag as a start or end tag token.
|
545
|
-
// Always returns
|
464
|
+
// Always returns EMIT_TOKEN.
|
546
465
|
static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
547
466
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
548
467
|
if (tag_state->_is_start_tag) {
|
@@ -559,7 +478,10 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
559
478
|
output->type = GUMBO_TOKEN_END_TAG;
|
560
479
|
output->v.end_tag.tag = tag_state->_tag;
|
561
480
|
output->v.end_tag.name = tag_state->_name;
|
562
|
-
|
481
|
+
if (tag_state->_is_self_closing)
|
482
|
+
tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS);
|
483
|
+
if (tag_state->_attributes.length > 0)
|
484
|
+
tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_ATTRIBUTES);
|
563
485
|
// In end tags, ownership of the attributes vector is not transferred to the
|
564
486
|
// token, but it's still initialized as normal, so it must be manually
|
565
487
|
// deallocated. There may also be attributes to destroy, in certain broken
|
@@ -582,7 +504,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
582
504
|
assert(output->original_text.length >= 2);
|
583
505
|
assert(output->original_text.data[0] == '<');
|
584
506
|
assert(output->original_text.data[output->original_text.length - 1] == '>');
|
585
|
-
return
|
507
|
+
return EMIT_TOKEN;
|
586
508
|
}
|
587
509
|
|
588
510
|
// In some states, we speculatively start a tag, but don't know whether it'll be
|
@@ -600,90 +522,59 @@ static void abandon_current_tag(GumboParser* parser) {
|
|
600
522
|
gumbo_debug("Abandoning current tag.\n");
|
601
523
|
}
|
602
524
|
|
603
|
-
// Wraps the gumbo_consume_char_ref function to handle its output and make the
|
604
|
-
// appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
|
605
|
-
// error occurred, RETURN_SUCCESS otherwise.
|
606
|
-
static StateResult emit_char_ref (
|
607
|
-
GumboParser* parser,
|
608
|
-
int additional_allowed_char,
|
609
|
-
bool UNUSED_ARG(is_in_attribute),
|
610
|
-
GumboToken* output
|
611
|
-
) {
|
612
|
-
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
613
|
-
OneOrTwoCodepoints char_ref;
|
614
|
-
bool status = gumbo_consume_char_ref (
|
615
|
-
parser,
|
616
|
-
&tokenizer->_input,
|
617
|
-
additional_allowed_char,
|
618
|
-
false,
|
619
|
-
&char_ref
|
620
|
-
);
|
621
|
-
if (char_ref.first != kGumboNoChar) {
|
622
|
-
// gumbo_consume_char_ref ends with the iterator pointing at the next
|
623
|
-
// character, so we need to be sure not advance it again before
|
624
|
-
// reading the next token.
|
625
|
-
tokenizer->_reconsume_current_input = true;
|
626
|
-
emit_char(parser, char_ref.first, output);
|
627
|
-
tokenizer->_buffered_emit_char = char_ref.second;
|
628
|
-
} else {
|
629
|
-
emit_char(parser, '&', output);
|
630
|
-
}
|
631
|
-
return status ? RETURN_SUCCESS : RETURN_ERROR;
|
632
|
-
}
|
633
|
-
|
634
525
|
// Emits a comment token. Comments use the temporary buffer to accumulate their
|
635
526
|
// data, and then it's copied over and released to the 'text' field of the
|
636
|
-
// GumboToken union. Always returns
|
527
|
+
// GumboToken union. Always returns EMIT_TOKEN.
|
637
528
|
static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
|
638
529
|
output->type = GUMBO_TOKEN_COMMENT;
|
639
530
|
finish_temporary_buffer(parser, &output->v.text);
|
640
531
|
finish_token(parser, output);
|
641
|
-
return
|
532
|
+
return EMIT_TOKEN;
|
642
533
|
}
|
643
534
|
|
644
|
-
|
645
|
-
// buffer, and fills the output token with the next output character if so.
|
646
|
-
// Returns true if a character has been emitted and the tokenizer should
|
647
|
-
// immediately return, false if we're at the end of the temporary buffer and
|
648
|
-
// should resume normal operation.
|
649
|
-
static bool maybe_emit_from_temporary_buffer(
|
650
|
-
GumboParser* parser, GumboToken* output) {
|
535
|
+
static void set_mark(GumboParser* parser) {
|
651
536
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
652
|
-
|
653
|
-
|
537
|
+
utf8iterator_mark(&tokenizer->_input);
|
538
|
+
}
|
654
539
|
|
655
|
-
|
656
|
-
|
657
|
-
|
540
|
+
// Checks to see we should be emitting characters from the mark, and fills the
|
541
|
+
// output token with the next output character if so.
|
542
|
+
// Returns EMIT_TOKEN if a character has been emitted and the tokenizer should
|
543
|
+
// immediately return, CONTINUE if we should resume normal operation.
|
544
|
+
static StateResult maybe_emit_from_mark (
|
545
|
+
GumboParser* parser,
|
546
|
+
GumboToken* output
|
547
|
+
) {
|
548
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
549
|
+
const char* pos = tokenizer->_resume_pos;
|
550
|
+
|
551
|
+
if (!pos)
|
552
|
+
return CONTINUE;
|
553
|
+
if (utf8iterator_get_char_pointer(&tokenizer->_input) >= pos) {
|
554
|
+
tokenizer->_resume_pos = NULL;
|
555
|
+
return CONTINUE;
|
658
556
|
}
|
659
557
|
|
660
|
-
|
661
|
-
//
|
662
|
-
//
|
663
|
-
|
664
|
-
|
665
|
-
// have already been advanced past. However, it should be preserved so that
|
666
|
-
// when the *next* character is encountered again, the tokenizer knows not to
|
667
|
-
// advance past it.
|
668
|
-
bool saved_reconsume_state = tokenizer->_reconsume_current_input;
|
669
|
-
tokenizer->_reconsume_current_input = false;
|
670
|
-
emit_char(parser, *c, output);
|
671
|
-
++tokenizer->_temporary_buffer_emit;
|
672
|
-
tokenizer->_reconsume_current_input = saved_reconsume_state;
|
673
|
-
return true;
|
558
|
+
// emit_char advances the input stream. _reconsume_current_input should
|
559
|
+
// *never* be set when emitting from the mark since those characters have
|
560
|
+
// already been advanced past.
|
561
|
+
assert(!tokenizer->_reconsume_current_input);
|
562
|
+
return emit_char(parser, utf8iterator_current(&tokenizer->_input), output);
|
674
563
|
}
|
675
564
|
|
676
|
-
// Sets up the tokenizer to begin
|
677
|
-
// This resets the input iterator stream to
|
678
|
-
//
|
679
|
-
//
|
680
|
-
|
681
|
-
static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
|
565
|
+
// Sets up the tokenizer to begin emitting from the mark up to, but not
|
566
|
+
// including, the current code point. This resets the input iterator stream to
|
567
|
+
// the mark, sets up _resume_pos, and then emits the first character in it.
|
568
|
+
// Returns EMIT_TOKEN.
|
569
|
+
static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
|
682
570
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
683
|
-
|
571
|
+
tokenizer->_resume_pos = utf8iterator_get_char_pointer(&tokenizer->_input);
|
684
572
|
utf8iterator_reset(&tokenizer->_input);
|
685
|
-
|
686
|
-
|
573
|
+
// Now that we have reset the input, we need to advance through it.
|
574
|
+
tokenizer->_reconsume_current_input = false;
|
575
|
+
StateResult result = maybe_emit_from_mark(parser, output);
|
576
|
+
assert(result == EMIT_TOKEN);
|
577
|
+
return result;
|
687
578
|
}
|
688
579
|
|
689
580
|
// Appends a codepoint to the current tag buffer. If
|
@@ -703,6 +594,19 @@ static void append_char_to_tag_buffer (
|
|
703
594
|
gumbo_string_buffer_append_codepoint(codepoint, buffer);
|
704
595
|
}
|
705
596
|
|
597
|
+
// Like above but append a string.
|
598
|
+
static void append_string_to_tag_buffer (
|
599
|
+
GumboParser* parser,
|
600
|
+
GumboStringPiece* str,
|
601
|
+
bool reinitilize_position_on_first
|
602
|
+
) {
|
603
|
+
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
604
|
+
if (buffer->length == 0 && reinitilize_position_on_first) {
|
605
|
+
reset_tag_buffer_start_point(parser);
|
606
|
+
}
|
607
|
+
gumbo_string_buffer_append_string(str, buffer);
|
608
|
+
}
|
609
|
+
|
706
610
|
// (Re-)initialize the tag buffer. This also resets the original_text pointer
|
707
611
|
// and _start_pos field to point to the current position.
|
708
612
|
static void initialize_tag_buffer(GumboParser* parser) {
|
@@ -713,6 +617,70 @@ static void initialize_tag_buffer(GumboParser* parser) {
|
|
713
617
|
reset_tag_buffer_start_point(parser);
|
714
618
|
}
|
715
619
|
|
620
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#charref-in-attribute
|
621
|
+
static bool character_reference_part_of_attribute(GumboParser* parser) {
|
622
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
623
|
+
switch (tokenizer->_return_state) {
|
624
|
+
case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
|
625
|
+
case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
|
626
|
+
case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
|
627
|
+
return true;
|
628
|
+
default:
|
629
|
+
return false;
|
630
|
+
}
|
631
|
+
}
|
632
|
+
|
633
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference
|
634
|
+
// For each code point in the temporary buffer, add to the current attribute
|
635
|
+
// value if the character reference was consumed as part of an attribute or
|
636
|
+
// emit the code point as a character token.
|
637
|
+
static StateResult flush_code_points_consumed_as_character_reference (
|
638
|
+
GumboParser* parser,
|
639
|
+
GumboToken* output
|
640
|
+
) {
|
641
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
642
|
+
if (character_reference_part_of_attribute(parser)) {
|
643
|
+
const char *start = utf8iterator_get_mark_pointer(&tokenizer->_input);
|
644
|
+
assert(start);
|
645
|
+
GumboStringPiece str = {
|
646
|
+
.data = start,
|
647
|
+
.length = utf8iterator_get_char_pointer(&tokenizer->_input) - start,
|
648
|
+
};
|
649
|
+
bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
|
650
|
+
append_string_to_tag_buffer(parser, &str, unquoted);
|
651
|
+
return CONTINUE;
|
652
|
+
}
|
653
|
+
return emit_from_mark(parser, output);
|
654
|
+
}
|
655
|
+
|
656
|
+
// After a character reference has been successfully constructed, the standard
|
657
|
+
// says to set the temporary buffer equal to the empty string, append the code
|
658
|
+
// point(s) associated with the reference and flush code points consumed as a
|
659
|
+
// character reference.
|
660
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
|
661
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
|
662
|
+
// That doesn't work for us because we use the temporary buffer in lock step
|
663
|
+
// with the input for position and that would fail if we inserted a different
|
664
|
+
// number of code points. So duplicate a bit of the above logic.
|
665
|
+
static StateResult flush_char_ref (
|
666
|
+
GumboParser* parser,
|
667
|
+
int first,
|
668
|
+
int second,
|
669
|
+
GumboToken* output
|
670
|
+
) {
|
671
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
672
|
+
if (character_reference_part_of_attribute(parser)) {
|
673
|
+
bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
|
674
|
+
append_char_to_tag_buffer(parser, first, unquoted);
|
675
|
+
if (second != kGumboNoChar)
|
676
|
+
append_char_to_tag_buffer(parser, second, unquoted);
|
677
|
+
return CONTINUE;
|
678
|
+
}
|
679
|
+
tokenizer->_buffered_emit_char = second;
|
680
|
+
return emit_char(parser, first, output);
|
681
|
+
}
|
682
|
+
|
683
|
+
|
716
684
|
// Initializes the tag_state to start a new tag, keeping track of the opening
|
717
685
|
// positions and original text. Takes a boolean indicating whether this is a
|
718
686
|
// start or end tag.
|
@@ -725,7 +693,6 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
|
|
725
693
|
assert(is_alpha(c));
|
726
694
|
|
727
695
|
initialize_tag_buffer(parser);
|
728
|
-
gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
|
729
696
|
|
730
697
|
assert(tag_state->_name == NULL);
|
731
698
|
assert(tag_state->_attributes.data == NULL);
|
@@ -801,23 +768,20 @@ static void finish_tag_name(GumboParser* parser) {
|
|
801
768
|
}
|
802
769
|
|
803
770
|
// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
|
804
|
-
static void add_duplicate_attr_error
|
805
|
-
|
806
|
-
|
807
|
-
int new_index
|
808
|
-
) {
|
771
|
+
static void add_duplicate_attr_error(GumboParser* parser) {
|
772
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
773
|
+
tokenizer->_parse_error = true;
|
809
774
|
GumboError* error = gumbo_add_error(parser);
|
810
775
|
if (!error) {
|
811
776
|
return;
|
812
777
|
}
|
813
778
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
814
|
-
error->type =
|
779
|
+
error->type = GUMBO_ERR_DUPLICATE_ATTRIBUTE;
|
815
780
|
error->position = tag_state->_start_pos;
|
816
|
-
error->original_text = tag_state->_original_text;
|
817
|
-
error->
|
818
|
-
|
819
|
-
|
820
|
-
reinitialize_tag_buffer(parser);
|
781
|
+
error->original_text.data = tag_state->_original_text;
|
782
|
+
error->original_text.length =
|
783
|
+
utf8iterator_get_char_pointer(&tokenizer->_input) - error->original_text.data;
|
784
|
+
error->v.tokenizer.state = tokenizer->_state;
|
821
785
|
}
|
822
786
|
|
823
787
|
// Creates a new attribute in the current tag, copying the current tag buffer to
|
@@ -846,7 +810,8 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
846
810
|
)
|
847
811
|
) {
|
848
812
|
// Identical attribute; bail.
|
849
|
-
add_duplicate_attr_error(parser
|
813
|
+
add_duplicate_attr_error(parser);
|
814
|
+
reinitialize_tag_buffer(parser);
|
850
815
|
tag_state->_drop_next_attr_value = true;
|
851
816
|
return false;
|
852
817
|
}
|
@@ -911,19 +876,21 @@ void gumbo_tokenizer_state_init (
|
|
911
876
|
GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState));
|
912
877
|
parser->_tokenizer_state = tokenizer;
|
913
878
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
879
|
+
tokenizer->_return_state = GUMBO_LEX_DATA;
|
880
|
+
tokenizer->_character_reference_code = 0;
|
914
881
|
tokenizer->_reconsume_current_input = false;
|
915
|
-
tokenizer->
|
882
|
+
tokenizer->_is_adjusted_current_node_foreign = false;
|
916
883
|
tokenizer->_is_in_cdata = false;
|
884
|
+
tokenizer->_parse_error = false;
|
917
885
|
tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
|
918
886
|
tokenizer->_tag_state._name = NULL;
|
919
887
|
|
920
888
|
tokenizer->_buffered_emit_char = kGumboNoChar;
|
921
889
|
gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
|
922
|
-
tokenizer->
|
890
|
+
tokenizer->_resume_pos = NULL;
|
923
891
|
|
924
892
|
mark_tag_state_as_empty(&tokenizer->_tag_state);
|
925
893
|
|
926
|
-
gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
|
927
894
|
tokenizer->_token_start = text;
|
928
895
|
utf8iterator_init(parser, text, text_length, &tokenizer->_input);
|
929
896
|
utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
|
@@ -936,7 +903,6 @@ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
|
|
936
903
|
assert(tokenizer->_doc_type_state.public_identifier == NULL);
|
937
904
|
assert(tokenizer->_doc_type_state.system_identifier == NULL);
|
938
905
|
gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
|
939
|
-
gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
|
940
906
|
assert(tokenizer->_tag_state._name == NULL);
|
941
907
|
assert(tokenizer->_tag_state._attributes.data == NULL);
|
942
908
|
gumbo_free(tokenizer);
|
@@ -946,17 +912,23 @@ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
|
|
946
912
|
parser->_tokenizer_state->_state = state;
|
947
913
|
}
|
948
914
|
|
949
|
-
void
|
915
|
+
void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
|
950
916
|
GumboParser* parser,
|
951
917
|
bool is_foreign
|
952
918
|
) {
|
953
|
-
if (is_foreign != parser->_tokenizer_state->
|
919
|
+
if (is_foreign != parser->_tokenizer_state->_is_adjusted_current_node_foreign) {
|
954
920
|
gumbo_debug (
|
955
921
|
"Toggling is_current_node_foreign to %s.\n",
|
956
922
|
is_foreign ? "true" : "false"
|
957
923
|
);
|
958
924
|
}
|
959
|
-
parser->_tokenizer_state->
|
925
|
+
parser->_tokenizer_state->_is_adjusted_current_node_foreign = is_foreign;
|
926
|
+
}
|
927
|
+
|
928
|
+
static void reconsume_in_state(GumboParser* parser, GumboTokenizerEnum state) {
|
929
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
930
|
+
tokenizer->_reconsume_current_input = true;
|
931
|
+
tokenizer->_state = state;
|
960
932
|
}
|
961
933
|
|
962
934
|
// https://html.spec.whatwg.org/multipage/parsing.html#data-state
|
@@ -968,37 +940,24 @@ static StateResult handle_data_state (
|
|
968
940
|
) {
|
969
941
|
switch (c) {
|
970
942
|
case '&':
|
971
|
-
gumbo_tokenizer_set_state(parser,
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
tokenizer->_reconsume_current_input = true;
|
976
|
-
return NEXT_CHAR;
|
943
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
|
944
|
+
set_mark(parser);
|
945
|
+
tokenizer->_return_state = GUMBO_LEX_DATA;
|
946
|
+
return CONTINUE;
|
977
947
|
case '<':
|
978
948
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
|
979
|
-
|
980
|
-
|
981
|
-
return NEXT_CHAR;
|
949
|
+
set_mark(parser);
|
950
|
+
return CONTINUE;
|
982
951
|
case '\0':
|
983
|
-
tokenizer_add_parse_error(parser,
|
984
|
-
emit_char(parser, c, output);
|
985
|
-
|
952
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
953
|
+
return emit_char(parser, c, output);
|
954
|
+
case -1:
|
955
|
+
return emit_eof(parser, output);
|
986
956
|
default:
|
987
|
-
return
|
957
|
+
return emit_char(parser, c, output);
|
988
958
|
}
|
989
959
|
}
|
990
960
|
|
991
|
-
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-data-state
|
992
|
-
static StateResult handle_char_ref_in_data_state (
|
993
|
-
GumboParser* parser,
|
994
|
-
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
995
|
-
int UNUSED_ARG(c),
|
996
|
-
GumboToken* output
|
997
|
-
) {
|
998
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
999
|
-
return emit_char_ref(parser, ' ', false, output);
|
1000
|
-
}
|
1001
|
-
|
1002
961
|
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
|
1003
962
|
static StateResult handle_rcdata_state (
|
1004
963
|
GumboParser* parser,
|
@@ -1008,34 +967,23 @@ static StateResult handle_rcdata_state (
|
|
1008
967
|
) {
|
1009
968
|
switch (c) {
|
1010
969
|
case '&':
|
1011
|
-
gumbo_tokenizer_set_state(parser,
|
1012
|
-
|
1013
|
-
|
970
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
|
971
|
+
set_mark(parser);
|
972
|
+
tokenizer->_return_state = GUMBO_LEX_RCDATA;
|
973
|
+
return CONTINUE;
|
1014
974
|
case '<':
|
1015
975
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
|
1016
|
-
|
1017
|
-
|
1018
|
-
return NEXT_CHAR;
|
976
|
+
set_mark(parser);
|
977
|
+
return CONTINUE;
|
1019
978
|
case '\0':
|
1020
979
|
return emit_replacement_char(parser, output);
|
1021
980
|
case -1:
|
1022
981
|
return emit_eof(parser, output);
|
1023
982
|
default:
|
1024
|
-
return
|
983
|
+
return emit_char(parser, c, output);
|
1025
984
|
}
|
1026
985
|
}
|
1027
986
|
|
1028
|
-
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-rcdata-state
|
1029
|
-
static StateResult handle_char_ref_in_rcdata_state (
|
1030
|
-
GumboParser* parser,
|
1031
|
-
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1032
|
-
int UNUSED_ARG(c),
|
1033
|
-
GumboToken* output
|
1034
|
-
) {
|
1035
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
|
1036
|
-
return emit_char_ref(parser, ' ', false, output);
|
1037
|
-
}
|
1038
|
-
|
1039
987
|
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
|
1040
988
|
static StateResult handle_rawtext_state (
|
1041
989
|
GumboParser* parser,
|
@@ -1046,20 +994,19 @@ static StateResult handle_rawtext_state (
|
|
1046
994
|
switch (c) {
|
1047
995
|
case '<':
|
1048
996
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
|
1049
|
-
|
1050
|
-
|
1051
|
-
return NEXT_CHAR;
|
997
|
+
set_mark(parser);
|
998
|
+
return CONTINUE;
|
1052
999
|
case '\0':
|
1053
1000
|
return emit_replacement_char(parser, output);
|
1054
1001
|
case -1:
|
1055
1002
|
return emit_eof(parser, output);
|
1056
1003
|
default:
|
1057
|
-
return
|
1004
|
+
return emit_char(parser, c, output);
|
1058
1005
|
}
|
1059
1006
|
}
|
1060
1007
|
|
1061
1008
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
|
1062
|
-
static StateResult
|
1009
|
+
static StateResult handle_script_data_state (
|
1063
1010
|
GumboParser* parser,
|
1064
1011
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1065
1012
|
int c,
|
@@ -1067,16 +1014,15 @@ static StateResult handle_script_state (
|
|
1067
1014
|
) {
|
1068
1015
|
switch (c) {
|
1069
1016
|
case '<':
|
1070
|
-
gumbo_tokenizer_set_state(parser,
|
1071
|
-
|
1072
|
-
|
1073
|
-
return NEXT_CHAR;
|
1017
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_LT);
|
1018
|
+
set_mark(parser);
|
1019
|
+
return CONTINUE;
|
1074
1020
|
case '\0':
|
1075
1021
|
return emit_replacement_char(parser, output);
|
1076
1022
|
case -1:
|
1077
1023
|
return emit_eof(parser, output);
|
1078
1024
|
default:
|
1079
|
-
return
|
1025
|
+
return emit_char(parser, c, output);
|
1080
1026
|
}
|
1081
1027
|
}
|
1082
1028
|
|
@@ -1093,75 +1039,75 @@ static StateResult handle_plaintext_state (
|
|
1093
1039
|
case -1:
|
1094
1040
|
return emit_eof(parser, output);
|
1095
1041
|
default:
|
1096
|
-
return
|
1042
|
+
return emit_char(parser, c, output);
|
1097
1043
|
}
|
1098
1044
|
}
|
1099
1045
|
|
1100
1046
|
// https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
|
1101
1047
|
static StateResult handle_tag_open_state (
|
1102
1048
|
GumboParser* parser,
|
1103
|
-
GumboTokenizerState*
|
1049
|
+
GumboTokenizerState* tokenizer,
|
1104
1050
|
int c,
|
1105
1051
|
GumboToken* output
|
1106
1052
|
) {
|
1107
|
-
assert(temporary_buffer_equals(parser, "<"));
|
1108
1053
|
switch (c) {
|
1109
1054
|
case '!':
|
1110
|
-
gumbo_tokenizer_set_state(parser,
|
1055
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION_OPEN);
|
1111
1056
|
clear_temporary_buffer(parser);
|
1112
|
-
return
|
1057
|
+
return CONTINUE;
|
1113
1058
|
case '/':
|
1114
1059
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
|
1115
|
-
|
1116
|
-
return NEXT_CHAR;
|
1060
|
+
return CONTINUE;
|
1117
1061
|
case '?':
|
1118
|
-
|
1062
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
|
1119
1063
|
clear_temporary_buffer(parser);
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1064
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
|
1065
|
+
return CONTINUE;
|
1066
|
+
case -1:
|
1067
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
|
1068
|
+
// Switch to data to emit EOF.
|
1069
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
1070
|
+
return emit_from_mark(parser, output);
|
1123
1071
|
default:
|
1124
1072
|
if (is_alpha(c)) {
|
1125
|
-
|
1073
|
+
reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
|
1126
1074
|
start_new_tag(parser, true);
|
1127
|
-
return
|
1128
|
-
} else {
|
1129
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
|
1130
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1131
|
-
emit_temporary_buffer(parser, output);
|
1132
|
-
return RETURN_ERROR;
|
1075
|
+
return CONTINUE;
|
1133
1076
|
}
|
1077
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
|
1078
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
1079
|
+
return emit_from_mark(parser, output);
|
1134
1080
|
}
|
1135
1081
|
}
|
1136
1082
|
|
1137
1083
|
// https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
|
1138
1084
|
static StateResult handle_end_tag_open_state (
|
1139
1085
|
GumboParser* parser,
|
1140
|
-
GumboTokenizerState*
|
1086
|
+
GumboTokenizerState* tokenizer,
|
1141
1087
|
int c,
|
1142
1088
|
GumboToken* output
|
1143
1089
|
) {
|
1144
|
-
assert(temporary_buffer_equals(parser, "</"));
|
1145
1090
|
switch (c) {
|
1146
1091
|
case '>':
|
1147
|
-
tokenizer_add_parse_error(parser,
|
1092
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_END_TAG_NAME);
|
1148
1093
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1149
|
-
return
|
1094
|
+
return CONTINUE;
|
1150
1095
|
case -1:
|
1151
|
-
tokenizer_add_parse_error(parser,
|
1152
|
-
|
1153
|
-
|
1096
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
|
1097
|
+
// Similar to the tag open state except we need to emit '<' and '/'
|
1098
|
+
// before the EOF.
|
1099
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
1100
|
+
return emit_from_mark(parser, output);
|
1154
1101
|
default:
|
1155
1102
|
if (is_alpha(c)) {
|
1156
|
-
|
1103
|
+
reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
|
1157
1104
|
start_new_tag(parser, false);
|
1158
1105
|
} else {
|
1159
|
-
tokenizer_add_parse_error(parser,
|
1160
|
-
|
1106
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
|
1107
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
|
1161
1108
|
clear_temporary_buffer(parser);
|
1162
|
-
append_char_to_temporary_buffer(parser, c);
|
1163
1109
|
}
|
1164
|
-
return
|
1110
|
+
return CONTINUE;
|
1165
1111
|
}
|
1166
1112
|
}
|
1167
1113
|
|
@@ -1179,27 +1125,26 @@ static StateResult handle_tag_name_state (
|
|
1179
1125
|
case ' ':
|
1180
1126
|
finish_tag_name(parser);
|
1181
1127
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1182
|
-
return
|
1128
|
+
return CONTINUE;
|
1183
1129
|
case '/':
|
1184
1130
|
finish_tag_name(parser);
|
1185
1131
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1186
|
-
return
|
1132
|
+
return CONTINUE;
|
1187
1133
|
case '>':
|
1188
1134
|
finish_tag_name(parser);
|
1189
1135
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1190
1136
|
return emit_current_tag(parser, output);
|
1191
1137
|
case '\0':
|
1192
|
-
tokenizer_add_parse_error(parser,
|
1138
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
1193
1139
|
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
|
1194
|
-
return
|
1140
|
+
return CONTINUE;
|
1195
1141
|
case -1:
|
1196
|
-
tokenizer_add_parse_error(parser,
|
1142
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
1197
1143
|
abandon_current_tag(parser);
|
1198
|
-
|
1199
|
-
return NEXT_CHAR;
|
1144
|
+
return emit_eof(parser, output);
|
1200
1145
|
default:
|
1201
1146
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1202
|
-
return
|
1147
|
+
return CONTINUE;
|
1203
1148
|
}
|
1204
1149
|
}
|
1205
1150
|
|
@@ -1210,36 +1155,29 @@ static StateResult handle_rcdata_lt_state (
|
|
1210
1155
|
int c,
|
1211
1156
|
GumboToken* output
|
1212
1157
|
) {
|
1213
|
-
assert(temporary_buffer_equals(parser, "<"));
|
1214
1158
|
if (c == '/') {
|
1215
1159
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
|
1216
|
-
|
1217
|
-
return NEXT_CHAR;
|
1160
|
+
return CONTINUE;
|
1218
1161
|
} else {
|
1219
|
-
|
1220
|
-
|
1221
|
-
return emit_temporary_buffer(parser, output);
|
1162
|
+
reconsume_in_state(parser, GUMBO_LEX_RCDATA);
|
1163
|
+
return emit_from_mark(parser, output);
|
1222
1164
|
}
|
1223
1165
|
}
|
1224
1166
|
|
1225
1167
|
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
|
1226
1168
|
static StateResult handle_rcdata_end_tag_open_state (
|
1227
1169
|
GumboParser* parser,
|
1228
|
-
GumboTokenizerState*
|
1170
|
+
GumboTokenizerState* tokenizer,
|
1229
1171
|
int c,
|
1230
1172
|
GumboToken* output
|
1231
1173
|
) {
|
1232
|
-
assert(temporary_buffer_equals(parser, "</"));
|
1233
1174
|
if (is_alpha(c)) {
|
1234
|
-
|
1175
|
+
reconsume_in_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
|
1235
1176
|
start_new_tag(parser, false);
|
1236
|
-
|
1237
|
-
return NEXT_CHAR;
|
1238
|
-
} else {
|
1239
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
|
1240
|
-
return emit_temporary_buffer(parser, output);
|
1177
|
+
return CONTINUE;
|
1241
1178
|
}
|
1242
|
-
|
1179
|
+
reconsume_in_state(parser, GUMBO_LEX_RCDATA);
|
1180
|
+
return emit_from_mark(parser, output);
|
1243
1181
|
}
|
1244
1182
|
|
1245
1183
|
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
|
@@ -1250,33 +1188,39 @@ static StateResult handle_rcdata_end_tag_name_state (
|
|
1250
1188
|
GumboToken* output
|
1251
1189
|
) {
|
1252
1190
|
UNUSED_IF_NDEBUG(tokenizer);
|
1253
|
-
assert(tokenizer->_temporary_buffer.length >= 2);
|
1254
1191
|
if (is_alpha(c)) {
|
1255
1192
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1193
|
+
return CONTINUE;
|
1194
|
+
}
|
1195
|
+
switch (c) {
|
1196
|
+
case '\t':
|
1197
|
+
case '\n':
|
1198
|
+
case '\f':
|
1199
|
+
case ' ':
|
1200
|
+
if (is_appropriate_end_tag(parser)) {
|
1201
|
+
finish_tag_name(parser);
|
1202
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1203
|
+
return CONTINUE;
|
1204
|
+
}
|
1205
|
+
break;
|
1206
|
+
case '/':
|
1207
|
+
if (is_appropriate_end_tag(parser)) {
|
1208
|
+
finish_tag_name(parser);
|
1209
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1210
|
+
return CONTINUE;
|
1211
|
+
}
|
1212
|
+
break;
|
1213
|
+
case '>':
|
1214
|
+
if (is_appropriate_end_tag(parser)) {
|
1215
|
+
finish_tag_name(parser);
|
1216
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1217
|
+
return emit_current_tag(parser, output);
|
1275
1218
|
}
|
1219
|
+
break;
|
1276
1220
|
}
|
1277
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
|
1278
1221
|
abandon_current_tag(parser);
|
1279
|
-
|
1222
|
+
reconsume_in_state(parser, GUMBO_LEX_RCDATA);
|
1223
|
+
return emit_from_mark(parser, output);
|
1280
1224
|
}
|
1281
1225
|
|
1282
1226
|
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
|
@@ -1286,34 +1230,29 @@ static StateResult handle_rawtext_lt_state (
|
|
1286
1230
|
int c,
|
1287
1231
|
GumboToken* output
|
1288
1232
|
) {
|
1289
|
-
assert(temporary_buffer_equals(parser, "<"));
|
1290
1233
|
if (c == '/') {
|
1291
1234
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
|
1292
|
-
|
1293
|
-
return NEXT_CHAR;
|
1235
|
+
return CONTINUE;
|
1294
1236
|
} else {
|
1295
|
-
|
1296
|
-
|
1297
|
-
return emit_temporary_buffer(parser, output);
|
1237
|
+
reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
|
1238
|
+
return emit_from_mark(parser, output);
|
1298
1239
|
}
|
1299
1240
|
}
|
1300
1241
|
|
1301
1242
|
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
|
1302
1243
|
static StateResult handle_rawtext_end_tag_open_state (
|
1303
1244
|
GumboParser* parser,
|
1304
|
-
GumboTokenizerState*
|
1245
|
+
GumboTokenizerState* tokenizer,
|
1305
1246
|
int c,
|
1306
1247
|
GumboToken* output
|
1307
1248
|
) {
|
1308
|
-
assert(temporary_buffer_equals(parser, "</"));
|
1309
1249
|
if (is_alpha(c)) {
|
1310
|
-
|
1250
|
+
reconsume_in_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
|
1311
1251
|
start_new_tag(parser, false);
|
1312
|
-
|
1313
|
-
return NEXT_CHAR;
|
1252
|
+
return CONTINUE;
|
1314
1253
|
} else {
|
1315
|
-
|
1316
|
-
return
|
1254
|
+
reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
|
1255
|
+
return emit_from_mark(parser, output);
|
1317
1256
|
}
|
1318
1257
|
}
|
1319
1258
|
|
@@ -1324,153 +1263,156 @@ static StateResult handle_rawtext_end_tag_name_state (
|
|
1324
1263
|
int c,
|
1325
1264
|
GumboToken* output
|
1326
1265
|
) {
|
1327
|
-
assert(tokenizer->_temporary_buffer.length >= 2);
|
1328
|
-
gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
|
1329
|
-
tokenizer->_tag_state._buffer.data);
|
1330
1266
|
if (is_alpha(c)) {
|
1331
1267
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1351
|
-
return emit_current_tag(parser, output);
|
1268
|
+
return CONTINUE;
|
1269
|
+
}
|
1270
|
+
switch (c) {
|
1271
|
+
case '\t':
|
1272
|
+
case '\n':
|
1273
|
+
case '\f':
|
1274
|
+
case ' ':
|
1275
|
+
if (is_appropriate_end_tag(parser)) {
|
1276
|
+
finish_tag_name(parser);
|
1277
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1278
|
+
return CONTINUE;
|
1279
|
+
}
|
1280
|
+
break;
|
1281
|
+
case '/':
|
1282
|
+
if (is_appropriate_end_tag(parser)) {
|
1283
|
+
finish_tag_name(parser);
|
1284
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1285
|
+
return CONTINUE;
|
1352
1286
|
}
|
1287
|
+
break;
|
1288
|
+
case '>':
|
1289
|
+
if (is_appropriate_end_tag(parser)) {
|
1290
|
+
finish_tag_name(parser);
|
1291
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1292
|
+
return emit_current_tag(parser, output);
|
1293
|
+
}
|
1294
|
+
break;
|
1353
1295
|
}
|
1354
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
|
1355
1296
|
abandon_current_tag(parser);
|
1356
|
-
|
1297
|
+
reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
|
1298
|
+
return emit_from_mark(parser, output);
|
1357
1299
|
}
|
1358
1300
|
|
1359
1301
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
|
1360
|
-
static StateResult
|
1302
|
+
static StateResult handle_script_data_lt_state (
|
1361
1303
|
GumboParser* parser,
|
1362
1304
|
GumboTokenizerState* tokenizer,
|
1363
1305
|
int c,
|
1364
1306
|
GumboToken* output
|
1365
1307
|
) {
|
1366
|
-
assert(temporary_buffer_equals(parser, "<"));
|
1367
1308
|
if (c == '/') {
|
1368
|
-
gumbo_tokenizer_set_state(parser,
|
1369
|
-
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
|
1378
|
-
return
|
1309
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN);
|
1310
|
+
return CONTINUE;
|
1311
|
+
}
|
1312
|
+
if (c == '!') {
|
1313
|
+
// This is the only place we don't reconsume the input before emitting the
|
1314
|
+
// temporary buffer. Since the current position is stored and the current
|
1315
|
+
// character is not emitted, we need to advance the input and then
|
1316
|
+
// reconsume.
|
1317
|
+
utf8iterator_next(&tokenizer->_input);
|
1318
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START);
|
1319
|
+
return emit_from_mark(parser, output);
|
1379
1320
|
}
|
1321
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1322
|
+
return emit_from_mark(parser, output);
|
1380
1323
|
}
|
1381
1324
|
|
1382
1325
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
|
1383
|
-
static StateResult
|
1326
|
+
static StateResult handle_script_data_end_tag_open_state (
|
1384
1327
|
GumboParser* parser,
|
1385
|
-
GumboTokenizerState*
|
1328
|
+
GumboTokenizerState* tokenizer,
|
1386
1329
|
int c,
|
1387
1330
|
GumboToken* output
|
1388
1331
|
) {
|
1389
|
-
assert(temporary_buffer_equals(parser, "</"));
|
1390
1332
|
if (is_alpha(c)) {
|
1391
|
-
|
1333
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME);
|
1392
1334
|
start_new_tag(parser, false);
|
1393
|
-
|
1394
|
-
return NEXT_CHAR;
|
1395
|
-
} else {
|
1396
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
|
1397
|
-
return emit_temporary_buffer(parser, output);
|
1335
|
+
return CONTINUE;
|
1398
1336
|
}
|
1337
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1338
|
+
return emit_from_mark(parser, output);
|
1399
1339
|
}
|
1400
1340
|
|
1401
1341
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
|
1402
|
-
static StateResult
|
1342
|
+
static StateResult handle_script_data_end_tag_name_state (
|
1403
1343
|
GumboParser* parser,
|
1404
1344
|
GumboTokenizerState* tokenizer,
|
1405
1345
|
int c,
|
1406
1346
|
GumboToken* output
|
1407
1347
|
) {
|
1408
|
-
UNUSED_IF_NDEBUG(tokenizer);
|
1409
|
-
assert(tokenizer->_temporary_buffer.length >= 2);
|
1410
1348
|
if (is_alpha(c)) {
|
1411
1349
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1350
|
+
return CONTINUE;
|
1351
|
+
}
|
1352
|
+
switch (c) {
|
1353
|
+
case '\t':
|
1354
|
+
case '\n':
|
1355
|
+
case '\f':
|
1356
|
+
case ' ':
|
1357
|
+
if (is_appropriate_end_tag(parser)) {
|
1358
|
+
finish_tag_name(parser);
|
1359
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1360
|
+
return CONTINUE;
|
1361
|
+
}
|
1362
|
+
break;
|
1363
|
+
case '/':
|
1364
|
+
if (is_appropriate_end_tag(parser)) {
|
1365
|
+
finish_tag_name(parser);
|
1366
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1367
|
+
return CONTINUE;
|
1368
|
+
}
|
1369
|
+
break;
|
1370
|
+
case '>':
|
1371
|
+
if (is_appropriate_end_tag(parser)) {
|
1372
|
+
finish_tag_name(parser);
|
1373
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1374
|
+
return emit_current_tag(parser, output);
|
1431
1375
|
}
|
1376
|
+
break;
|
1432
1377
|
}
|
1433
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
|
1434
1378
|
abandon_current_tag(parser);
|
1435
|
-
|
1379
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1380
|
+
return emit_from_mark(parser, output);
|
1436
1381
|
}
|
1437
1382
|
|
1438
1383
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
|
1439
|
-
static StateResult
|
1384
|
+
static StateResult handle_script_data_escaped_start_state (
|
1440
1385
|
GumboParser* parser,
|
1441
1386
|
GumboTokenizerState* tokenizer,
|
1442
1387
|
int c,
|
1443
1388
|
GumboToken* output
|
1444
1389
|
) {
|
1445
1390
|
if (c == '-') {
|
1446
|
-
gumbo_tokenizer_set_state(parser,
|
1447
|
-
return
|
1448
|
-
} else {
|
1449
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
|
1450
|
-
tokenizer->_reconsume_current_input = true;
|
1451
|
-
return NEXT_CHAR;
|
1391
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH);
|
1392
|
+
return emit_char(parser, c, output);
|
1452
1393
|
}
|
1394
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1395
|
+
return CONTINUE;
|
1453
1396
|
}
|
1454
1397
|
|
1455
1398
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
|
1456
|
-
static StateResult
|
1399
|
+
static StateResult handle_script_data_escaped_start_dash_state (
|
1457
1400
|
GumboParser* parser,
|
1458
1401
|
GumboTokenizerState* tokenizer,
|
1459
1402
|
int c,
|
1460
1403
|
GumboToken* output
|
1461
1404
|
) {
|
1462
1405
|
if (c == '-') {
|
1463
|
-
gumbo_tokenizer_set_state(parser,
|
1464
|
-
return
|
1406
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
|
1407
|
+
return emit_char(parser, c, output);
|
1465
1408
|
} else {
|
1466
|
-
|
1467
|
-
|
1468
|
-
return NEXT_CHAR;
|
1409
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1410
|
+
return CONTINUE;
|
1469
1411
|
}
|
1470
1412
|
}
|
1471
1413
|
|
1472
1414
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
|
1473
|
-
static StateResult
|
1415
|
+
static StateResult handle_script_data_escaped_state (
|
1474
1416
|
GumboParser* parser,
|
1475
1417
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1476
1418
|
int c,
|
@@ -1478,25 +1420,25 @@ static StateResult handle_script_escaped_state (
|
|
1478
1420
|
) {
|
1479
1421
|
switch (c) {
|
1480
1422
|
case '-':
|
1481
|
-
gumbo_tokenizer_set_state(parser,
|
1482
|
-
return
|
1423
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH);
|
1424
|
+
return emit_char(parser, c, output);
|
1483
1425
|
case '<':
|
1484
|
-
gumbo_tokenizer_set_state(parser,
|
1426
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
|
1485
1427
|
clear_temporary_buffer(parser);
|
1486
|
-
|
1487
|
-
return
|
1428
|
+
set_mark(parser);
|
1429
|
+
return CONTINUE;
|
1488
1430
|
case '\0':
|
1489
1431
|
return emit_replacement_char(parser, output);
|
1490
1432
|
case -1:
|
1491
|
-
tokenizer_add_parse_error(parser,
|
1433
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1492
1434
|
return emit_eof(parser, output);
|
1493
1435
|
default:
|
1494
|
-
return
|
1436
|
+
return emit_char(parser, c, output);
|
1495
1437
|
}
|
1496
1438
|
}
|
1497
1439
|
|
1498
1440
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
|
1499
|
-
static StateResult
|
1441
|
+
static StateResult handle_script_data_escaped_dash_state (
|
1500
1442
|
GumboParser* parser,
|
1501
1443
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1502
1444
|
int c,
|
@@ -1504,28 +1446,27 @@ static StateResult handle_script_escaped_dash_state (
|
|
1504
1446
|
) {
|
1505
1447
|
switch (c) {
|
1506
1448
|
case '-':
|
1507
|
-
gumbo_tokenizer_set_state(parser,
|
1508
|
-
return
|
1449
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
|
1450
|
+
return emit_char(parser, c, output);
|
1509
1451
|
case '<':
|
1510
|
-
gumbo_tokenizer_set_state(parser,
|
1452
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
|
1511
1453
|
clear_temporary_buffer(parser);
|
1512
|
-
|
1513
|
-
return
|
1454
|
+
set_mark(parser);
|
1455
|
+
return CONTINUE;
|
1514
1456
|
case '\0':
|
1515
|
-
gumbo_tokenizer_set_state(parser,
|
1457
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1516
1458
|
return emit_replacement_char(parser, output);
|
1517
1459
|
case -1:
|
1518
|
-
tokenizer_add_parse_error(parser,
|
1519
|
-
|
1520
|
-
return NEXT_CHAR;
|
1460
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1461
|
+
return emit_eof(parser, output);
|
1521
1462
|
default:
|
1522
|
-
gumbo_tokenizer_set_state(parser,
|
1523
|
-
return
|
1463
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1464
|
+
return emit_char(parser, c, output);
|
1524
1465
|
}
|
1525
1466
|
}
|
1526
1467
|
|
1527
1468
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
|
1528
|
-
static StateResult
|
1469
|
+
static StateResult handle_script_data_escaped_dash_dash_state (
|
1529
1470
|
GumboParser* parser,
|
1530
1471
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1531
1472
|
int c,
|
@@ -1533,113 +1474,107 @@ static StateResult handle_script_escaped_dash_dash_state (
|
|
1533
1474
|
) {
|
1534
1475
|
switch (c) {
|
1535
1476
|
case '-':
|
1536
|
-
return
|
1477
|
+
return emit_char(parser, c, output);
|
1537
1478
|
case '<':
|
1538
|
-
gumbo_tokenizer_set_state(parser,
|
1479
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
|
1539
1480
|
clear_temporary_buffer(parser);
|
1540
|
-
|
1541
|
-
return
|
1481
|
+
set_mark(parser);
|
1482
|
+
return CONTINUE;
|
1542
1483
|
case '>':
|
1543
|
-
gumbo_tokenizer_set_state(parser,
|
1544
|
-
return
|
1484
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1485
|
+
return emit_char(parser, c, output);
|
1545
1486
|
case '\0':
|
1546
|
-
gumbo_tokenizer_set_state(parser,
|
1487
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1547
1488
|
return emit_replacement_char(parser, output);
|
1548
1489
|
case -1:
|
1549
|
-
tokenizer_add_parse_error(parser,
|
1550
|
-
|
1551
|
-
return NEXT_CHAR;
|
1490
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1491
|
+
return emit_eof(parser, output);
|
1552
1492
|
default:
|
1553
|
-
gumbo_tokenizer_set_state(parser,
|
1554
|
-
return
|
1493
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1494
|
+
return emit_char(parser, c, output);
|
1555
1495
|
}
|
1556
1496
|
}
|
1557
1497
|
|
1558
1498
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
|
1559
|
-
static StateResult
|
1499
|
+
static StateResult handle_script_data_escaped_lt_state (
|
1560
1500
|
GumboParser* parser,
|
1561
1501
|
GumboTokenizerState* tokenizer,
|
1562
1502
|
int c,
|
1563
1503
|
GumboToken* output
|
1564
1504
|
) {
|
1565
|
-
assert(
|
1566
|
-
assert(!tokenizer->_script_data_buffer.length);
|
1505
|
+
assert(temporary_buffer_is_empty(parser));
|
1567
1506
|
if (c == '/') {
|
1568
|
-
gumbo_tokenizer_set_state(parser,
|
1569
|
-
|
1570
|
-
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1574
|
-
gumbo_string_buffer_append_codepoint (
|
1575
|
-
ensure_lowercase(c),
|
1576
|
-
&tokenizer->_script_data_buffer
|
1577
|
-
);
|
1578
|
-
return emit_temporary_buffer(parser, output);
|
1579
|
-
} else {
|
1580
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
|
1581
|
-
return emit_temporary_buffer(parser, output);
|
1507
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN);
|
1508
|
+
return CONTINUE;
|
1509
|
+
}
|
1510
|
+
if (is_alpha(c)) {
|
1511
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START);
|
1512
|
+
return emit_from_mark(parser, output);
|
1582
1513
|
}
|
1514
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1515
|
+
return emit_from_mark(parser, output);
|
1583
1516
|
}
|
1584
1517
|
|
1585
1518
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
|
1586
|
-
static StateResult
|
1519
|
+
static StateResult handle_script_data_escaped_end_tag_open_state (
|
1587
1520
|
GumboParser* parser,
|
1588
|
-
GumboTokenizerState*
|
1521
|
+
GumboTokenizerState* tokenizer,
|
1589
1522
|
int c,
|
1590
1523
|
GumboToken* output
|
1591
1524
|
) {
|
1592
|
-
assert(temporary_buffer_equals(parser, "</"));
|
1593
1525
|
if (is_alpha(c)) {
|
1594
|
-
|
1526
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME);
|
1595
1527
|
start_new_tag(parser, false);
|
1596
|
-
|
1597
|
-
return NEXT_CHAR;
|
1598
|
-
} else {
|
1599
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
|
1600
|
-
return emit_temporary_buffer(parser, output);
|
1528
|
+
return CONTINUE;
|
1601
1529
|
}
|
1530
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1531
|
+
return emit_from_mark(parser, output);
|
1602
1532
|
}
|
1603
1533
|
|
1604
1534
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
|
1605
|
-
static StateResult
|
1535
|
+
static StateResult handle_script_data_escaped_end_tag_name_state (
|
1606
1536
|
GumboParser* parser,
|
1607
1537
|
GumboTokenizerState* tokenizer,
|
1608
1538
|
int c,
|
1609
1539
|
GumboToken* output
|
1610
1540
|
) {
|
1611
|
-
UNUSED_IF_NDEBUG(tokenizer);
|
1612
|
-
assert(tokenizer->_temporary_buffer.length >= 2);
|
1613
1541
|
if (is_alpha(c)) {
|
1614
1542
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1615
|
-
|
1616
|
-
|
1617
|
-
|
1618
|
-
|
1619
|
-
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1633
|
-
|
1543
|
+
return CONTINUE;
|
1544
|
+
}
|
1545
|
+
switch (c) {
|
1546
|
+
case '\t':
|
1547
|
+
case '\n':
|
1548
|
+
case '\f':
|
1549
|
+
case ' ':
|
1550
|
+
if (is_appropriate_end_tag(parser)) {
|
1551
|
+
finish_tag_name(parser);
|
1552
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1553
|
+
return CONTINUE;
|
1554
|
+
}
|
1555
|
+
break;
|
1556
|
+
case '/':
|
1557
|
+
if (is_appropriate_end_tag(parser)) {
|
1558
|
+
finish_tag_name(parser);
|
1559
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1560
|
+
return CONTINUE;
|
1561
|
+
}
|
1562
|
+
break;
|
1563
|
+
case '>':
|
1564
|
+
if (is_appropriate_end_tag(parser)) {
|
1565
|
+
finish_tag_name(parser);
|
1566
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1567
|
+
return emit_current_tag(parser, output);
|
1634
1568
|
}
|
1569
|
+
break;
|
1635
1570
|
}
|
1636
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
|
1637
1571
|
abandon_current_tag(parser);
|
1638
|
-
|
1572
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1573
|
+
return emit_from_mark(parser, output);
|
1639
1574
|
}
|
1640
1575
|
|
1641
1576
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
|
1642
|
-
static StateResult
|
1577
|
+
static StateResult handle_script_data_double_escaped_start_state (
|
1643
1578
|
GumboParser* parser,
|
1644
1579
|
GumboTokenizerState* tokenizer,
|
1645
1580
|
int c,
|
@@ -1656,29 +1591,23 @@ static StateResult handle_script_double_escaped_start_state (
|
|
1656
1591
|
parser,
|
1657
1592
|
gumbo_string_equals (
|
1658
1593
|
&kScriptTag,
|
1659
|
-
(GumboStringPiece*) &tokenizer->
|
1594
|
+
(GumboStringPiece*) &tokenizer->_temporary_buffer
|
1660
1595
|
)
|
1661
|
-
?
|
1662
|
-
:
|
1596
|
+
? GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED
|
1597
|
+
: GUMBO_LEX_SCRIPT_DATA_ESCAPED
|
1663
1598
|
);
|
1664
|
-
return
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
&tokenizer->_script_data_buffer
|
1670
|
-
);
|
1671
|
-
return emit_current_char(parser, output);
|
1672
|
-
} else {
|
1673
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
|
1674
|
-
tokenizer->_reconsume_current_input = true;
|
1675
|
-
return NEXT_CHAR;
|
1676
|
-
}
|
1599
|
+
return emit_char(parser, c, output);
|
1600
|
+
}
|
1601
|
+
if (is_alpha(c)) {
|
1602
|
+
append_char_to_temporary_buffer(parser, ensure_lowercase(c));
|
1603
|
+
return emit_char(parser, c, output);
|
1677
1604
|
}
|
1605
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1606
|
+
return CONTINUE;
|
1678
1607
|
}
|
1679
1608
|
|
1680
1609
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
|
1681
|
-
static StateResult
|
1610
|
+
static StateResult handle_script_data_double_escaped_state (
|
1682
1611
|
GumboParser* parser,
|
1683
1612
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1684
1613
|
int c,
|
@@ -1686,24 +1615,23 @@ static StateResult handle_script_double_escaped_state (
|
|
1686
1615
|
) {
|
1687
1616
|
switch (c) {
|
1688
1617
|
case '-':
|
1689
|
-
gumbo_tokenizer_set_state(parser,
|
1690
|
-
return
|
1618
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH);
|
1619
|
+
return emit_char(parser, c, output);
|
1691
1620
|
case '<':
|
1692
|
-
gumbo_tokenizer_set_state(parser,
|
1693
|
-
return
|
1621
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
|
1622
|
+
return emit_char(parser, c, output);
|
1694
1623
|
case '\0':
|
1695
1624
|
return emit_replacement_char(parser, output);
|
1696
1625
|
case -1:
|
1697
|
-
tokenizer_add_parse_error(parser,
|
1698
|
-
|
1699
|
-
return NEXT_CHAR;
|
1626
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1627
|
+
return emit_eof(parser, output);
|
1700
1628
|
default:
|
1701
|
-
return
|
1629
|
+
return emit_char(parser, c, output);
|
1702
1630
|
}
|
1703
1631
|
}
|
1704
1632
|
|
1705
1633
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
|
1706
|
-
static StateResult
|
1634
|
+
static StateResult handle_script_data_double_escaped_dash_state (
|
1707
1635
|
GumboParser* parser,
|
1708
1636
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1709
1637
|
int c,
|
@@ -1712,26 +1640,25 @@ static StateResult handle_script_double_escaped_dash_state (
|
|
1712
1640
|
switch (c) {
|
1713
1641
|
case '-':
|
1714
1642
|
gumbo_tokenizer_set_state(
|
1715
|
-
parser,
|
1716
|
-
return
|
1643
|
+
parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH);
|
1644
|
+
return emit_char(parser, c, output);
|
1717
1645
|
case '<':
|
1718
|
-
gumbo_tokenizer_set_state(parser,
|
1719
|
-
return
|
1646
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
|
1647
|
+
return emit_char(parser, c, output);
|
1720
1648
|
case '\0':
|
1721
|
-
gumbo_tokenizer_set_state(parser,
|
1649
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1722
1650
|
return emit_replacement_char(parser, output);
|
1723
1651
|
case -1:
|
1724
|
-
tokenizer_add_parse_error(parser,
|
1725
|
-
|
1726
|
-
return NEXT_CHAR;
|
1652
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1653
|
+
return emit_eof(parser, output);
|
1727
1654
|
default:
|
1728
|
-
gumbo_tokenizer_set_state(parser,
|
1729
|
-
return
|
1655
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1656
|
+
return emit_char(parser, c, output);
|
1730
1657
|
}
|
1731
1658
|
}
|
1732
1659
|
|
1733
1660
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
|
1734
|
-
static StateResult
|
1661
|
+
static StateResult handle_script_data_double_escaped_dash_dash_state (
|
1735
1662
|
GumboParser* parser,
|
1736
1663
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1737
1664
|
int c,
|
@@ -1739,46 +1666,44 @@ static StateResult handle_script_double_escaped_dash_dash_state (
|
|
1739
1666
|
) {
|
1740
1667
|
switch (c) {
|
1741
1668
|
case '-':
|
1742
|
-
return
|
1669
|
+
return emit_char(parser, c, output);
|
1743
1670
|
case '<':
|
1744
|
-
gumbo_tokenizer_set_state(parser,
|
1745
|
-
return
|
1671
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
|
1672
|
+
return emit_char(parser, c, output);
|
1746
1673
|
case '>':
|
1747
|
-
gumbo_tokenizer_set_state(parser,
|
1748
|
-
return
|
1674
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1675
|
+
return emit_char(parser, c, output);
|
1749
1676
|
case '\0':
|
1750
|
-
gumbo_tokenizer_set_state(parser,
|
1677
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1751
1678
|
return emit_replacement_char(parser, output);
|
1752
1679
|
case -1:
|
1753
|
-
tokenizer_add_parse_error(parser,
|
1754
|
-
|
1755
|
-
return NEXT_CHAR;
|
1680
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1681
|
+
return emit_eof(parser, output);
|
1756
1682
|
default:
|
1757
|
-
gumbo_tokenizer_set_state(parser,
|
1758
|
-
return
|
1683
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1684
|
+
return emit_char(parser, c, output);
|
1759
1685
|
}
|
1760
1686
|
}
|
1761
1687
|
|
1762
1688
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
|
1763
|
-
static StateResult
|
1689
|
+
static StateResult handle_script_data_double_escaped_lt_state (
|
1764
1690
|
GumboParser* parser,
|
1765
1691
|
GumboTokenizerState* tokenizer,
|
1766
1692
|
int c,
|
1767
1693
|
GumboToken* output
|
1768
1694
|
) {
|
1769
1695
|
if (c == '/') {
|
1770
|
-
gumbo_tokenizer_set_state(parser,
|
1771
|
-
|
1772
|
-
return
|
1696
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END);
|
1697
|
+
clear_temporary_buffer(parser);
|
1698
|
+
return emit_char(parser, c, output);
|
1773
1699
|
} else {
|
1774
|
-
|
1775
|
-
|
1776
|
-
return NEXT_CHAR;
|
1700
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1701
|
+
return CONTINUE;
|
1777
1702
|
}
|
1778
1703
|
}
|
1779
1704
|
|
1780
1705
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
|
1781
|
-
static StateResult
|
1706
|
+
static StateResult handle_script_data_double_escaped_end_state (
|
1782
1707
|
GumboParser* parser,
|
1783
1708
|
GumboTokenizerState* tokenizer,
|
1784
1709
|
int c,
|
@@ -1793,29 +1718,23 @@ static StateResult handle_script_double_escaped_end_state (
|
|
1793
1718
|
case '>':
|
1794
1719
|
gumbo_tokenizer_set_state(
|
1795
1720
|
parser, gumbo_string_equals(&kScriptTag,
|
1796
|
-
(GumboStringPiece*) &tokenizer->
|
1797
|
-
?
|
1798
|
-
:
|
1799
|
-
return
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
&tokenizer->_script_data_buffer
|
1805
|
-
);
|
1806
|
-
return emit_current_char(parser, output);
|
1807
|
-
} else {
|
1808
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
|
1809
|
-
tokenizer->_reconsume_current_input = true;
|
1810
|
-
return NEXT_CHAR;
|
1811
|
-
}
|
1721
|
+
(GumboStringPiece*) &tokenizer->_temporary_buffer)
|
1722
|
+
? GUMBO_LEX_SCRIPT_DATA_ESCAPED
|
1723
|
+
: GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1724
|
+
return emit_char(parser, c, output);
|
1725
|
+
}
|
1726
|
+
if (is_alpha(c)) {
|
1727
|
+
append_char_to_temporary_buffer(parser, ensure_lowercase(c));
|
1728
|
+
return emit_char(parser, c, output);
|
1812
1729
|
}
|
1730
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1731
|
+
return CONTINUE;
|
1813
1732
|
}
|
1814
1733
|
|
1815
1734
|
// https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
|
1816
1735
|
static StateResult handle_before_attr_name_state (
|
1817
1736
|
GumboParser* parser,
|
1818
|
-
GumboTokenizerState*
|
1737
|
+
GumboTokenizerState* tokenizer,
|
1819
1738
|
int c,
|
1820
1739
|
GumboToken* output
|
1821
1740
|
) {
|
@@ -1824,40 +1743,27 @@ static StateResult handle_before_attr_name_state (
|
|
1824
1743
|
case '\n':
|
1825
1744
|
case '\f':
|
1826
1745
|
case ' ':
|
1827
|
-
return
|
1746
|
+
return CONTINUE;
|
1828
1747
|
case '/':
|
1829
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1830
|
-
return NEXT_CHAR;
|
1831
1748
|
case '>':
|
1832
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1833
|
-
return emit_current_tag(parser, output);
|
1834
|
-
case '\0':
|
1835
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
1836
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
|
1837
|
-
append_char_to_temporary_buffer(parser, 0xfffd);
|
1838
|
-
return NEXT_CHAR;
|
1839
1749
|
case -1:
|
1840
|
-
|
1841
|
-
|
1842
|
-
abandon_current_tag(parser);
|
1843
|
-
return NEXT_CHAR;
|
1844
|
-
case '"':
|
1845
|
-
case '\'':
|
1846
|
-
case '<':
|
1750
|
+
reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
|
1751
|
+
return CONTINUE;
|
1847
1752
|
case '=':
|
1848
|
-
tokenizer_add_parse_error(parser,
|
1849
|
-
// Fall through.
|
1850
|
-
default:
|
1753
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME);
|
1851
1754
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
|
1852
|
-
append_char_to_tag_buffer(parser,
|
1853
|
-
return
|
1755
|
+
append_char_to_tag_buffer(parser, c, true);
|
1756
|
+
return CONTINUE;
|
1757
|
+
default:
|
1758
|
+
reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
|
1759
|
+
return CONTINUE;
|
1854
1760
|
}
|
1855
1761
|
}
|
1856
1762
|
|
1857
1763
|
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
|
1858
1764
|
static StateResult handle_attr_name_state (
|
1859
1765
|
GumboParser* parser,
|
1860
|
-
GumboTokenizerState*
|
1766
|
+
GumboTokenizerState* tokenizer,
|
1861
1767
|
int c,
|
1862
1768
|
GumboToken* output
|
1863
1769
|
) {
|
@@ -1866,45 +1772,35 @@ static StateResult handle_attr_name_state (
|
|
1866
1772
|
case '\n':
|
1867
1773
|
case '\f':
|
1868
1774
|
case ' ':
|
1869
|
-
finish_attribute_name(parser);
|
1870
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
|
1871
|
-
return NEXT_CHAR;
|
1872
1775
|
case '/':
|
1776
|
+
case '>':
|
1777
|
+
case -1:
|
1873
1778
|
finish_attribute_name(parser);
|
1874
|
-
|
1875
|
-
return
|
1779
|
+
reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
|
1780
|
+
return CONTINUE;
|
1876
1781
|
case '=':
|
1877
1782
|
finish_attribute_name(parser);
|
1878
1783
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
|
1879
|
-
return
|
1880
|
-
case '>':
|
1881
|
-
finish_attribute_name(parser);
|
1882
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1883
|
-
return emit_current_tag(parser, output);
|
1784
|
+
return CONTINUE;
|
1884
1785
|
case '\0':
|
1885
|
-
tokenizer_add_parse_error(parser,
|
1786
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
1886
1787
|
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
|
1887
|
-
return
|
1888
|
-
case -1:
|
1889
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1890
|
-
abandon_current_tag(parser);
|
1891
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
|
1892
|
-
return NEXT_CHAR;
|
1788
|
+
return CONTINUE;
|
1893
1789
|
case '"':
|
1894
1790
|
case '\'':
|
1895
1791
|
case '<':
|
1896
|
-
tokenizer_add_parse_error(parser,
|
1792
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME);
|
1897
1793
|
// Fall through.
|
1898
1794
|
default:
|
1899
1795
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1900
|
-
return
|
1796
|
+
return CONTINUE;
|
1901
1797
|
}
|
1902
1798
|
}
|
1903
1799
|
|
1904
1800
|
// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
|
1905
1801
|
static StateResult handle_after_attr_name_state (
|
1906
1802
|
GumboParser* parser,
|
1907
|
-
GumboTokenizerState*
|
1803
|
+
GumboTokenizerState* tokenizer,
|
1908
1804
|
int c,
|
1909
1805
|
GumboToken* output
|
1910
1806
|
) {
|
@@ -1913,35 +1809,23 @@ static StateResult handle_after_attr_name_state (
|
|
1913
1809
|
case '\n':
|
1914
1810
|
case '\f':
|
1915
1811
|
case ' ':
|
1916
|
-
return
|
1812
|
+
return CONTINUE;
|
1917
1813
|
case '/':
|
1918
1814
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1919
|
-
return
|
1815
|
+
return CONTINUE;
|
1920
1816
|
case '=':
|
1921
1817
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
|
1922
|
-
return
|
1818
|
+
return CONTINUE;
|
1923
1819
|
case '>':
|
1924
1820
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1925
1821
|
return emit_current_tag(parser, output);
|
1926
|
-
case '\0':
|
1927
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
1928
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
|
1929
|
-
append_char_to_temporary_buffer(parser, 0xfffd);
|
1930
|
-
return NEXT_CHAR;
|
1931
1822
|
case -1:
|
1932
|
-
tokenizer_add_parse_error(parser,
|
1933
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1823
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
1934
1824
|
abandon_current_tag(parser);
|
1935
|
-
return
|
1936
|
-
case '"':
|
1937
|
-
case '\'':
|
1938
|
-
case '<':
|
1939
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
|
1940
|
-
// Fall through.
|
1825
|
+
return emit_eof(parser, output);
|
1941
1826
|
default:
|
1942
|
-
|
1943
|
-
|
1944
|
-
return NEXT_CHAR;
|
1827
|
+
reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
|
1828
|
+
return CONTINUE;
|
1945
1829
|
}
|
1946
1830
|
}
|
1947
1831
|
|
@@ -1957,45 +1841,22 @@ static StateResult handle_before_attr_value_state (
|
|
1957
1841
|
case '\n':
|
1958
1842
|
case '\f':
|
1959
1843
|
case ' ':
|
1960
|
-
return
|
1844
|
+
return CONTINUE;
|
1961
1845
|
case '"':
|
1962
1846
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
|
1963
1847
|
reset_tag_buffer_start_point(parser);
|
1964
|
-
return
|
1965
|
-
case '&':
|
1966
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
|
1967
|
-
tokenizer->_reconsume_current_input = true;
|
1968
|
-
return NEXT_CHAR;
|
1848
|
+
return CONTINUE;
|
1969
1849
|
case '\'':
|
1970
1850
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
|
1971
1851
|
reset_tag_buffer_start_point(parser);
|
1972
|
-
return
|
1973
|
-
case '\0':
|
1974
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
1975
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
|
1976
|
-
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
|
1977
|
-
return NEXT_CHAR;
|
1978
|
-
case -1:
|
1979
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
|
1980
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1981
|
-
abandon_current_tag(parser);
|
1982
|
-
tokenizer->_reconsume_current_input = true;
|
1983
|
-
return NEXT_CHAR;
|
1852
|
+
return CONTINUE;
|
1984
1853
|
case '>':
|
1985
|
-
tokenizer_add_parse_error(parser,
|
1854
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_ATTRIBUTE_VALUE);
|
1986
1855
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1987
|
-
emit_current_tag(parser, output);
|
1988
|
-
return RETURN_ERROR;
|
1989
|
-
case '<':
|
1990
|
-
case '=':
|
1991
|
-
case '`':
|
1992
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
|
1993
|
-
// Fall through.
|
1994
|
-
default:
|
1995
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
|
1996
|
-
append_char_to_tag_buffer(parser, c, true);
|
1997
|
-
return NEXT_CHAR;
|
1856
|
+
return emit_current_tag(parser, output);
|
1998
1857
|
}
|
1858
|
+
reconsume_in_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
|
1859
|
+
return CONTINUE;
|
1999
1860
|
}
|
2000
1861
|
|
2001
1862
|
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
|
@@ -2003,30 +1864,28 @@ static StateResult handle_attr_value_double_quoted_state (
|
|
2003
1864
|
GumboParser* parser,
|
2004
1865
|
GumboTokenizerState* tokenizer,
|
2005
1866
|
int c,
|
2006
|
-
GumboToken*
|
1867
|
+
GumboToken* output
|
2007
1868
|
) {
|
2008
1869
|
switch (c) {
|
2009
1870
|
case '"':
|
2010
1871
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
|
2011
|
-
return
|
1872
|
+
return CONTINUE;
|
2012
1873
|
case '&':
|
2013
|
-
|
2014
|
-
|
2015
|
-
tokenizer->
|
2016
|
-
return
|
1874
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
|
1875
|
+
set_mark(parser);
|
1876
|
+
tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED;
|
1877
|
+
return CONTINUE;
|
2017
1878
|
case '\0':
|
2018
|
-
tokenizer_add_parse_error(parser,
|
1879
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2019
1880
|
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
|
2020
|
-
return
|
1881
|
+
return CONTINUE;
|
2021
1882
|
case -1:
|
2022
|
-
tokenizer_add_parse_error(parser,
|
2023
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1883
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
2024
1884
|
abandon_current_tag(parser);
|
2025
|
-
|
2026
|
-
return NEXT_CHAR;
|
1885
|
+
return emit_eof(parser, output);
|
2027
1886
|
default:
|
2028
1887
|
append_char_to_tag_buffer(parser, c, false);
|
2029
|
-
return
|
1888
|
+
return CONTINUE;
|
2030
1889
|
}
|
2031
1890
|
}
|
2032
1891
|
|
@@ -2035,30 +1894,28 @@ static StateResult handle_attr_value_single_quoted_state (
|
|
2035
1894
|
GumboParser* parser,
|
2036
1895
|
GumboTokenizerState* tokenizer,
|
2037
1896
|
int c,
|
2038
|
-
GumboToken*
|
1897
|
+
GumboToken* output
|
2039
1898
|
) {
|
2040
1899
|
switch (c) {
|
2041
1900
|
case '\'':
|
2042
1901
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
|
2043
|
-
return
|
1902
|
+
return CONTINUE;
|
2044
1903
|
case '&':
|
2045
|
-
|
2046
|
-
|
2047
|
-
tokenizer->
|
2048
|
-
return
|
1904
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
|
1905
|
+
set_mark(parser);
|
1906
|
+
tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED;
|
1907
|
+
return CONTINUE;
|
2049
1908
|
case '\0':
|
2050
|
-
tokenizer_add_parse_error(parser,
|
1909
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2051
1910
|
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
|
2052
|
-
return
|
1911
|
+
return CONTINUE;
|
2053
1912
|
case -1:
|
2054
|
-
tokenizer_add_parse_error(parser,
|
2055
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1913
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
2056
1914
|
abandon_current_tag(parser);
|
2057
|
-
|
2058
|
-
return NEXT_CHAR;
|
1915
|
+
return emit_eof(parser, output);
|
2059
1916
|
default:
|
2060
1917
|
append_char_to_tag_buffer(parser, c, false);
|
2061
|
-
return
|
1918
|
+
return CONTINUE;
|
2062
1919
|
}
|
2063
1920
|
}
|
2064
1921
|
|
@@ -2076,91 +1933,37 @@ static StateResult handle_attr_value_unquoted_state (
|
|
2076
1933
|
case ' ':
|
2077
1934
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
2078
1935
|
finish_attribute_value(parser);
|
2079
|
-
return
|
1936
|
+
return CONTINUE;
|
2080
1937
|
case '&':
|
2081
|
-
|
2082
|
-
|
2083
|
-
tokenizer->
|
2084
|
-
return
|
1938
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
|
1939
|
+
set_mark(parser);
|
1940
|
+
tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_UNQUOTED;
|
1941
|
+
return CONTINUE;
|
2085
1942
|
case '>':
|
2086
1943
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2087
1944
|
finish_attribute_value(parser);
|
2088
1945
|
return emit_current_tag(parser, output);
|
2089
1946
|
case '\0':
|
2090
|
-
tokenizer_add_parse_error(parser,
|
1947
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2091
1948
|
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
|
2092
|
-
return
|
1949
|
+
return CONTINUE;
|
2093
1950
|
case -1:
|
2094
|
-
tokenizer_add_parse_error(parser,
|
2095
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2096
|
-
tokenizer->_reconsume_current_input = true;
|
1951
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
2097
1952
|
abandon_current_tag(parser);
|
2098
|
-
return
|
2099
|
-
case '<':
|
2100
|
-
case '=':
|
1953
|
+
return emit_eof(parser, output);
|
2101
1954
|
case '"':
|
2102
1955
|
case '\'':
|
1956
|
+
case '<':
|
1957
|
+
case '=':
|
2103
1958
|
case '`':
|
2104
|
-
tokenizer_add_parse_error(parser,
|
1959
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE);
|
2105
1960
|
// Fall through.
|
2106
1961
|
default:
|
2107
1962
|
append_char_to_tag_buffer(parser, c, true);
|
2108
|
-
return
|
1963
|
+
return CONTINUE;
|
2109
1964
|
}
|
2110
1965
|
}
|
2111
1966
|
|
2112
|
-
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-attribute-value-state
|
2113
|
-
static StateResult handle_char_ref_in_attr_value_state (
|
2114
|
-
GumboParser* parser,
|
2115
|
-
GumboTokenizerState* tokenizer,
|
2116
|
-
int UNUSED_ARG(c),
|
2117
|
-
GumboToken* UNUSED_ARG(output)
|
2118
|
-
) {
|
2119
|
-
OneOrTwoCodepoints char_ref;
|
2120
|
-
int allowed_char;
|
2121
|
-
bool is_unquoted = false;
|
2122
|
-
switch (tokenizer->_tag_state._attr_value_state) {
|
2123
|
-
case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
|
2124
|
-
allowed_char = '"';
|
2125
|
-
break;
|
2126
|
-
case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
|
2127
|
-
allowed_char = '\'';
|
2128
|
-
break;
|
2129
|
-
case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
|
2130
|
-
allowed_char = '>';
|
2131
|
-
is_unquoted = true;
|
2132
|
-
break;
|
2133
|
-
default:
|
2134
|
-
// -Wmaybe-uninitialized is a little overzealous here, and doesn't
|
2135
|
-
// get that the assert(0) means this codepath will never happen.
|
2136
|
-
allowed_char = ' ';
|
2137
|
-
assert(0);
|
2138
|
-
}
|
2139
|
-
|
2140
|
-
// Ignore the status, since we don't have a convenient way of signalling that
|
2141
|
-
// a parser error has occurred when the error occurs in the middle of a
|
2142
|
-
// multi-state token. We'd need a flag inside the TokenizerState to do this,
|
2143
|
-
// but that's a low priority fix.
|
2144
|
-
gumbo_consume_char_ref (
|
2145
|
-
parser,
|
2146
|
-
&tokenizer->_input,
|
2147
|
-
allowed_char,
|
2148
|
-
true,
|
2149
|
-
&char_ref
|
2150
|
-
);
|
2151
|
-
if (char_ref.first != kGumboNoChar) {
|
2152
|
-
tokenizer->_reconsume_current_input = true;
|
2153
|
-
append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
|
2154
|
-
if (char_ref.second != kGumboNoChar) {
|
2155
|
-
append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
|
2156
|
-
}
|
2157
|
-
} else {
|
2158
|
-
append_char_to_tag_buffer(parser, '&', is_unquoted);
|
2159
|
-
}
|
2160
|
-
gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
|
2161
|
-
return NEXT_CHAR;
|
2162
|
-
}
|
2163
|
-
|
2164
1967
|
// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
|
2165
1968
|
static StateResult handle_after_attr_value_quoted_state (
|
2166
1969
|
GumboParser* parser,
|
@@ -2175,24 +1978,21 @@ static StateResult handle_after_attr_value_quoted_state (
|
|
2175
1978
|
case '\f':
|
2176
1979
|
case ' ':
|
2177
1980
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
2178
|
-
return
|
1981
|
+
return CONTINUE;
|
2179
1982
|
case '/':
|
2180
1983
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
2181
|
-
return
|
1984
|
+
return CONTINUE;
|
2182
1985
|
case '>':
|
2183
1986
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2184
1987
|
return emit_current_tag(parser, output);
|
2185
1988
|
case -1:
|
2186
|
-
tokenizer_add_parse_error(parser,
|
2187
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1989
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
2188
1990
|
abandon_current_tag(parser);
|
2189
|
-
|
2190
|
-
return NEXT_CHAR;
|
1991
|
+
return emit_eof(parser, output);
|
2191
1992
|
default:
|
2192
|
-
tokenizer_add_parse_error(parser,
|
2193
|
-
|
2194
|
-
|
2195
|
-
return NEXT_CHAR;
|
1993
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES);
|
1994
|
+
reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1995
|
+
return CONTINUE;
|
2196
1996
|
}
|
2197
1997
|
}
|
2198
1998
|
|
@@ -2209,15 +2009,13 @@ static StateResult handle_self_closing_start_tag_state (
|
|
2209
2009
|
tokenizer->_tag_state._is_self_closing = true;
|
2210
2010
|
return emit_current_tag(parser, output);
|
2211
2011
|
case -1:
|
2212
|
-
tokenizer_add_parse_error(parser,
|
2213
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2012
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
2214
2013
|
abandon_current_tag(parser);
|
2215
|
-
return
|
2014
|
+
return emit_eof(parser, output);
|
2216
2015
|
default:
|
2217
|
-
tokenizer_add_parse_error(parser,
|
2218
|
-
|
2219
|
-
|
2220
|
-
return NEXT_CHAR;
|
2016
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG);
|
2017
|
+
reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
2018
|
+
return CONTINUE;
|
2221
2019
|
}
|
2222
2020
|
}
|
2223
2021
|
|
@@ -2228,21 +2026,27 @@ static StateResult handle_bogus_comment_state (
|
|
2228
2026
|
int c,
|
2229
2027
|
GumboToken* output
|
2230
2028
|
) {
|
2231
|
-
|
2232
|
-
|
2233
|
-
|
2234
|
-
|
2235
|
-
|
2029
|
+
switch (c) {
|
2030
|
+
case '>':
|
2031
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2032
|
+
return emit_comment(parser, output);
|
2033
|
+
case -1:
|
2034
|
+
// We need to emit the comment and then the EOF, so reconsume in data
|
2035
|
+
// state.
|
2036
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2037
|
+
return emit_comment(parser, output);
|
2038
|
+
case '\0':
|
2039
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2040
|
+
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2041
|
+
return CONTINUE;
|
2042
|
+
default:
|
2236
2043
|
append_char_to_temporary_buffer(parser, c);
|
2237
|
-
|
2238
|
-
c = utf8iterator_current(&tokenizer->_input);
|
2044
|
+
return CONTINUE;
|
2239
2045
|
}
|
2240
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2241
|
-
return emit_comment(parser, output);
|
2242
2046
|
}
|
2243
2047
|
|
2244
2048
|
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
2245
|
-
static StateResult
|
2049
|
+
static StateResult handle_markup_declaration_open_state (
|
2246
2050
|
GumboParser* parser,
|
2247
2051
|
GumboTokenizerState* tokenizer,
|
2248
2052
|
int UNUSED_ARG(c),
|
@@ -2253,21 +2057,21 @@ static StateResult handle_markup_declaration_state (
|
|
2253
2057
|
&tokenizer->_input,
|
2254
2058
|
"--",
|
2255
2059
|
sizeof("--") - 1,
|
2256
|
-
true
|
2060
|
+
/* case sensitive */ true
|
2257
2061
|
)
|
2258
2062
|
) {
|
2259
|
-
|
2260
|
-
|
2261
|
-
}
|
2063
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT_START);
|
2064
|
+
return CONTINUE;
|
2065
|
+
}
|
2066
|
+
if (
|
2262
2067
|
utf8iterator_maybe_consume_match (
|
2263
2068
|
&tokenizer->_input,
|
2264
2069
|
"DOCTYPE",
|
2265
2070
|
sizeof("DOCTYPE") - 1,
|
2266
|
-
false
|
2071
|
+
/* case sensitive */ false
|
2267
2072
|
)
|
2268
2073
|
) {
|
2269
|
-
|
2270
|
-
tokenizer->_reconsume_current_input = true;
|
2074
|
+
reconsume_in_state(parser, GUMBO_LEX_DOCTYPE);
|
2271
2075
|
// If we get here, we know we'll eventually emit a doctype token, so now is
|
2272
2076
|
// the time to initialize the doctype strings. (Not in doctype_state_init,
|
2273
2077
|
// since then they'll leak if ownership never gets transferred to the
|
@@ -2275,24 +2079,35 @@ static StateResult handle_markup_declaration_state (
|
|
2275
2079
|
tokenizer->_doc_type_state.name = gumbo_strdup("");
|
2276
2080
|
tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
|
2277
2081
|
tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
|
2278
|
-
|
2279
|
-
|
2280
|
-
|
2082
|
+
return CONTINUE;
|
2083
|
+
}
|
2084
|
+
if (
|
2085
|
+
utf8iterator_maybe_consume_match (
|
2281
2086
|
&tokenizer->_input,
|
2282
2087
|
"[CDATA[", sizeof("[CDATA[") - 1,
|
2283
|
-
true
|
2088
|
+
/* case sensitive */ true
|
2284
2089
|
)
|
2285
2090
|
) {
|
2286
|
-
|
2287
|
-
|
2288
|
-
|
2289
|
-
|
2290
|
-
|
2291
|
-
|
2292
|
-
|
2293
|
-
|
2091
|
+
if (tokenizer->_is_adjusted_current_node_foreign) {
|
2092
|
+
reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
|
2093
|
+
tokenizer->_is_in_cdata = true;
|
2094
|
+
// Start the token after the <![CDATA[.
|
2095
|
+
reset_token_start_point(tokenizer);
|
2096
|
+
} else {
|
2097
|
+
tokenizer_add_token_parse_error(parser, GUMBO_ERR_CDATA_IN_HTML_CONTENT);
|
2098
|
+
clear_temporary_buffer(parser);
|
2099
|
+
append_string_to_temporary_buffer (
|
2100
|
+
parser,
|
2101
|
+
&(const GumboStringPiece) { .data = "[CDATA[", .length = 7 }
|
2102
|
+
);
|
2103
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
|
2104
|
+
}
|
2105
|
+
return CONTINUE;
|
2294
2106
|
}
|
2295
|
-
|
2107
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_OPENED_COMMENT);
|
2108
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
|
2109
|
+
clear_temporary_buffer(parser);
|
2110
|
+
return CONTINUE;
|
2296
2111
|
}
|
2297
2112
|
|
2298
2113
|
// https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
|
@@ -2305,26 +2120,14 @@ static StateResult handle_comment_start_state (
|
|
2305
2120
|
switch (c) {
|
2306
2121
|
case '-':
|
2307
2122
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
|
2308
|
-
return
|
2309
|
-
case '\0':
|
2310
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
2311
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2312
|
-
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2313
|
-
return NEXT_CHAR;
|
2123
|
+
return CONTINUE;
|
2314
2124
|
case '>':
|
2315
|
-
tokenizer_add_parse_error(parser,
|
2125
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
|
2316
2126
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2317
|
-
emit_comment(parser, output);
|
2318
|
-
return RETURN_ERROR;
|
2319
|
-
case -1:
|
2320
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
|
2321
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2322
|
-
emit_comment(parser, output);
|
2323
|
-
return RETURN_ERROR;
|
2127
|
+
return emit_comment(parser, output);
|
2324
2128
|
default:
|
2325
|
-
|
2326
|
-
|
2327
|
-
return NEXT_CHAR;
|
2129
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2130
|
+
return CONTINUE;
|
2328
2131
|
}
|
2329
2132
|
}
|
2330
2133
|
|
@@ -2338,28 +2141,20 @@ static StateResult handle_comment_start_dash_state (
|
|
2338
2141
|
switch (c) {
|
2339
2142
|
case '-':
|
2340
2143
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
|
2341
|
-
return
|
2342
|
-
case '\0':
|
2343
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
2344
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2345
|
-
append_char_to_temporary_buffer(parser, '-');
|
2346
|
-
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2347
|
-
return NEXT_CHAR;
|
2144
|
+
return CONTINUE;
|
2348
2145
|
case '>':
|
2349
|
-
tokenizer_add_parse_error(parser,
|
2146
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
|
2350
2147
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2351
|
-
emit_comment(parser, output);
|
2352
|
-
return RETURN_ERROR;
|
2148
|
+
return emit_comment(parser, output);
|
2353
2149
|
case -1:
|
2354
|
-
tokenizer_add_parse_error(parser,
|
2355
|
-
|
2356
|
-
|
2357
|
-
return
|
2150
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
|
2151
|
+
// Switch to data to emit the EOF next.
|
2152
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2153
|
+
return emit_comment(parser, output);
|
2358
2154
|
default:
|
2359
|
-
|
2155
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2360
2156
|
append_char_to_temporary_buffer(parser, '-');
|
2361
|
-
|
2362
|
-
return NEXT_CHAR;
|
2157
|
+
return CONTINUE;
|
2363
2158
|
}
|
2364
2159
|
}
|
2365
2160
|
|
@@ -2371,21 +2166,99 @@ static StateResult handle_comment_state (
|
|
2371
2166
|
GumboToken* output
|
2372
2167
|
) {
|
2373
2168
|
switch (c) {
|
2169
|
+
case '<':
|
2170
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT);
|
2171
|
+
append_char_to_temporary_buffer(parser, c);
|
2172
|
+
return CONTINUE;
|
2374
2173
|
case '-':
|
2375
2174
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
|
2376
|
-
return
|
2175
|
+
return CONTINUE;
|
2377
2176
|
case '\0':
|
2378
|
-
tokenizer_add_parse_error(parser,
|
2177
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2379
2178
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2380
|
-
return
|
2179
|
+
return CONTINUE;
|
2381
2180
|
case -1:
|
2382
|
-
tokenizer_add_parse_error(parser,
|
2383
|
-
|
2384
|
-
|
2385
|
-
return
|
2181
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
|
2182
|
+
// Switch to data to emit the EOF token next.
|
2183
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2184
|
+
return emit_comment(parser, output);
|
2386
2185
|
default:
|
2387
2186
|
append_char_to_temporary_buffer(parser, c);
|
2388
|
-
return
|
2187
|
+
return CONTINUE;
|
2188
|
+
}
|
2189
|
+
}
|
2190
|
+
|
2191
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
|
2192
|
+
static StateResult handle_comment_lt_state (
|
2193
|
+
GumboParser* parser,
|
2194
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2195
|
+
int c,
|
2196
|
+
GumboToken* output
|
2197
|
+
) {
|
2198
|
+
switch (c) {
|
2199
|
+
case '!':
|
2200
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG);
|
2201
|
+
append_char_to_temporary_buffer(parser, c);
|
2202
|
+
return CONTINUE;
|
2203
|
+
case '<':
|
2204
|
+
append_char_to_temporary_buffer(parser, c);
|
2205
|
+
return CONTINUE;
|
2206
|
+
default:
|
2207
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2208
|
+
return CONTINUE;
|
2209
|
+
}
|
2210
|
+
}
|
2211
|
+
|
2212
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
|
2213
|
+
static StateResult handle_comment_lt_bang_state (
|
2214
|
+
GumboParser* parser,
|
2215
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2216
|
+
int c,
|
2217
|
+
GumboToken* output
|
2218
|
+
) {
|
2219
|
+
switch (c) {
|
2220
|
+
case '-':
|
2221
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH);
|
2222
|
+
return CONTINUE;
|
2223
|
+
default:
|
2224
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2225
|
+
return CONTINUE;
|
2226
|
+
}
|
2227
|
+
}
|
2228
|
+
|
2229
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
|
2230
|
+
static StateResult handle_comment_lt_bang_dash_state (
|
2231
|
+
GumboParser* parser,
|
2232
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2233
|
+
int c,
|
2234
|
+
GumboToken* output
|
2235
|
+
) {
|
2236
|
+
switch (c) {
|
2237
|
+
case '-':
|
2238
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH);
|
2239
|
+
return CONTINUE;
|
2240
|
+
default:
|
2241
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT_END_DASH);
|
2242
|
+
return CONTINUE;
|
2243
|
+
}
|
2244
|
+
}
|
2245
|
+
|
2246
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
|
2247
|
+
static StateResult handle_comment_lt_bang_dash_dash_state (
|
2248
|
+
GumboParser* parser,
|
2249
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2250
|
+
int c,
|
2251
|
+
GumboToken* output
|
2252
|
+
) {
|
2253
|
+
switch (c) {
|
2254
|
+
case '>':
|
2255
|
+
case -1:
|
2256
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
|
2257
|
+
return CONTINUE;
|
2258
|
+
default:
|
2259
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_NESTED_COMMENT);
|
2260
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
|
2261
|
+
return CONTINUE;
|
2389
2262
|
}
|
2390
2263
|
}
|
2391
2264
|
|
@@ -2397,25 +2270,18 @@ static StateResult handle_comment_end_dash_state (
|
|
2397
2270
|
GumboToken* output
|
2398
2271
|
) {
|
2399
2272
|
switch (c) {
|
2400
|
-
|
2401
|
-
|
2402
|
-
|
2403
|
-
|
2404
|
-
|
2405
|
-
|
2406
|
-
|
2407
|
-
|
2408
|
-
|
2409
|
-
|
2410
|
-
|
2411
|
-
|
2412
|
-
emit_comment(parser, output);
|
2413
|
-
return RETURN_ERROR;
|
2414
|
-
default:
|
2415
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2416
|
-
append_char_to_temporary_buffer(parser, '-');
|
2417
|
-
append_char_to_temporary_buffer(parser, c);
|
2418
|
-
return NEXT_CHAR;
|
2273
|
+
case '-':
|
2274
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
|
2275
|
+
return CONTINUE;
|
2276
|
+
case -1:
|
2277
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
|
2278
|
+
// Switch to data to emit EOF next.
|
2279
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2280
|
+
return emit_comment(parser, output);
|
2281
|
+
default:
|
2282
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2283
|
+
append_char_to_temporary_buffer(parser, '-');
|
2284
|
+
return CONTINUE;
|
2419
2285
|
}
|
2420
2286
|
}
|
2421
2287
|
|
@@ -2430,35 +2296,22 @@ static StateResult handle_comment_end_state (
|
|
2430
2296
|
case '>':
|
2431
2297
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2432
2298
|
return emit_comment(parser, output);
|
2433
|
-
case '\0':
|
2434
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
2435
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2436
|
-
append_char_to_temporary_buffer(parser, '-');
|
2437
|
-
append_char_to_temporary_buffer(parser, '-');
|
2438
|
-
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2439
|
-
return NEXT_CHAR;
|
2440
2299
|
case '!':
|
2441
|
-
tokenizer_add_parse_error(
|
2442
|
-
parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
|
2443
2300
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
|
2444
|
-
return
|
2301
|
+
return CONTINUE;
|
2445
2302
|
case '-':
|
2446
|
-
tokenizer_add_parse_error(
|
2447
|
-
parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
|
2448
2303
|
append_char_to_temporary_buffer(parser, '-');
|
2449
|
-
return
|
2304
|
+
return CONTINUE;
|
2450
2305
|
case -1:
|
2451
|
-
tokenizer_add_parse_error(parser,
|
2306
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
|
2307
|
+
// Switch to data to emit EOF next.
|
2452
2308
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2453
|
-
emit_comment(parser, output);
|
2454
|
-
return RETURN_ERROR;
|
2309
|
+
return emit_comment(parser, output);
|
2455
2310
|
default:
|
2456
|
-
|
2457
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2311
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2458
2312
|
append_char_to_temporary_buffer(parser, '-');
|
2459
2313
|
append_char_to_temporary_buffer(parser, '-');
|
2460
|
-
|
2461
|
-
return NEXT_CHAR;
|
2314
|
+
return CONTINUE;
|
2462
2315
|
}
|
2463
2316
|
}
|
2464
2317
|
|
@@ -2475,30 +2328,22 @@ static StateResult handle_comment_end_bang_state (
|
|
2475
2328
|
append_char_to_temporary_buffer(parser, '-');
|
2476
2329
|
append_char_to_temporary_buffer(parser, '-');
|
2477
2330
|
append_char_to_temporary_buffer(parser, '!');
|
2478
|
-
return
|
2331
|
+
return CONTINUE;
|
2479
2332
|
case '>':
|
2333
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT);
|
2480
2334
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2481
2335
|
return emit_comment(parser, output);
|
2482
|
-
case '\0':
|
2483
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
2484
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2485
|
-
append_char_to_temporary_buffer(parser, '-');
|
2486
|
-
append_char_to_temporary_buffer(parser, '-');
|
2487
|
-
append_char_to_temporary_buffer(parser, '!');
|
2488
|
-
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2489
|
-
return NEXT_CHAR;
|
2490
2336
|
case -1:
|
2491
|
-
tokenizer_add_parse_error(parser,
|
2337
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
|
2338
|
+
// Switch to data to emit EOF next.
|
2492
2339
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2493
|
-
emit_comment(parser, output);
|
2494
|
-
return RETURN_ERROR;
|
2340
|
+
return emit_comment(parser, output);
|
2495
2341
|
default:
|
2496
|
-
|
2342
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2497
2343
|
append_char_to_temporary_buffer(parser, '-');
|
2498
2344
|
append_char_to_temporary_buffer(parser, '-');
|
2499
2345
|
append_char_to_temporary_buffer(parser, '!');
|
2500
|
-
|
2501
|
-
return NEXT_CHAR;
|
2346
|
+
return CONTINUE;
|
2502
2347
|
}
|
2503
2348
|
}
|
2504
2349
|
|
@@ -2509,26 +2354,27 @@ static StateResult handle_doctype_state (
|
|
2509
2354
|
int c,
|
2510
2355
|
GumboToken* output
|
2511
2356
|
) {
|
2512
|
-
assert(
|
2357
|
+
assert(temporary_buffer_is_empty(parser));
|
2513
2358
|
switch (c) {
|
2514
2359
|
case '\t':
|
2515
2360
|
case '\n':
|
2516
2361
|
case '\f':
|
2517
2362
|
case ' ':
|
2518
2363
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
|
2519
|
-
return
|
2364
|
+
return CONTINUE;
|
2365
|
+
case '>':
|
2366
|
+
reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
|
2367
|
+
return CONTINUE;
|
2520
2368
|
case -1:
|
2521
|
-
tokenizer_add_parse_error(parser,
|
2522
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2369
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2523
2370
|
tokenizer->_doc_type_state.force_quirks = true;
|
2524
|
-
|
2525
|
-
|
2371
|
+
// Switch to data to emit EOF next.
|
2372
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2373
|
+
return emit_doctype(parser, output);
|
2526
2374
|
default:
|
2527
|
-
tokenizer_add_parse_error(parser,
|
2528
|
-
|
2529
|
-
|
2530
|
-
tokenizer->_doc_type_state.force_quirks = true;
|
2531
|
-
return NEXT_CHAR;
|
2375
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME);
|
2376
|
+
reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
|
2377
|
+
return CONTINUE;
|
2532
2378
|
}
|
2533
2379
|
}
|
2534
2380
|
|
@@ -2544,30 +2390,27 @@ static StateResult handle_before_doctype_name_state (
|
|
2544
2390
|
case '\n':
|
2545
2391
|
case '\f':
|
2546
2392
|
case ' ':
|
2547
|
-
return
|
2393
|
+
return CONTINUE;
|
2548
2394
|
case '\0':
|
2549
|
-
tokenizer_add_parse_error(parser,
|
2395
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2550
2396
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
|
2551
|
-
tokenizer->_doc_type_state.force_quirks = true;
|
2552
2397
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2553
|
-
return
|
2398
|
+
return CONTINUE;
|
2554
2399
|
case '>':
|
2555
|
-
tokenizer_add_parse_error(parser,
|
2400
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_NAME);
|
2556
2401
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2557
2402
|
tokenizer->_doc_type_state.force_quirks = true;
|
2558
|
-
emit_doctype(parser, output);
|
2559
|
-
return RETURN_ERROR;
|
2403
|
+
return emit_doctype(parser, output);
|
2560
2404
|
case -1:
|
2561
|
-
tokenizer_add_parse_error(parser,
|
2562
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2405
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2563
2406
|
tokenizer->_doc_type_state.force_quirks = true;
|
2564
|
-
|
2565
|
-
|
2407
|
+
// Switch to data to emit EOF next.
|
2408
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2409
|
+
return emit_doctype(parser, output);
|
2566
2410
|
default:
|
2567
2411
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
|
2568
|
-
tokenizer->_doc_type_state.force_quirks = false;
|
2569
2412
|
append_char_to_temporary_buffer(parser, ensure_lowercase(c));
|
2570
|
-
return
|
2413
|
+
return CONTINUE;
|
2571
2414
|
}
|
2572
2415
|
}
|
2573
2416
|
|
@@ -2586,30 +2429,26 @@ static StateResult handle_doctype_name_state (
|
|
2586
2429
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
|
2587
2430
|
gumbo_free((void*) tokenizer->_doc_type_state.name);
|
2588
2431
|
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2589
|
-
return
|
2432
|
+
return CONTINUE;
|
2590
2433
|
case '>':
|
2591
2434
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2592
2435
|
gumbo_free((void*) tokenizer->_doc_type_state.name);
|
2593
2436
|
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2594
|
-
emit_doctype(parser, output);
|
2595
|
-
return RETURN_SUCCESS;
|
2437
|
+
return emit_doctype(parser, output);
|
2596
2438
|
case '\0':
|
2597
|
-
tokenizer_add_parse_error(parser,
|
2439
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2598
2440
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2599
|
-
return
|
2441
|
+
return CONTINUE;
|
2600
2442
|
case -1:
|
2601
|
-
tokenizer_add_parse_error(parser,
|
2602
|
-
|
2443
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2444
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2603
2445
|
tokenizer->_doc_type_state.force_quirks = true;
|
2604
2446
|
gumbo_free((void*) tokenizer->_doc_type_state.name);
|
2605
2447
|
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2606
|
-
emit_doctype(parser, output);
|
2607
|
-
return RETURN_ERROR;
|
2448
|
+
return emit_doctype(parser, output);
|
2608
2449
|
default:
|
2609
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
|
2610
|
-
tokenizer->_doc_type_state.force_quirks = false;
|
2611
2450
|
append_char_to_temporary_buffer(parser, ensure_lowercase(c));
|
2612
|
-
return
|
2451
|
+
return CONTINUE;
|
2613
2452
|
}
|
2614
2453
|
}
|
2615
2454
|
|
@@ -2625,35 +2464,29 @@ static StateResult handle_after_doctype_name_state (
|
|
2625
2464
|
case '\n':
|
2626
2465
|
case '\f':
|
2627
2466
|
case ' ':
|
2628
|
-
return
|
2467
|
+
return CONTINUE;
|
2629
2468
|
case '>':
|
2630
2469
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2631
|
-
emit_doctype(parser, output);
|
2632
|
-
return RETURN_SUCCESS;
|
2470
|
+
return emit_doctype(parser, output);
|
2633
2471
|
case -1:
|
2634
|
-
tokenizer_add_parse_error(parser,
|
2472
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2635
2473
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2636
2474
|
tokenizer->_doc_type_state.force_quirks = true;
|
2637
|
-
emit_doctype(parser, output);
|
2638
|
-
return RETURN_ERROR;
|
2475
|
+
return emit_doctype(parser, output);
|
2639
2476
|
default:
|
2640
2477
|
if (utf8iterator_maybe_consume_match(
|
2641
2478
|
&tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
|
2642
|
-
|
2643
|
-
parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
|
2644
|
-
tokenizer->_reconsume_current_input = true;
|
2479
|
+
reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
|
2645
2480
|
} else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
|
2646
2481
|
sizeof("SYSTEM") - 1, false)) {
|
2647
|
-
|
2648
|
-
parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
|
2649
|
-
tokenizer->_reconsume_current_input = true;
|
2482
|
+
reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
|
2650
2483
|
} else {
|
2651
2484
|
tokenizer_add_parse_error(
|
2652
|
-
parser,
|
2653
|
-
|
2485
|
+
parser, GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME);
|
2486
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2654
2487
|
tokenizer->_doc_type_state.force_quirks = true;
|
2655
2488
|
}
|
2656
|
-
return
|
2489
|
+
return CONTINUE;
|
2657
2490
|
}
|
2658
2491
|
}
|
2659
2492
|
|
@@ -2670,37 +2503,34 @@ static StateResult handle_after_doctype_public_keyword_state (
|
|
2670
2503
|
case '\f':
|
2671
2504
|
case ' ':
|
2672
2505
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
|
2673
|
-
return
|
2506
|
+
return CONTINUE;
|
2674
2507
|
case '"':
|
2675
|
-
tokenizer_add_parse_error(parser,
|
2508
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
|
2676
2509
|
assert(temporary_buffer_is_empty(parser));
|
2677
2510
|
gumbo_tokenizer_set_state(
|
2678
2511
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
|
2679
|
-
return
|
2512
|
+
return CONTINUE;
|
2680
2513
|
case '\'':
|
2681
|
-
tokenizer_add_parse_error(parser,
|
2514
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
|
2682
2515
|
assert(temporary_buffer_is_empty(parser));
|
2683
2516
|
gumbo_tokenizer_set_state(
|
2684
2517
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
|
2685
|
-
return
|
2518
|
+
return CONTINUE;
|
2686
2519
|
case '>':
|
2687
|
-
tokenizer_add_parse_error(parser,
|
2520
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
|
2688
2521
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2689
2522
|
tokenizer->_doc_type_state.force_quirks = true;
|
2690
|
-
emit_doctype(parser, output);
|
2691
|
-
return RETURN_ERROR;
|
2523
|
+
return emit_doctype(parser, output);
|
2692
2524
|
case -1:
|
2693
|
-
tokenizer_add_parse_error(parser,
|
2694
|
-
|
2525
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2526
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2695
2527
|
tokenizer->_doc_type_state.force_quirks = true;
|
2696
|
-
emit_doctype(parser, output);
|
2697
|
-
return RETURN_ERROR;
|
2528
|
+
return emit_doctype(parser, output);
|
2698
2529
|
default:
|
2699
|
-
tokenizer_add_parse_error(parser,
|
2700
|
-
|
2530
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
|
2531
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2701
2532
|
tokenizer->_doc_type_state.force_quirks = true;
|
2702
|
-
|
2703
|
-
return RETURN_ERROR;
|
2533
|
+
return CONTINUE;
|
2704
2534
|
}
|
2705
2535
|
}
|
2706
2536
|
|
@@ -2716,35 +2546,32 @@ static StateResult handle_before_doctype_public_id_state (
|
|
2716
2546
|
case '\n':
|
2717
2547
|
case '\f':
|
2718
2548
|
case ' ':
|
2719
|
-
return
|
2549
|
+
return CONTINUE;
|
2720
2550
|
case '"':
|
2721
2551
|
assert(temporary_buffer_is_empty(parser));
|
2722
2552
|
gumbo_tokenizer_set_state(
|
2723
2553
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
|
2724
|
-
return
|
2554
|
+
return CONTINUE;
|
2725
2555
|
case '\'':
|
2726
2556
|
assert(temporary_buffer_is_empty(parser));
|
2727
2557
|
gumbo_tokenizer_set_state(
|
2728
2558
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
|
2729
|
-
return
|
2559
|
+
return CONTINUE;
|
2730
2560
|
case '>':
|
2731
|
-
tokenizer_add_parse_error(parser,
|
2561
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
|
2732
2562
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2733
2563
|
tokenizer->_doc_type_state.force_quirks = true;
|
2734
|
-
emit_doctype(parser, output);
|
2735
|
-
return RETURN_ERROR;
|
2564
|
+
return emit_doctype(parser, output);
|
2736
2565
|
case -1:
|
2737
|
-
tokenizer_add_parse_error(parser,
|
2738
|
-
|
2566
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2567
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2739
2568
|
tokenizer->_doc_type_state.force_quirks = true;
|
2740
|
-
emit_doctype(parser, output);
|
2741
|
-
return RETURN_ERROR;
|
2569
|
+
return emit_doctype(parser, output);
|
2742
2570
|
default:
|
2743
|
-
tokenizer_add_parse_error(parser,
|
2744
|
-
|
2571
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
|
2572
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2745
2573
|
tokenizer->_doc_type_state.force_quirks = true;
|
2746
|
-
|
2747
|
-
return RETURN_ERROR;
|
2574
|
+
return CONTINUE;
|
2748
2575
|
}
|
2749
2576
|
}
|
2750
2577
|
|
@@ -2759,28 +2586,26 @@ static StateResult handle_doctype_public_id_double_quoted_state (
|
|
2759
2586
|
case '"':
|
2760
2587
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
|
2761
2588
|
finish_doctype_public_id(parser);
|
2762
|
-
return
|
2589
|
+
return CONTINUE;
|
2763
2590
|
case '\0':
|
2764
|
-
tokenizer_add_parse_error(parser,
|
2591
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2765
2592
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2766
|
-
return
|
2593
|
+
return CONTINUE;
|
2767
2594
|
case '>':
|
2768
|
-
tokenizer_add_parse_error(parser,
|
2595
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
|
2769
2596
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2770
2597
|
tokenizer->_doc_type_state.force_quirks = true;
|
2771
2598
|
finish_doctype_public_id(parser);
|
2772
|
-
emit_doctype(parser, output);
|
2773
|
-
return RETURN_ERROR;
|
2599
|
+
return emit_doctype(parser, output);
|
2774
2600
|
case -1:
|
2775
|
-
tokenizer_add_parse_error(parser,
|
2776
|
-
|
2601
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2602
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2777
2603
|
tokenizer->_doc_type_state.force_quirks = true;
|
2778
2604
|
finish_doctype_public_id(parser);
|
2779
|
-
emit_doctype(parser, output);
|
2780
|
-
return RETURN_ERROR;
|
2605
|
+
return emit_doctype(parser, output);
|
2781
2606
|
default:
|
2782
2607
|
append_char_to_temporary_buffer(parser, c);
|
2783
|
-
return
|
2608
|
+
return CONTINUE;
|
2784
2609
|
}
|
2785
2610
|
}
|
2786
2611
|
|
@@ -2795,28 +2620,26 @@ static StateResult handle_doctype_public_id_single_quoted_state (
|
|
2795
2620
|
case '\'':
|
2796
2621
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
|
2797
2622
|
finish_doctype_public_id(parser);
|
2798
|
-
return
|
2623
|
+
return CONTINUE;
|
2799
2624
|
case '\0':
|
2800
|
-
tokenizer_add_parse_error(parser,
|
2625
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2801
2626
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2802
|
-
return
|
2627
|
+
return CONTINUE;
|
2803
2628
|
case '>':
|
2804
|
-
tokenizer_add_parse_error(parser,
|
2629
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
|
2805
2630
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2806
2631
|
tokenizer->_doc_type_state.force_quirks = true;
|
2807
2632
|
finish_doctype_public_id(parser);
|
2808
|
-
emit_doctype(parser, output);
|
2809
|
-
return RETURN_ERROR;
|
2633
|
+
return emit_doctype(parser, output);
|
2810
2634
|
case -1:
|
2811
|
-
tokenizer_add_parse_error(parser,
|
2812
|
-
|
2635
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2636
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2813
2637
|
tokenizer->_doc_type_state.force_quirks = true;
|
2814
2638
|
finish_doctype_public_id(parser);
|
2815
|
-
emit_doctype(parser, output);
|
2816
|
-
return RETURN_ERROR;
|
2639
|
+
return emit_doctype(parser, output);
|
2817
2640
|
default:
|
2818
2641
|
append_char_to_temporary_buffer(parser, c);
|
2819
|
-
return
|
2642
|
+
return CONTINUE;
|
2820
2643
|
}
|
2821
2644
|
}
|
2822
2645
|
|
@@ -2834,35 +2657,38 @@ static StateResult handle_after_doctype_public_id_state (
|
|
2834
2657
|
case ' ':
|
2835
2658
|
gumbo_tokenizer_set_state(
|
2836
2659
|
parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
|
2837
|
-
return
|
2660
|
+
return CONTINUE;
|
2838
2661
|
case '>':
|
2839
2662
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2840
|
-
emit_doctype(parser, output);
|
2841
|
-
return RETURN_SUCCESS;
|
2663
|
+
return emit_doctype(parser, output);
|
2842
2664
|
case '"':
|
2843
|
-
tokenizer_add_parse_error(
|
2665
|
+
tokenizer_add_parse_error (
|
2666
|
+
parser,
|
2667
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
|
2668
|
+
);
|
2844
2669
|
assert(temporary_buffer_is_empty(parser));
|
2845
2670
|
gumbo_tokenizer_set_state(
|
2846
2671
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2847
|
-
return
|
2672
|
+
return CONTINUE;
|
2848
2673
|
case '\'':
|
2849
|
-
tokenizer_add_parse_error(
|
2674
|
+
tokenizer_add_parse_error (
|
2675
|
+
parser,
|
2676
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
|
2677
|
+
);
|
2850
2678
|
assert(temporary_buffer_is_empty(parser));
|
2851
2679
|
gumbo_tokenizer_set_state(
|
2852
2680
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2853
|
-
return
|
2681
|
+
return CONTINUE;
|
2854
2682
|
case -1:
|
2855
|
-
tokenizer_add_parse_error(parser,
|
2856
|
-
|
2857
|
-
tokenizer->_reconsume_current_input = true;
|
2683
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2684
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2858
2685
|
tokenizer->_doc_type_state.force_quirks = true;
|
2859
|
-
emit_doctype(parser, output);
|
2860
|
-
return RETURN_ERROR;
|
2686
|
+
return emit_doctype(parser, output);
|
2861
2687
|
default:
|
2862
|
-
tokenizer_add_parse_error(parser,
|
2863
|
-
|
2688
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
|
2689
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2864
2690
|
tokenizer->_doc_type_state.force_quirks = true;
|
2865
|
-
return
|
2691
|
+
return CONTINUE;
|
2866
2692
|
}
|
2867
2693
|
}
|
2868
2694
|
|
@@ -2878,33 +2704,30 @@ static StateResult handle_between_doctype_public_system_id_state (
|
|
2878
2704
|
case '\n':
|
2879
2705
|
case '\f':
|
2880
2706
|
case ' ':
|
2881
|
-
return
|
2707
|
+
return CONTINUE;
|
2882
2708
|
case '>':
|
2883
2709
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2884
|
-
emit_doctype(parser, output);
|
2885
|
-
return RETURN_SUCCESS;
|
2710
|
+
return emit_doctype(parser, output);
|
2886
2711
|
case '"':
|
2887
2712
|
assert(temporary_buffer_is_empty(parser));
|
2888
2713
|
gumbo_tokenizer_set_state(
|
2889
2714
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2890
|
-
return
|
2715
|
+
return CONTINUE;
|
2891
2716
|
case '\'':
|
2892
2717
|
assert(temporary_buffer_is_empty(parser));
|
2893
2718
|
gumbo_tokenizer_set_state(
|
2894
2719
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2895
|
-
return
|
2720
|
+
return CONTINUE;
|
2896
2721
|
case -1:
|
2897
|
-
tokenizer_add_parse_error(parser,
|
2898
|
-
|
2722
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2723
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2899
2724
|
tokenizer->_doc_type_state.force_quirks = true;
|
2900
|
-
emit_doctype(parser, output);
|
2901
|
-
return RETURN_ERROR;
|
2725
|
+
return emit_doctype(parser, output);
|
2902
2726
|
default:
|
2903
|
-
tokenizer_add_parse_error(parser,
|
2904
|
-
|
2727
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
|
2728
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2905
2729
|
tokenizer->_doc_type_state.force_quirks = true;
|
2906
|
-
|
2907
|
-
return RETURN_ERROR;
|
2730
|
+
return CONTINUE;
|
2908
2731
|
}
|
2909
2732
|
}
|
2910
2733
|
|
@@ -2921,36 +2744,34 @@ static StateResult handle_after_doctype_system_keyword_state (
|
|
2921
2744
|
case '\f':
|
2922
2745
|
case ' ':
|
2923
2746
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
|
2924
|
-
return
|
2747
|
+
return CONTINUE;
|
2925
2748
|
case '"':
|
2926
|
-
tokenizer_add_parse_error(parser,
|
2749
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
|
2927
2750
|
assert(temporary_buffer_is_empty(parser));
|
2928
2751
|
gumbo_tokenizer_set_state(
|
2929
2752
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2930
|
-
return
|
2753
|
+
return CONTINUE;
|
2931
2754
|
case '\'':
|
2932
|
-
tokenizer_add_parse_error(parser,
|
2755
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
|
2933
2756
|
assert(temporary_buffer_is_empty(parser));
|
2934
2757
|
gumbo_tokenizer_set_state(
|
2935
2758
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2936
|
-
return
|
2759
|
+
return CONTINUE;
|
2937
2760
|
case '>':
|
2938
|
-
tokenizer_add_parse_error(parser,
|
2761
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
|
2939
2762
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2940
2763
|
tokenizer->_doc_type_state.force_quirks = true;
|
2941
|
-
emit_doctype(parser, output);
|
2942
|
-
return RETURN_ERROR;
|
2764
|
+
return emit_doctype(parser, output);
|
2943
2765
|
case -1:
|
2944
|
-
tokenizer_add_parse_error(parser,
|
2945
|
-
|
2766
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2767
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2946
2768
|
tokenizer->_doc_type_state.force_quirks = true;
|
2947
|
-
emit_doctype(parser, output);
|
2948
|
-
return RETURN_ERROR;
|
2769
|
+
return emit_doctype(parser, output);
|
2949
2770
|
default:
|
2950
|
-
tokenizer_add_parse_error(parser,
|
2951
|
-
|
2771
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
|
2772
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2952
2773
|
tokenizer->_doc_type_state.force_quirks = true;
|
2953
|
-
return
|
2774
|
+
return CONTINUE;
|
2954
2775
|
}
|
2955
2776
|
}
|
2956
2777
|
|
@@ -2966,34 +2787,32 @@ static StateResult handle_before_doctype_system_id_state (
|
|
2966
2787
|
case '\n':
|
2967
2788
|
case '\f':
|
2968
2789
|
case ' ':
|
2969
|
-
return
|
2790
|
+
return CONTINUE;
|
2970
2791
|
case '"':
|
2971
2792
|
assert(temporary_buffer_is_empty(parser));
|
2972
2793
|
gumbo_tokenizer_set_state(
|
2973
2794
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2974
|
-
return
|
2795
|
+
return CONTINUE;
|
2975
2796
|
case '\'':
|
2976
2797
|
assert(temporary_buffer_is_empty(parser));
|
2977
2798
|
gumbo_tokenizer_set_state(
|
2978
2799
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2979
|
-
return
|
2800
|
+
return CONTINUE;
|
2980
2801
|
case '>':
|
2981
|
-
tokenizer_add_parse_error(parser,
|
2802
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
|
2982
2803
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2983
2804
|
tokenizer->_doc_type_state.force_quirks = true;
|
2984
|
-
emit_doctype(parser, output);
|
2985
|
-
return RETURN_ERROR;
|
2805
|
+
return emit_doctype(parser, output);
|
2986
2806
|
case -1:
|
2987
|
-
tokenizer_add_parse_error(parser,
|
2988
|
-
|
2807
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2808
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2989
2809
|
tokenizer->_doc_type_state.force_quirks = true;
|
2990
|
-
emit_doctype(parser, output);
|
2991
|
-
return RETURN_ERROR;
|
2810
|
+
return emit_doctype(parser, output);
|
2992
2811
|
default:
|
2993
|
-
tokenizer_add_parse_error(parser,
|
2994
|
-
|
2812
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
|
2813
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2995
2814
|
tokenizer->_doc_type_state.force_quirks = true;
|
2996
|
-
return
|
2815
|
+
return CONTINUE;
|
2997
2816
|
}
|
2998
2817
|
}
|
2999
2818
|
|
@@ -3008,28 +2827,26 @@ static StateResult handle_doctype_system_id_double_quoted_state (
|
|
3008
2827
|
case '"':
|
3009
2828
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
|
3010
2829
|
finish_doctype_system_id(parser);
|
3011
|
-
return
|
2830
|
+
return CONTINUE;
|
3012
2831
|
case '\0':
|
3013
|
-
tokenizer_add_parse_error(parser,
|
2832
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
3014
2833
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
3015
|
-
return
|
2834
|
+
return CONTINUE;
|
3016
2835
|
case '>':
|
3017
|
-
tokenizer_add_parse_error(parser,
|
2836
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
|
3018
2837
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
3019
2838
|
tokenizer->_doc_type_state.force_quirks = true;
|
3020
2839
|
finish_doctype_system_id(parser);
|
3021
|
-
emit_doctype(parser, output);
|
3022
|
-
return RETURN_ERROR;
|
2840
|
+
return emit_doctype(parser, output);
|
3023
2841
|
case -1:
|
3024
|
-
tokenizer_add_parse_error(parser,
|
3025
|
-
|
2842
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2843
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
3026
2844
|
tokenizer->_doc_type_state.force_quirks = true;
|
3027
2845
|
finish_doctype_system_id(parser);
|
3028
|
-
emit_doctype(parser, output);
|
3029
|
-
return RETURN_ERROR;
|
2846
|
+
return emit_doctype(parser, output);
|
3030
2847
|
default:
|
3031
2848
|
append_char_to_temporary_buffer(parser, c);
|
3032
|
-
return
|
2849
|
+
return CONTINUE;
|
3033
2850
|
}
|
3034
2851
|
}
|
3035
2852
|
|
@@ -3044,28 +2861,26 @@ static StateResult handle_doctype_system_id_single_quoted_state (
|
|
3044
2861
|
case '\'':
|
3045
2862
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
|
3046
2863
|
finish_doctype_system_id(parser);
|
3047
|
-
return
|
2864
|
+
return CONTINUE;
|
3048
2865
|
case '\0':
|
3049
|
-
tokenizer_add_parse_error(parser,
|
2866
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
3050
2867
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
3051
|
-
return
|
2868
|
+
return CONTINUE;
|
3052
2869
|
case '>':
|
3053
|
-
tokenizer_add_parse_error(parser,
|
2870
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
|
3054
2871
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
3055
2872
|
tokenizer->_doc_type_state.force_quirks = true;
|
3056
2873
|
finish_doctype_system_id(parser);
|
3057
|
-
emit_doctype(parser, output);
|
3058
|
-
return RETURN_ERROR;
|
2874
|
+
return emit_doctype(parser, output);
|
3059
2875
|
case -1:
|
3060
|
-
tokenizer_add_parse_error(parser,
|
3061
|
-
|
2876
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2877
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
3062
2878
|
tokenizer->_doc_type_state.force_quirks = true;
|
3063
2879
|
finish_doctype_system_id(parser);
|
3064
|
-
emit_doctype(parser, output);
|
3065
|
-
return RETURN_ERROR;
|
2880
|
+
return emit_doctype(parser, output);
|
3066
2881
|
default:
|
3067
2882
|
append_char_to_temporary_buffer(parser, c);
|
3068
|
-
return
|
2883
|
+
return CONTINUE;
|
3069
2884
|
}
|
3070
2885
|
}
|
3071
2886
|
|
@@ -3081,21 +2896,19 @@ static StateResult handle_after_doctype_system_id_state (
|
|
3081
2896
|
case '\n':
|
3082
2897
|
case '\f':
|
3083
2898
|
case ' ':
|
3084
|
-
return
|
2899
|
+
return CONTINUE;
|
3085
2900
|
case '>':
|
3086
2901
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
3087
|
-
emit_doctype(parser, output);
|
3088
|
-
return RETURN_SUCCESS;
|
2902
|
+
return emit_doctype(parser, output);
|
3089
2903
|
case -1:
|
3090
|
-
tokenizer_add_parse_error(parser,
|
3091
|
-
|
2904
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2905
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
3092
2906
|
tokenizer->_doc_type_state.force_quirks = true;
|
3093
|
-
emit_doctype(parser, output);
|
3094
|
-
return RETURN_ERROR;
|
2907
|
+
return emit_doctype(parser, output);
|
3095
2908
|
default:
|
3096
|
-
tokenizer_add_parse_error(parser,
|
3097
|
-
|
3098
|
-
return
|
2909
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER);
|
2910
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2911
|
+
return CONTINUE;
|
3099
2912
|
}
|
3100
2913
|
}
|
3101
2914
|
|
@@ -3106,33 +2919,370 @@ static StateResult handle_bogus_doctype_state (
|
|
3106
2919
|
int c,
|
3107
2920
|
GumboToken* output
|
3108
2921
|
) {
|
3109
|
-
|
2922
|
+
switch (c) {
|
2923
|
+
case '>':
|
3110
2924
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
3111
|
-
emit_doctype(parser, output);
|
3112
|
-
|
2925
|
+
return emit_doctype(parser, output);
|
2926
|
+
case '\0':
|
2927
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2928
|
+
return CONTINUE;
|
2929
|
+
case -1:
|
2930
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2931
|
+
return emit_doctype(parser, output);
|
2932
|
+
default:
|
2933
|
+
return CONTINUE;
|
3113
2934
|
}
|
3114
|
-
return NEXT_CHAR;
|
3115
2935
|
}
|
3116
2936
|
|
3117
2937
|
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
|
3118
|
-
static StateResult
|
2938
|
+
static StateResult handle_cdata_section_state (
|
3119
2939
|
GumboParser* parser,
|
3120
2940
|
GumboTokenizerState* tokenizer,
|
3121
2941
|
int c,
|
3122
2942
|
GumboToken* output
|
3123
2943
|
) {
|
3124
|
-
|
3125
|
-
|
3126
|
-
|
2944
|
+
switch (c) {
|
2945
|
+
case ']':
|
2946
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_BRACKET);
|
2947
|
+
set_mark(parser);
|
2948
|
+
return CONTINUE;
|
2949
|
+
case -1:
|
2950
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_CDATA);
|
2951
|
+
return emit_eof(parser, output);
|
2952
|
+
default:
|
2953
|
+
return emit_char(parser, c, output);
|
2954
|
+
}
|
2955
|
+
}
|
2956
|
+
|
2957
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
|
2958
|
+
static StateResult handle_cdata_section_bracket_state (
|
2959
|
+
GumboParser* parser,
|
2960
|
+
GumboTokenizerState* tokenizer,
|
2961
|
+
int c,
|
2962
|
+
GumboToken* output
|
2963
|
+
) {
|
2964
|
+
switch (c) {
|
2965
|
+
case ']':
|
2966
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_END);
|
2967
|
+
return CONTINUE;
|
2968
|
+
default:
|
2969
|
+
reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
|
2970
|
+
// Emit the ].
|
2971
|
+
return emit_from_mark(parser, output);
|
2972
|
+
}
|
2973
|
+
}
|
2974
|
+
|
2975
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
|
2976
|
+
static StateResult handle_cdata_section_end_state (
|
2977
|
+
GumboParser* parser,
|
2978
|
+
GumboTokenizerState* tokenizer,
|
2979
|
+
int c,
|
2980
|
+
GumboToken* output
|
2981
|
+
) {
|
2982
|
+
switch (c) {
|
2983
|
+
case ']':
|
2984
|
+
{
|
2985
|
+
// XXX: This is terrible. We want to emit a ] corresponding to the first
|
2986
|
+
// of the three in a row we've seen. So let's emit one token from the
|
2987
|
+
// temporary buffer (which will rewind 3 characters, emit the ] and
|
2988
|
+
// advance one). Next, let's clear the temporary buffer which will set the
|
2989
|
+
// mark to the middle of the three brackets. Finally, let's move to the
|
2990
|
+
// appropriate state.
|
2991
|
+
StateResult result = emit_from_mark(parser, output);
|
2992
|
+
tokenizer->_resume_pos = NULL;
|
2993
|
+
set_mark(parser);
|
2994
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION);
|
2995
|
+
return result;
|
2996
|
+
}
|
2997
|
+
case '>':
|
2998
|
+
// We're done with CDATA so move past the >, reset the token start point
|
2999
|
+
// to point after the >, and then reconsume in the data state.
|
3000
|
+
utf8iterator_next(&tokenizer->_input);
|
3127
3001
|
reset_token_start_point(tokenizer);
|
3128
|
-
|
3002
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
3129
3003
|
tokenizer->_is_in_cdata = false;
|
3130
|
-
return
|
3131
|
-
|
3132
|
-
|
3004
|
+
return CONTINUE;
|
3005
|
+
default:
|
3006
|
+
reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
|
3007
|
+
return emit_from_mark(parser, output);
|
3133
3008
|
}
|
3134
3009
|
}
|
3135
3010
|
|
3011
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
|
3012
|
+
static StateResult handle_character_reference_state (
|
3013
|
+
GumboParser* parser,
|
3014
|
+
GumboTokenizerState* tokenizer,
|
3015
|
+
int c,
|
3016
|
+
GumboToken* output
|
3017
|
+
) {
|
3018
|
+
if (gumbo_ascii_isalnum(c)) {
|
3019
|
+
reconsume_in_state(parser, GUMBO_LEX_NAMED_CHARACTER_REFERENCE);
|
3020
|
+
return CONTINUE;
|
3021
|
+
}
|
3022
|
+
if (c == '#') {
|
3023
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE);
|
3024
|
+
return CONTINUE;
|
3025
|
+
}
|
3026
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3027
|
+
return flush_code_points_consumed_as_character_reference(parser, output);
|
3028
|
+
}
|
3029
|
+
|
3030
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
|
3031
|
+
static StateResult handle_named_character_reference_state (
|
3032
|
+
GumboParser* parser,
|
3033
|
+
GumboTokenizerState* tokenizer,
|
3034
|
+
int c,
|
3035
|
+
GumboToken* output
|
3036
|
+
) {
|
3037
|
+
const char *cur = utf8iterator_get_char_pointer(&tokenizer->_input);
|
3038
|
+
const char *end = utf8iterator_get_end_pointer(&tokenizer->_input);
|
3039
|
+
int code_point[2];
|
3040
|
+
size_t size = match_named_char_ref(cur, end - cur, code_point);
|
3041
|
+
|
3042
|
+
if (size > 0) {
|
3043
|
+
utf8iterator_maybe_consume_match(&tokenizer->_input, cur, size, true);
|
3044
|
+
int next = utf8iterator_current(&tokenizer->_input);
|
3045
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3046
|
+
if (character_reference_part_of_attribute(parser)
|
3047
|
+
&& cur[size-1] != ';'
|
3048
|
+
&& (next == '=' || gumbo_ascii_isalnum(next))) {
|
3049
|
+
GumboStringPiece str = { .data = cur, .length = size };
|
3050
|
+
append_string_to_temporary_buffer(parser, &str);
|
3051
|
+
return flush_code_points_consumed_as_character_reference(parser, output);
|
3052
|
+
}
|
3053
|
+
if (cur[size-1] != ';')
|
3054
|
+
tokenizer_add_char_ref_error(parser, GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE, -1);
|
3055
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3056
|
+
return flush_char_ref(parser, code_point[0], code_point[1], output);
|
3057
|
+
}
|
3058
|
+
reconsume_in_state(parser, GUMBO_LEX_AMBIGUOUS_AMPERSAND);
|
3059
|
+
return flush_code_points_consumed_as_character_reference(parser, output);
|
3060
|
+
}
|
3061
|
+
|
3062
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
|
3063
|
+
static StateResult handle_ambiguous_ampersand_state (
|
3064
|
+
GumboParser* parser,
|
3065
|
+
GumboTokenizerState* tokenizer,
|
3066
|
+
int c,
|
3067
|
+
GumboToken* output
|
3068
|
+
) {
|
3069
|
+
if (gumbo_ascii_isalnum(c)) {
|
3070
|
+
if (character_reference_part_of_attribute(parser)) {
|
3071
|
+
append_char_to_tag_buffer(parser, c, true);
|
3072
|
+
return CONTINUE;
|
3073
|
+
}
|
3074
|
+
return emit_char(parser, c, output);
|
3075
|
+
}
|
3076
|
+
if (c == ';') {
|
3077
|
+
tokenizer_add_char_ref_error(parser, GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE, -1);
|
3078
|
+
// fall through
|
3079
|
+
}
|
3080
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3081
|
+
return CONTINUE;
|
3082
|
+
}
|
3083
|
+
|
3084
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
|
3085
|
+
static StateResult handle_numeric_character_reference_state (
|
3086
|
+
GumboParser* parser,
|
3087
|
+
GumboTokenizerState* tokenizer,
|
3088
|
+
int c,
|
3089
|
+
GumboToken* output
|
3090
|
+
) {
|
3091
|
+
tokenizer->_character_reference_code = 0;
|
3092
|
+
switch (c) {
|
3093
|
+
case 'x':
|
3094
|
+
case 'X':
|
3095
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START);
|
3096
|
+
return CONTINUE;
|
3097
|
+
default:
|
3098
|
+
reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START);
|
3099
|
+
return CONTINUE;
|
3100
|
+
}
|
3101
|
+
}
|
3102
|
+
|
3103
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-start-state
|
3104
|
+
static StateResult handle_hexadecimal_character_reference_start_state (
|
3105
|
+
GumboParser* parser,
|
3106
|
+
GumboTokenizerState* tokenizer,
|
3107
|
+
int c,
|
3108
|
+
GumboToken* output
|
3109
|
+
) {
|
3110
|
+
if (gumbo_ascii_isxdigit(c)) {
|
3111
|
+
reconsume_in_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE);
|
3112
|
+
return CONTINUE;
|
3113
|
+
}
|
3114
|
+
tokenizer_add_char_ref_error (
|
3115
|
+
parser,
|
3116
|
+
GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
|
3117
|
+
-1
|
3118
|
+
);
|
3119
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3120
|
+
return flush_code_points_consumed_as_character_reference(parser, output);
|
3121
|
+
}
|
3122
|
+
|
3123
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
|
3124
|
+
static StateResult handle_decimal_character_reference_start_state (
|
3125
|
+
GumboParser* parser,
|
3126
|
+
GumboTokenizerState* tokenizer,
|
3127
|
+
int c,
|
3128
|
+
GumboToken* output
|
3129
|
+
) {
|
3130
|
+
if (gumbo_ascii_isdigit(c)) {
|
3131
|
+
reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE);
|
3132
|
+
return CONTINUE;
|
3133
|
+
}
|
3134
|
+
tokenizer_add_char_ref_error (
|
3135
|
+
parser,
|
3136
|
+
GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
|
3137
|
+
-1
|
3138
|
+
);
|
3139
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3140
|
+
return flush_code_points_consumed_as_character_reference(parser, output);
|
3141
|
+
}
|
3142
|
+
|
3143
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-state
|
3144
|
+
static StateResult handle_hexadecimal_character_reference_state (
|
3145
|
+
GumboParser* parser,
|
3146
|
+
GumboTokenizerState* tokenizer,
|
3147
|
+
int c,
|
3148
|
+
GumboToken* output
|
3149
|
+
) {
|
3150
|
+
if (gumbo_ascii_isdigit(c)) {
|
3151
|
+
tokenizer->_character_reference_code =
|
3152
|
+
tokenizer->_character_reference_code * 16 + (c - 0x0030);
|
3153
|
+
if (tokenizer->_character_reference_code > kUtf8MaxChar)
|
3154
|
+
tokenizer->_character_reference_code = kUtf8MaxChar+1;
|
3155
|
+
return CONTINUE;
|
3156
|
+
}
|
3157
|
+
if (gumbo_ascii_isupper_xdigit(c)) {
|
3158
|
+
tokenizer->_character_reference_code =
|
3159
|
+
tokenizer->_character_reference_code * 16 + (c - 0x0037);
|
3160
|
+
if (tokenizer->_character_reference_code > kUtf8MaxChar)
|
3161
|
+
tokenizer->_character_reference_code = kUtf8MaxChar+1;
|
3162
|
+
return CONTINUE;
|
3163
|
+
}
|
3164
|
+
if (gumbo_ascii_islower_xdigit(c)) {
|
3165
|
+
tokenizer->_character_reference_code =
|
3166
|
+
tokenizer->_character_reference_code * 16 + (c - 0x0057);
|
3167
|
+
if (tokenizer->_character_reference_code > kUtf8MaxChar)
|
3168
|
+
tokenizer->_character_reference_code = kUtf8MaxChar+1;
|
3169
|
+
return CONTINUE;
|
3170
|
+
}
|
3171
|
+
if (c == ';') {
|
3172
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
|
3173
|
+
return CONTINUE;
|
3174
|
+
}
|
3175
|
+
tokenizer_add_char_ref_error(
|
3176
|
+
parser,
|
3177
|
+
GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
|
3178
|
+
tokenizer->_character_reference_code
|
3179
|
+
);
|
3180
|
+
reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
|
3181
|
+
return CONTINUE;
|
3182
|
+
}
|
3183
|
+
|
3184
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
|
3185
|
+
static StateResult handle_decimal_character_reference_state (
|
3186
|
+
GumboParser* parser,
|
3187
|
+
GumboTokenizerState* tokenizer,
|
3188
|
+
int c,
|
3189
|
+
GumboToken* output
|
3190
|
+
) {
|
3191
|
+
if (gumbo_ascii_isdigit(c)) {
|
3192
|
+
tokenizer->_character_reference_code =
|
3193
|
+
tokenizer->_character_reference_code * 10 + (c - 0x0030);
|
3194
|
+
if (tokenizer->_character_reference_code > kUtf8MaxChar)
|
3195
|
+
tokenizer->_character_reference_code = kUtf8MaxChar+1;
|
3196
|
+
return CONTINUE;
|
3197
|
+
}
|
3198
|
+
if (c == ';') {
|
3199
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
|
3200
|
+
return CONTINUE;
|
3201
|
+
}
|
3202
|
+
tokenizer_add_char_ref_error(
|
3203
|
+
parser,
|
3204
|
+
GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
|
3205
|
+
tokenizer->_character_reference_code
|
3206
|
+
);
|
3207
|
+
reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
|
3208
|
+
return CONTINUE;
|
3209
|
+
}
|
3210
|
+
|
3211
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
|
3212
|
+
static StateResult handle_numeric_character_reference_end_state (
|
3213
|
+
GumboParser* parser,
|
3214
|
+
GumboTokenizerState* tokenizer,
|
3215
|
+
int c,
|
3216
|
+
GumboToken* output
|
3217
|
+
) {
|
3218
|
+
c = tokenizer->_character_reference_code;
|
3219
|
+
if (c == 0) {
|
3220
|
+
tokenizer_add_char_ref_error(
|
3221
|
+
parser,
|
3222
|
+
GUMBO_ERR_NULL_CHARACTER_REFERENCE,
|
3223
|
+
c
|
3224
|
+
);
|
3225
|
+
c = kUtf8ReplacementChar;
|
3226
|
+
} else if (c > kUtf8MaxChar) {
|
3227
|
+
tokenizer_add_char_ref_error(
|
3228
|
+
parser,
|
3229
|
+
GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
|
3230
|
+
c
|
3231
|
+
);
|
3232
|
+
c = kUtf8ReplacementChar;
|
3233
|
+
} else if (utf8_is_surrogate(c)) {
|
3234
|
+
tokenizer_add_char_ref_error(
|
3235
|
+
parser,
|
3236
|
+
GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
|
3237
|
+
c
|
3238
|
+
);
|
3239
|
+
c = kUtf8ReplacementChar;
|
3240
|
+
} else if (utf8_is_noncharacter(c)) {
|
3241
|
+
tokenizer_add_char_ref_error(
|
3242
|
+
parser,
|
3243
|
+
GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
|
3244
|
+
c
|
3245
|
+
);
|
3246
|
+
} else if (c == 0x0D || (utf8_is_control(c) && !gumbo_ascii_isspace(c))) {
|
3247
|
+
tokenizer_add_char_ref_error(
|
3248
|
+
parser,
|
3249
|
+
GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
|
3250
|
+
c
|
3251
|
+
);
|
3252
|
+
switch (c) {
|
3253
|
+
case 0x80: c = 0x20AC; break;
|
3254
|
+
case 0x82: c = 0x201A; break;
|
3255
|
+
case 0x83: c = 0x0192; break;
|
3256
|
+
case 0x84: c = 0x201E; break;
|
3257
|
+
case 0x85: c = 0x2026; break;
|
3258
|
+
case 0x86: c = 0x2020; break;
|
3259
|
+
case 0x87: c = 0x2021; break;
|
3260
|
+
case 0x88: c = 0x02C6; break;
|
3261
|
+
case 0x89: c = 0x2030; break;
|
3262
|
+
case 0x8A: c = 0x0160; break;
|
3263
|
+
case 0x8B: c = 0x2039; break;
|
3264
|
+
case 0x8C: c = 0x0152; break;
|
3265
|
+
case 0x8E: c = 0x017D; break;
|
3266
|
+
case 0x91: c = 0x2018; break;
|
3267
|
+
case 0x92: c = 0x2019; break;
|
3268
|
+
case 0x93: c = 0x201C; break;
|
3269
|
+
case 0x94: c = 0x201D; break;
|
3270
|
+
case 0x95: c = 0x2022; break;
|
3271
|
+
case 0x96: c = 0x2013; break;
|
3272
|
+
case 0x97: c = 0x2014; break;
|
3273
|
+
case 0x98: c = 0x02DC; break;
|
3274
|
+
case 0x99: c = 0x2122; break;
|
3275
|
+
case 0x9A: c = 0x0161; break;
|
3276
|
+
case 0x9B: c = 0x203A; break;
|
3277
|
+
case 0x9C: c = 0x0153; break;
|
3278
|
+
case 0x9E: c = 0x017E; break;
|
3279
|
+
case 0x9F: c = 0x0178; break;
|
3280
|
+
}
|
3281
|
+
}
|
3282
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3283
|
+
return flush_char_ref(parser, c, kGumboNoChar, output);
|
3284
|
+
}
|
3285
|
+
|
3136
3286
|
typedef StateResult (*GumboLexerStateFunction) (
|
3137
3287
|
GumboParser* parser,
|
3138
3288
|
GumboTokenizerState* tokenizer,
|
@@ -3141,74 +3291,86 @@ typedef StateResult (*GumboLexerStateFunction) (
|
|
3141
3291
|
);
|
3142
3292
|
|
3143
3293
|
static GumboLexerStateFunction dispatch_table[] = {
|
3144
|
-
handle_data_state,
|
3145
|
-
|
3146
|
-
|
3147
|
-
|
3148
|
-
|
3149
|
-
|
3150
|
-
|
3151
|
-
|
3152
|
-
|
3153
|
-
|
3154
|
-
|
3155
|
-
|
3156
|
-
|
3157
|
-
|
3158
|
-
|
3159
|
-
|
3160
|
-
|
3161
|
-
|
3162
|
-
|
3163
|
-
|
3164
|
-
|
3165
|
-
|
3166
|
-
|
3167
|
-
|
3168
|
-
|
3169
|
-
|
3170
|
-
|
3171
|
-
|
3172
|
-
|
3173
|
-
|
3174
|
-
|
3175
|
-
|
3176
|
-
|
3177
|
-
|
3178
|
-
|
3179
|
-
|
3180
|
-
|
3181
|
-
|
3182
|
-
|
3183
|
-
|
3184
|
-
|
3185
|
-
|
3186
|
-
|
3187
|
-
|
3188
|
-
|
3189
|
-
|
3190
|
-
|
3191
|
-
|
3192
|
-
|
3193
|
-
|
3194
|
-
|
3195
|
-
|
3196
|
-
|
3197
|
-
|
3198
|
-
|
3199
|
-
|
3200
|
-
|
3201
|
-
|
3202
|
-
|
3203
|
-
|
3204
|
-
|
3205
|
-
|
3206
|
-
|
3207
|
-
|
3208
|
-
|
3209
|
-
|
3210
|
-
|
3211
|
-
|
3294
|
+
[GUMBO_LEX_DATA] = handle_data_state,
|
3295
|
+
[GUMBO_LEX_RCDATA] = handle_rcdata_state,
|
3296
|
+
[GUMBO_LEX_RAWTEXT] = handle_rawtext_state,
|
3297
|
+
[GUMBO_LEX_SCRIPT_DATA] = handle_script_data_state,
|
3298
|
+
[GUMBO_LEX_PLAINTEXT] = handle_plaintext_state,
|
3299
|
+
[GUMBO_LEX_TAG_OPEN] = handle_tag_open_state,
|
3300
|
+
[GUMBO_LEX_END_TAG_OPEN] = handle_end_tag_open_state,
|
3301
|
+
[GUMBO_LEX_TAG_NAME] = handle_tag_name_state,
|
3302
|
+
[GUMBO_LEX_RCDATA_LT] = handle_rcdata_lt_state,
|
3303
|
+
[GUMBO_LEX_RCDATA_END_TAG_OPEN] = handle_rcdata_end_tag_open_state,
|
3304
|
+
[GUMBO_LEX_RCDATA_END_TAG_NAME] = handle_rcdata_end_tag_name_state,
|
3305
|
+
[GUMBO_LEX_RAWTEXT_LT] = handle_rawtext_lt_state,
|
3306
|
+
[GUMBO_LEX_RAWTEXT_END_TAG_OPEN] = handle_rawtext_end_tag_open_state,
|
3307
|
+
[GUMBO_LEX_RAWTEXT_END_TAG_NAME] = handle_rawtext_end_tag_name_state,
|
3308
|
+
[GUMBO_LEX_SCRIPT_DATA_LT] = handle_script_data_lt_state,
|
3309
|
+
[GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN] = handle_script_data_end_tag_open_state,
|
3310
|
+
[GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME] = handle_script_data_end_tag_name_state,
|
3311
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_START] = handle_script_data_escaped_start_state,
|
3312
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH] = handle_script_data_escaped_start_dash_state,
|
3313
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED] = handle_script_data_escaped_state,
|
3314
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH] = handle_script_data_escaped_dash_state,
|
3315
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH] = handle_script_data_escaped_dash_dash_state,
|
3316
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT] = handle_script_data_escaped_lt_state,
|
3317
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN] = handle_script_data_escaped_end_tag_open_state,
|
3318
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME] = handle_script_data_escaped_end_tag_name_state,
|
3319
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START] = handle_script_data_double_escaped_start_state,
|
3320
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED] = handle_script_data_double_escaped_state,
|
3321
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH] = handle_script_data_double_escaped_dash_state,
|
3322
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH] = handle_script_data_double_escaped_dash_dash_state,
|
3323
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT] = handle_script_data_double_escaped_lt_state,
|
3324
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END] = handle_script_data_double_escaped_end_state,
|
3325
|
+
[GUMBO_LEX_BEFORE_ATTR_NAME] = handle_before_attr_name_state,
|
3326
|
+
[GUMBO_LEX_ATTR_NAME] = handle_attr_name_state,
|
3327
|
+
[GUMBO_LEX_AFTER_ATTR_NAME] = handle_after_attr_name_state,
|
3328
|
+
[GUMBO_LEX_BEFORE_ATTR_VALUE] = handle_before_attr_value_state,
|
3329
|
+
[GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED] = handle_attr_value_double_quoted_state,
|
3330
|
+
[GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED] = handle_attr_value_single_quoted_state,
|
3331
|
+
[GUMBO_LEX_ATTR_VALUE_UNQUOTED] = handle_attr_value_unquoted_state,
|
3332
|
+
[GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED] = handle_after_attr_value_quoted_state,
|
3333
|
+
[GUMBO_LEX_SELF_CLOSING_START_TAG] = handle_self_closing_start_tag_state,
|
3334
|
+
[GUMBO_LEX_BOGUS_COMMENT] = handle_bogus_comment_state,
|
3335
|
+
[GUMBO_LEX_MARKUP_DECLARATION_OPEN] = handle_markup_declaration_open_state,
|
3336
|
+
[GUMBO_LEX_COMMENT_START] = handle_comment_start_state,
|
3337
|
+
[GUMBO_LEX_COMMENT_START_DASH] = handle_comment_start_dash_state,
|
3338
|
+
[GUMBO_LEX_COMMENT] = handle_comment_state,
|
3339
|
+
[GUMBO_LEX_COMMENT_LT] = handle_comment_lt_state,
|
3340
|
+
[GUMBO_LEX_COMMENT_LT_BANG] = handle_comment_lt_bang_state,
|
3341
|
+
[GUMBO_LEX_COMMENT_LT_BANG_DASH] = handle_comment_lt_bang_dash_state,
|
3342
|
+
[GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH] = handle_comment_lt_bang_dash_dash_state,
|
3343
|
+
[GUMBO_LEX_COMMENT_END_DASH] = handle_comment_end_dash_state,
|
3344
|
+
[GUMBO_LEX_COMMENT_END] = handle_comment_end_state,
|
3345
|
+
[GUMBO_LEX_COMMENT_END_BANG] = handle_comment_end_bang_state,
|
3346
|
+
[GUMBO_LEX_DOCTYPE] = handle_doctype_state,
|
3347
|
+
[GUMBO_LEX_BEFORE_DOCTYPE_NAME] = handle_before_doctype_name_state,
|
3348
|
+
[GUMBO_LEX_DOCTYPE_NAME] = handle_doctype_name_state,
|
3349
|
+
[GUMBO_LEX_AFTER_DOCTYPE_NAME] = handle_after_doctype_name_state,
|
3350
|
+
[GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD] = handle_after_doctype_public_keyword_state,
|
3351
|
+
[GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID] = handle_before_doctype_public_id_state,
|
3352
|
+
[GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED] = handle_doctype_public_id_double_quoted_state,
|
3353
|
+
[GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED] = handle_doctype_public_id_single_quoted_state,
|
3354
|
+
[GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID] = handle_after_doctype_public_id_state,
|
3355
|
+
[GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID] = handle_between_doctype_public_system_id_state,
|
3356
|
+
[GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD] = handle_after_doctype_system_keyword_state,
|
3357
|
+
[GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID] = handle_before_doctype_system_id_state,
|
3358
|
+
[GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED] = handle_doctype_system_id_double_quoted_state,
|
3359
|
+
[GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED] = handle_doctype_system_id_single_quoted_state,
|
3360
|
+
[GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID] = handle_after_doctype_system_id_state,
|
3361
|
+
[GUMBO_LEX_BOGUS_DOCTYPE] = handle_bogus_doctype_state,
|
3362
|
+
[GUMBO_LEX_CDATA_SECTION] = handle_cdata_section_state,
|
3363
|
+
[GUMBO_LEX_CDATA_SECTION_BRACKET] = handle_cdata_section_bracket_state,
|
3364
|
+
[GUMBO_LEX_CDATA_SECTION_END] = handle_cdata_section_end_state,
|
3365
|
+
[GUMBO_LEX_CHARACTER_REFERENCE] = handle_character_reference_state,
|
3366
|
+
[GUMBO_LEX_NAMED_CHARACTER_REFERENCE] = handle_named_character_reference_state,
|
3367
|
+
[GUMBO_LEX_AMBIGUOUS_AMPERSAND] = handle_ambiguous_ampersand_state,
|
3368
|
+
[GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE] = handle_numeric_character_reference_state,
|
3369
|
+
[GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START] = handle_hexadecimal_character_reference_start_state,
|
3370
|
+
[GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START] = handle_decimal_character_reference_start_state,
|
3371
|
+
[GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE] = handle_hexadecimal_character_reference_state,
|
3372
|
+
[GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE] = handle_decimal_character_reference_state,
|
3373
|
+
[GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
|
3212
3374
|
};
|
3213
3375
|
|
3214
3376
|
bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
@@ -3239,12 +3401,14 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3239
3401
|
return true;
|
3240
3402
|
}
|
3241
3403
|
|
3242
|
-
if (
|
3404
|
+
if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
|
3405
|
+
// Return no error.
|
3243
3406
|
return true;
|
3244
3407
|
}
|
3245
3408
|
|
3409
|
+
tokenizer->_parse_error = false;
|
3246
3410
|
while (1) {
|
3247
|
-
assert(!tokenizer->
|
3411
|
+
assert(!tokenizer->_resume_pos);
|
3248
3412
|
assert(tokenizer->_buffered_emit_char == kGumboNoChar);
|
3249
3413
|
int c = utf8iterator_current(&tokenizer->_input);
|
3250
3414
|
GumboTokenizerEnum state = tokenizer->_state;
|
@@ -3255,11 +3419,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3255
3419
|
bool should_advance = !tokenizer->_reconsume_current_input;
|
3256
3420
|
tokenizer->_reconsume_current_input = false;
|
3257
3421
|
|
3258
|
-
if (result ==
|
3259
|
-
return
|
3260
|
-
} else if (result == RETURN_ERROR) {
|
3261
|
-
return false;
|
3262
|
-
}
|
3422
|
+
if (result == EMIT_TOKEN)
|
3423
|
+
return !tokenizer->_parse_error;
|
3263
3424
|
|
3264
3425
|
if (should_advance) {
|
3265
3426
|
utf8iterator_next(&tokenizer->_input);
|
@@ -3285,12 +3446,16 @@ void gumbo_token_destroy(GumboToken* token) {
|
|
3285
3446
|
}
|
3286
3447
|
}
|
3287
3448
|
gumbo_free((void*) token->v.start_tag.attributes.data);
|
3288
|
-
if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN)
|
3449
|
+
if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
|
3289
3450
|
gumbo_free(token->v.start_tag.name);
|
3451
|
+
token->v.start_tag.name = NULL;
|
3452
|
+
}
|
3290
3453
|
return;
|
3291
3454
|
case GUMBO_TOKEN_END_TAG:
|
3292
|
-
if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN)
|
3455
|
+
if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN) {
|
3293
3456
|
gumbo_free(token->v.end_tag.name);
|
3457
|
+
token->v.end_tag.name = NULL;
|
3458
|
+
}
|
3294
3459
|
break;
|
3295
3460
|
case GUMBO_TOKEN_COMMENT:
|
3296
3461
|
gumbo_free((void*) token->v.text);
|