nokogumbo 2.0.0.pre.alpha → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +101 -14
- data/ext/nokogumbo/extconf.rb +7 -2
- data/ext/nokogumbo/nokogumbo.c +630 -235
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +391 -126
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +74 -4
- data/gumbo-parser/src/parser.c +1161 -1025
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1440 -1278
- data/gumbo-parser/src/tokenizer.h +7 -18
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +17 -59
- data/gumbo-parser/src/utf8.h +52 -16
- data/lib/nokogumbo.rb +3 -1
- data/lib/nokogumbo/html5.rb +17 -15
- data/lib/nokogumbo/html5/document.rb +19 -3
- data/lib/nokogumbo/html5/document_fragment.rb +36 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +20 -14
- data/CHANGELOG.md +0 -56
@@ -0,0 +1,79 @@
|
|
1
|
+
/*
|
2
|
+
Copyright 2018 Stephen Checkoway
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
*/
|
16
|
+
|
17
|
+
#include <assert.h>
|
18
|
+
|
19
|
+
#include "ascii.h"
|
20
|
+
#include "token_buffer.h"
|
21
|
+
#include "tokenizer.h"
|
22
|
+
#include "util.h"
|
23
|
+
|
24
|
+
struct GumboInternalCharacterToken {
|
25
|
+
GumboSourcePosition position;
|
26
|
+
GumboStringPiece original_text;
|
27
|
+
int c;
|
28
|
+
};
|
29
|
+
|
30
|
+
void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer) {
|
31
|
+
buffer->data = NULL;
|
32
|
+
buffer->length = 0;
|
33
|
+
buffer->capacity = 0;
|
34
|
+
}
|
35
|
+
|
36
|
+
void gumbo_character_token_buffer_append (
|
37
|
+
const GumboToken* token,
|
38
|
+
GumboCharacterTokenBuffer* buffer
|
39
|
+
) {
|
40
|
+
assert(token->type == GUMBO_TOKEN_WHITESPACE
|
41
|
+
|| token->type == GUMBO_TOKEN_CHARACTER);
|
42
|
+
if (buffer->length == buffer->capacity) {
|
43
|
+
if (buffer->capacity == 0)
|
44
|
+
buffer->capacity = 10;
|
45
|
+
else
|
46
|
+
buffer->capacity *= 2;
|
47
|
+
size_t bytes = sizeof(*buffer->data) * buffer->capacity;
|
48
|
+
buffer->data = gumbo_realloc(buffer->data, bytes);
|
49
|
+
}
|
50
|
+
size_t index = buffer->length++;
|
51
|
+
buffer->data[index].position = token->position;
|
52
|
+
buffer->data[index].original_text = token->original_text;
|
53
|
+
buffer->data[index].c = token->v.character;
|
54
|
+
}
|
55
|
+
|
56
|
+
void gumbo_character_token_buffer_get (
|
57
|
+
const GumboCharacterTokenBuffer* buffer,
|
58
|
+
size_t index,
|
59
|
+
struct GumboInternalToken* output
|
60
|
+
) {
|
61
|
+
assert(index < buffer->length);
|
62
|
+
int c = buffer->data[index].c;
|
63
|
+
output->type = gumbo_ascii_isspace(c)?
|
64
|
+
GUMBO_TOKEN_WHITESPACE : GUMBO_TOKEN_CHARACTER;
|
65
|
+
output->position = buffer->data[index].position;
|
66
|
+
output->original_text = buffer->data[index].original_text;
|
67
|
+
output->v.character = c;
|
68
|
+
}
|
69
|
+
|
70
|
+
void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer) {
|
71
|
+
buffer->length = 0;
|
72
|
+
}
|
73
|
+
|
74
|
+
void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer) {
|
75
|
+
gumbo_free(buffer->data);
|
76
|
+
buffer->data = NULL;
|
77
|
+
buffer->length = 0;
|
78
|
+
buffer->capacity = 0;
|
79
|
+
}
|
@@ -0,0 +1,71 @@
|
|
1
|
+
/*
|
2
|
+
Copyright 2018 Stephen Checkoway
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
*/
|
16
|
+
|
17
|
+
#ifndef GUMBO_TOKEN_BUFFER_H
|
18
|
+
#define GUMBO_TOKEN_BUFFER_H
|
19
|
+
|
20
|
+
#include <stdbool.h>
|
21
|
+
#include <stddef.h>
|
22
|
+
|
23
|
+
#include "gumbo.h"
|
24
|
+
|
25
|
+
#ifdef __cplusplus
|
26
|
+
extern "C" {
|
27
|
+
#endif
|
28
|
+
|
29
|
+
struct GumboInternalCharacterToken;
|
30
|
+
struct GumboInternalToken;
|
31
|
+
|
32
|
+
// A struct representing a growable sequence of character (and whitespace)
|
33
|
+
// tokens.
|
34
|
+
typedef struct {
|
35
|
+
// A pointer to the start of the sequence.
|
36
|
+
struct GumboInternalCharacterToken* data;
|
37
|
+
|
38
|
+
// The length of the sequence.
|
39
|
+
size_t length;
|
40
|
+
|
41
|
+
// The capacity of the buffer.
|
42
|
+
size_t capacity;
|
43
|
+
} GumboCharacterTokenBuffer;
|
44
|
+
|
45
|
+
// Initializes a new GumboCharacterTokenBuffer.
|
46
|
+
void gumbo_character_token_buffer_init(GumboCharacterTokenBuffer* buffer);
|
47
|
+
|
48
|
+
// Appends a character (or whitespace) token.
|
49
|
+
void gumbo_character_token_buffer_append (
|
50
|
+
const struct GumboInternalToken* token,
|
51
|
+
GumboCharacterTokenBuffer* buffer
|
52
|
+
);
|
53
|
+
|
54
|
+
void gumbo_character_token_buffer_get (
|
55
|
+
const GumboCharacterTokenBuffer* buffer,
|
56
|
+
size_t index,
|
57
|
+
struct GumboInternalToken* output
|
58
|
+
);
|
59
|
+
|
60
|
+
// Reinitialize this string buffer. This clears it by setting length=0. It
|
61
|
+
// does not zero out the buffer itself.
|
62
|
+
void gumbo_character_token_buffer_clear(GumboCharacterTokenBuffer* buffer);
|
63
|
+
|
64
|
+
// Deallocates this GumboCharacterTokenBuffer.
|
65
|
+
void gumbo_character_token_buffer_destroy(GumboCharacterTokenBuffer* buffer);
|
66
|
+
|
67
|
+
#ifdef __cplusplus
|
68
|
+
}
|
69
|
+
#endif
|
70
|
+
|
71
|
+
#endif // GUMBO_TOKEN_BUFFER_H
|
@@ -1,5 +1,7 @@
|
|
1
1
|
/*
|
2
2
|
Copyright 2010 Google Inc.
|
3
|
+
Copyright 2017-2018 Craig Barnes
|
4
|
+
Copyright 2018 Stephen Checkoway
|
3
5
|
|
4
6
|
Licensed under the Apache License, Version 2.0 (the "License");
|
5
7
|
you may not use this file except in compliance with the License.
|
@@ -18,10 +20,7 @@
|
|
18
20
|
Coding conventions specific to this file:
|
19
21
|
|
20
22
|
1. Functions that fill in a token should be named emit_*, and should be
|
21
|
-
followed immediately by a return from the tokenizer
|
22
|
-
occurred, false if an error occurred). Sometimes the emit functions
|
23
|
-
themselves return a boolean so that they can be combined with the return
|
24
|
-
statement; in this case, they should match this convention.
|
23
|
+
followed immediately by a return from the tokenizer.
|
25
24
|
2. Functions that shuffle data from temporaries to final API structures
|
26
25
|
should be named finish_*, and be called just before the tokenizer exits the
|
27
26
|
state that accumulates the temporary.
|
@@ -60,15 +59,18 @@
|
|
60
59
|
#include "util.h"
|
61
60
|
#include "vector.h"
|
62
61
|
|
63
|
-
// Compared against
|
62
|
+
// Compared against _temporary_buffer to determine if we're in
|
64
63
|
// double-escaped script mode.
|
65
64
|
static const GumboStringPiece kScriptTag = {.data = "script", .length = 6};
|
66
65
|
|
67
|
-
// An enum for the return value of each individual state.
|
66
|
+
// An enum for the return value of each individual state. Each of the emit_*
|
67
|
+
// functions should return EMIT_TOKEN and should be called as
|
68
|
+
// return emit_foo(parser, ..., output);
|
69
|
+
// Each of the handle_*_state functions that do not return emit_* should
|
70
|
+
// instead return CONTINUE to indicate to gumbo_lex to continue lexing.
|
68
71
|
typedef enum {
|
69
|
-
|
70
|
-
|
71
|
-
NEXT_CHAR // Proceed to the next character and continue lexing.
|
72
|
+
EMIT_TOKEN,
|
73
|
+
CONTINUE,
|
72
74
|
} StateResult;
|
73
75
|
|
74
76
|
// This is a struct containing state necessary to build up a tag token,
|
@@ -103,12 +105,6 @@ typedef struct GumboInternalTagState {
|
|
103
105
|
// the attribute value, but shouldn't overwrite the existing value.
|
104
106
|
bool _drop_next_attr_value;
|
105
107
|
|
106
|
-
// The state that caused the tokenizer to switch into a character reference in
|
107
|
-
// attribute value state. This is used to set the additional allowed
|
108
|
-
// character, and is switched back to on completion. Initialized as the
|
109
|
-
// tokenizer enters the character reference state.
|
110
|
-
GumboTokenizerEnum _attr_value_state;
|
111
|
-
|
112
108
|
// The last start tag to have been emitted by the tokenizer. This is
|
113
109
|
// necessary to check for appropriate end tags.
|
114
110
|
GumboTag _last_start_tag;
|
@@ -133,10 +129,10 @@ typedef struct GumboInternalTokenizerState {
|
|
133
129
|
// "Reconsume the current input character in..."
|
134
130
|
bool _reconsume_current_input;
|
135
131
|
|
136
|
-
// A flag indicating whether the current node is a foreign element.
|
137
|
-
// set by
|
138
|
-
// markup declaration state.
|
139
|
-
bool
|
132
|
+
// A flag indicating whether the adjusted current node is a foreign element.
|
133
|
+
// This is set by gumbo_tokenizer_set_is_adjusted_current_node_foreign and
|
134
|
+
// checked in the markup declaration state.
|
135
|
+
bool _is_adjusted_current_node_foreign;
|
140
136
|
|
141
137
|
// A flag indicating whether the tokenizer is in a CDATA section. If so, then
|
142
138
|
// text tokens emitted will be GUMBO_TOKEN_CDATA.
|
@@ -159,27 +155,24 @@ typedef struct GumboInternalTokenizerState {
|
|
159
155
|
|
160
156
|
// A temporary buffer to accumulate characters, as described by the "temporary
|
161
157
|
// buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
|
162
|
-
// way:
|
163
|
-
//
|
164
|
-
//
|
165
|
-
//
|
166
|
-
//
|
167
|
-
//
|
168
|
-
// input stream, so that tokens emitted by emit_char have the correct position
|
169
|
-
// and original text.
|
158
|
+
// way: In situations where the spec calls for inserting characters into the
|
159
|
+
// temporary buffer that exactly match the input in order to emit them as
|
160
|
+
// character tokens, we don't actually do it.
|
161
|
+
// Instead, we mark the input and reset the input to it using set_mark() and
|
162
|
+
// emit_from_mark(). We do use the temporary buffer for other uses such as
|
163
|
+
// DOCTYPEs, comments, and detecting escaped <script> tags.
|
170
164
|
GumboStringBuffer _temporary_buffer;
|
171
165
|
|
172
|
-
// The
|
173
|
-
//
|
174
|
-
const char*
|
166
|
+
// The position to resume normal operation after we start emitting from the
|
167
|
+
// mark. NULL whenever we're not emitting from the mark.
|
168
|
+
const char* _resume_pos;
|
169
|
+
|
170
|
+
// The character reference state uses a return state to return to the state
|
171
|
+
// it was invoked from.
|
172
|
+
GumboTokenizerEnum _return_state;
|
175
173
|
|
176
|
-
//
|
177
|
-
|
178
|
-
// buffer for both because we have to flush out "<s" as emits while still
|
179
|
-
// maintaining the context that will eventually become "script". This is a
|
180
|
-
// separate buffer that's used in place of the temporary buffer for states
|
181
|
-
// that may enter the script data double escape start state.
|
182
|
-
GumboStringBuffer _script_data_buffer;
|
174
|
+
// Numeric character reference.
|
175
|
+
uint32_t _character_reference_code;
|
183
176
|
|
184
177
|
// Pointer to the beginning of the current token in the original buffer; used
|
185
178
|
// to record the original text.
|
@@ -201,123 +194,66 @@ typedef struct GumboInternalTokenizerState {
|
|
201
194
|
Utf8Iterator _input;
|
202
195
|
} GumboTokenizerState;
|
203
196
|
|
204
|
-
// Adds
|
197
|
+
// Adds a parse error to the parser's error struct.
|
205
198
|
static void tokenizer_add_parse_error (
|
206
199
|
GumboParser* parser,
|
207
200
|
GumboErrorType type
|
208
201
|
) {
|
202
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
209
203
|
GumboError* error = gumbo_add_error(parser);
|
210
204
|
if (!error) {
|
211
205
|
return;
|
212
206
|
}
|
207
|
+
const Utf8Iterator* input = &tokenizer->_input;
|
208
|
+
utf8iterator_get_position(input, &error->position);
|
209
|
+
error->original_text.data = utf8iterator_get_char_pointer(input);
|
210
|
+
error->original_text.length = utf8iterator_get_width(input);
|
211
|
+
error->type = type;
|
212
|
+
error->v.tokenizer.state = tokenizer->_state;
|
213
|
+
error->v.tokenizer.codepoint = utf8iterator_current(input);
|
214
|
+
}
|
215
|
+
|
216
|
+
// Adds an error pointing at the start of the character reference.
|
217
|
+
static void tokenizer_add_char_ref_error (
|
218
|
+
struct GumboInternalParser* parser,
|
219
|
+
GumboErrorType type,
|
220
|
+
int codepoint
|
221
|
+
) {
|
213
222
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
214
|
-
|
215
|
-
|
223
|
+
GumboError* error = gumbo_add_error(parser);
|
224
|
+
if (!error)
|
225
|
+
return;
|
226
|
+
Utf8Iterator* input = &tokenizer->_input;
|
216
227
|
error->type = type;
|
217
|
-
error->
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
case GUMBO_LEX_SCRIPT:
|
243
|
-
case GUMBO_LEX_SCRIPT_LT:
|
244
|
-
case GUMBO_LEX_SCRIPT_END_TAG_OPEN:
|
245
|
-
case GUMBO_LEX_SCRIPT_END_TAG_NAME:
|
246
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_START:
|
247
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_START_DASH:
|
248
|
-
case GUMBO_LEX_SCRIPT_ESCAPED:
|
249
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_DASH:
|
250
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH:
|
251
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_LT:
|
252
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN:
|
253
|
-
case GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME:
|
254
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START:
|
255
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED:
|
256
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH:
|
257
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH:
|
258
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT:
|
259
|
-
case GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END:
|
260
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SCRIPT;
|
261
|
-
break;
|
262
|
-
case GUMBO_LEX_TAG_OPEN:
|
263
|
-
case GUMBO_LEX_END_TAG_OPEN:
|
264
|
-
case GUMBO_LEX_TAG_NAME:
|
265
|
-
case GUMBO_LEX_BEFORE_ATTR_NAME:
|
266
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_TAG;
|
267
|
-
break;
|
268
|
-
case GUMBO_LEX_SELF_CLOSING_START_TAG:
|
269
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG;
|
270
|
-
break;
|
271
|
-
case GUMBO_LEX_ATTR_NAME:
|
272
|
-
case GUMBO_LEX_AFTER_ATTR_NAME:
|
273
|
-
case GUMBO_LEX_BEFORE_ATTR_VALUE:
|
274
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_NAME;
|
275
|
-
break;
|
276
|
-
case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
|
277
|
-
case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
|
278
|
-
case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
|
279
|
-
case GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED:
|
280
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_ATTR_VALUE;
|
281
|
-
break;
|
282
|
-
case GUMBO_LEX_BOGUS_COMMENT:
|
283
|
-
case GUMBO_LEX_COMMENT_START:
|
284
|
-
case GUMBO_LEX_COMMENT_START_DASH:
|
285
|
-
case GUMBO_LEX_COMMENT:
|
286
|
-
case GUMBO_LEX_COMMENT_END_DASH:
|
287
|
-
case GUMBO_LEX_COMMENT_END:
|
288
|
-
case GUMBO_LEX_COMMENT_END_BANG:
|
289
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_COMMENT;
|
290
|
-
break;
|
291
|
-
case GUMBO_LEX_MARKUP_DECLARATION:
|
292
|
-
case GUMBO_LEX_DOCTYPE:
|
293
|
-
case GUMBO_LEX_BEFORE_DOCTYPE_NAME:
|
294
|
-
case GUMBO_LEX_DOCTYPE_NAME:
|
295
|
-
case GUMBO_LEX_AFTER_DOCTYPE_NAME:
|
296
|
-
case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD:
|
297
|
-
case GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID:
|
298
|
-
case GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED:
|
299
|
-
case GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED:
|
300
|
-
case GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID:
|
301
|
-
case GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID:
|
302
|
-
case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD:
|
303
|
-
case GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID:
|
304
|
-
case GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED:
|
305
|
-
case GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED:
|
306
|
-
case GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID:
|
307
|
-
case GUMBO_LEX_BOGUS_DOCTYPE:
|
308
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_DOCTYPE;
|
309
|
-
break;
|
310
|
-
case GUMBO_LEX_CDATA:
|
311
|
-
error->v.tokenizer.state = GUMBO_ERR_TOKENIZER_CDATA;
|
312
|
-
break;
|
313
|
-
}
|
228
|
+
error->position = utf8iterator_get_mark_position(input);
|
229
|
+
const char* mark = utf8iterator_get_mark_pointer(input);
|
230
|
+
error->original_text.data = mark;
|
231
|
+
error->original_text.length = utf8iterator_get_char_pointer(input) - mark;
|
232
|
+
error->v.tokenizer.state = tokenizer->_state;
|
233
|
+
error->v.tokenizer.codepoint = codepoint;
|
234
|
+
}
|
235
|
+
|
236
|
+
// Adds an error pointing at the start of the token.
|
237
|
+
static void tokenizer_add_token_parse_error (
|
238
|
+
GumboParser* parser,
|
239
|
+
GumboErrorType type
|
240
|
+
) {
|
241
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
242
|
+
GumboError* error = gumbo_add_error(parser);
|
243
|
+
if (!error)
|
244
|
+
return;
|
245
|
+
Utf8Iterator* input = &tokenizer->_input;
|
246
|
+
error->type = type;
|
247
|
+
error->position = tokenizer->_token_start_pos;
|
248
|
+
error->original_text.data = tokenizer->_token_start;
|
249
|
+
error->original_text.length =
|
250
|
+
utf8iterator_get_char_pointer(input) - tokenizer->_token_start;
|
251
|
+
error->v.tokenizer.state = tokenizer->_state;
|
252
|
+
error->v.tokenizer.codepoint = 0;
|
314
253
|
}
|
315
254
|
|
316
255
|
static bool is_alpha(int c) {
|
317
|
-
|
318
|
-
// on the current locale, whereas the behavior in the HTML5 spec is
|
319
|
-
// locale-independent.
|
320
|
-
return ((unsigned) c | 32) - 'a' < 26;
|
256
|
+
return gumbo_ascii_isalpha(c);
|
321
257
|
}
|
322
258
|
|
323
259
|
static int ensure_lowercase(int c) {
|
@@ -347,24 +283,9 @@ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
|
|
347
283
|
}
|
348
284
|
|
349
285
|
// Starts recording characters in the temporary buffer.
|
350
|
-
// Because this needs to reset the utf8iterator_mark to the beginning of the
|
351
|
-
// text that will eventually be emitted, it needs to be called a couple of
|
352
|
-
// states before the spec says "Set the temporary buffer to the empty string".
|
353
|
-
// In general, this should be called whenever there's a transition to a
|
354
|
-
// "less-than sign state". The initial < and possibly / then need to be
|
355
|
-
// appended to the temporary buffer, their presence needs to be accounted for in
|
356
|
-
// states that compare the temporary buffer against a literal value, and
|
357
|
-
// spec stanzas that say "emit a < and / character token along with a character
|
358
|
-
// token for each character in the temporary buffer" need to be adjusted to
|
359
|
-
// account for the presence of the < and / inside the temporary buffer.
|
360
286
|
static void clear_temporary_buffer(GumboParser* parser) {
|
361
287
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
362
|
-
assert(!tokenizer->_temporary_buffer_emit);
|
363
|
-
utf8iterator_mark(&tokenizer->_input);
|
364
288
|
gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
|
365
|
-
// The temporary buffer and script data buffer are the same object in the
|
366
|
-
// spec, so the script data buffer should be cleared as well.
|
367
|
-
gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
|
368
289
|
}
|
369
290
|
|
370
291
|
// Appends a codepoint to the temporary buffer.
|
@@ -378,25 +299,20 @@ static void append_char_to_temporary_buffer (
|
|
378
299
|
);
|
379
300
|
}
|
380
301
|
|
381
|
-
|
382
|
-
|
383
|
-
const
|
384
|
-
const char* text,
|
385
|
-
size_t text_len
|
302
|
+
static void append_string_to_temporary_buffer (
|
303
|
+
GumboParser* parser,
|
304
|
+
const GumboStringPiece* str
|
386
305
|
) {
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
306
|
+
gumbo_string_buffer_append_string (
|
307
|
+
str,
|
308
|
+
&parser->_tokenizer_state->_temporary_buffer
|
309
|
+
);
|
391
310
|
}
|
392
311
|
|
393
|
-
#define temporary_buffer_equals(parser, text) \
|
394
|
-
temporary_buffer_equals__(parser, "" text, sizeof(text) - 1)
|
395
312
|
|
396
313
|
static bool temporary_buffer_is_empty(const GumboParser* parser) {
|
397
314
|
return parser->_tokenizer_state->_temporary_buffer.length == 0;
|
398
315
|
}
|
399
|
-
#endif
|
400
316
|
|
401
317
|
static void doc_type_state_init(GumboParser* parser) {
|
402
318
|
GumboTokenDocType* doc_type_state =
|
@@ -493,56 +409,49 @@ static void finish_doctype_system_id(GumboParser* parser) {
|
|
493
409
|
}
|
494
410
|
|
495
411
|
// Writes a single specified character to the output token.
|
496
|
-
static
|
412
|
+
static StateResult emit_char(GumboParser* parser, int c, GumboToken* output) {
|
497
413
|
output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
|
498
414
|
output->v.character = c;
|
499
415
|
finish_token(parser, output);
|
416
|
+
return EMIT_TOKEN;
|
500
417
|
}
|
501
418
|
|
502
419
|
// Writes a replacement character token and records a parse error.
|
503
|
-
// Always returns
|
420
|
+
// Always returns EMIT_TOKEN, per gumbo_lex return value.
|
504
421
|
static StateResult emit_replacement_char(
|
505
422
|
GumboParser* parser, GumboToken* output) {
|
506
423
|
// In all cases, this is because of a null byte in the input stream.
|
507
|
-
tokenizer_add_parse_error(parser,
|
424
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
508
425
|
emit_char(parser, kUtf8ReplacementChar, output);
|
509
|
-
return
|
426
|
+
return EMIT_TOKEN;
|
510
427
|
}
|
511
428
|
|
512
|
-
// Writes an EOF character token. Always returns
|
429
|
+
// Writes an EOF character token. Always returns EMIT_TOKEN.
|
513
430
|
static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
|
514
|
-
emit_char(parser, -1, output);
|
515
|
-
return RETURN_SUCCESS;
|
516
|
-
}
|
517
|
-
|
518
|
-
// Writes the current input character out as a character token.
|
519
|
-
// Always returns RETURN_SUCCESS.
|
520
|
-
static bool emit_current_char(GumboParser* parser, GumboToken* output) {
|
521
|
-
emit_char(
|
522
|
-
parser, utf8iterator_current(&parser->_tokenizer_state->_input), output);
|
523
|
-
return RETURN_SUCCESS;
|
431
|
+
return emit_char(parser, -1, output);
|
524
432
|
}
|
525
433
|
|
526
434
|
// Writes out a doctype token, copying it from the tokenizer state.
|
527
|
-
static
|
435
|
+
static StateResult emit_doctype(GumboParser* parser, GumboToken* output) {
|
528
436
|
output->type = GUMBO_TOKEN_DOCTYPE;
|
529
437
|
output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
|
530
438
|
finish_token(parser, output);
|
531
439
|
doc_type_state_init(parser);
|
440
|
+
return EMIT_TOKEN;
|
532
441
|
}
|
533
442
|
|
534
443
|
// Debug-only function that explicitly sets the attribute vector data to NULL so
|
535
444
|
// it can be asserted on tag creation, verifying that there are no memory leaks.
|
536
445
|
static void mark_tag_state_as_empty(GumboTagState* tag_state) {
|
537
446
|
UNUSED_IF_NDEBUG(tag_state);
|
538
|
-
#ifndef NDEBUG
|
539
447
|
tag_state->_name = NULL;
|
448
|
+
#ifndef NDEBUG
|
540
449
|
tag_state->_attributes = kGumboEmptyVector;
|
541
450
|
#endif
|
542
451
|
}
|
543
452
|
|
544
453
|
// Writes out the current tag as a start or end tag token.
|
545
|
-
// Always returns
|
454
|
+
// Always returns EMIT_TOKEN.
|
546
455
|
static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
547
456
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
548
457
|
if (tag_state->_is_start_tag) {
|
@@ -559,7 +468,10 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
559
468
|
output->type = GUMBO_TOKEN_END_TAG;
|
560
469
|
output->v.end_tag.tag = tag_state->_tag;
|
561
470
|
output->v.end_tag.name = tag_state->_name;
|
562
|
-
|
471
|
+
if (tag_state->_is_self_closing)
|
472
|
+
tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS);
|
473
|
+
if (tag_state->_attributes.length > 0)
|
474
|
+
tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_ATTRIBUTES);
|
563
475
|
// In end tags, ownership of the attributes vector is not transferred to the
|
564
476
|
// token, but it's still initialized as normal, so it must be manually
|
565
477
|
// deallocated. There may also be attributes to destroy, in certain broken
|
@@ -582,7 +494,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
582
494
|
assert(output->original_text.length >= 2);
|
583
495
|
assert(output->original_text.data[0] == '<');
|
584
496
|
assert(output->original_text.data[output->original_text.length - 1] == '>');
|
585
|
-
return
|
497
|
+
return EMIT_TOKEN;
|
586
498
|
}
|
587
499
|
|
588
500
|
// In some states, we speculatively start a tag, but don't know whether it'll be
|
@@ -600,90 +512,59 @@ static void abandon_current_tag(GumboParser* parser) {
|
|
600
512
|
gumbo_debug("Abandoning current tag.\n");
|
601
513
|
}
|
602
514
|
|
603
|
-
// Wraps the gumbo_consume_char_ref function to handle its output and make the
|
604
|
-
// appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
|
605
|
-
// error occurred, RETURN_SUCCESS otherwise.
|
606
|
-
static StateResult emit_char_ref (
|
607
|
-
GumboParser* parser,
|
608
|
-
int additional_allowed_char,
|
609
|
-
bool UNUSED_ARG(is_in_attribute),
|
610
|
-
GumboToken* output
|
611
|
-
) {
|
612
|
-
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
613
|
-
OneOrTwoCodepoints char_ref;
|
614
|
-
bool status = gumbo_consume_char_ref (
|
615
|
-
parser,
|
616
|
-
&tokenizer->_input,
|
617
|
-
additional_allowed_char,
|
618
|
-
false,
|
619
|
-
&char_ref
|
620
|
-
);
|
621
|
-
if (char_ref.first != kGumboNoChar) {
|
622
|
-
// gumbo_consume_char_ref ends with the iterator pointing at the next
|
623
|
-
// character, so we need to be sure not advance it again before
|
624
|
-
// reading the next token.
|
625
|
-
tokenizer->_reconsume_current_input = true;
|
626
|
-
emit_char(parser, char_ref.first, output);
|
627
|
-
tokenizer->_buffered_emit_char = char_ref.second;
|
628
|
-
} else {
|
629
|
-
emit_char(parser, '&', output);
|
630
|
-
}
|
631
|
-
return status ? RETURN_SUCCESS : RETURN_ERROR;
|
632
|
-
}
|
633
|
-
|
634
515
|
// Emits a comment token. Comments use the temporary buffer to accumulate their
|
635
516
|
// data, and then it's copied over and released to the 'text' field of the
|
636
|
-
// GumboToken union. Always returns
|
517
|
+
// GumboToken union. Always returns EMIT_TOKEN.
|
637
518
|
static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
|
638
519
|
output->type = GUMBO_TOKEN_COMMENT;
|
639
520
|
finish_temporary_buffer(parser, &output->v.text);
|
640
521
|
finish_token(parser, output);
|
641
|
-
return
|
522
|
+
return EMIT_TOKEN;
|
642
523
|
}
|
643
524
|
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
525
|
+
static void set_mark(GumboParser* parser) {
|
526
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
527
|
+
utf8iterator_mark(&tokenizer->_input);
|
528
|
+
}
|
529
|
+
|
530
|
+
// Checks to see we should be emitting characters from the mark, and fills the
|
531
|
+
// output token with the next output character if so.
|
532
|
+
// Returns EMIT_TOKEN if a character has been emitted and the tokenizer should
|
533
|
+
// immediately return, CONTINUE if we should resume normal operation.
|
534
|
+
static StateResult maybe_emit_from_mark (
|
535
|
+
GumboParser* parser,
|
536
|
+
GumboToken* output
|
537
|
+
) {
|
651
538
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
652
|
-
const char*
|
653
|
-
GumboStringBuffer* buffer = &tokenizer->_temporary_buffer;
|
539
|
+
const char* pos = tokenizer->_resume_pos;
|
654
540
|
|
655
|
-
if (!
|
656
|
-
|
657
|
-
|
541
|
+
if (!pos)
|
542
|
+
return CONTINUE;
|
543
|
+
if (utf8iterator_get_char_pointer(&tokenizer->_input) >= pos) {
|
544
|
+
tokenizer->_resume_pos = NULL;
|
545
|
+
return CONTINUE;
|
658
546
|
}
|
659
547
|
|
660
|
-
|
661
|
-
//
|
662
|
-
//
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
tokenizer->_reconsume_current_input = saved_reconsume_state;
|
673
|
-
return true;
|
674
|
-
}
|
675
|
-
|
676
|
-
// Sets up the tokenizer to begin flushing the temporary buffer.
|
677
|
-
// This resets the input iterator stream to the start of the last tag, sets up
|
678
|
-
// _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
|
679
|
-
// the first character in it. It returns true if a character was emitted, false
|
680
|
-
// otherwise.
|
681
|
-
static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
|
548
|
+
// emit_char advances the input stream. _reconsume_current_input should
|
549
|
+
// *never* be set when emitting from the mark since those characters have
|
550
|
+
// already been advanced past.
|
551
|
+
assert(!tokenizer->_reconsume_current_input);
|
552
|
+
return emit_char(parser, utf8iterator_current(&tokenizer->_input), output);
|
553
|
+
}
|
554
|
+
|
555
|
+
// Sets up the tokenizer to begin emitting from the mark up to, but not
|
556
|
+
// including, the current code point. This resets the input iterator stream to
|
557
|
+
// the mark, sets up _resume_pos, and then emits the first character in it.
|
558
|
+
// Returns EMIT_TOKEN.
|
559
|
+
static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
|
682
560
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
683
|
-
|
561
|
+
tokenizer->_resume_pos = utf8iterator_get_char_pointer(&tokenizer->_input);
|
684
562
|
utf8iterator_reset(&tokenizer->_input);
|
685
|
-
|
686
|
-
|
563
|
+
// Now that we have reset the input, we need to advance through it.
|
564
|
+
tokenizer->_reconsume_current_input = false;
|
565
|
+
StateResult result = maybe_emit_from_mark(parser, output);
|
566
|
+
assert(result == EMIT_TOKEN);
|
567
|
+
return result;
|
687
568
|
}
|
688
569
|
|
689
570
|
// Appends a codepoint to the current tag buffer. If
|
@@ -703,6 +584,19 @@ static void append_char_to_tag_buffer (
|
|
703
584
|
gumbo_string_buffer_append_codepoint(codepoint, buffer);
|
704
585
|
}
|
705
586
|
|
587
|
+
// Like above but append a string.
|
588
|
+
static void append_string_to_tag_buffer (
|
589
|
+
GumboParser* parser,
|
590
|
+
GumboStringPiece* str,
|
591
|
+
bool reinitilize_position_on_first
|
592
|
+
) {
|
593
|
+
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
594
|
+
if (buffer->length == 0 && reinitilize_position_on_first) {
|
595
|
+
reset_tag_buffer_start_point(parser);
|
596
|
+
}
|
597
|
+
gumbo_string_buffer_append_string(str, buffer);
|
598
|
+
}
|
599
|
+
|
706
600
|
// (Re-)initialize the tag buffer. This also resets the original_text pointer
|
707
601
|
// and _start_pos field to point to the current position.
|
708
602
|
static void initialize_tag_buffer(GumboParser* parser) {
|
@@ -713,6 +607,70 @@ static void initialize_tag_buffer(GumboParser* parser) {
|
|
713
607
|
reset_tag_buffer_start_point(parser);
|
714
608
|
}
|
715
609
|
|
610
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#charref-in-attribute
|
611
|
+
static bool character_reference_part_of_attribute(GumboParser* parser) {
|
612
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
613
|
+
switch (tokenizer->_return_state) {
|
614
|
+
case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
|
615
|
+
case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
|
616
|
+
case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
|
617
|
+
return true;
|
618
|
+
default:
|
619
|
+
return false;
|
620
|
+
}
|
621
|
+
}
|
622
|
+
|
623
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference
|
624
|
+
// For each code point in the temporary buffer, add to the current attribute
|
625
|
+
// value if the character reference was consumed as part of an attribute or
|
626
|
+
// emit the code point as a character token.
|
627
|
+
static StateResult flush_code_points_consumed_as_character_reference (
|
628
|
+
GumboParser* parser,
|
629
|
+
GumboToken* output
|
630
|
+
) {
|
631
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
632
|
+
if (character_reference_part_of_attribute(parser)) {
|
633
|
+
const char *start = utf8iterator_get_mark_pointer(&tokenizer->_input);
|
634
|
+
assert(start);
|
635
|
+
GumboStringPiece str = {
|
636
|
+
.data = start,
|
637
|
+
.length = utf8iterator_get_char_pointer(&tokenizer->_input) - start,
|
638
|
+
};
|
639
|
+
bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
|
640
|
+
append_string_to_tag_buffer(parser, &str, unquoted);
|
641
|
+
return CONTINUE;
|
642
|
+
}
|
643
|
+
return emit_from_mark(parser, output);
|
644
|
+
}
|
645
|
+
|
646
|
+
// After a character reference has been successfully constructed, the standard
|
647
|
+
// says to set the temporary buffer equal to the empty string, append the code
|
648
|
+
// point(s) associated with the reference and flush code points consumed as a
|
649
|
+
// character reference.
|
650
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
|
651
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
|
652
|
+
// That doesn't work for us because we use the temporary buffer in lock step
|
653
|
+
// with the input for position and that would fail if we inserted a different
|
654
|
+
// number of code points. So duplicate a bit of the above logic.
|
655
|
+
static StateResult flush_char_ref (
|
656
|
+
GumboParser* parser,
|
657
|
+
int first,
|
658
|
+
int second,
|
659
|
+
GumboToken* output
|
660
|
+
) {
|
661
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
662
|
+
if (character_reference_part_of_attribute(parser)) {
|
663
|
+
bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
|
664
|
+
append_char_to_tag_buffer(parser, first, unquoted);
|
665
|
+
if (second != kGumboNoChar)
|
666
|
+
append_char_to_tag_buffer(parser, second, unquoted);
|
667
|
+
return CONTINUE;
|
668
|
+
}
|
669
|
+
tokenizer->_buffered_emit_char = second;
|
670
|
+
return emit_char(parser, first, output);
|
671
|
+
}
|
672
|
+
|
673
|
+
|
716
674
|
// Initializes the tag_state to start a new tag, keeping track of the opening
|
717
675
|
// positions and original text. Takes a boolean indicating whether this is a
|
718
676
|
// start or end tag.
|
@@ -725,7 +683,6 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
|
|
725
683
|
assert(is_alpha(c));
|
726
684
|
|
727
685
|
initialize_tag_buffer(parser);
|
728
|
-
gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
|
729
686
|
|
730
687
|
assert(tag_state->_name == NULL);
|
731
688
|
assert(tag_state->_attributes.data == NULL);
|
@@ -765,7 +722,10 @@ static void copy_over_original_tag_text (
|
|
765
722
|
original_text->data = tag_state->_original_text;
|
766
723
|
original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
|
767
724
|
tag_state->_original_text;
|
768
|
-
if (
|
725
|
+
if (
|
726
|
+
original_text->length
|
727
|
+
&& original_text->data[original_text->length - 1] == '\r'
|
728
|
+
) {
|
769
729
|
// Since \r is skipped by the UTF-8 iterator, it can sometimes end up
|
770
730
|
// appended to the end of original text even when it's really the first part
|
771
731
|
// of the next character. If we detect this situation, shrink the length of
|
@@ -801,40 +761,45 @@ static void finish_tag_name(GumboParser* parser) {
|
|
801
761
|
}
|
802
762
|
|
803
763
|
// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
|
804
|
-
static void add_duplicate_attr_error
|
805
|
-
|
806
|
-
int original_index,
|
807
|
-
int new_index
|
808
|
-
) {
|
764
|
+
static void add_duplicate_attr_error(GumboParser* parser) {
|
765
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
809
766
|
GumboError* error = gumbo_add_error(parser);
|
810
767
|
if (!error) {
|
811
768
|
return;
|
812
769
|
}
|
813
770
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
814
|
-
error->type =
|
771
|
+
error->type = GUMBO_ERR_DUPLICATE_ATTRIBUTE;
|
815
772
|
error->position = tag_state->_start_pos;
|
816
|
-
error->original_text = tag_state->_original_text;
|
817
|
-
error->
|
818
|
-
|
819
|
-
|
820
|
-
reinitialize_tag_buffer(parser);
|
773
|
+
error->original_text.data = tag_state->_original_text;
|
774
|
+
error->original_text.length =
|
775
|
+
utf8iterator_get_char_pointer(&tokenizer->_input) - error->original_text.data;
|
776
|
+
error->v.tokenizer.state = tokenizer->_state;
|
821
777
|
}
|
822
778
|
|
823
779
|
// Creates a new attribute in the current tag, copying the current tag buffer to
|
824
780
|
// the attribute's name. The attribute's value starts out as the empty string
|
825
781
|
// (following the "Boolean attributes" section of the spec) and is only
|
826
782
|
// overwritten on finish_attribute_value(). If the attribute has already been
|
827
|
-
// specified, the new attribute is dropped
|
828
|
-
|
829
|
-
static bool finish_attribute_name(GumboParser* parser) {
|
783
|
+
// specified, the new attribute is dropped and a parse error is added
|
784
|
+
static void finish_attribute_name(GumboParser* parser) {
|
830
785
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
831
786
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
787
|
+
GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
|
788
|
+
|
789
|
+
int max_attributes = parser->_options->max_attributes;
|
790
|
+
if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
|
791
|
+
parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
|
792
|
+
gumbo_debug("Attributes limit exceeded.\n");
|
793
|
+
reinitialize_tag_buffer(parser);
|
794
|
+
tag_state->_drop_next_attr_value = true;
|
795
|
+
return;
|
796
|
+
}
|
797
|
+
|
832
798
|
// May've been set by a previous attribute without a value; reset it here.
|
833
799
|
tag_state->_drop_next_attr_value = false;
|
834
800
|
assert(tag_state->_attributes.data);
|
835
801
|
assert(tag_state->_attributes.capacity);
|
836
802
|
|
837
|
-
GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
|
838
803
|
for (unsigned int i = 0; i < attributes->length; ++i) {
|
839
804
|
GumboAttribute* attr = attributes->data[i];
|
840
805
|
if (
|
@@ -846,9 +811,10 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
846
811
|
)
|
847
812
|
) {
|
848
813
|
// Identical attribute; bail.
|
849
|
-
add_duplicate_attr_error(parser
|
814
|
+
add_duplicate_attr_error(parser);
|
815
|
+
reinitialize_tag_buffer(parser);
|
850
816
|
tag_state->_drop_next_attr_value = true;
|
851
|
-
return
|
817
|
+
return;
|
852
818
|
}
|
853
819
|
}
|
854
820
|
|
@@ -870,7 +836,6 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
870
836
|
);
|
871
837
|
gumbo_vector_add(attr, attributes);
|
872
838
|
reinitialize_tag_buffer(parser);
|
873
|
-
return true;
|
874
839
|
}
|
875
840
|
|
876
841
|
// Finishes an attribute value. This sets the value of the most recently added
|
@@ -911,22 +876,23 @@ void gumbo_tokenizer_state_init (
|
|
911
876
|
GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState));
|
912
877
|
parser->_tokenizer_state = tokenizer;
|
913
878
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
879
|
+
tokenizer->_return_state = GUMBO_LEX_DATA;
|
880
|
+
tokenizer->_character_reference_code = 0;
|
914
881
|
tokenizer->_reconsume_current_input = false;
|
915
|
-
tokenizer->
|
882
|
+
tokenizer->_is_adjusted_current_node_foreign = false;
|
916
883
|
tokenizer->_is_in_cdata = false;
|
917
884
|
tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
|
918
885
|
tokenizer->_tag_state._name = NULL;
|
919
886
|
|
920
887
|
tokenizer->_buffered_emit_char = kGumboNoChar;
|
921
888
|
gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
|
922
|
-
tokenizer->
|
889
|
+
tokenizer->_resume_pos = NULL;
|
923
890
|
|
924
891
|
mark_tag_state_as_empty(&tokenizer->_tag_state);
|
925
892
|
|
926
|
-
gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
|
927
|
-
tokenizer->_token_start = text;
|
928
893
|
utf8iterator_init(parser, text, text_length, &tokenizer->_input);
|
929
894
|
utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
|
895
|
+
tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
|
930
896
|
doc_type_state_init(parser);
|
931
897
|
}
|
932
898
|
|
@@ -936,7 +902,6 @@ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
|
|
936
902
|
assert(tokenizer->_doc_type_state.public_identifier == NULL);
|
937
903
|
assert(tokenizer->_doc_type_state.system_identifier == NULL);
|
938
904
|
gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
|
939
|
-
gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
|
940
905
|
assert(tokenizer->_tag_state._name == NULL);
|
941
906
|
assert(tokenizer->_tag_state._attributes.data == NULL);
|
942
907
|
gumbo_free(tokenizer);
|
@@ -946,17 +911,23 @@ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
|
|
946
911
|
parser->_tokenizer_state->_state = state;
|
947
912
|
}
|
948
913
|
|
949
|
-
void
|
914
|
+
void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
|
950
915
|
GumboParser* parser,
|
951
916
|
bool is_foreign
|
952
917
|
) {
|
953
|
-
if (is_foreign != parser->_tokenizer_state->
|
918
|
+
if (is_foreign != parser->_tokenizer_state->_is_adjusted_current_node_foreign) {
|
954
919
|
gumbo_debug (
|
955
920
|
"Toggling is_current_node_foreign to %s.\n",
|
956
921
|
is_foreign ? "true" : "false"
|
957
922
|
);
|
958
923
|
}
|
959
|
-
parser->_tokenizer_state->
|
924
|
+
parser->_tokenizer_state->_is_adjusted_current_node_foreign = is_foreign;
|
925
|
+
}
|
926
|
+
|
927
|
+
static void reconsume_in_state(GumboParser* parser, GumboTokenizerEnum state) {
|
928
|
+
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
929
|
+
tokenizer->_reconsume_current_input = true;
|
930
|
+
tokenizer->_state = state;
|
960
931
|
}
|
961
932
|
|
962
933
|
// https://html.spec.whatwg.org/multipage/parsing.html#data-state
|
@@ -968,37 +939,24 @@ static StateResult handle_data_state (
|
|
968
939
|
) {
|
969
940
|
switch (c) {
|
970
941
|
case '&':
|
971
|
-
gumbo_tokenizer_set_state(parser,
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
tokenizer->_reconsume_current_input = true;
|
976
|
-
return NEXT_CHAR;
|
942
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
|
943
|
+
set_mark(parser);
|
944
|
+
tokenizer->_return_state = GUMBO_LEX_DATA;
|
945
|
+
return CONTINUE;
|
977
946
|
case '<':
|
978
947
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
|
979
|
-
|
980
|
-
|
981
|
-
return NEXT_CHAR;
|
948
|
+
set_mark(parser);
|
949
|
+
return CONTINUE;
|
982
950
|
case '\0':
|
983
|
-
tokenizer_add_parse_error(parser,
|
984
|
-
emit_char(parser, c, output);
|
985
|
-
|
951
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
952
|
+
return emit_char(parser, c, output);
|
953
|
+
case -1:
|
954
|
+
return emit_eof(parser, output);
|
986
955
|
default:
|
987
|
-
return
|
956
|
+
return emit_char(parser, c, output);
|
988
957
|
}
|
989
958
|
}
|
990
959
|
|
991
|
-
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-data-state
|
992
|
-
static StateResult handle_char_ref_in_data_state (
|
993
|
-
GumboParser* parser,
|
994
|
-
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
995
|
-
int UNUSED_ARG(c),
|
996
|
-
GumboToken* output
|
997
|
-
) {
|
998
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
999
|
-
return emit_char_ref(parser, ' ', false, output);
|
1000
|
-
}
|
1001
|
-
|
1002
960
|
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
|
1003
961
|
static StateResult handle_rcdata_state (
|
1004
962
|
GumboParser* parser,
|
@@ -1008,34 +966,23 @@ static StateResult handle_rcdata_state (
|
|
1008
966
|
) {
|
1009
967
|
switch (c) {
|
1010
968
|
case '&':
|
1011
|
-
gumbo_tokenizer_set_state(parser,
|
1012
|
-
|
1013
|
-
|
969
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
|
970
|
+
set_mark(parser);
|
971
|
+
tokenizer->_return_state = GUMBO_LEX_RCDATA;
|
972
|
+
return CONTINUE;
|
1014
973
|
case '<':
|
1015
974
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
|
1016
|
-
|
1017
|
-
|
1018
|
-
return NEXT_CHAR;
|
975
|
+
set_mark(parser);
|
976
|
+
return CONTINUE;
|
1019
977
|
case '\0':
|
1020
978
|
return emit_replacement_char(parser, output);
|
1021
979
|
case -1:
|
1022
980
|
return emit_eof(parser, output);
|
1023
981
|
default:
|
1024
|
-
return
|
982
|
+
return emit_char(parser, c, output);
|
1025
983
|
}
|
1026
984
|
}
|
1027
985
|
|
1028
|
-
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-rcdata-state
|
1029
|
-
static StateResult handle_char_ref_in_rcdata_state (
|
1030
|
-
GumboParser* parser,
|
1031
|
-
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1032
|
-
int UNUSED_ARG(c),
|
1033
|
-
GumboToken* output
|
1034
|
-
) {
|
1035
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
|
1036
|
-
return emit_char_ref(parser, ' ', false, output);
|
1037
|
-
}
|
1038
|
-
|
1039
986
|
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
|
1040
987
|
static StateResult handle_rawtext_state (
|
1041
988
|
GumboParser* parser,
|
@@ -1046,20 +993,19 @@ static StateResult handle_rawtext_state (
|
|
1046
993
|
switch (c) {
|
1047
994
|
case '<':
|
1048
995
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
|
1049
|
-
|
1050
|
-
|
1051
|
-
return NEXT_CHAR;
|
996
|
+
set_mark(parser);
|
997
|
+
return CONTINUE;
|
1052
998
|
case '\0':
|
1053
999
|
return emit_replacement_char(parser, output);
|
1054
1000
|
case -1:
|
1055
1001
|
return emit_eof(parser, output);
|
1056
1002
|
default:
|
1057
|
-
return
|
1003
|
+
return emit_char(parser, c, output);
|
1058
1004
|
}
|
1059
1005
|
}
|
1060
1006
|
|
1061
1007
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
|
1062
|
-
static StateResult
|
1008
|
+
static StateResult handle_script_data_state (
|
1063
1009
|
GumboParser* parser,
|
1064
1010
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1065
1011
|
int c,
|
@@ -1067,16 +1013,15 @@ static StateResult handle_script_state (
|
|
1067
1013
|
) {
|
1068
1014
|
switch (c) {
|
1069
1015
|
case '<':
|
1070
|
-
gumbo_tokenizer_set_state(parser,
|
1071
|
-
|
1072
|
-
|
1073
|
-
return NEXT_CHAR;
|
1016
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_LT);
|
1017
|
+
set_mark(parser);
|
1018
|
+
return CONTINUE;
|
1074
1019
|
case '\0':
|
1075
1020
|
return emit_replacement_char(parser, output);
|
1076
1021
|
case -1:
|
1077
1022
|
return emit_eof(parser, output);
|
1078
1023
|
default:
|
1079
|
-
return
|
1024
|
+
return emit_char(parser, c, output);
|
1080
1025
|
}
|
1081
1026
|
}
|
1082
1027
|
|
@@ -1093,75 +1038,75 @@ static StateResult handle_plaintext_state (
|
|
1093
1038
|
case -1:
|
1094
1039
|
return emit_eof(parser, output);
|
1095
1040
|
default:
|
1096
|
-
return
|
1041
|
+
return emit_char(parser, c, output);
|
1097
1042
|
}
|
1098
1043
|
}
|
1099
1044
|
|
1100
1045
|
// https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
|
1101
1046
|
static StateResult handle_tag_open_state (
|
1102
1047
|
GumboParser* parser,
|
1103
|
-
GumboTokenizerState*
|
1048
|
+
GumboTokenizerState* tokenizer,
|
1104
1049
|
int c,
|
1105
1050
|
GumboToken* output
|
1106
1051
|
) {
|
1107
|
-
assert(temporary_buffer_equals(parser, "<"));
|
1108
1052
|
switch (c) {
|
1109
1053
|
case '!':
|
1110
|
-
gumbo_tokenizer_set_state(parser,
|
1054
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION_OPEN);
|
1111
1055
|
clear_temporary_buffer(parser);
|
1112
|
-
return
|
1056
|
+
return CONTINUE;
|
1113
1057
|
case '/':
|
1114
1058
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
|
1115
|
-
|
1116
|
-
return NEXT_CHAR;
|
1059
|
+
return CONTINUE;
|
1117
1060
|
case '?':
|
1118
|
-
|
1061
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
|
1119
1062
|
clear_temporary_buffer(parser);
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1063
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
|
1064
|
+
return CONTINUE;
|
1065
|
+
case -1:
|
1066
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
|
1067
|
+
// Switch to data to emit EOF.
|
1068
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
1069
|
+
return emit_from_mark(parser, output);
|
1123
1070
|
default:
|
1124
1071
|
if (is_alpha(c)) {
|
1125
|
-
|
1072
|
+
reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
|
1126
1073
|
start_new_tag(parser, true);
|
1127
|
-
return
|
1128
|
-
} else {
|
1129
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_TAG_INVALID);
|
1130
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1131
|
-
emit_temporary_buffer(parser, output);
|
1132
|
-
return RETURN_ERROR;
|
1074
|
+
return CONTINUE;
|
1133
1075
|
}
|
1076
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
|
1077
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
1078
|
+
return emit_from_mark(parser, output);
|
1134
1079
|
}
|
1135
1080
|
}
|
1136
1081
|
|
1137
1082
|
// https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
|
1138
1083
|
static StateResult handle_end_tag_open_state (
|
1139
1084
|
GumboParser* parser,
|
1140
|
-
GumboTokenizerState*
|
1085
|
+
GumboTokenizerState* tokenizer,
|
1141
1086
|
int c,
|
1142
1087
|
GumboToken* output
|
1143
1088
|
) {
|
1144
|
-
assert(temporary_buffer_equals(parser, "</"));
|
1145
1089
|
switch (c) {
|
1146
1090
|
case '>':
|
1147
|
-
tokenizer_add_parse_error(parser,
|
1091
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_END_TAG_NAME);
|
1148
1092
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1149
|
-
return
|
1093
|
+
return CONTINUE;
|
1150
1094
|
case -1:
|
1151
|
-
tokenizer_add_parse_error(parser,
|
1152
|
-
|
1153
|
-
|
1095
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
|
1096
|
+
// Similar to the tag open state except we need to emit '<' and '/'
|
1097
|
+
// before the EOF.
|
1098
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
1099
|
+
return emit_from_mark(parser, output);
|
1154
1100
|
default:
|
1155
1101
|
if (is_alpha(c)) {
|
1156
|
-
|
1102
|
+
reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
|
1157
1103
|
start_new_tag(parser, false);
|
1158
1104
|
} else {
|
1159
|
-
tokenizer_add_parse_error(parser,
|
1160
|
-
|
1105
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
|
1106
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
|
1161
1107
|
clear_temporary_buffer(parser);
|
1162
|
-
append_char_to_temporary_buffer(parser, c);
|
1163
1108
|
}
|
1164
|
-
return
|
1109
|
+
return CONTINUE;
|
1165
1110
|
}
|
1166
1111
|
}
|
1167
1112
|
|
@@ -1179,27 +1124,26 @@ static StateResult handle_tag_name_state (
|
|
1179
1124
|
case ' ':
|
1180
1125
|
finish_tag_name(parser);
|
1181
1126
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1182
|
-
return
|
1127
|
+
return CONTINUE;
|
1183
1128
|
case '/':
|
1184
1129
|
finish_tag_name(parser);
|
1185
1130
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1186
|
-
return
|
1131
|
+
return CONTINUE;
|
1187
1132
|
case '>':
|
1188
1133
|
finish_tag_name(parser);
|
1189
1134
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1190
1135
|
return emit_current_tag(parser, output);
|
1191
1136
|
case '\0':
|
1192
|
-
tokenizer_add_parse_error(parser,
|
1137
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
1193
1138
|
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
|
1194
|
-
return
|
1139
|
+
return CONTINUE;
|
1195
1140
|
case -1:
|
1196
|
-
tokenizer_add_parse_error(parser,
|
1141
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
1197
1142
|
abandon_current_tag(parser);
|
1198
|
-
|
1199
|
-
return NEXT_CHAR;
|
1143
|
+
return emit_eof(parser, output);
|
1200
1144
|
default:
|
1201
1145
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1202
|
-
return
|
1146
|
+
return CONTINUE;
|
1203
1147
|
}
|
1204
1148
|
}
|
1205
1149
|
|
@@ -1210,36 +1154,29 @@ static StateResult handle_rcdata_lt_state (
|
|
1210
1154
|
int c,
|
1211
1155
|
GumboToken* output
|
1212
1156
|
) {
|
1213
|
-
assert(temporary_buffer_equals(parser, "<"));
|
1214
1157
|
if (c == '/') {
|
1215
1158
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
|
1216
|
-
|
1217
|
-
return NEXT_CHAR;
|
1159
|
+
return CONTINUE;
|
1218
1160
|
} else {
|
1219
|
-
|
1220
|
-
|
1221
|
-
return emit_temporary_buffer(parser, output);
|
1161
|
+
reconsume_in_state(parser, GUMBO_LEX_RCDATA);
|
1162
|
+
return emit_from_mark(parser, output);
|
1222
1163
|
}
|
1223
1164
|
}
|
1224
1165
|
|
1225
1166
|
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
|
1226
1167
|
static StateResult handle_rcdata_end_tag_open_state (
|
1227
1168
|
GumboParser* parser,
|
1228
|
-
GumboTokenizerState*
|
1169
|
+
GumboTokenizerState* tokenizer,
|
1229
1170
|
int c,
|
1230
1171
|
GumboToken* output
|
1231
1172
|
) {
|
1232
|
-
assert(temporary_buffer_equals(parser, "</"));
|
1233
1173
|
if (is_alpha(c)) {
|
1234
|
-
|
1174
|
+
reconsume_in_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
|
1235
1175
|
start_new_tag(parser, false);
|
1236
|
-
|
1237
|
-
return NEXT_CHAR;
|
1238
|
-
} else {
|
1239
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
|
1240
|
-
return emit_temporary_buffer(parser, output);
|
1176
|
+
return CONTINUE;
|
1241
1177
|
}
|
1242
|
-
|
1178
|
+
reconsume_in_state(parser, GUMBO_LEX_RCDATA);
|
1179
|
+
return emit_from_mark(parser, output);
|
1243
1180
|
}
|
1244
1181
|
|
1245
1182
|
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
|
@@ -1250,33 +1187,39 @@ static StateResult handle_rcdata_end_tag_name_state (
|
|
1250
1187
|
GumboToken* output
|
1251
1188
|
) {
|
1252
1189
|
UNUSED_IF_NDEBUG(tokenizer);
|
1253
|
-
assert(tokenizer->_temporary_buffer.length >= 2);
|
1254
1190
|
if (is_alpha(c)) {
|
1255
1191
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1192
|
+
return CONTINUE;
|
1193
|
+
}
|
1194
|
+
switch (c) {
|
1195
|
+
case '\t':
|
1196
|
+
case '\n':
|
1197
|
+
case '\f':
|
1198
|
+
case ' ':
|
1199
|
+
if (is_appropriate_end_tag(parser)) {
|
1200
|
+
finish_tag_name(parser);
|
1201
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1202
|
+
return CONTINUE;
|
1203
|
+
}
|
1204
|
+
break;
|
1205
|
+
case '/':
|
1206
|
+
if (is_appropriate_end_tag(parser)) {
|
1207
|
+
finish_tag_name(parser);
|
1208
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1209
|
+
return CONTINUE;
|
1210
|
+
}
|
1211
|
+
break;
|
1212
|
+
case '>':
|
1213
|
+
if (is_appropriate_end_tag(parser)) {
|
1214
|
+
finish_tag_name(parser);
|
1215
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1216
|
+
return emit_current_tag(parser, output);
|
1275
1217
|
}
|
1218
|
+
break;
|
1276
1219
|
}
|
1277
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
|
1278
1220
|
abandon_current_tag(parser);
|
1279
|
-
|
1221
|
+
reconsume_in_state(parser, GUMBO_LEX_RCDATA);
|
1222
|
+
return emit_from_mark(parser, output);
|
1280
1223
|
}
|
1281
1224
|
|
1282
1225
|
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
|
@@ -1286,34 +1229,29 @@ static StateResult handle_rawtext_lt_state (
|
|
1286
1229
|
int c,
|
1287
1230
|
GumboToken* output
|
1288
1231
|
) {
|
1289
|
-
assert(temporary_buffer_equals(parser, "<"));
|
1290
1232
|
if (c == '/') {
|
1291
1233
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
|
1292
|
-
|
1293
|
-
return NEXT_CHAR;
|
1234
|
+
return CONTINUE;
|
1294
1235
|
} else {
|
1295
|
-
|
1296
|
-
|
1297
|
-
return emit_temporary_buffer(parser, output);
|
1236
|
+
reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
|
1237
|
+
return emit_from_mark(parser, output);
|
1298
1238
|
}
|
1299
1239
|
}
|
1300
1240
|
|
1301
1241
|
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
|
1302
1242
|
static StateResult handle_rawtext_end_tag_open_state (
|
1303
1243
|
GumboParser* parser,
|
1304
|
-
GumboTokenizerState*
|
1244
|
+
GumboTokenizerState* tokenizer,
|
1305
1245
|
int c,
|
1306
1246
|
GumboToken* output
|
1307
1247
|
) {
|
1308
|
-
assert(temporary_buffer_equals(parser, "</"));
|
1309
1248
|
if (is_alpha(c)) {
|
1310
|
-
|
1249
|
+
reconsume_in_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
|
1311
1250
|
start_new_tag(parser, false);
|
1312
|
-
|
1313
|
-
return NEXT_CHAR;
|
1251
|
+
return CONTINUE;
|
1314
1252
|
} else {
|
1315
|
-
|
1316
|
-
return
|
1253
|
+
reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
|
1254
|
+
return emit_from_mark(parser, output);
|
1317
1255
|
}
|
1318
1256
|
}
|
1319
1257
|
|
@@ -1324,153 +1262,156 @@ static StateResult handle_rawtext_end_tag_name_state (
|
|
1324
1262
|
int c,
|
1325
1263
|
GumboToken* output
|
1326
1264
|
) {
|
1327
|
-
assert(tokenizer->_temporary_buffer.length >= 2);
|
1328
|
-
gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
|
1329
|
-
tokenizer->_tag_state._buffer.data);
|
1330
1265
|
if (is_alpha(c)) {
|
1331
1266
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1348
|
-
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1267
|
+
return CONTINUE;
|
1268
|
+
}
|
1269
|
+
switch (c) {
|
1270
|
+
case '\t':
|
1271
|
+
case '\n':
|
1272
|
+
case '\f':
|
1273
|
+
case ' ':
|
1274
|
+
if (is_appropriate_end_tag(parser)) {
|
1275
|
+
finish_tag_name(parser);
|
1276
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1277
|
+
return CONTINUE;
|
1278
|
+
}
|
1279
|
+
break;
|
1280
|
+
case '/':
|
1281
|
+
if (is_appropriate_end_tag(parser)) {
|
1282
|
+
finish_tag_name(parser);
|
1283
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1284
|
+
return CONTINUE;
|
1285
|
+
}
|
1286
|
+
break;
|
1287
|
+
case '>':
|
1288
|
+
if (is_appropriate_end_tag(parser)) {
|
1289
|
+
finish_tag_name(parser);
|
1290
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1291
|
+
return emit_current_tag(parser, output);
|
1352
1292
|
}
|
1293
|
+
break;
|
1353
1294
|
}
|
1354
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
|
1355
1295
|
abandon_current_tag(parser);
|
1356
|
-
|
1296
|
+
reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
|
1297
|
+
return emit_from_mark(parser, output);
|
1357
1298
|
}
|
1358
1299
|
|
1359
1300
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
|
1360
|
-
static StateResult
|
1301
|
+
static StateResult handle_script_data_lt_state (
|
1361
1302
|
GumboParser* parser,
|
1362
1303
|
GumboTokenizerState* tokenizer,
|
1363
1304
|
int c,
|
1364
1305
|
GumboToken* output
|
1365
1306
|
) {
|
1366
|
-
assert(temporary_buffer_equals(parser, "<"));
|
1367
1307
|
if (c == '/') {
|
1368
|
-
gumbo_tokenizer_set_state(parser,
|
1369
|
-
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
|
1378
|
-
return
|
1308
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN);
|
1309
|
+
return CONTINUE;
|
1310
|
+
}
|
1311
|
+
if (c == '!') {
|
1312
|
+
// This is the only place we don't reconsume the input before emitting the
|
1313
|
+
// temporary buffer. Since the current position is stored and the current
|
1314
|
+
// character is not emitted, we need to advance the input and then
|
1315
|
+
// reconsume.
|
1316
|
+
utf8iterator_next(&tokenizer->_input);
|
1317
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START);
|
1318
|
+
return emit_from_mark(parser, output);
|
1379
1319
|
}
|
1320
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1321
|
+
return emit_from_mark(parser, output);
|
1380
1322
|
}
|
1381
1323
|
|
1382
1324
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
|
1383
|
-
static StateResult
|
1325
|
+
static StateResult handle_script_data_end_tag_open_state (
|
1384
1326
|
GumboParser* parser,
|
1385
|
-
GumboTokenizerState*
|
1327
|
+
GumboTokenizerState* tokenizer,
|
1386
1328
|
int c,
|
1387
1329
|
GumboToken* output
|
1388
1330
|
) {
|
1389
|
-
assert(temporary_buffer_equals(parser, "</"));
|
1390
1331
|
if (is_alpha(c)) {
|
1391
|
-
|
1332
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME);
|
1392
1333
|
start_new_tag(parser, false);
|
1393
|
-
|
1394
|
-
return NEXT_CHAR;
|
1395
|
-
} else {
|
1396
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
|
1397
|
-
return emit_temporary_buffer(parser, output);
|
1334
|
+
return CONTINUE;
|
1398
1335
|
}
|
1336
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1337
|
+
return emit_from_mark(parser, output);
|
1399
1338
|
}
|
1400
1339
|
|
1401
1340
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
|
1402
|
-
static StateResult
|
1341
|
+
static StateResult handle_script_data_end_tag_name_state (
|
1403
1342
|
GumboParser* parser,
|
1404
1343
|
GumboTokenizerState* tokenizer,
|
1405
1344
|
int c,
|
1406
1345
|
GumboToken* output
|
1407
1346
|
) {
|
1408
|
-
UNUSED_IF_NDEBUG(tokenizer);
|
1409
|
-
assert(tokenizer->_temporary_buffer.length >= 2);
|
1410
1347
|
if (is_alpha(c)) {
|
1411
1348
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1349
|
+
return CONTINUE;
|
1350
|
+
}
|
1351
|
+
switch (c) {
|
1352
|
+
case '\t':
|
1353
|
+
case '\n':
|
1354
|
+
case '\f':
|
1355
|
+
case ' ':
|
1356
|
+
if (is_appropriate_end_tag(parser)) {
|
1357
|
+
finish_tag_name(parser);
|
1358
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1359
|
+
return CONTINUE;
|
1360
|
+
}
|
1361
|
+
break;
|
1362
|
+
case '/':
|
1363
|
+
if (is_appropriate_end_tag(parser)) {
|
1364
|
+
finish_tag_name(parser);
|
1365
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1366
|
+
return CONTINUE;
|
1367
|
+
}
|
1368
|
+
break;
|
1369
|
+
case '>':
|
1370
|
+
if (is_appropriate_end_tag(parser)) {
|
1371
|
+
finish_tag_name(parser);
|
1372
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1373
|
+
return emit_current_tag(parser, output);
|
1431
1374
|
}
|
1375
|
+
break;
|
1432
1376
|
}
|
1433
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
|
1434
1377
|
abandon_current_tag(parser);
|
1435
|
-
|
1378
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1379
|
+
return emit_from_mark(parser, output);
|
1436
1380
|
}
|
1437
1381
|
|
1438
1382
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
|
1439
|
-
static StateResult
|
1383
|
+
static StateResult handle_script_data_escaped_start_state (
|
1440
1384
|
GumboParser* parser,
|
1441
1385
|
GumboTokenizerState* tokenizer,
|
1442
1386
|
int c,
|
1443
1387
|
GumboToken* output
|
1444
1388
|
) {
|
1445
1389
|
if (c == '-') {
|
1446
|
-
gumbo_tokenizer_set_state(parser,
|
1447
|
-
return
|
1448
|
-
} else {
|
1449
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
|
1450
|
-
tokenizer->_reconsume_current_input = true;
|
1451
|
-
return NEXT_CHAR;
|
1390
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH);
|
1391
|
+
return emit_char(parser, c, output);
|
1452
1392
|
}
|
1393
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1394
|
+
return CONTINUE;
|
1453
1395
|
}
|
1454
1396
|
|
1455
1397
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
|
1456
|
-
static StateResult
|
1398
|
+
static StateResult handle_script_data_escaped_start_dash_state (
|
1457
1399
|
GumboParser* parser,
|
1458
1400
|
GumboTokenizerState* tokenizer,
|
1459
1401
|
int c,
|
1460
1402
|
GumboToken* output
|
1461
1403
|
) {
|
1462
1404
|
if (c == '-') {
|
1463
|
-
gumbo_tokenizer_set_state(parser,
|
1464
|
-
return
|
1405
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
|
1406
|
+
return emit_char(parser, c, output);
|
1465
1407
|
} else {
|
1466
|
-
|
1467
|
-
|
1468
|
-
return NEXT_CHAR;
|
1408
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1409
|
+
return CONTINUE;
|
1469
1410
|
}
|
1470
1411
|
}
|
1471
1412
|
|
1472
1413
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
|
1473
|
-
static StateResult
|
1414
|
+
static StateResult handle_script_data_escaped_state (
|
1474
1415
|
GumboParser* parser,
|
1475
1416
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1476
1417
|
int c,
|
@@ -1478,25 +1419,25 @@ static StateResult handle_script_escaped_state (
|
|
1478
1419
|
) {
|
1479
1420
|
switch (c) {
|
1480
1421
|
case '-':
|
1481
|
-
gumbo_tokenizer_set_state(parser,
|
1482
|
-
return
|
1422
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH);
|
1423
|
+
return emit_char(parser, c, output);
|
1483
1424
|
case '<':
|
1484
|
-
gumbo_tokenizer_set_state(parser,
|
1425
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
|
1485
1426
|
clear_temporary_buffer(parser);
|
1486
|
-
|
1487
|
-
return
|
1427
|
+
set_mark(parser);
|
1428
|
+
return CONTINUE;
|
1488
1429
|
case '\0':
|
1489
1430
|
return emit_replacement_char(parser, output);
|
1490
1431
|
case -1:
|
1491
|
-
tokenizer_add_parse_error(parser,
|
1432
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1492
1433
|
return emit_eof(parser, output);
|
1493
1434
|
default:
|
1494
|
-
return
|
1435
|
+
return emit_char(parser, c, output);
|
1495
1436
|
}
|
1496
1437
|
}
|
1497
1438
|
|
1498
1439
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
|
1499
|
-
static StateResult
|
1440
|
+
static StateResult handle_script_data_escaped_dash_state (
|
1500
1441
|
GumboParser* parser,
|
1501
1442
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1502
1443
|
int c,
|
@@ -1504,28 +1445,27 @@ static StateResult handle_script_escaped_dash_state (
|
|
1504
1445
|
) {
|
1505
1446
|
switch (c) {
|
1506
1447
|
case '-':
|
1507
|
-
gumbo_tokenizer_set_state(parser,
|
1508
|
-
return
|
1448
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
|
1449
|
+
return emit_char(parser, c, output);
|
1509
1450
|
case '<':
|
1510
|
-
gumbo_tokenizer_set_state(parser,
|
1451
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
|
1511
1452
|
clear_temporary_buffer(parser);
|
1512
|
-
|
1513
|
-
return
|
1453
|
+
set_mark(parser);
|
1454
|
+
return CONTINUE;
|
1514
1455
|
case '\0':
|
1515
|
-
gumbo_tokenizer_set_state(parser,
|
1456
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1516
1457
|
return emit_replacement_char(parser, output);
|
1517
1458
|
case -1:
|
1518
|
-
tokenizer_add_parse_error(parser,
|
1519
|
-
|
1520
|
-
return NEXT_CHAR;
|
1459
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1460
|
+
return emit_eof(parser, output);
|
1521
1461
|
default:
|
1522
|
-
gumbo_tokenizer_set_state(parser,
|
1523
|
-
return
|
1462
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1463
|
+
return emit_char(parser, c, output);
|
1524
1464
|
}
|
1525
1465
|
}
|
1526
1466
|
|
1527
1467
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
|
1528
|
-
static StateResult
|
1468
|
+
static StateResult handle_script_data_escaped_dash_dash_state (
|
1529
1469
|
GumboParser* parser,
|
1530
1470
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1531
1471
|
int c,
|
@@ -1533,113 +1473,107 @@ static StateResult handle_script_escaped_dash_dash_state (
|
|
1533
1473
|
) {
|
1534
1474
|
switch (c) {
|
1535
1475
|
case '-':
|
1536
|
-
return
|
1476
|
+
return emit_char(parser, c, output);
|
1537
1477
|
case '<':
|
1538
|
-
gumbo_tokenizer_set_state(parser,
|
1478
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
|
1539
1479
|
clear_temporary_buffer(parser);
|
1540
|
-
|
1541
|
-
return
|
1480
|
+
set_mark(parser);
|
1481
|
+
return CONTINUE;
|
1542
1482
|
case '>':
|
1543
|
-
gumbo_tokenizer_set_state(parser,
|
1544
|
-
return
|
1483
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1484
|
+
return emit_char(parser, c, output);
|
1545
1485
|
case '\0':
|
1546
|
-
gumbo_tokenizer_set_state(parser,
|
1486
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1547
1487
|
return emit_replacement_char(parser, output);
|
1548
1488
|
case -1:
|
1549
|
-
tokenizer_add_parse_error(parser,
|
1550
|
-
|
1551
|
-
return NEXT_CHAR;
|
1489
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1490
|
+
return emit_eof(parser, output);
|
1552
1491
|
default:
|
1553
|
-
gumbo_tokenizer_set_state(parser,
|
1554
|
-
return
|
1492
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1493
|
+
return emit_char(parser, c, output);
|
1555
1494
|
}
|
1556
1495
|
}
|
1557
1496
|
|
1558
1497
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
|
1559
|
-
static StateResult
|
1498
|
+
static StateResult handle_script_data_escaped_lt_state (
|
1560
1499
|
GumboParser* parser,
|
1561
1500
|
GumboTokenizerState* tokenizer,
|
1562
1501
|
int c,
|
1563
1502
|
GumboToken* output
|
1564
1503
|
) {
|
1565
|
-
assert(
|
1566
|
-
assert(!tokenizer->_script_data_buffer.length);
|
1504
|
+
assert(temporary_buffer_is_empty(parser));
|
1567
1505
|
if (c == '/') {
|
1568
|
-
gumbo_tokenizer_set_state(parser,
|
1569
|
-
|
1570
|
-
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1574
|
-
gumbo_string_buffer_append_codepoint (
|
1575
|
-
ensure_lowercase(c),
|
1576
|
-
&tokenizer->_script_data_buffer
|
1577
|
-
);
|
1578
|
-
return emit_temporary_buffer(parser, output);
|
1579
|
-
} else {
|
1580
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
|
1581
|
-
return emit_temporary_buffer(parser, output);
|
1506
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN);
|
1507
|
+
return CONTINUE;
|
1508
|
+
}
|
1509
|
+
if (is_alpha(c)) {
|
1510
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START);
|
1511
|
+
return emit_from_mark(parser, output);
|
1582
1512
|
}
|
1513
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1514
|
+
return emit_from_mark(parser, output);
|
1583
1515
|
}
|
1584
1516
|
|
1585
1517
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
|
1586
|
-
static StateResult
|
1518
|
+
static StateResult handle_script_data_escaped_end_tag_open_state (
|
1587
1519
|
GumboParser* parser,
|
1588
|
-
GumboTokenizerState*
|
1520
|
+
GumboTokenizerState* tokenizer,
|
1589
1521
|
int c,
|
1590
1522
|
GumboToken* output
|
1591
1523
|
) {
|
1592
|
-
assert(temporary_buffer_equals(parser, "</"));
|
1593
1524
|
if (is_alpha(c)) {
|
1594
|
-
|
1525
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME);
|
1595
1526
|
start_new_tag(parser, false);
|
1596
|
-
|
1597
|
-
return NEXT_CHAR;
|
1598
|
-
} else {
|
1599
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
|
1600
|
-
return emit_temporary_buffer(parser, output);
|
1527
|
+
return CONTINUE;
|
1601
1528
|
}
|
1529
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1530
|
+
return emit_from_mark(parser, output);
|
1602
1531
|
}
|
1603
1532
|
|
1604
1533
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
|
1605
|
-
static StateResult
|
1534
|
+
static StateResult handle_script_data_escaped_end_tag_name_state (
|
1606
1535
|
GumboParser* parser,
|
1607
1536
|
GumboTokenizerState* tokenizer,
|
1608
1537
|
int c,
|
1609
1538
|
GumboToken* output
|
1610
1539
|
) {
|
1611
|
-
UNUSED_IF_NDEBUG(tokenizer);
|
1612
|
-
assert(tokenizer->_temporary_buffer.length >= 2);
|
1613
1540
|
if (is_alpha(c)) {
|
1614
1541
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1615
|
-
|
1616
|
-
|
1617
|
-
|
1618
|
-
|
1619
|
-
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1633
|
-
|
1542
|
+
return CONTINUE;
|
1543
|
+
}
|
1544
|
+
switch (c) {
|
1545
|
+
case '\t':
|
1546
|
+
case '\n':
|
1547
|
+
case '\f':
|
1548
|
+
case ' ':
|
1549
|
+
if (is_appropriate_end_tag(parser)) {
|
1550
|
+
finish_tag_name(parser);
|
1551
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1552
|
+
return CONTINUE;
|
1553
|
+
}
|
1554
|
+
break;
|
1555
|
+
case '/':
|
1556
|
+
if (is_appropriate_end_tag(parser)) {
|
1557
|
+
finish_tag_name(parser);
|
1558
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1559
|
+
return CONTINUE;
|
1560
|
+
}
|
1561
|
+
break;
|
1562
|
+
case '>':
|
1563
|
+
if (is_appropriate_end_tag(parser)) {
|
1564
|
+
finish_tag_name(parser);
|
1565
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1566
|
+
return emit_current_tag(parser, output);
|
1634
1567
|
}
|
1568
|
+
break;
|
1635
1569
|
}
|
1636
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
|
1637
1570
|
abandon_current_tag(parser);
|
1638
|
-
|
1571
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1572
|
+
return emit_from_mark(parser, output);
|
1639
1573
|
}
|
1640
1574
|
|
1641
1575
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
|
1642
|
-
static StateResult
|
1576
|
+
static StateResult handle_script_data_double_escaped_start_state (
|
1643
1577
|
GumboParser* parser,
|
1644
1578
|
GumboTokenizerState* tokenizer,
|
1645
1579
|
int c,
|
@@ -1656,29 +1590,23 @@ static StateResult handle_script_double_escaped_start_state (
|
|
1656
1590
|
parser,
|
1657
1591
|
gumbo_string_equals (
|
1658
1592
|
&kScriptTag,
|
1659
|
-
(GumboStringPiece*) &tokenizer->
|
1593
|
+
(GumboStringPiece*) &tokenizer->_temporary_buffer
|
1660
1594
|
)
|
1661
|
-
?
|
1662
|
-
:
|
1595
|
+
? GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED
|
1596
|
+
: GUMBO_LEX_SCRIPT_DATA_ESCAPED
|
1663
1597
|
);
|
1664
|
-
return
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
&tokenizer->_script_data_buffer
|
1670
|
-
);
|
1671
|
-
return emit_current_char(parser, output);
|
1672
|
-
} else {
|
1673
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
|
1674
|
-
tokenizer->_reconsume_current_input = true;
|
1675
|
-
return NEXT_CHAR;
|
1676
|
-
}
|
1598
|
+
return emit_char(parser, c, output);
|
1599
|
+
}
|
1600
|
+
if (is_alpha(c)) {
|
1601
|
+
append_char_to_temporary_buffer(parser, ensure_lowercase(c));
|
1602
|
+
return emit_char(parser, c, output);
|
1677
1603
|
}
|
1604
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
|
1605
|
+
return CONTINUE;
|
1678
1606
|
}
|
1679
1607
|
|
1680
1608
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
|
1681
|
-
static StateResult
|
1609
|
+
static StateResult handle_script_data_double_escaped_state (
|
1682
1610
|
GumboParser* parser,
|
1683
1611
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1684
1612
|
int c,
|
@@ -1686,24 +1614,23 @@ static StateResult handle_script_double_escaped_state (
|
|
1686
1614
|
) {
|
1687
1615
|
switch (c) {
|
1688
1616
|
case '-':
|
1689
|
-
gumbo_tokenizer_set_state(parser,
|
1690
|
-
return
|
1617
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH);
|
1618
|
+
return emit_char(parser, c, output);
|
1691
1619
|
case '<':
|
1692
|
-
gumbo_tokenizer_set_state(parser,
|
1693
|
-
return
|
1620
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
|
1621
|
+
return emit_char(parser, c, output);
|
1694
1622
|
case '\0':
|
1695
1623
|
return emit_replacement_char(parser, output);
|
1696
1624
|
case -1:
|
1697
|
-
tokenizer_add_parse_error(parser,
|
1698
|
-
|
1699
|
-
return NEXT_CHAR;
|
1625
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1626
|
+
return emit_eof(parser, output);
|
1700
1627
|
default:
|
1701
|
-
return
|
1628
|
+
return emit_char(parser, c, output);
|
1702
1629
|
}
|
1703
1630
|
}
|
1704
1631
|
|
1705
1632
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
|
1706
|
-
static StateResult
|
1633
|
+
static StateResult handle_script_data_double_escaped_dash_state (
|
1707
1634
|
GumboParser* parser,
|
1708
1635
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1709
1636
|
int c,
|
@@ -1712,26 +1639,25 @@ static StateResult handle_script_double_escaped_dash_state (
|
|
1712
1639
|
switch (c) {
|
1713
1640
|
case '-':
|
1714
1641
|
gumbo_tokenizer_set_state(
|
1715
|
-
parser,
|
1716
|
-
return
|
1642
|
+
parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH);
|
1643
|
+
return emit_char(parser, c, output);
|
1717
1644
|
case '<':
|
1718
|
-
gumbo_tokenizer_set_state(parser,
|
1719
|
-
return
|
1645
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
|
1646
|
+
return emit_char(parser, c, output);
|
1720
1647
|
case '\0':
|
1721
|
-
gumbo_tokenizer_set_state(parser,
|
1648
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1722
1649
|
return emit_replacement_char(parser, output);
|
1723
1650
|
case -1:
|
1724
|
-
tokenizer_add_parse_error(parser,
|
1725
|
-
|
1726
|
-
return NEXT_CHAR;
|
1651
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1652
|
+
return emit_eof(parser, output);
|
1727
1653
|
default:
|
1728
|
-
gumbo_tokenizer_set_state(parser,
|
1729
|
-
return
|
1654
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1655
|
+
return emit_char(parser, c, output);
|
1730
1656
|
}
|
1731
1657
|
}
|
1732
1658
|
|
1733
1659
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
|
1734
|
-
static StateResult
|
1660
|
+
static StateResult handle_script_data_double_escaped_dash_dash_state (
|
1735
1661
|
GumboParser* parser,
|
1736
1662
|
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1737
1663
|
int c,
|
@@ -1739,46 +1665,44 @@ static StateResult handle_script_double_escaped_dash_dash_state (
|
|
1739
1665
|
) {
|
1740
1666
|
switch (c) {
|
1741
1667
|
case '-':
|
1742
|
-
return
|
1668
|
+
return emit_char(parser, c, output);
|
1743
1669
|
case '<':
|
1744
|
-
gumbo_tokenizer_set_state(parser,
|
1745
|
-
return
|
1670
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
|
1671
|
+
return emit_char(parser, c, output);
|
1746
1672
|
case '>':
|
1747
|
-
gumbo_tokenizer_set_state(parser,
|
1748
|
-
return
|
1673
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
|
1674
|
+
return emit_char(parser, c, output);
|
1749
1675
|
case '\0':
|
1750
|
-
gumbo_tokenizer_set_state(parser,
|
1676
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1751
1677
|
return emit_replacement_char(parser, output);
|
1752
1678
|
case -1:
|
1753
|
-
tokenizer_add_parse_error(parser,
|
1754
|
-
|
1755
|
-
return NEXT_CHAR;
|
1679
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
|
1680
|
+
return emit_eof(parser, output);
|
1756
1681
|
default:
|
1757
|
-
gumbo_tokenizer_set_state(parser,
|
1758
|
-
return
|
1682
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1683
|
+
return emit_char(parser, c, output);
|
1759
1684
|
}
|
1760
1685
|
}
|
1761
1686
|
|
1762
1687
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
|
1763
|
-
static StateResult
|
1688
|
+
static StateResult handle_script_data_double_escaped_lt_state (
|
1764
1689
|
GumboParser* parser,
|
1765
1690
|
GumboTokenizerState* tokenizer,
|
1766
1691
|
int c,
|
1767
1692
|
GumboToken* output
|
1768
1693
|
) {
|
1769
1694
|
if (c == '/') {
|
1770
|
-
gumbo_tokenizer_set_state(parser,
|
1771
|
-
|
1772
|
-
return
|
1695
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END);
|
1696
|
+
clear_temporary_buffer(parser);
|
1697
|
+
return emit_char(parser, c, output);
|
1773
1698
|
} else {
|
1774
|
-
|
1775
|
-
|
1776
|
-
return NEXT_CHAR;
|
1699
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1700
|
+
return CONTINUE;
|
1777
1701
|
}
|
1778
1702
|
}
|
1779
1703
|
|
1780
1704
|
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
|
1781
|
-
static StateResult
|
1705
|
+
static StateResult handle_script_data_double_escaped_end_state (
|
1782
1706
|
GumboParser* parser,
|
1783
1707
|
GumboTokenizerState* tokenizer,
|
1784
1708
|
int c,
|
@@ -1793,29 +1717,23 @@ static StateResult handle_script_double_escaped_end_state (
|
|
1793
1717
|
case '>':
|
1794
1718
|
gumbo_tokenizer_set_state(
|
1795
1719
|
parser, gumbo_string_equals(&kScriptTag,
|
1796
|
-
(GumboStringPiece*) &tokenizer->
|
1797
|
-
?
|
1798
|
-
:
|
1799
|
-
return
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
&tokenizer->_script_data_buffer
|
1805
|
-
);
|
1806
|
-
return emit_current_char(parser, output);
|
1807
|
-
} else {
|
1808
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
|
1809
|
-
tokenizer->_reconsume_current_input = true;
|
1810
|
-
return NEXT_CHAR;
|
1811
|
-
}
|
1720
|
+
(GumboStringPiece*) &tokenizer->_temporary_buffer)
|
1721
|
+
? GUMBO_LEX_SCRIPT_DATA_ESCAPED
|
1722
|
+
: GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1723
|
+
return emit_char(parser, c, output);
|
1724
|
+
}
|
1725
|
+
if (is_alpha(c)) {
|
1726
|
+
append_char_to_temporary_buffer(parser, ensure_lowercase(c));
|
1727
|
+
return emit_char(parser, c, output);
|
1812
1728
|
}
|
1729
|
+
reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
|
1730
|
+
return CONTINUE;
|
1813
1731
|
}
|
1814
1732
|
|
1815
1733
|
// https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
|
1816
1734
|
static StateResult handle_before_attr_name_state (
|
1817
1735
|
GumboParser* parser,
|
1818
|
-
GumboTokenizerState*
|
1736
|
+
GumboTokenizerState* tokenizer,
|
1819
1737
|
int c,
|
1820
1738
|
GumboToken* output
|
1821
1739
|
) {
|
@@ -1824,40 +1742,27 @@ static StateResult handle_before_attr_name_state (
|
|
1824
1742
|
case '\n':
|
1825
1743
|
case '\f':
|
1826
1744
|
case ' ':
|
1827
|
-
return
|
1745
|
+
return CONTINUE;
|
1828
1746
|
case '/':
|
1829
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1830
|
-
return NEXT_CHAR;
|
1831
1747
|
case '>':
|
1832
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1833
|
-
return emit_current_tag(parser, output);
|
1834
|
-
case '\0':
|
1835
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
1836
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
|
1837
|
-
append_char_to_temporary_buffer(parser, 0xfffd);
|
1838
|
-
return NEXT_CHAR;
|
1839
1748
|
case -1:
|
1840
|
-
|
1841
|
-
|
1842
|
-
abandon_current_tag(parser);
|
1843
|
-
return NEXT_CHAR;
|
1844
|
-
case '"':
|
1845
|
-
case '\'':
|
1846
|
-
case '<':
|
1749
|
+
reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
|
1750
|
+
return CONTINUE;
|
1847
1751
|
case '=':
|
1848
|
-
tokenizer_add_parse_error(parser,
|
1849
|
-
// Fall through.
|
1850
|
-
default:
|
1752
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME);
|
1851
1753
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
|
1852
|
-
append_char_to_tag_buffer(parser,
|
1853
|
-
return
|
1754
|
+
append_char_to_tag_buffer(parser, c, true);
|
1755
|
+
return CONTINUE;
|
1756
|
+
default:
|
1757
|
+
reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
|
1758
|
+
return CONTINUE;
|
1854
1759
|
}
|
1855
1760
|
}
|
1856
1761
|
|
1857
1762
|
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
|
1858
1763
|
static StateResult handle_attr_name_state (
|
1859
1764
|
GumboParser* parser,
|
1860
|
-
GumboTokenizerState*
|
1765
|
+
GumboTokenizerState* tokenizer,
|
1861
1766
|
int c,
|
1862
1767
|
GumboToken* output
|
1863
1768
|
) {
|
@@ -1866,45 +1771,35 @@ static StateResult handle_attr_name_state (
|
|
1866
1771
|
case '\n':
|
1867
1772
|
case '\f':
|
1868
1773
|
case ' ':
|
1869
|
-
finish_attribute_name(parser);
|
1870
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
|
1871
|
-
return NEXT_CHAR;
|
1872
1774
|
case '/':
|
1775
|
+
case '>':
|
1776
|
+
case -1:
|
1873
1777
|
finish_attribute_name(parser);
|
1874
|
-
|
1875
|
-
return
|
1778
|
+
reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
|
1779
|
+
return CONTINUE;
|
1876
1780
|
case '=':
|
1877
1781
|
finish_attribute_name(parser);
|
1878
1782
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
|
1879
|
-
return
|
1880
|
-
case '>':
|
1881
|
-
finish_attribute_name(parser);
|
1882
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1883
|
-
return emit_current_tag(parser, output);
|
1783
|
+
return CONTINUE;
|
1884
1784
|
case '\0':
|
1885
|
-
tokenizer_add_parse_error(parser,
|
1785
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
1886
1786
|
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
|
1887
|
-
return
|
1888
|
-
case -1:
|
1889
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1890
|
-
abandon_current_tag(parser);
|
1891
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_EOF);
|
1892
|
-
return NEXT_CHAR;
|
1787
|
+
return CONTINUE;
|
1893
1788
|
case '"':
|
1894
1789
|
case '\'':
|
1895
1790
|
case '<':
|
1896
|
-
tokenizer_add_parse_error(parser,
|
1791
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME);
|
1897
1792
|
// Fall through.
|
1898
1793
|
default:
|
1899
1794
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1900
|
-
return
|
1795
|
+
return CONTINUE;
|
1901
1796
|
}
|
1902
1797
|
}
|
1903
1798
|
|
1904
1799
|
// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
|
1905
1800
|
static StateResult handle_after_attr_name_state (
|
1906
1801
|
GumboParser* parser,
|
1907
|
-
GumboTokenizerState*
|
1802
|
+
GumboTokenizerState* tokenizer,
|
1908
1803
|
int c,
|
1909
1804
|
GumboToken* output
|
1910
1805
|
) {
|
@@ -1913,35 +1808,23 @@ static StateResult handle_after_attr_name_state (
|
|
1913
1808
|
case '\n':
|
1914
1809
|
case '\f':
|
1915
1810
|
case ' ':
|
1916
|
-
return
|
1811
|
+
return CONTINUE;
|
1917
1812
|
case '/':
|
1918
1813
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
1919
|
-
return
|
1814
|
+
return CONTINUE;
|
1920
1815
|
case '=':
|
1921
1816
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
|
1922
|
-
return
|
1817
|
+
return CONTINUE;
|
1923
1818
|
case '>':
|
1924
1819
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1925
1820
|
return emit_current_tag(parser, output);
|
1926
|
-
case '\0':
|
1927
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
1928
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
|
1929
|
-
append_char_to_temporary_buffer(parser, 0xfffd);
|
1930
|
-
return NEXT_CHAR;
|
1931
1821
|
case -1:
|
1932
|
-
tokenizer_add_parse_error(parser,
|
1933
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1822
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
1934
1823
|
abandon_current_tag(parser);
|
1935
|
-
return
|
1936
|
-
case '"':
|
1937
|
-
case '\'':
|
1938
|
-
case '<':
|
1939
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
|
1940
|
-
// Fall through.
|
1824
|
+
return emit_eof(parser, output);
|
1941
1825
|
default:
|
1942
|
-
|
1943
|
-
|
1944
|
-
return NEXT_CHAR;
|
1826
|
+
reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
|
1827
|
+
return CONTINUE;
|
1945
1828
|
}
|
1946
1829
|
}
|
1947
1830
|
|
@@ -1957,45 +1840,22 @@ static StateResult handle_before_attr_value_state (
|
|
1957
1840
|
case '\n':
|
1958
1841
|
case '\f':
|
1959
1842
|
case ' ':
|
1960
|
-
return
|
1843
|
+
return CONTINUE;
|
1961
1844
|
case '"':
|
1962
1845
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
|
1963
1846
|
reset_tag_buffer_start_point(parser);
|
1964
|
-
return
|
1965
|
-
case '&':
|
1966
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
|
1967
|
-
tokenizer->_reconsume_current_input = true;
|
1968
|
-
return NEXT_CHAR;
|
1847
|
+
return CONTINUE;
|
1969
1848
|
case '\'':
|
1970
1849
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
|
1971
1850
|
reset_tag_buffer_start_point(parser);
|
1972
|
-
return
|
1973
|
-
case '\0':
|
1974
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
1975
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
|
1976
|
-
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
|
1977
|
-
return NEXT_CHAR;
|
1978
|
-
case -1:
|
1979
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EOF);
|
1980
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1981
|
-
abandon_current_tag(parser);
|
1982
|
-
tokenizer->_reconsume_current_input = true;
|
1983
|
-
return NEXT_CHAR;
|
1851
|
+
return CONTINUE;
|
1984
1852
|
case '>':
|
1985
|
-
tokenizer_add_parse_error(parser,
|
1853
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_ATTRIBUTE_VALUE);
|
1986
1854
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1987
|
-
emit_current_tag(parser, output);
|
1988
|
-
return RETURN_ERROR;
|
1989
|
-
case '<':
|
1990
|
-
case '=':
|
1991
|
-
case '`':
|
1992
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
|
1993
|
-
// Fall through.
|
1994
|
-
default:
|
1995
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
|
1996
|
-
append_char_to_tag_buffer(parser, c, true);
|
1997
|
-
return NEXT_CHAR;
|
1855
|
+
return emit_current_tag(parser, output);
|
1998
1856
|
}
|
1857
|
+
reconsume_in_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
|
1858
|
+
return CONTINUE;
|
1999
1859
|
}
|
2000
1860
|
|
2001
1861
|
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
|
@@ -2003,30 +1863,28 @@ static StateResult handle_attr_value_double_quoted_state (
|
|
2003
1863
|
GumboParser* parser,
|
2004
1864
|
GumboTokenizerState* tokenizer,
|
2005
1865
|
int c,
|
2006
|
-
GumboToken*
|
1866
|
+
GumboToken* output
|
2007
1867
|
) {
|
2008
1868
|
switch (c) {
|
2009
1869
|
case '"':
|
2010
1870
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
|
2011
|
-
return
|
1871
|
+
return CONTINUE;
|
2012
1872
|
case '&':
|
2013
|
-
|
2014
|
-
|
2015
|
-
tokenizer->
|
2016
|
-
return
|
1873
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
|
1874
|
+
set_mark(parser);
|
1875
|
+
tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED;
|
1876
|
+
return CONTINUE;
|
2017
1877
|
case '\0':
|
2018
|
-
tokenizer_add_parse_error(parser,
|
1878
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2019
1879
|
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
|
2020
|
-
return
|
1880
|
+
return CONTINUE;
|
2021
1881
|
case -1:
|
2022
|
-
tokenizer_add_parse_error(parser,
|
2023
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1882
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
2024
1883
|
abandon_current_tag(parser);
|
2025
|
-
|
2026
|
-
return NEXT_CHAR;
|
1884
|
+
return emit_eof(parser, output);
|
2027
1885
|
default:
|
2028
1886
|
append_char_to_tag_buffer(parser, c, false);
|
2029
|
-
return
|
1887
|
+
return CONTINUE;
|
2030
1888
|
}
|
2031
1889
|
}
|
2032
1890
|
|
@@ -2035,30 +1893,28 @@ static StateResult handle_attr_value_single_quoted_state (
|
|
2035
1893
|
GumboParser* parser,
|
2036
1894
|
GumboTokenizerState* tokenizer,
|
2037
1895
|
int c,
|
2038
|
-
GumboToken*
|
1896
|
+
GumboToken* output
|
2039
1897
|
) {
|
2040
1898
|
switch (c) {
|
2041
1899
|
case '\'':
|
2042
1900
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
|
2043
|
-
return
|
1901
|
+
return CONTINUE;
|
2044
1902
|
case '&':
|
2045
|
-
|
2046
|
-
|
2047
|
-
tokenizer->
|
2048
|
-
return
|
1903
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
|
1904
|
+
set_mark(parser);
|
1905
|
+
tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED;
|
1906
|
+
return CONTINUE;
|
2049
1907
|
case '\0':
|
2050
|
-
tokenizer_add_parse_error(parser,
|
1908
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2051
1909
|
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
|
2052
|
-
return
|
1910
|
+
return CONTINUE;
|
2053
1911
|
case -1:
|
2054
|
-
tokenizer_add_parse_error(parser,
|
2055
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1912
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
2056
1913
|
abandon_current_tag(parser);
|
2057
|
-
|
2058
|
-
return NEXT_CHAR;
|
1914
|
+
return emit_eof(parser, output);
|
2059
1915
|
default:
|
2060
1916
|
append_char_to_tag_buffer(parser, c, false);
|
2061
|
-
return
|
1917
|
+
return CONTINUE;
|
2062
1918
|
}
|
2063
1919
|
}
|
2064
1920
|
|
@@ -2076,89 +1932,35 @@ static StateResult handle_attr_value_unquoted_state (
|
|
2076
1932
|
case ' ':
|
2077
1933
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
2078
1934
|
finish_attribute_value(parser);
|
2079
|
-
return
|
1935
|
+
return CONTINUE;
|
2080
1936
|
case '&':
|
2081
|
-
|
2082
|
-
|
2083
|
-
tokenizer->
|
2084
|
-
return
|
1937
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
|
1938
|
+
set_mark(parser);
|
1939
|
+
tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_UNQUOTED;
|
1940
|
+
return CONTINUE;
|
2085
1941
|
case '>':
|
2086
1942
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2087
1943
|
finish_attribute_value(parser);
|
2088
1944
|
return emit_current_tag(parser, output);
|
2089
1945
|
case '\0':
|
2090
|
-
tokenizer_add_parse_error(parser,
|
1946
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2091
1947
|
append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
|
2092
|
-
return
|
1948
|
+
return CONTINUE;
|
2093
1949
|
case -1:
|
2094
|
-
tokenizer_add_parse_error(parser,
|
2095
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2096
|
-
tokenizer->_reconsume_current_input = true;
|
1950
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
2097
1951
|
abandon_current_tag(parser);
|
2098
|
-
return
|
2099
|
-
case '<':
|
2100
|
-
case '=':
|
1952
|
+
return emit_eof(parser, output);
|
2101
1953
|
case '"':
|
2102
1954
|
case '\'':
|
1955
|
+
case '<':
|
1956
|
+
case '=':
|
2103
1957
|
case '`':
|
2104
|
-
tokenizer_add_parse_error(parser,
|
1958
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE);
|
2105
1959
|
// Fall through.
|
2106
1960
|
default:
|
2107
1961
|
append_char_to_tag_buffer(parser, c, true);
|
2108
|
-
return
|
2109
|
-
}
|
2110
|
-
}
|
2111
|
-
|
2112
|
-
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-attribute-value-state
|
2113
|
-
static StateResult handle_char_ref_in_attr_value_state (
|
2114
|
-
GumboParser* parser,
|
2115
|
-
GumboTokenizerState* tokenizer,
|
2116
|
-
int UNUSED_ARG(c),
|
2117
|
-
GumboToken* UNUSED_ARG(output)
|
2118
|
-
) {
|
2119
|
-
OneOrTwoCodepoints char_ref;
|
2120
|
-
int allowed_char;
|
2121
|
-
bool is_unquoted = false;
|
2122
|
-
switch (tokenizer->_tag_state._attr_value_state) {
|
2123
|
-
case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
|
2124
|
-
allowed_char = '"';
|
2125
|
-
break;
|
2126
|
-
case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
|
2127
|
-
allowed_char = '\'';
|
2128
|
-
break;
|
2129
|
-
case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
|
2130
|
-
allowed_char = '>';
|
2131
|
-
is_unquoted = true;
|
2132
|
-
break;
|
2133
|
-
default:
|
2134
|
-
// -Wmaybe-uninitialized is a little overzealous here, and doesn't
|
2135
|
-
// get that the assert(0) means this codepath will never happen.
|
2136
|
-
allowed_char = ' ';
|
2137
|
-
assert(0);
|
1962
|
+
return CONTINUE;
|
2138
1963
|
}
|
2139
|
-
|
2140
|
-
// Ignore the status, since we don't have a convenient way of signalling that
|
2141
|
-
// a parser error has occurred when the error occurs in the middle of a
|
2142
|
-
// multi-state token. We'd need a flag inside the TokenizerState to do this,
|
2143
|
-
// but that's a low priority fix.
|
2144
|
-
gumbo_consume_char_ref (
|
2145
|
-
parser,
|
2146
|
-
&tokenizer->_input,
|
2147
|
-
allowed_char,
|
2148
|
-
true,
|
2149
|
-
&char_ref
|
2150
|
-
);
|
2151
|
-
if (char_ref.first != kGumboNoChar) {
|
2152
|
-
tokenizer->_reconsume_current_input = true;
|
2153
|
-
append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
|
2154
|
-
if (char_ref.second != kGumboNoChar) {
|
2155
|
-
append_char_to_tag_buffer(parser, char_ref.second, is_unquoted);
|
2156
|
-
}
|
2157
|
-
} else {
|
2158
|
-
append_char_to_tag_buffer(parser, '&', is_unquoted);
|
2159
|
-
}
|
2160
|
-
gumbo_tokenizer_set_state(parser, tokenizer->_tag_state._attr_value_state);
|
2161
|
-
return NEXT_CHAR;
|
2162
1964
|
}
|
2163
1965
|
|
2164
1966
|
// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
|
@@ -2175,24 +1977,21 @@ static StateResult handle_after_attr_value_quoted_state (
|
|
2175
1977
|
case '\f':
|
2176
1978
|
case ' ':
|
2177
1979
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
2178
|
-
return
|
1980
|
+
return CONTINUE;
|
2179
1981
|
case '/':
|
2180
1982
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
|
2181
|
-
return
|
1983
|
+
return CONTINUE;
|
2182
1984
|
case '>':
|
2183
1985
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2184
1986
|
return emit_current_tag(parser, output);
|
2185
1987
|
case -1:
|
2186
|
-
tokenizer_add_parse_error(parser,
|
2187
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
1988
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
2188
1989
|
abandon_current_tag(parser);
|
2189
|
-
|
2190
|
-
return NEXT_CHAR;
|
1990
|
+
return emit_eof(parser, output);
|
2191
1991
|
default:
|
2192
|
-
tokenizer_add_parse_error(parser,
|
2193
|
-
|
2194
|
-
|
2195
|
-
return NEXT_CHAR;
|
1992
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES);
|
1993
|
+
reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
1994
|
+
return CONTINUE;
|
2196
1995
|
}
|
2197
1996
|
}
|
2198
1997
|
|
@@ -2209,15 +2008,13 @@ static StateResult handle_self_closing_start_tag_state (
|
|
2209
2008
|
tokenizer->_tag_state._is_self_closing = true;
|
2210
2009
|
return emit_current_tag(parser, output);
|
2211
2010
|
case -1:
|
2212
|
-
tokenizer_add_parse_error(parser,
|
2213
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2011
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
|
2214
2012
|
abandon_current_tag(parser);
|
2215
|
-
return
|
2013
|
+
return emit_eof(parser, output);
|
2216
2014
|
default:
|
2217
|
-
tokenizer_add_parse_error(parser,
|
2218
|
-
|
2219
|
-
|
2220
|
-
return NEXT_CHAR;
|
2015
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG);
|
2016
|
+
reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
|
2017
|
+
return CONTINUE;
|
2221
2018
|
}
|
2222
2019
|
}
|
2223
2020
|
|
@@ -2228,21 +2025,27 @@ static StateResult handle_bogus_comment_state (
|
|
2228
2025
|
int c,
|
2229
2026
|
GumboToken* output
|
2230
2027
|
) {
|
2231
|
-
|
2232
|
-
|
2233
|
-
|
2234
|
-
|
2235
|
-
|
2028
|
+
switch (c) {
|
2029
|
+
case '>':
|
2030
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2031
|
+
return emit_comment(parser, output);
|
2032
|
+
case -1:
|
2033
|
+
// We need to emit the comment and then the EOF, so reconsume in data
|
2034
|
+
// state.
|
2035
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2036
|
+
return emit_comment(parser, output);
|
2037
|
+
case '\0':
|
2038
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2039
|
+
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2040
|
+
return CONTINUE;
|
2041
|
+
default:
|
2236
2042
|
append_char_to_temporary_buffer(parser, c);
|
2237
|
-
|
2238
|
-
c = utf8iterator_current(&tokenizer->_input);
|
2043
|
+
return CONTINUE;
|
2239
2044
|
}
|
2240
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2241
|
-
return emit_comment(parser, output);
|
2242
2045
|
}
|
2243
2046
|
|
2244
2047
|
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
2245
|
-
static StateResult
|
2048
|
+
static StateResult handle_markup_declaration_open_state (
|
2246
2049
|
GumboParser* parser,
|
2247
2050
|
GumboTokenizerState* tokenizer,
|
2248
2051
|
int UNUSED_ARG(c),
|
@@ -2253,21 +2056,21 @@ static StateResult handle_markup_declaration_state (
|
|
2253
2056
|
&tokenizer->_input,
|
2254
2057
|
"--",
|
2255
2058
|
sizeof("--") - 1,
|
2256
|
-
true
|
2059
|
+
/* case sensitive */ true
|
2257
2060
|
)
|
2258
2061
|
) {
|
2259
|
-
|
2260
|
-
|
2261
|
-
}
|
2062
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT_START);
|
2063
|
+
return CONTINUE;
|
2064
|
+
}
|
2065
|
+
if (
|
2262
2066
|
utf8iterator_maybe_consume_match (
|
2263
2067
|
&tokenizer->_input,
|
2264
2068
|
"DOCTYPE",
|
2265
2069
|
sizeof("DOCTYPE") - 1,
|
2266
|
-
false
|
2070
|
+
/* case sensitive */ false
|
2267
2071
|
)
|
2268
2072
|
) {
|
2269
|
-
|
2270
|
-
tokenizer->_reconsume_current_input = true;
|
2073
|
+
reconsume_in_state(parser, GUMBO_LEX_DOCTYPE);
|
2271
2074
|
// If we get here, we know we'll eventually emit a doctype token, so now is
|
2272
2075
|
// the time to initialize the doctype strings. (Not in doctype_state_init,
|
2273
2076
|
// since then they'll leak if ownership never gets transferred to the
|
@@ -2275,24 +2078,35 @@ static StateResult handle_markup_declaration_state (
|
|
2275
2078
|
tokenizer->_doc_type_state.name = gumbo_strdup("");
|
2276
2079
|
tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
|
2277
2080
|
tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
|
2278
|
-
|
2279
|
-
|
2280
|
-
|
2081
|
+
return CONTINUE;
|
2082
|
+
}
|
2083
|
+
if (
|
2084
|
+
utf8iterator_maybe_consume_match (
|
2281
2085
|
&tokenizer->_input,
|
2282
2086
|
"[CDATA[", sizeof("[CDATA[") - 1,
|
2283
|
-
true
|
2087
|
+
/* case sensitive */ true
|
2284
2088
|
)
|
2285
2089
|
) {
|
2286
|
-
|
2287
|
-
|
2288
|
-
|
2289
|
-
|
2290
|
-
|
2291
|
-
|
2292
|
-
|
2293
|
-
|
2090
|
+
if (tokenizer->_is_adjusted_current_node_foreign) {
|
2091
|
+
reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
|
2092
|
+
tokenizer->_is_in_cdata = true;
|
2093
|
+
// Start the token after the <![CDATA[.
|
2094
|
+
reset_token_start_point(tokenizer);
|
2095
|
+
} else {
|
2096
|
+
tokenizer_add_token_parse_error(parser, GUMBO_ERR_CDATA_IN_HTML_CONTENT);
|
2097
|
+
clear_temporary_buffer(parser);
|
2098
|
+
append_string_to_temporary_buffer (
|
2099
|
+
parser,
|
2100
|
+
&(const GumboStringPiece) { .data = "[CDATA[", .length = 7 }
|
2101
|
+
);
|
2102
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
|
2103
|
+
}
|
2104
|
+
return CONTINUE;
|
2294
2105
|
}
|
2295
|
-
|
2106
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_OPENED_COMMENT);
|
2107
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
|
2108
|
+
clear_temporary_buffer(parser);
|
2109
|
+
return CONTINUE;
|
2296
2110
|
}
|
2297
2111
|
|
2298
2112
|
// https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
|
@@ -2305,26 +2119,14 @@ static StateResult handle_comment_start_state (
|
|
2305
2119
|
switch (c) {
|
2306
2120
|
case '-':
|
2307
2121
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
|
2308
|
-
return
|
2309
|
-
case '\0':
|
2310
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
2311
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2312
|
-
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2313
|
-
return NEXT_CHAR;
|
2122
|
+
return CONTINUE;
|
2314
2123
|
case '>':
|
2315
|
-
tokenizer_add_parse_error(parser,
|
2124
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
|
2316
2125
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2317
|
-
emit_comment(parser, output);
|
2318
|
-
return RETURN_ERROR;
|
2319
|
-
case -1:
|
2320
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_EOF);
|
2321
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2322
|
-
emit_comment(parser, output);
|
2323
|
-
return RETURN_ERROR;
|
2126
|
+
return emit_comment(parser, output);
|
2324
2127
|
default:
|
2325
|
-
|
2326
|
-
|
2327
|
-
return NEXT_CHAR;
|
2128
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2129
|
+
return CONTINUE;
|
2328
2130
|
}
|
2329
2131
|
}
|
2330
2132
|
|
@@ -2338,28 +2140,20 @@ static StateResult handle_comment_start_dash_state (
|
|
2338
2140
|
switch (c) {
|
2339
2141
|
case '-':
|
2340
2142
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
|
2341
|
-
return
|
2342
|
-
case '\0':
|
2343
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
2344
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2345
|
-
append_char_to_temporary_buffer(parser, '-');
|
2346
|
-
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2347
|
-
return NEXT_CHAR;
|
2143
|
+
return CONTINUE;
|
2348
2144
|
case '>':
|
2349
|
-
tokenizer_add_parse_error(parser,
|
2145
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
|
2350
2146
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2351
|
-
emit_comment(parser, output);
|
2352
|
-
return RETURN_ERROR;
|
2147
|
+
return emit_comment(parser, output);
|
2353
2148
|
case -1:
|
2354
|
-
tokenizer_add_parse_error(parser,
|
2355
|
-
|
2356
|
-
|
2357
|
-
return
|
2149
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
|
2150
|
+
// Switch to data to emit the EOF next.
|
2151
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2152
|
+
return emit_comment(parser, output);
|
2358
2153
|
default:
|
2359
|
-
|
2154
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2360
2155
|
append_char_to_temporary_buffer(parser, '-');
|
2361
|
-
|
2362
|
-
return NEXT_CHAR;
|
2156
|
+
return CONTINUE;
|
2363
2157
|
}
|
2364
2158
|
}
|
2365
2159
|
|
@@ -2371,21 +2165,99 @@ static StateResult handle_comment_state (
|
|
2371
2165
|
GumboToken* output
|
2372
2166
|
) {
|
2373
2167
|
switch (c) {
|
2168
|
+
case '<':
|
2169
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT);
|
2170
|
+
append_char_to_temporary_buffer(parser, c);
|
2171
|
+
return CONTINUE;
|
2374
2172
|
case '-':
|
2375
2173
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
|
2376
|
-
return
|
2174
|
+
return CONTINUE;
|
2377
2175
|
case '\0':
|
2378
|
-
tokenizer_add_parse_error(parser,
|
2176
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2379
2177
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2380
|
-
return
|
2178
|
+
return CONTINUE;
|
2381
2179
|
case -1:
|
2382
|
-
tokenizer_add_parse_error(parser,
|
2383
|
-
|
2384
|
-
|
2385
|
-
return
|
2180
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
|
2181
|
+
// Switch to data to emit the EOF token next.
|
2182
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2183
|
+
return emit_comment(parser, output);
|
2386
2184
|
default:
|
2387
2185
|
append_char_to_temporary_buffer(parser, c);
|
2388
|
-
return
|
2186
|
+
return CONTINUE;
|
2187
|
+
}
|
2188
|
+
}
|
2189
|
+
|
2190
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
|
2191
|
+
static StateResult handle_comment_lt_state (
|
2192
|
+
GumboParser* parser,
|
2193
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2194
|
+
int c,
|
2195
|
+
GumboToken* output
|
2196
|
+
) {
|
2197
|
+
switch (c) {
|
2198
|
+
case '!':
|
2199
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG);
|
2200
|
+
append_char_to_temporary_buffer(parser, c);
|
2201
|
+
return CONTINUE;
|
2202
|
+
case '<':
|
2203
|
+
append_char_to_temporary_buffer(parser, c);
|
2204
|
+
return CONTINUE;
|
2205
|
+
default:
|
2206
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2207
|
+
return CONTINUE;
|
2208
|
+
}
|
2209
|
+
}
|
2210
|
+
|
2211
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
|
2212
|
+
static StateResult handle_comment_lt_bang_state (
|
2213
|
+
GumboParser* parser,
|
2214
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2215
|
+
int c,
|
2216
|
+
GumboToken* output
|
2217
|
+
) {
|
2218
|
+
switch (c) {
|
2219
|
+
case '-':
|
2220
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH);
|
2221
|
+
return CONTINUE;
|
2222
|
+
default:
|
2223
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2224
|
+
return CONTINUE;
|
2225
|
+
}
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
|
2229
|
+
static StateResult handle_comment_lt_bang_dash_state (
|
2230
|
+
GumboParser* parser,
|
2231
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2232
|
+
int c,
|
2233
|
+
GumboToken* output
|
2234
|
+
) {
|
2235
|
+
switch (c) {
|
2236
|
+
case '-':
|
2237
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH);
|
2238
|
+
return CONTINUE;
|
2239
|
+
default:
|
2240
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT_END_DASH);
|
2241
|
+
return CONTINUE;
|
2242
|
+
}
|
2243
|
+
}
|
2244
|
+
|
2245
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
|
2246
|
+
static StateResult handle_comment_lt_bang_dash_dash_state (
|
2247
|
+
GumboParser* parser,
|
2248
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2249
|
+
int c,
|
2250
|
+
GumboToken* output
|
2251
|
+
) {
|
2252
|
+
switch (c) {
|
2253
|
+
case '>':
|
2254
|
+
case -1:
|
2255
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
|
2256
|
+
return CONTINUE;
|
2257
|
+
default:
|
2258
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_NESTED_COMMENT);
|
2259
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
|
2260
|
+
return CONTINUE;
|
2389
2261
|
}
|
2390
2262
|
}
|
2391
2263
|
|
@@ -2397,25 +2269,18 @@ static StateResult handle_comment_end_dash_state (
|
|
2397
2269
|
GumboToken* output
|
2398
2270
|
) {
|
2399
2271
|
switch (c) {
|
2400
|
-
|
2401
|
-
|
2402
|
-
|
2403
|
-
|
2404
|
-
|
2405
|
-
|
2406
|
-
|
2407
|
-
|
2408
|
-
|
2409
|
-
|
2410
|
-
|
2411
|
-
|
2412
|
-
emit_comment(parser, output);
|
2413
|
-
return RETURN_ERROR;
|
2414
|
-
default:
|
2415
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2416
|
-
append_char_to_temporary_buffer(parser, '-');
|
2417
|
-
append_char_to_temporary_buffer(parser, c);
|
2418
|
-
return NEXT_CHAR;
|
2272
|
+
case '-':
|
2273
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
|
2274
|
+
return CONTINUE;
|
2275
|
+
case -1:
|
2276
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
|
2277
|
+
// Switch to data to emit EOF next.
|
2278
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2279
|
+
return emit_comment(parser, output);
|
2280
|
+
default:
|
2281
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2282
|
+
append_char_to_temporary_buffer(parser, '-');
|
2283
|
+
return CONTINUE;
|
2419
2284
|
}
|
2420
2285
|
}
|
2421
2286
|
|
@@ -2430,35 +2295,22 @@ static StateResult handle_comment_end_state (
|
|
2430
2295
|
case '>':
|
2431
2296
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2432
2297
|
return emit_comment(parser, output);
|
2433
|
-
case '\0':
|
2434
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
2435
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2436
|
-
append_char_to_temporary_buffer(parser, '-');
|
2437
|
-
append_char_to_temporary_buffer(parser, '-');
|
2438
|
-
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2439
|
-
return NEXT_CHAR;
|
2440
2298
|
case '!':
|
2441
|
-
tokenizer_add_parse_error(
|
2442
|
-
parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
|
2443
2299
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
|
2444
|
-
return
|
2300
|
+
return CONTINUE;
|
2445
2301
|
case '-':
|
2446
|
-
tokenizer_add_parse_error(
|
2447
|
-
parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
|
2448
2302
|
append_char_to_temporary_buffer(parser, '-');
|
2449
|
-
return
|
2303
|
+
return CONTINUE;
|
2450
2304
|
case -1:
|
2451
|
-
tokenizer_add_parse_error(parser,
|
2305
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
|
2306
|
+
// Switch to data to emit EOF next.
|
2452
2307
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2453
|
-
emit_comment(parser, output);
|
2454
|
-
return RETURN_ERROR;
|
2308
|
+
return emit_comment(parser, output);
|
2455
2309
|
default:
|
2456
|
-
|
2457
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2310
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2458
2311
|
append_char_to_temporary_buffer(parser, '-');
|
2459
2312
|
append_char_to_temporary_buffer(parser, '-');
|
2460
|
-
|
2461
|
-
return NEXT_CHAR;
|
2313
|
+
return CONTINUE;
|
2462
2314
|
}
|
2463
2315
|
}
|
2464
2316
|
|
@@ -2475,30 +2327,22 @@ static StateResult handle_comment_end_bang_state (
|
|
2475
2327
|
append_char_to_temporary_buffer(parser, '-');
|
2476
2328
|
append_char_to_temporary_buffer(parser, '-');
|
2477
2329
|
append_char_to_temporary_buffer(parser, '!');
|
2478
|
-
return
|
2330
|
+
return CONTINUE;
|
2479
2331
|
case '>':
|
2332
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT);
|
2480
2333
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2481
2334
|
return emit_comment(parser, output);
|
2482
|
-
case '\0':
|
2483
|
-
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
2484
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT);
|
2485
|
-
append_char_to_temporary_buffer(parser, '-');
|
2486
|
-
append_char_to_temporary_buffer(parser, '-');
|
2487
|
-
append_char_to_temporary_buffer(parser, '!');
|
2488
|
-
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2489
|
-
return NEXT_CHAR;
|
2490
2335
|
case -1:
|
2491
|
-
tokenizer_add_parse_error(parser,
|
2336
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
|
2337
|
+
// Switch to data to emit EOF next.
|
2492
2338
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2493
|
-
emit_comment(parser, output);
|
2494
|
-
return RETURN_ERROR;
|
2339
|
+
return emit_comment(parser, output);
|
2495
2340
|
default:
|
2496
|
-
|
2341
|
+
reconsume_in_state(parser, GUMBO_LEX_COMMENT);
|
2497
2342
|
append_char_to_temporary_buffer(parser, '-');
|
2498
2343
|
append_char_to_temporary_buffer(parser, '-');
|
2499
2344
|
append_char_to_temporary_buffer(parser, '!');
|
2500
|
-
|
2501
|
-
return NEXT_CHAR;
|
2345
|
+
return CONTINUE;
|
2502
2346
|
}
|
2503
2347
|
}
|
2504
2348
|
|
@@ -2509,26 +2353,27 @@ static StateResult handle_doctype_state (
|
|
2509
2353
|
int c,
|
2510
2354
|
GumboToken* output
|
2511
2355
|
) {
|
2512
|
-
assert(
|
2356
|
+
assert(temporary_buffer_is_empty(parser));
|
2513
2357
|
switch (c) {
|
2514
2358
|
case '\t':
|
2515
2359
|
case '\n':
|
2516
2360
|
case '\f':
|
2517
2361
|
case ' ':
|
2518
2362
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
|
2519
|
-
return
|
2363
|
+
return CONTINUE;
|
2364
|
+
case '>':
|
2365
|
+
reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
|
2366
|
+
return CONTINUE;
|
2520
2367
|
case -1:
|
2521
|
-
tokenizer_add_parse_error(parser,
|
2522
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2368
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2523
2369
|
tokenizer->_doc_type_state.force_quirks = true;
|
2524
|
-
|
2525
|
-
|
2370
|
+
// Switch to data to emit EOF next.
|
2371
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2372
|
+
return emit_doctype(parser, output);
|
2526
2373
|
default:
|
2527
|
-
tokenizer_add_parse_error(parser,
|
2528
|
-
|
2529
|
-
|
2530
|
-
tokenizer->_doc_type_state.force_quirks = true;
|
2531
|
-
return NEXT_CHAR;
|
2374
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME);
|
2375
|
+
reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
|
2376
|
+
return CONTINUE;
|
2532
2377
|
}
|
2533
2378
|
}
|
2534
2379
|
|
@@ -2544,30 +2389,27 @@ static StateResult handle_before_doctype_name_state (
|
|
2544
2389
|
case '\n':
|
2545
2390
|
case '\f':
|
2546
2391
|
case ' ':
|
2547
|
-
return
|
2392
|
+
return CONTINUE;
|
2548
2393
|
case '\0':
|
2549
|
-
tokenizer_add_parse_error(parser,
|
2394
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2550
2395
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
|
2551
|
-
tokenizer->_doc_type_state.force_quirks = true;
|
2552
2396
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2553
|
-
return
|
2397
|
+
return CONTINUE;
|
2554
2398
|
case '>':
|
2555
|
-
tokenizer_add_parse_error(parser,
|
2399
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_NAME);
|
2556
2400
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2557
2401
|
tokenizer->_doc_type_state.force_quirks = true;
|
2558
|
-
emit_doctype(parser, output);
|
2559
|
-
return RETURN_ERROR;
|
2402
|
+
return emit_doctype(parser, output);
|
2560
2403
|
case -1:
|
2561
|
-
tokenizer_add_parse_error(parser,
|
2562
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2404
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2563
2405
|
tokenizer->_doc_type_state.force_quirks = true;
|
2564
|
-
|
2565
|
-
|
2406
|
+
// Switch to data to emit EOF next.
|
2407
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2408
|
+
return emit_doctype(parser, output);
|
2566
2409
|
default:
|
2567
2410
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
|
2568
|
-
tokenizer->_doc_type_state.force_quirks = false;
|
2569
2411
|
append_char_to_temporary_buffer(parser, ensure_lowercase(c));
|
2570
|
-
return
|
2412
|
+
return CONTINUE;
|
2571
2413
|
}
|
2572
2414
|
}
|
2573
2415
|
|
@@ -2586,30 +2428,26 @@ static StateResult handle_doctype_name_state (
|
|
2586
2428
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
|
2587
2429
|
gumbo_free((void*) tokenizer->_doc_type_state.name);
|
2588
2430
|
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2589
|
-
return
|
2431
|
+
return CONTINUE;
|
2590
2432
|
case '>':
|
2591
2433
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2592
2434
|
gumbo_free((void*) tokenizer->_doc_type_state.name);
|
2593
2435
|
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2594
|
-
emit_doctype(parser, output);
|
2595
|
-
return RETURN_SUCCESS;
|
2436
|
+
return emit_doctype(parser, output);
|
2596
2437
|
case '\0':
|
2597
|
-
tokenizer_add_parse_error(parser,
|
2438
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2598
2439
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2599
|
-
return
|
2440
|
+
return CONTINUE;
|
2600
2441
|
case -1:
|
2601
|
-
tokenizer_add_parse_error(parser,
|
2602
|
-
|
2442
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2443
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2603
2444
|
tokenizer->_doc_type_state.force_quirks = true;
|
2604
2445
|
gumbo_free((void*) tokenizer->_doc_type_state.name);
|
2605
2446
|
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2606
|
-
emit_doctype(parser, output);
|
2607
|
-
return RETURN_ERROR;
|
2447
|
+
return emit_doctype(parser, output);
|
2608
2448
|
default:
|
2609
|
-
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
|
2610
|
-
tokenizer->_doc_type_state.force_quirks = false;
|
2611
2449
|
append_char_to_temporary_buffer(parser, ensure_lowercase(c));
|
2612
|
-
return
|
2450
|
+
return CONTINUE;
|
2613
2451
|
}
|
2614
2452
|
}
|
2615
2453
|
|
@@ -2625,35 +2463,29 @@ static StateResult handle_after_doctype_name_state (
|
|
2625
2463
|
case '\n':
|
2626
2464
|
case '\f':
|
2627
2465
|
case ' ':
|
2628
|
-
return
|
2466
|
+
return CONTINUE;
|
2629
2467
|
case '>':
|
2630
2468
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2631
|
-
emit_doctype(parser, output);
|
2632
|
-
return RETURN_SUCCESS;
|
2469
|
+
return emit_doctype(parser, output);
|
2633
2470
|
case -1:
|
2634
|
-
tokenizer_add_parse_error(parser,
|
2471
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2635
2472
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2636
2473
|
tokenizer->_doc_type_state.force_quirks = true;
|
2637
|
-
emit_doctype(parser, output);
|
2638
|
-
return RETURN_ERROR;
|
2474
|
+
return emit_doctype(parser, output);
|
2639
2475
|
default:
|
2640
2476
|
if (utf8iterator_maybe_consume_match(
|
2641
2477
|
&tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
|
2642
|
-
|
2643
|
-
parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
|
2644
|
-
tokenizer->_reconsume_current_input = true;
|
2478
|
+
reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
|
2645
2479
|
} else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
|
2646
2480
|
sizeof("SYSTEM") - 1, false)) {
|
2647
|
-
|
2648
|
-
parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
|
2649
|
-
tokenizer->_reconsume_current_input = true;
|
2481
|
+
reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
|
2650
2482
|
} else {
|
2651
2483
|
tokenizer_add_parse_error(
|
2652
|
-
parser,
|
2653
|
-
|
2484
|
+
parser, GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME);
|
2485
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2654
2486
|
tokenizer->_doc_type_state.force_quirks = true;
|
2655
2487
|
}
|
2656
|
-
return
|
2488
|
+
return CONTINUE;
|
2657
2489
|
}
|
2658
2490
|
}
|
2659
2491
|
|
@@ -2670,37 +2502,34 @@ static StateResult handle_after_doctype_public_keyword_state (
|
|
2670
2502
|
case '\f':
|
2671
2503
|
case ' ':
|
2672
2504
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
|
2673
|
-
return
|
2505
|
+
return CONTINUE;
|
2674
2506
|
case '"':
|
2675
|
-
tokenizer_add_parse_error(parser,
|
2507
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
|
2676
2508
|
assert(temporary_buffer_is_empty(parser));
|
2677
2509
|
gumbo_tokenizer_set_state(
|
2678
2510
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
|
2679
|
-
return
|
2511
|
+
return CONTINUE;
|
2680
2512
|
case '\'':
|
2681
|
-
tokenizer_add_parse_error(parser,
|
2513
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
|
2682
2514
|
assert(temporary_buffer_is_empty(parser));
|
2683
2515
|
gumbo_tokenizer_set_state(
|
2684
2516
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
|
2685
|
-
return
|
2517
|
+
return CONTINUE;
|
2686
2518
|
case '>':
|
2687
|
-
tokenizer_add_parse_error(parser,
|
2519
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
|
2688
2520
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2689
2521
|
tokenizer->_doc_type_state.force_quirks = true;
|
2690
|
-
emit_doctype(parser, output);
|
2691
|
-
return RETURN_ERROR;
|
2522
|
+
return emit_doctype(parser, output);
|
2692
2523
|
case -1:
|
2693
|
-
tokenizer_add_parse_error(parser,
|
2694
|
-
|
2524
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2525
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2695
2526
|
tokenizer->_doc_type_state.force_quirks = true;
|
2696
|
-
emit_doctype(parser, output);
|
2697
|
-
return RETURN_ERROR;
|
2527
|
+
return emit_doctype(parser, output);
|
2698
2528
|
default:
|
2699
|
-
tokenizer_add_parse_error(parser,
|
2700
|
-
|
2529
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
|
2530
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2701
2531
|
tokenizer->_doc_type_state.force_quirks = true;
|
2702
|
-
|
2703
|
-
return RETURN_ERROR;
|
2532
|
+
return CONTINUE;
|
2704
2533
|
}
|
2705
2534
|
}
|
2706
2535
|
|
@@ -2716,35 +2545,32 @@ static StateResult handle_before_doctype_public_id_state (
|
|
2716
2545
|
case '\n':
|
2717
2546
|
case '\f':
|
2718
2547
|
case ' ':
|
2719
|
-
return
|
2548
|
+
return CONTINUE;
|
2720
2549
|
case '"':
|
2721
2550
|
assert(temporary_buffer_is_empty(parser));
|
2722
2551
|
gumbo_tokenizer_set_state(
|
2723
2552
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
|
2724
|
-
return
|
2553
|
+
return CONTINUE;
|
2725
2554
|
case '\'':
|
2726
2555
|
assert(temporary_buffer_is_empty(parser));
|
2727
2556
|
gumbo_tokenizer_set_state(
|
2728
2557
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
|
2729
|
-
return
|
2558
|
+
return CONTINUE;
|
2730
2559
|
case '>':
|
2731
|
-
tokenizer_add_parse_error(parser,
|
2560
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
|
2732
2561
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2733
2562
|
tokenizer->_doc_type_state.force_quirks = true;
|
2734
|
-
emit_doctype(parser, output);
|
2735
|
-
return RETURN_ERROR;
|
2563
|
+
return emit_doctype(parser, output);
|
2736
2564
|
case -1:
|
2737
|
-
tokenizer_add_parse_error(parser,
|
2738
|
-
|
2565
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2566
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2739
2567
|
tokenizer->_doc_type_state.force_quirks = true;
|
2740
|
-
emit_doctype(parser, output);
|
2741
|
-
return RETURN_ERROR;
|
2568
|
+
return emit_doctype(parser, output);
|
2742
2569
|
default:
|
2743
|
-
tokenizer_add_parse_error(parser,
|
2744
|
-
|
2570
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
|
2571
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2745
2572
|
tokenizer->_doc_type_state.force_quirks = true;
|
2746
|
-
|
2747
|
-
return RETURN_ERROR;
|
2573
|
+
return CONTINUE;
|
2748
2574
|
}
|
2749
2575
|
}
|
2750
2576
|
|
@@ -2759,28 +2585,26 @@ static StateResult handle_doctype_public_id_double_quoted_state (
|
|
2759
2585
|
case '"':
|
2760
2586
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
|
2761
2587
|
finish_doctype_public_id(parser);
|
2762
|
-
return
|
2588
|
+
return CONTINUE;
|
2763
2589
|
case '\0':
|
2764
|
-
tokenizer_add_parse_error(parser,
|
2590
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2765
2591
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2766
|
-
return
|
2592
|
+
return CONTINUE;
|
2767
2593
|
case '>':
|
2768
|
-
tokenizer_add_parse_error(parser,
|
2594
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
|
2769
2595
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2770
2596
|
tokenizer->_doc_type_state.force_quirks = true;
|
2771
2597
|
finish_doctype_public_id(parser);
|
2772
|
-
emit_doctype(parser, output);
|
2773
|
-
return RETURN_ERROR;
|
2598
|
+
return emit_doctype(parser, output);
|
2774
2599
|
case -1:
|
2775
|
-
tokenizer_add_parse_error(parser,
|
2776
|
-
|
2600
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2601
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2777
2602
|
tokenizer->_doc_type_state.force_quirks = true;
|
2778
2603
|
finish_doctype_public_id(parser);
|
2779
|
-
emit_doctype(parser, output);
|
2780
|
-
return RETURN_ERROR;
|
2604
|
+
return emit_doctype(parser, output);
|
2781
2605
|
default:
|
2782
2606
|
append_char_to_temporary_buffer(parser, c);
|
2783
|
-
return
|
2607
|
+
return CONTINUE;
|
2784
2608
|
}
|
2785
2609
|
}
|
2786
2610
|
|
@@ -2795,28 +2619,26 @@ static StateResult handle_doctype_public_id_single_quoted_state (
|
|
2795
2619
|
case '\'':
|
2796
2620
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
|
2797
2621
|
finish_doctype_public_id(parser);
|
2798
|
-
return
|
2622
|
+
return CONTINUE;
|
2799
2623
|
case '\0':
|
2800
|
-
tokenizer_add_parse_error(parser,
|
2624
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2801
2625
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2802
|
-
return
|
2626
|
+
return CONTINUE;
|
2803
2627
|
case '>':
|
2804
|
-
tokenizer_add_parse_error(parser,
|
2628
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
|
2805
2629
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2806
2630
|
tokenizer->_doc_type_state.force_quirks = true;
|
2807
2631
|
finish_doctype_public_id(parser);
|
2808
|
-
emit_doctype(parser, output);
|
2809
|
-
return RETURN_ERROR;
|
2632
|
+
return emit_doctype(parser, output);
|
2810
2633
|
case -1:
|
2811
|
-
tokenizer_add_parse_error(parser,
|
2812
|
-
|
2634
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2635
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2813
2636
|
tokenizer->_doc_type_state.force_quirks = true;
|
2814
2637
|
finish_doctype_public_id(parser);
|
2815
|
-
emit_doctype(parser, output);
|
2816
|
-
return RETURN_ERROR;
|
2638
|
+
return emit_doctype(parser, output);
|
2817
2639
|
default:
|
2818
2640
|
append_char_to_temporary_buffer(parser, c);
|
2819
|
-
return
|
2641
|
+
return CONTINUE;
|
2820
2642
|
}
|
2821
2643
|
}
|
2822
2644
|
|
@@ -2834,35 +2656,38 @@ static StateResult handle_after_doctype_public_id_state (
|
|
2834
2656
|
case ' ':
|
2835
2657
|
gumbo_tokenizer_set_state(
|
2836
2658
|
parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
|
2837
|
-
return
|
2659
|
+
return CONTINUE;
|
2838
2660
|
case '>':
|
2839
2661
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2840
|
-
emit_doctype(parser, output);
|
2841
|
-
return RETURN_SUCCESS;
|
2662
|
+
return emit_doctype(parser, output);
|
2842
2663
|
case '"':
|
2843
|
-
tokenizer_add_parse_error(
|
2664
|
+
tokenizer_add_parse_error (
|
2665
|
+
parser,
|
2666
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
|
2667
|
+
);
|
2844
2668
|
assert(temporary_buffer_is_empty(parser));
|
2845
2669
|
gumbo_tokenizer_set_state(
|
2846
2670
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2847
|
-
return
|
2671
|
+
return CONTINUE;
|
2848
2672
|
case '\'':
|
2849
|
-
tokenizer_add_parse_error(
|
2673
|
+
tokenizer_add_parse_error (
|
2674
|
+
parser,
|
2675
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
|
2676
|
+
);
|
2850
2677
|
assert(temporary_buffer_is_empty(parser));
|
2851
2678
|
gumbo_tokenizer_set_state(
|
2852
2679
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2853
|
-
return
|
2680
|
+
return CONTINUE;
|
2854
2681
|
case -1:
|
2855
|
-
tokenizer_add_parse_error(parser,
|
2856
|
-
|
2857
|
-
tokenizer->_reconsume_current_input = true;
|
2682
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2683
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2858
2684
|
tokenizer->_doc_type_state.force_quirks = true;
|
2859
|
-
emit_doctype(parser, output);
|
2860
|
-
return RETURN_ERROR;
|
2685
|
+
return emit_doctype(parser, output);
|
2861
2686
|
default:
|
2862
|
-
tokenizer_add_parse_error(parser,
|
2863
|
-
|
2687
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
|
2688
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2864
2689
|
tokenizer->_doc_type_state.force_quirks = true;
|
2865
|
-
return
|
2690
|
+
return CONTINUE;
|
2866
2691
|
}
|
2867
2692
|
}
|
2868
2693
|
|
@@ -2878,33 +2703,30 @@ static StateResult handle_between_doctype_public_system_id_state (
|
|
2878
2703
|
case '\n':
|
2879
2704
|
case '\f':
|
2880
2705
|
case ' ':
|
2881
|
-
return
|
2706
|
+
return CONTINUE;
|
2882
2707
|
case '>':
|
2883
2708
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2884
|
-
emit_doctype(parser, output);
|
2885
|
-
return RETURN_SUCCESS;
|
2709
|
+
return emit_doctype(parser, output);
|
2886
2710
|
case '"':
|
2887
2711
|
assert(temporary_buffer_is_empty(parser));
|
2888
2712
|
gumbo_tokenizer_set_state(
|
2889
2713
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2890
|
-
return
|
2714
|
+
return CONTINUE;
|
2891
2715
|
case '\'':
|
2892
2716
|
assert(temporary_buffer_is_empty(parser));
|
2893
2717
|
gumbo_tokenizer_set_state(
|
2894
2718
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2895
|
-
return
|
2719
|
+
return CONTINUE;
|
2896
2720
|
case -1:
|
2897
|
-
tokenizer_add_parse_error(parser,
|
2898
|
-
|
2721
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2722
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2899
2723
|
tokenizer->_doc_type_state.force_quirks = true;
|
2900
|
-
emit_doctype(parser, output);
|
2901
|
-
return RETURN_ERROR;
|
2724
|
+
return emit_doctype(parser, output);
|
2902
2725
|
default:
|
2903
|
-
tokenizer_add_parse_error(parser,
|
2904
|
-
|
2726
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
|
2727
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2905
2728
|
tokenizer->_doc_type_state.force_quirks = true;
|
2906
|
-
|
2907
|
-
return RETURN_ERROR;
|
2729
|
+
return CONTINUE;
|
2908
2730
|
}
|
2909
2731
|
}
|
2910
2732
|
|
@@ -2921,36 +2743,34 @@ static StateResult handle_after_doctype_system_keyword_state (
|
|
2921
2743
|
case '\f':
|
2922
2744
|
case ' ':
|
2923
2745
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
|
2924
|
-
return
|
2746
|
+
return CONTINUE;
|
2925
2747
|
case '"':
|
2926
|
-
tokenizer_add_parse_error(parser,
|
2748
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
|
2927
2749
|
assert(temporary_buffer_is_empty(parser));
|
2928
2750
|
gumbo_tokenizer_set_state(
|
2929
2751
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2930
|
-
return
|
2752
|
+
return CONTINUE;
|
2931
2753
|
case '\'':
|
2932
|
-
tokenizer_add_parse_error(parser,
|
2754
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
|
2933
2755
|
assert(temporary_buffer_is_empty(parser));
|
2934
2756
|
gumbo_tokenizer_set_state(
|
2935
2757
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2936
|
-
return
|
2758
|
+
return CONTINUE;
|
2937
2759
|
case '>':
|
2938
|
-
tokenizer_add_parse_error(parser,
|
2760
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
|
2939
2761
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2940
2762
|
tokenizer->_doc_type_state.force_quirks = true;
|
2941
|
-
emit_doctype(parser, output);
|
2942
|
-
return RETURN_ERROR;
|
2763
|
+
return emit_doctype(parser, output);
|
2943
2764
|
case -1:
|
2944
|
-
tokenizer_add_parse_error(parser,
|
2945
|
-
|
2765
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2766
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2946
2767
|
tokenizer->_doc_type_state.force_quirks = true;
|
2947
|
-
emit_doctype(parser, output);
|
2948
|
-
return RETURN_ERROR;
|
2768
|
+
return emit_doctype(parser, output);
|
2949
2769
|
default:
|
2950
|
-
tokenizer_add_parse_error(parser,
|
2951
|
-
|
2770
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
|
2771
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2952
2772
|
tokenizer->_doc_type_state.force_quirks = true;
|
2953
|
-
return
|
2773
|
+
return CONTINUE;
|
2954
2774
|
}
|
2955
2775
|
}
|
2956
2776
|
|
@@ -2966,34 +2786,32 @@ static StateResult handle_before_doctype_system_id_state (
|
|
2966
2786
|
case '\n':
|
2967
2787
|
case '\f':
|
2968
2788
|
case ' ':
|
2969
|
-
return
|
2789
|
+
return CONTINUE;
|
2970
2790
|
case '"':
|
2971
2791
|
assert(temporary_buffer_is_empty(parser));
|
2972
2792
|
gumbo_tokenizer_set_state(
|
2973
2793
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2974
|
-
return
|
2794
|
+
return CONTINUE;
|
2975
2795
|
case '\'':
|
2976
2796
|
assert(temporary_buffer_is_empty(parser));
|
2977
2797
|
gumbo_tokenizer_set_state(
|
2978
2798
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2979
|
-
return
|
2799
|
+
return CONTINUE;
|
2980
2800
|
case '>':
|
2981
|
-
tokenizer_add_parse_error(parser,
|
2801
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
|
2982
2802
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2983
2803
|
tokenizer->_doc_type_state.force_quirks = true;
|
2984
|
-
emit_doctype(parser, output);
|
2985
|
-
return RETURN_ERROR;
|
2804
|
+
return emit_doctype(parser, output);
|
2986
2805
|
case -1:
|
2987
|
-
tokenizer_add_parse_error(parser,
|
2988
|
-
|
2806
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2807
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2989
2808
|
tokenizer->_doc_type_state.force_quirks = true;
|
2990
|
-
emit_doctype(parser, output);
|
2991
|
-
return RETURN_ERROR;
|
2809
|
+
return emit_doctype(parser, output);
|
2992
2810
|
default:
|
2993
|
-
tokenizer_add_parse_error(parser,
|
2994
|
-
|
2811
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
|
2812
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2995
2813
|
tokenizer->_doc_type_state.force_quirks = true;
|
2996
|
-
return
|
2814
|
+
return CONTINUE;
|
2997
2815
|
}
|
2998
2816
|
}
|
2999
2817
|
|
@@ -3008,28 +2826,26 @@ static StateResult handle_doctype_system_id_double_quoted_state (
|
|
3008
2826
|
case '"':
|
3009
2827
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
|
3010
2828
|
finish_doctype_system_id(parser);
|
3011
|
-
return
|
2829
|
+
return CONTINUE;
|
3012
2830
|
case '\0':
|
3013
|
-
tokenizer_add_parse_error(parser,
|
2831
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
3014
2832
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
3015
|
-
return
|
2833
|
+
return CONTINUE;
|
3016
2834
|
case '>':
|
3017
|
-
tokenizer_add_parse_error(parser,
|
2835
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
|
3018
2836
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
3019
2837
|
tokenizer->_doc_type_state.force_quirks = true;
|
3020
2838
|
finish_doctype_system_id(parser);
|
3021
|
-
emit_doctype(parser, output);
|
3022
|
-
return RETURN_ERROR;
|
2839
|
+
return emit_doctype(parser, output);
|
3023
2840
|
case -1:
|
3024
|
-
tokenizer_add_parse_error(parser,
|
3025
|
-
|
2841
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2842
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
3026
2843
|
tokenizer->_doc_type_state.force_quirks = true;
|
3027
2844
|
finish_doctype_system_id(parser);
|
3028
|
-
emit_doctype(parser, output);
|
3029
|
-
return RETURN_ERROR;
|
2845
|
+
return emit_doctype(parser, output);
|
3030
2846
|
default:
|
3031
2847
|
append_char_to_temporary_buffer(parser, c);
|
3032
|
-
return
|
2848
|
+
return CONTINUE;
|
3033
2849
|
}
|
3034
2850
|
}
|
3035
2851
|
|
@@ -3044,28 +2860,26 @@ static StateResult handle_doctype_system_id_single_quoted_state (
|
|
3044
2860
|
case '\'':
|
3045
2861
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
|
3046
2862
|
finish_doctype_system_id(parser);
|
3047
|
-
return
|
2863
|
+
return CONTINUE;
|
3048
2864
|
case '\0':
|
3049
|
-
tokenizer_add_parse_error(parser,
|
2865
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
3050
2866
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
3051
|
-
return
|
2867
|
+
return CONTINUE;
|
3052
2868
|
case '>':
|
3053
|
-
tokenizer_add_parse_error(parser,
|
2869
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
|
3054
2870
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
3055
2871
|
tokenizer->_doc_type_state.force_quirks = true;
|
3056
2872
|
finish_doctype_system_id(parser);
|
3057
|
-
emit_doctype(parser, output);
|
3058
|
-
return RETURN_ERROR;
|
2873
|
+
return emit_doctype(parser, output);
|
3059
2874
|
case -1:
|
3060
|
-
tokenizer_add_parse_error(parser,
|
3061
|
-
|
2875
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2876
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
3062
2877
|
tokenizer->_doc_type_state.force_quirks = true;
|
3063
2878
|
finish_doctype_system_id(parser);
|
3064
|
-
emit_doctype(parser, output);
|
3065
|
-
return RETURN_ERROR;
|
2879
|
+
return emit_doctype(parser, output);
|
3066
2880
|
default:
|
3067
2881
|
append_char_to_temporary_buffer(parser, c);
|
3068
|
-
return
|
2882
|
+
return CONTINUE;
|
3069
2883
|
}
|
3070
2884
|
}
|
3071
2885
|
|
@@ -3081,21 +2895,19 @@ static StateResult handle_after_doctype_system_id_state (
|
|
3081
2895
|
case '\n':
|
3082
2896
|
case '\f':
|
3083
2897
|
case ' ':
|
3084
|
-
return
|
2898
|
+
return CONTINUE;
|
3085
2899
|
case '>':
|
3086
2900
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
3087
|
-
emit_doctype(parser, output);
|
3088
|
-
return RETURN_SUCCESS;
|
2901
|
+
return emit_doctype(parser, output);
|
3089
2902
|
case -1:
|
3090
|
-
tokenizer_add_parse_error(parser,
|
3091
|
-
|
2903
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
|
2904
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
3092
2905
|
tokenizer->_doc_type_state.force_quirks = true;
|
3093
|
-
emit_doctype(parser, output);
|
3094
|
-
return RETURN_ERROR;
|
2906
|
+
return emit_doctype(parser, output);
|
3095
2907
|
default:
|
3096
|
-
tokenizer_add_parse_error(parser,
|
3097
|
-
|
3098
|
-
return
|
2908
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER);
|
2909
|
+
reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2910
|
+
return CONTINUE;
|
3099
2911
|
}
|
3100
2912
|
}
|
3101
2913
|
|
@@ -3106,33 +2918,370 @@ static StateResult handle_bogus_doctype_state (
|
|
3106
2918
|
int c,
|
3107
2919
|
GumboToken* output
|
3108
2920
|
) {
|
3109
|
-
|
2921
|
+
switch (c) {
|
2922
|
+
case '>':
|
3110
2923
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
3111
|
-
emit_doctype(parser, output);
|
3112
|
-
|
2924
|
+
return emit_doctype(parser, output);
|
2925
|
+
case '\0':
|
2926
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
|
2927
|
+
return CONTINUE;
|
2928
|
+
case -1:
|
2929
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
2930
|
+
return emit_doctype(parser, output);
|
2931
|
+
default:
|
2932
|
+
return CONTINUE;
|
3113
2933
|
}
|
3114
|
-
return NEXT_CHAR;
|
3115
2934
|
}
|
3116
2935
|
|
3117
2936
|
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
|
3118
|
-
static StateResult
|
2937
|
+
static StateResult handle_cdata_section_state (
|
3119
2938
|
GumboParser* parser,
|
3120
2939
|
GumboTokenizerState* tokenizer,
|
3121
2940
|
int c,
|
3122
2941
|
GumboToken* output
|
3123
2942
|
) {
|
3124
|
-
|
3125
|
-
|
3126
|
-
|
2943
|
+
switch (c) {
|
2944
|
+
case ']':
|
2945
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_BRACKET);
|
2946
|
+
set_mark(parser);
|
2947
|
+
return CONTINUE;
|
2948
|
+
case -1:
|
2949
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_CDATA);
|
2950
|
+
return emit_eof(parser, output);
|
2951
|
+
default:
|
2952
|
+
return emit_char(parser, c, output);
|
2953
|
+
}
|
2954
|
+
}
|
2955
|
+
|
2956
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
|
2957
|
+
static StateResult handle_cdata_section_bracket_state (
|
2958
|
+
GumboParser* parser,
|
2959
|
+
GumboTokenizerState* tokenizer,
|
2960
|
+
int c,
|
2961
|
+
GumboToken* output
|
2962
|
+
) {
|
2963
|
+
switch (c) {
|
2964
|
+
case ']':
|
2965
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_END);
|
2966
|
+
return CONTINUE;
|
2967
|
+
default:
|
2968
|
+
reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
|
2969
|
+
// Emit the ].
|
2970
|
+
return emit_from_mark(parser, output);
|
2971
|
+
}
|
2972
|
+
}
|
2973
|
+
|
2974
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
|
2975
|
+
static StateResult handle_cdata_section_end_state (
|
2976
|
+
GumboParser* parser,
|
2977
|
+
GumboTokenizerState* tokenizer,
|
2978
|
+
int c,
|
2979
|
+
GumboToken* output
|
2980
|
+
) {
|
2981
|
+
switch (c) {
|
2982
|
+
case ']':
|
2983
|
+
{
|
2984
|
+
// XXX: This is terrible. We want to emit a ] corresponding to the first
|
2985
|
+
// of the three in a row we've seen. So let's emit one token from the
|
2986
|
+
// temporary buffer (which will rewind 3 characters, emit the ] and
|
2987
|
+
// advance one). Next, let's clear the temporary buffer which will set the
|
2988
|
+
// mark to the middle of the three brackets. Finally, let's move to the
|
2989
|
+
// appropriate state.
|
2990
|
+
StateResult result = emit_from_mark(parser, output);
|
2991
|
+
tokenizer->_resume_pos = NULL;
|
2992
|
+
set_mark(parser);
|
2993
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION);
|
2994
|
+
return result;
|
2995
|
+
}
|
2996
|
+
case '>':
|
2997
|
+
// We're done with CDATA so move past the >, reset the token start point
|
2998
|
+
// to point after the >, and then reconsume in the data state.
|
2999
|
+
utf8iterator_next(&tokenizer->_input);
|
3127
3000
|
reset_token_start_point(tokenizer);
|
3128
|
-
|
3001
|
+
reconsume_in_state(parser, GUMBO_LEX_DATA);
|
3129
3002
|
tokenizer->_is_in_cdata = false;
|
3130
|
-
return
|
3131
|
-
|
3132
|
-
|
3003
|
+
return CONTINUE;
|
3004
|
+
default:
|
3005
|
+
reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
|
3006
|
+
return emit_from_mark(parser, output);
|
3007
|
+
}
|
3008
|
+
}
|
3009
|
+
|
3010
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
|
3011
|
+
static StateResult handle_character_reference_state (
|
3012
|
+
GumboParser* parser,
|
3013
|
+
GumboTokenizerState* tokenizer,
|
3014
|
+
int c,
|
3015
|
+
GumboToken* output
|
3016
|
+
) {
|
3017
|
+
if (gumbo_ascii_isalnum(c)) {
|
3018
|
+
reconsume_in_state(parser, GUMBO_LEX_NAMED_CHARACTER_REFERENCE);
|
3019
|
+
return CONTINUE;
|
3020
|
+
}
|
3021
|
+
if (c == '#') {
|
3022
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE);
|
3023
|
+
return CONTINUE;
|
3024
|
+
}
|
3025
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3026
|
+
return flush_code_points_consumed_as_character_reference(parser, output);
|
3027
|
+
}
|
3028
|
+
|
3029
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
|
3030
|
+
static StateResult handle_named_character_reference_state (
|
3031
|
+
GumboParser* parser,
|
3032
|
+
GumboTokenizerState* tokenizer,
|
3033
|
+
int c,
|
3034
|
+
GumboToken* output
|
3035
|
+
) {
|
3036
|
+
const char *cur = utf8iterator_get_char_pointer(&tokenizer->_input);
|
3037
|
+
const char *end = utf8iterator_get_end_pointer(&tokenizer->_input);
|
3038
|
+
int code_point[2];
|
3039
|
+
size_t size = match_named_char_ref(cur, end - cur, code_point);
|
3040
|
+
|
3041
|
+
if (size > 0) {
|
3042
|
+
utf8iterator_maybe_consume_match(&tokenizer->_input, cur, size, true);
|
3043
|
+
int next = utf8iterator_current(&tokenizer->_input);
|
3044
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3045
|
+
if (character_reference_part_of_attribute(parser)
|
3046
|
+
&& cur[size-1] != ';'
|
3047
|
+
&& (next == '=' || gumbo_ascii_isalnum(next))) {
|
3048
|
+
GumboStringPiece str = { .data = cur, .length = size };
|
3049
|
+
append_string_to_temporary_buffer(parser, &str);
|
3050
|
+
return flush_code_points_consumed_as_character_reference(parser, output);
|
3051
|
+
}
|
3052
|
+
if (cur[size-1] != ';')
|
3053
|
+
tokenizer_add_char_ref_error(parser, GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE, -1);
|
3054
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3055
|
+
return flush_char_ref(parser, code_point[0], code_point[1], output);
|
3056
|
+
}
|
3057
|
+
reconsume_in_state(parser, GUMBO_LEX_AMBIGUOUS_AMPERSAND);
|
3058
|
+
return flush_code_points_consumed_as_character_reference(parser, output);
|
3059
|
+
}
|
3060
|
+
|
3061
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
|
3062
|
+
static StateResult handle_ambiguous_ampersand_state (
|
3063
|
+
GumboParser* parser,
|
3064
|
+
GumboTokenizerState* tokenizer,
|
3065
|
+
int c,
|
3066
|
+
GumboToken* output
|
3067
|
+
) {
|
3068
|
+
if (gumbo_ascii_isalnum(c)) {
|
3069
|
+
if (character_reference_part_of_attribute(parser)) {
|
3070
|
+
append_char_to_tag_buffer(parser, c, true);
|
3071
|
+
return CONTINUE;
|
3072
|
+
}
|
3073
|
+
return emit_char(parser, c, output);
|
3074
|
+
}
|
3075
|
+
if (c == ';') {
|
3076
|
+
tokenizer_add_char_ref_error(parser, GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE, -1);
|
3077
|
+
// fall through
|
3078
|
+
}
|
3079
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3080
|
+
return CONTINUE;
|
3081
|
+
}
|
3082
|
+
|
3083
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
|
3084
|
+
static StateResult handle_numeric_character_reference_state (
|
3085
|
+
GumboParser* parser,
|
3086
|
+
GumboTokenizerState* tokenizer,
|
3087
|
+
int c,
|
3088
|
+
GumboToken* output
|
3089
|
+
) {
|
3090
|
+
tokenizer->_character_reference_code = 0;
|
3091
|
+
switch (c) {
|
3092
|
+
case 'x':
|
3093
|
+
case 'X':
|
3094
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START);
|
3095
|
+
return CONTINUE;
|
3096
|
+
default:
|
3097
|
+
reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START);
|
3098
|
+
return CONTINUE;
|
3133
3099
|
}
|
3134
3100
|
}
|
3135
3101
|
|
3102
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-start-state
|
3103
|
+
static StateResult handle_hexadecimal_character_reference_start_state (
|
3104
|
+
GumboParser* parser,
|
3105
|
+
GumboTokenizerState* tokenizer,
|
3106
|
+
int c,
|
3107
|
+
GumboToken* output
|
3108
|
+
) {
|
3109
|
+
if (gumbo_ascii_isxdigit(c)) {
|
3110
|
+
reconsume_in_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE);
|
3111
|
+
return CONTINUE;
|
3112
|
+
}
|
3113
|
+
tokenizer_add_char_ref_error (
|
3114
|
+
parser,
|
3115
|
+
GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
|
3116
|
+
-1
|
3117
|
+
);
|
3118
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3119
|
+
return flush_code_points_consumed_as_character_reference(parser, output);
|
3120
|
+
}
|
3121
|
+
|
3122
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
|
3123
|
+
static StateResult handle_decimal_character_reference_start_state (
|
3124
|
+
GumboParser* parser,
|
3125
|
+
GumboTokenizerState* tokenizer,
|
3126
|
+
int c,
|
3127
|
+
GumboToken* output
|
3128
|
+
) {
|
3129
|
+
if (gumbo_ascii_isdigit(c)) {
|
3130
|
+
reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE);
|
3131
|
+
return CONTINUE;
|
3132
|
+
}
|
3133
|
+
tokenizer_add_char_ref_error (
|
3134
|
+
parser,
|
3135
|
+
GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
|
3136
|
+
-1
|
3137
|
+
);
|
3138
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3139
|
+
return flush_code_points_consumed_as_character_reference(parser, output);
|
3140
|
+
}
|
3141
|
+
|
3142
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-state
|
3143
|
+
static StateResult handle_hexadecimal_character_reference_state (
|
3144
|
+
GumboParser* parser,
|
3145
|
+
GumboTokenizerState* tokenizer,
|
3146
|
+
int c,
|
3147
|
+
GumboToken* output
|
3148
|
+
) {
|
3149
|
+
if (gumbo_ascii_isdigit(c)) {
|
3150
|
+
tokenizer->_character_reference_code =
|
3151
|
+
tokenizer->_character_reference_code * 16 + (c - 0x0030);
|
3152
|
+
if (tokenizer->_character_reference_code > kUtf8MaxChar)
|
3153
|
+
tokenizer->_character_reference_code = kUtf8MaxChar+1;
|
3154
|
+
return CONTINUE;
|
3155
|
+
}
|
3156
|
+
if (gumbo_ascii_isupper_xdigit(c)) {
|
3157
|
+
tokenizer->_character_reference_code =
|
3158
|
+
tokenizer->_character_reference_code * 16 + (c - 0x0037);
|
3159
|
+
if (tokenizer->_character_reference_code > kUtf8MaxChar)
|
3160
|
+
tokenizer->_character_reference_code = kUtf8MaxChar+1;
|
3161
|
+
return CONTINUE;
|
3162
|
+
}
|
3163
|
+
if (gumbo_ascii_islower_xdigit(c)) {
|
3164
|
+
tokenizer->_character_reference_code =
|
3165
|
+
tokenizer->_character_reference_code * 16 + (c - 0x0057);
|
3166
|
+
if (tokenizer->_character_reference_code > kUtf8MaxChar)
|
3167
|
+
tokenizer->_character_reference_code = kUtf8MaxChar+1;
|
3168
|
+
return CONTINUE;
|
3169
|
+
}
|
3170
|
+
if (c == ';') {
|
3171
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
|
3172
|
+
return CONTINUE;
|
3173
|
+
}
|
3174
|
+
tokenizer_add_char_ref_error(
|
3175
|
+
parser,
|
3176
|
+
GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
|
3177
|
+
tokenizer->_character_reference_code
|
3178
|
+
);
|
3179
|
+
reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
|
3180
|
+
return CONTINUE;
|
3181
|
+
}
|
3182
|
+
|
3183
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
|
3184
|
+
static StateResult handle_decimal_character_reference_state (
|
3185
|
+
GumboParser* parser,
|
3186
|
+
GumboTokenizerState* tokenizer,
|
3187
|
+
int c,
|
3188
|
+
GumboToken* output
|
3189
|
+
) {
|
3190
|
+
if (gumbo_ascii_isdigit(c)) {
|
3191
|
+
tokenizer->_character_reference_code =
|
3192
|
+
tokenizer->_character_reference_code * 10 + (c - 0x0030);
|
3193
|
+
if (tokenizer->_character_reference_code > kUtf8MaxChar)
|
3194
|
+
tokenizer->_character_reference_code = kUtf8MaxChar+1;
|
3195
|
+
return CONTINUE;
|
3196
|
+
}
|
3197
|
+
if (c == ';') {
|
3198
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
|
3199
|
+
return CONTINUE;
|
3200
|
+
}
|
3201
|
+
tokenizer_add_char_ref_error(
|
3202
|
+
parser,
|
3203
|
+
GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
|
3204
|
+
tokenizer->_character_reference_code
|
3205
|
+
);
|
3206
|
+
reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
|
3207
|
+
return CONTINUE;
|
3208
|
+
}
|
3209
|
+
|
3210
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
|
3211
|
+
static StateResult handle_numeric_character_reference_end_state (
|
3212
|
+
GumboParser* parser,
|
3213
|
+
GumboTokenizerState* tokenizer,
|
3214
|
+
int c,
|
3215
|
+
GumboToken* output
|
3216
|
+
) {
|
3217
|
+
c = tokenizer->_character_reference_code;
|
3218
|
+
if (c == 0) {
|
3219
|
+
tokenizer_add_char_ref_error(
|
3220
|
+
parser,
|
3221
|
+
GUMBO_ERR_NULL_CHARACTER_REFERENCE,
|
3222
|
+
c
|
3223
|
+
);
|
3224
|
+
c = kUtf8ReplacementChar;
|
3225
|
+
} else if (c > kUtf8MaxChar) {
|
3226
|
+
tokenizer_add_char_ref_error(
|
3227
|
+
parser,
|
3228
|
+
GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
|
3229
|
+
c
|
3230
|
+
);
|
3231
|
+
c = kUtf8ReplacementChar;
|
3232
|
+
} else if (utf8_is_surrogate(c)) {
|
3233
|
+
tokenizer_add_char_ref_error(
|
3234
|
+
parser,
|
3235
|
+
GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
|
3236
|
+
c
|
3237
|
+
);
|
3238
|
+
c = kUtf8ReplacementChar;
|
3239
|
+
} else if (utf8_is_noncharacter(c)) {
|
3240
|
+
tokenizer_add_char_ref_error(
|
3241
|
+
parser,
|
3242
|
+
GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
|
3243
|
+
c
|
3244
|
+
);
|
3245
|
+
} else if (c == 0x0D || (utf8_is_control(c) && !gumbo_ascii_isspace(c))) {
|
3246
|
+
tokenizer_add_char_ref_error(
|
3247
|
+
parser,
|
3248
|
+
GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
|
3249
|
+
c
|
3250
|
+
);
|
3251
|
+
switch (c) {
|
3252
|
+
case 0x80: c = 0x20AC; break;
|
3253
|
+
case 0x82: c = 0x201A; break;
|
3254
|
+
case 0x83: c = 0x0192; break;
|
3255
|
+
case 0x84: c = 0x201E; break;
|
3256
|
+
case 0x85: c = 0x2026; break;
|
3257
|
+
case 0x86: c = 0x2020; break;
|
3258
|
+
case 0x87: c = 0x2021; break;
|
3259
|
+
case 0x88: c = 0x02C6; break;
|
3260
|
+
case 0x89: c = 0x2030; break;
|
3261
|
+
case 0x8A: c = 0x0160; break;
|
3262
|
+
case 0x8B: c = 0x2039; break;
|
3263
|
+
case 0x8C: c = 0x0152; break;
|
3264
|
+
case 0x8E: c = 0x017D; break;
|
3265
|
+
case 0x91: c = 0x2018; break;
|
3266
|
+
case 0x92: c = 0x2019; break;
|
3267
|
+
case 0x93: c = 0x201C; break;
|
3268
|
+
case 0x94: c = 0x201D; break;
|
3269
|
+
case 0x95: c = 0x2022; break;
|
3270
|
+
case 0x96: c = 0x2013; break;
|
3271
|
+
case 0x97: c = 0x2014; break;
|
3272
|
+
case 0x98: c = 0x02DC; break;
|
3273
|
+
case 0x99: c = 0x2122; break;
|
3274
|
+
case 0x9A: c = 0x0161; break;
|
3275
|
+
case 0x9B: c = 0x203A; break;
|
3276
|
+
case 0x9C: c = 0x0153; break;
|
3277
|
+
case 0x9E: c = 0x017E; break;
|
3278
|
+
case 0x9F: c = 0x0178; break;
|
3279
|
+
}
|
3280
|
+
}
|
3281
|
+
reconsume_in_state(parser, tokenizer->_return_state);
|
3282
|
+
return flush_char_ref(parser, c, kGumboNoChar, output);
|
3283
|
+
}
|
3284
|
+
|
3136
3285
|
typedef StateResult (*GumboLexerStateFunction) (
|
3137
3286
|
GumboParser* parser,
|
3138
3287
|
GumboTokenizerState* tokenizer,
|
@@ -3141,77 +3290,89 @@ typedef StateResult (*GumboLexerStateFunction) (
|
|
3141
3290
|
);
|
3142
3291
|
|
3143
3292
|
static GumboLexerStateFunction dispatch_table[] = {
|
3144
|
-
handle_data_state,
|
3145
|
-
|
3146
|
-
|
3147
|
-
|
3148
|
-
|
3149
|
-
|
3150
|
-
|
3151
|
-
|
3152
|
-
|
3153
|
-
|
3154
|
-
|
3155
|
-
|
3156
|
-
|
3157
|
-
|
3158
|
-
|
3159
|
-
|
3160
|
-
|
3161
|
-
|
3162
|
-
|
3163
|
-
|
3164
|
-
|
3165
|
-
|
3166
|
-
|
3167
|
-
|
3168
|
-
|
3169
|
-
|
3170
|
-
|
3171
|
-
|
3172
|
-
|
3173
|
-
|
3174
|
-
|
3175
|
-
|
3176
|
-
|
3177
|
-
|
3178
|
-
|
3179
|
-
|
3180
|
-
|
3181
|
-
|
3182
|
-
|
3183
|
-
|
3184
|
-
|
3185
|
-
|
3186
|
-
|
3187
|
-
|
3188
|
-
|
3189
|
-
|
3190
|
-
|
3191
|
-
|
3192
|
-
|
3193
|
-
|
3194
|
-
|
3195
|
-
|
3196
|
-
|
3197
|
-
|
3198
|
-
|
3199
|
-
|
3200
|
-
|
3201
|
-
|
3202
|
-
|
3203
|
-
|
3204
|
-
|
3205
|
-
|
3206
|
-
|
3207
|
-
|
3208
|
-
|
3209
|
-
|
3210
|
-
|
3211
|
-
|
3293
|
+
[GUMBO_LEX_DATA] = handle_data_state,
|
3294
|
+
[GUMBO_LEX_RCDATA] = handle_rcdata_state,
|
3295
|
+
[GUMBO_LEX_RAWTEXT] = handle_rawtext_state,
|
3296
|
+
[GUMBO_LEX_SCRIPT_DATA] = handle_script_data_state,
|
3297
|
+
[GUMBO_LEX_PLAINTEXT] = handle_plaintext_state,
|
3298
|
+
[GUMBO_LEX_TAG_OPEN] = handle_tag_open_state,
|
3299
|
+
[GUMBO_LEX_END_TAG_OPEN] = handle_end_tag_open_state,
|
3300
|
+
[GUMBO_LEX_TAG_NAME] = handle_tag_name_state,
|
3301
|
+
[GUMBO_LEX_RCDATA_LT] = handle_rcdata_lt_state,
|
3302
|
+
[GUMBO_LEX_RCDATA_END_TAG_OPEN] = handle_rcdata_end_tag_open_state,
|
3303
|
+
[GUMBO_LEX_RCDATA_END_TAG_NAME] = handle_rcdata_end_tag_name_state,
|
3304
|
+
[GUMBO_LEX_RAWTEXT_LT] = handle_rawtext_lt_state,
|
3305
|
+
[GUMBO_LEX_RAWTEXT_END_TAG_OPEN] = handle_rawtext_end_tag_open_state,
|
3306
|
+
[GUMBO_LEX_RAWTEXT_END_TAG_NAME] = handle_rawtext_end_tag_name_state,
|
3307
|
+
[GUMBO_LEX_SCRIPT_DATA_LT] = handle_script_data_lt_state,
|
3308
|
+
[GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN] = handle_script_data_end_tag_open_state,
|
3309
|
+
[GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME] = handle_script_data_end_tag_name_state,
|
3310
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_START] = handle_script_data_escaped_start_state,
|
3311
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH] = handle_script_data_escaped_start_dash_state,
|
3312
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED] = handle_script_data_escaped_state,
|
3313
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH] = handle_script_data_escaped_dash_state,
|
3314
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH] = handle_script_data_escaped_dash_dash_state,
|
3315
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT] = handle_script_data_escaped_lt_state,
|
3316
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN] = handle_script_data_escaped_end_tag_open_state,
|
3317
|
+
[GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME] = handle_script_data_escaped_end_tag_name_state,
|
3318
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START] = handle_script_data_double_escaped_start_state,
|
3319
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED] = handle_script_data_double_escaped_state,
|
3320
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH] = handle_script_data_double_escaped_dash_state,
|
3321
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH] = handle_script_data_double_escaped_dash_dash_state,
|
3322
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT] = handle_script_data_double_escaped_lt_state,
|
3323
|
+
[GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END] = handle_script_data_double_escaped_end_state,
|
3324
|
+
[GUMBO_LEX_BEFORE_ATTR_NAME] = handle_before_attr_name_state,
|
3325
|
+
[GUMBO_LEX_ATTR_NAME] = handle_attr_name_state,
|
3326
|
+
[GUMBO_LEX_AFTER_ATTR_NAME] = handle_after_attr_name_state,
|
3327
|
+
[GUMBO_LEX_BEFORE_ATTR_VALUE] = handle_before_attr_value_state,
|
3328
|
+
[GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED] = handle_attr_value_double_quoted_state,
|
3329
|
+
[GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED] = handle_attr_value_single_quoted_state,
|
3330
|
+
[GUMBO_LEX_ATTR_VALUE_UNQUOTED] = handle_attr_value_unquoted_state,
|
3331
|
+
[GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED] = handle_after_attr_value_quoted_state,
|
3332
|
+
[GUMBO_LEX_SELF_CLOSING_START_TAG] = handle_self_closing_start_tag_state,
|
3333
|
+
[GUMBO_LEX_BOGUS_COMMENT] = handle_bogus_comment_state,
|
3334
|
+
[GUMBO_LEX_MARKUP_DECLARATION_OPEN] = handle_markup_declaration_open_state,
|
3335
|
+
[GUMBO_LEX_COMMENT_START] = handle_comment_start_state,
|
3336
|
+
[GUMBO_LEX_COMMENT_START_DASH] = handle_comment_start_dash_state,
|
3337
|
+
[GUMBO_LEX_COMMENT] = handle_comment_state,
|
3338
|
+
[GUMBO_LEX_COMMENT_LT] = handle_comment_lt_state,
|
3339
|
+
[GUMBO_LEX_COMMENT_LT_BANG] = handle_comment_lt_bang_state,
|
3340
|
+
[GUMBO_LEX_COMMENT_LT_BANG_DASH] = handle_comment_lt_bang_dash_state,
|
3341
|
+
[GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH] = handle_comment_lt_bang_dash_dash_state,
|
3342
|
+
[GUMBO_LEX_COMMENT_END_DASH] = handle_comment_end_dash_state,
|
3343
|
+
[GUMBO_LEX_COMMENT_END] = handle_comment_end_state,
|
3344
|
+
[GUMBO_LEX_COMMENT_END_BANG] = handle_comment_end_bang_state,
|
3345
|
+
[GUMBO_LEX_DOCTYPE] = handle_doctype_state,
|
3346
|
+
[GUMBO_LEX_BEFORE_DOCTYPE_NAME] = handle_before_doctype_name_state,
|
3347
|
+
[GUMBO_LEX_DOCTYPE_NAME] = handle_doctype_name_state,
|
3348
|
+
[GUMBO_LEX_AFTER_DOCTYPE_NAME] = handle_after_doctype_name_state,
|
3349
|
+
[GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD] = handle_after_doctype_public_keyword_state,
|
3350
|
+
[GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID] = handle_before_doctype_public_id_state,
|
3351
|
+
[GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED] = handle_doctype_public_id_double_quoted_state,
|
3352
|
+
[GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED] = handle_doctype_public_id_single_quoted_state,
|
3353
|
+
[GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID] = handle_after_doctype_public_id_state,
|
3354
|
+
[GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID] = handle_between_doctype_public_system_id_state,
|
3355
|
+
[GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD] = handle_after_doctype_system_keyword_state,
|
3356
|
+
[GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID] = handle_before_doctype_system_id_state,
|
3357
|
+
[GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED] = handle_doctype_system_id_double_quoted_state,
|
3358
|
+
[GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED] = handle_doctype_system_id_single_quoted_state,
|
3359
|
+
[GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID] = handle_after_doctype_system_id_state,
|
3360
|
+
[GUMBO_LEX_BOGUS_DOCTYPE] = handle_bogus_doctype_state,
|
3361
|
+
[GUMBO_LEX_CDATA_SECTION] = handle_cdata_section_state,
|
3362
|
+
[GUMBO_LEX_CDATA_SECTION_BRACKET] = handle_cdata_section_bracket_state,
|
3363
|
+
[GUMBO_LEX_CDATA_SECTION_END] = handle_cdata_section_end_state,
|
3364
|
+
[GUMBO_LEX_CHARACTER_REFERENCE] = handle_character_reference_state,
|
3365
|
+
[GUMBO_LEX_NAMED_CHARACTER_REFERENCE] = handle_named_character_reference_state,
|
3366
|
+
[GUMBO_LEX_AMBIGUOUS_AMPERSAND] = handle_ambiguous_ampersand_state,
|
3367
|
+
[GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE] = handle_numeric_character_reference_state,
|
3368
|
+
[GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START] = handle_hexadecimal_character_reference_start_state,
|
3369
|
+
[GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START] = handle_decimal_character_reference_start_state,
|
3370
|
+
[GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE] = handle_hexadecimal_character_reference_state,
|
3371
|
+
[GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE] = handle_decimal_character_reference_state,
|
3372
|
+
[GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
|
3212
3373
|
};
|
3213
3374
|
|
3214
|
-
|
3375
|
+
void gumbo_lex(GumboParser* parser, GumboToken* output) {
|
3215
3376
|
// Because of the spec requirements that...
|
3216
3377
|
//
|
3217
3378
|
// 1. Tokens be handled immediately by the parser upon emission.
|
@@ -3236,15 +3397,15 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3236
3397
|
// isn't consumed twice.
|
3237
3398
|
tokenizer->_reconsume_current_input = false;
|
3238
3399
|
tokenizer->_buffered_emit_char = kGumboNoChar;
|
3239
|
-
return
|
3400
|
+
return;
|
3240
3401
|
}
|
3241
3402
|
|
3242
|
-
if (
|
3243
|
-
return
|
3403
|
+
if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
|
3404
|
+
return;
|
3244
3405
|
}
|
3245
3406
|
|
3246
3407
|
while (1) {
|
3247
|
-
assert(!tokenizer->
|
3408
|
+
assert(!tokenizer->_resume_pos);
|
3248
3409
|
assert(tokenizer->_buffered_emit_char == kGumboNoChar);
|
3249
3410
|
int c = utf8iterator_current(&tokenizer->_input);
|
3250
3411
|
GumboTokenizerEnum state = tokenizer->_state;
|
@@ -3255,11 +3416,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
3255
3416
|
bool should_advance = !tokenizer->_reconsume_current_input;
|
3256
3417
|
tokenizer->_reconsume_current_input = false;
|
3257
3418
|
|
3258
|
-
if (result ==
|
3259
|
-
return
|
3260
|
-
} else if (result == RETURN_ERROR) {
|
3261
|
-
return false;
|
3262
|
-
}
|
3419
|
+
if (result == EMIT_TOKEN)
|
3420
|
+
return;
|
3263
3421
|
|
3264
3422
|
if (should_advance) {
|
3265
3423
|
utf8iterator_next(&tokenizer->_input);
|
@@ -3285,12 +3443,16 @@ void gumbo_token_destroy(GumboToken* token) {
|
|
3285
3443
|
}
|
3286
3444
|
}
|
3287
3445
|
gumbo_free((void*) token->v.start_tag.attributes.data);
|
3288
|
-
if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN)
|
3446
|
+
if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
|
3289
3447
|
gumbo_free(token->v.start_tag.name);
|
3448
|
+
token->v.start_tag.name = NULL;
|
3449
|
+
}
|
3290
3450
|
return;
|
3291
3451
|
case GUMBO_TOKEN_END_TAG:
|
3292
|
-
if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN)
|
3452
|
+
if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN) {
|
3293
3453
|
gumbo_free(token->v.end_tag.name);
|
3454
|
+
token->v.end_tag.name = NULL;
|
3455
|
+
}
|
3294
3456
|
break;
|
3295
3457
|
case GUMBO_TOKEN_COMMENT:
|
3296
3458
|
gumbo_free((void*) token->v.text);
|