nokogumbo 1.5.0 → 2.0.0.pre.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +56 -0
- data/README.md +146 -22
- data/ext/nokogumbo/extconf.rb +116 -0
- data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
- data/gumbo-parser/src/ascii.c +33 -0
- data/gumbo-parser/src/ascii.h +31 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +135 -2351
- data/gumbo-parser/src/char_ref.h +13 -29
- data/gumbo-parser/src/error.c +215 -133
- data/gumbo-parser/src/error.h +34 -49
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +506 -304
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +1989 -1431
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +899 -495
- data/gumbo-parser/src/tokenizer.h +37 -37
- data/gumbo-parser/src/tokenizer_states.h +6 -22
- data/gumbo-parser/src/utf8.c +103 -86
- data/gumbo-parser/src/utf8.h +37 -41
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +10 -174
- data/lib/nokogumbo/html5.rb +250 -0
- data/lib/nokogumbo/html5/document.rb +37 -0
- data/lib/nokogumbo/html5/document_fragment.rb +46 -0
- data/lib/nokogumbo/version.rb +3 -0
- data/lib/nokogumbo/xml/node.rb +57 -0
- metadata +32 -19
- data/ext/nokogumboc/extconf.rb +0 -60
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
@@ -0,0 +1,13 @@
|
|
1
|
+
#ifndef GUMBO_TAG_LOOKUP_H_
|
2
|
+
#define GUMBO_TAG_LOOKUP_H_
|
3
|
+
|
4
|
+
#include "gumbo.h"
|
5
|
+
|
6
|
+
typedef struct {
|
7
|
+
const char *key;
|
8
|
+
const GumboTag tag;
|
9
|
+
} TagHashSlot;
|
10
|
+
|
11
|
+
const TagHashSlot *gumbo_tag_lookup(const char *str, size_t len);
|
12
|
+
|
13
|
+
#endif // GUMBO_TAG_LOOKUP_H_
|
@@ -1,26 +1,6 @@
|
|
1
|
-
// Copyright 2011 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
|
17
1
|
#ifndef GUMBO_TOKEN_TYPE_H_
|
18
2
|
#define GUMBO_TOKEN_TYPE_H_
|
19
3
|
|
20
|
-
#ifdef __cplusplus
|
21
|
-
extern "C" {
|
22
|
-
#endif
|
23
|
-
|
24
4
|
// An enum representing the type of token.
|
25
5
|
typedef enum {
|
26
6
|
GUMBO_TOKEN_DOCTYPE,
|
@@ -34,8 +14,4 @@ typedef enum {
|
|
34
14
|
GUMBO_TOKEN_EOF
|
35
15
|
} GumboTokenType;
|
36
16
|
|
37
|
-
#
|
38
|
-
} // extern C
|
39
|
-
#endif
|
40
|
-
|
41
|
-
#endif // GUMBO_TOKEN_TYPE_H_
|
17
|
+
#endif // GUMBO_TOKEN_TYPE_H_
|
@@ -1,69 +1,68 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
1
|
+
/*
|
2
|
+
Copyright 2010 Google Inc.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
*/
|
16
|
+
|
17
|
+
/*
|
18
|
+
Coding conventions specific to this file:
|
19
|
+
|
20
|
+
1. Functions that fill in a token should be named emit_*, and should be
|
21
|
+
followed immediately by a return from the tokenizer (true if no error
|
22
|
+
occurred, false if an error occurred). Sometimes the emit functions
|
23
|
+
themselves return a boolean so that they can be combined with the return
|
24
|
+
statement; in this case, they should match this convention.
|
25
|
+
2. Functions that shuffle data from temporaries to final API structures
|
26
|
+
should be named finish_*, and be called just before the tokenizer exits the
|
27
|
+
state that accumulates the temporary.
|
28
|
+
3. All internal data structures should be kept in an initialized state from
|
29
|
+
tokenizer creation onwards, ready to accept input. When a buffer's flushed
|
30
|
+
and reset, it should be deallocated and immediately reinitialized.
|
31
|
+
4. Make sure there are appropriate break statements following each state.
|
32
|
+
5. Assertions on the state of the temporary and tag buffers are usually a
|
33
|
+
good idea, and should go at the entry point of each state when added.
|
34
|
+
6. Statement order within states goes:
|
35
|
+
1. Add parse errors, if appropriate.
|
36
|
+
2. Call finish_* functions to build up tag state.
|
37
|
+
2. Switch to new state. Set _reconsume flag if appropriate.
|
38
|
+
3. Perform any other temporary buffer manipulation.
|
39
|
+
4. Emit tokens
|
40
|
+
5. Return/break.
|
41
|
+
This order ensures that we can verify that every emit is followed by
|
42
|
+
a return, ensures that the correct state is recorded with any parse
|
43
|
+
errors, and prevents parse error position from being messed up by
|
44
|
+
possible mark/resets in temporary buffer manipulation.
|
45
|
+
*/
|
46
46
|
|
47
47
|
#include <assert.h>
|
48
|
-
#include <stdbool.h>
|
49
48
|
#include <string.h>
|
50
|
-
|
49
|
+
#include "tokenizer.h"
|
50
|
+
#include "ascii.h"
|
51
51
|
#include "attribute.h"
|
52
52
|
#include "char_ref.h"
|
53
53
|
#include "error.h"
|
54
54
|
#include "gumbo.h"
|
55
55
|
#include "parser.h"
|
56
56
|
#include "string_buffer.h"
|
57
|
-
#include "string_piece.h"
|
58
57
|
#include "token_type.h"
|
59
58
|
#include "tokenizer_states.h"
|
60
59
|
#include "utf8.h"
|
61
60
|
#include "util.h"
|
62
61
|
#include "vector.h"
|
63
62
|
|
64
|
-
// Compared against _script_data_buffer to determine if we're in
|
65
|
-
// script mode.
|
66
|
-
const GumboStringPiece kScriptTag = {"script", 6};
|
63
|
+
// Compared against _script_data_buffer to determine if we're in
|
64
|
+
// double-escaped script mode.
|
65
|
+
static const GumboStringPiece kScriptTag = {.data = "script", .length = 6};
|
67
66
|
|
68
67
|
// An enum for the return value of each individual state.
|
69
68
|
typedef enum {
|
@@ -86,31 +85,35 @@ typedef struct GumboInternalTagState {
|
|
86
85
|
// the buffer can be re-used for building up attributes.
|
87
86
|
GumboTag _tag;
|
88
87
|
|
88
|
+
// The current tag name. It's set at the same time that _tag is set if _tag
|
89
|
+
// is set to GUMBO_TAG_UNKNOWN.
|
90
|
+
char *_name;
|
91
|
+
|
89
92
|
// The starting location of the text in the buffer.
|
90
93
|
GumboSourcePosition _start_pos;
|
91
94
|
|
92
|
-
// The current list of attributes.
|
93
|
-
// transferred) to the GumboStartTag token upon completion of the tag.
|
95
|
+
// The current list of attributes. This is copied (and ownership of its data
|
96
|
+
// transferred) to the GumboStartTag token upon completion of the tag. New
|
94
97
|
// attributes are added as soon as their attribute name state is complete, and
|
95
98
|
// values are filled in by operating on _attributes.data[attributes.length-1].
|
96
99
|
GumboVector /* GumboAttribute */ _attributes;
|
97
100
|
|
98
|
-
// If true, the next attribute value to be finished should be dropped.
|
101
|
+
// If true, the next attribute value to be finished should be dropped. This
|
99
102
|
// happens if a duplicate attribute name is encountered - we want to consume
|
100
103
|
// the attribute value, but shouldn't overwrite the existing value.
|
101
104
|
bool _drop_next_attr_value;
|
102
105
|
|
103
106
|
// The state that caused the tokenizer to switch into a character reference in
|
104
|
-
// attribute value state.
|
105
|
-
// character, and is switched back to on completion.
|
107
|
+
// attribute value state. This is used to set the additional allowed
|
108
|
+
// character, and is switched back to on completion. Initialized as the
|
106
109
|
// tokenizer enters the character reference state.
|
107
110
|
GumboTokenizerEnum _attr_value_state;
|
108
111
|
|
109
|
-
// The last start tag to have been emitted by the tokenizer.
|
112
|
+
// The last start tag to have been emitted by the tokenizer. This is
|
110
113
|
// necessary to check for appropriate end tags.
|
111
114
|
GumboTag _last_start_tag;
|
112
115
|
|
113
|
-
// If true, then this is a start tag.
|
116
|
+
// If true, then this is a start tag. If false, it's an end tag. This is
|
114
117
|
// necessary to generate the appropriate token type at tag-closing time.
|
115
118
|
bool _is_start_tag;
|
116
119
|
|
@@ -121,43 +124,43 @@ typedef struct GumboInternalTagState {
|
|
121
124
|
// This is the main tokenizer state struct, containing all state used by in
|
122
125
|
// tokenizing the input stream.
|
123
126
|
typedef struct GumboInternalTokenizerState {
|
124
|
-
// The current lexer state.
|
127
|
+
// The current lexer state. Starts in GUMBO_LEX_DATA.
|
125
128
|
GumboTokenizerEnum _state;
|
126
129
|
|
127
130
|
// A flag indicating whether the current input character needs to reconsumed
|
128
131
|
// in another state, or whether the next input character should be read for
|
129
|
-
// the next iteration of the state loop.
|
132
|
+
// the next iteration of the state loop. This is set when the spec reads
|
130
133
|
// "Reconsume the current input character in..."
|
131
134
|
bool _reconsume_current_input;
|
132
135
|
|
133
|
-
// A flag indicating whether the current node is a foreign element.
|
136
|
+
// A flag indicating whether the current node is a foreign element. This is
|
134
137
|
// set by gumbo_tokenizer_set_is_current_node_foreign and checked in the
|
135
138
|
// markup declaration state.
|
136
139
|
bool _is_current_node_foreign;
|
137
140
|
|
138
|
-
// A flag indicating whether the tokenizer is in a CDATA section.
|
141
|
+
// A flag indicating whether the tokenizer is in a CDATA section. If so, then
|
139
142
|
// text tokens emitted will be GUMBO_TOKEN_CDATA.
|
140
143
|
bool _is_in_cdata;
|
141
144
|
|
142
145
|
// Certain states (notably character references) may emit two character tokens
|
143
|
-
// at once, but the contract for lex() fills in only one token at a time.
|
146
|
+
// at once, but the contract for lex() fills in only one token at a time. The
|
144
147
|
// extra character is buffered here, and then this is checked on entry to
|
145
|
-
// lex().
|
146
|
-
// returns from the lexer.
|
148
|
+
// lex(). If a character is stored here, it's immediately emitted and control
|
149
|
+
// returns from the lexer. kGumboNoChar is used to represent 'no character
|
147
150
|
// stored.'
|
148
151
|
//
|
149
152
|
// Note that characters emitted through this mechanism will have their source
|
150
153
|
// position marked as the character under the mark, i.e. multiple characters
|
151
|
-
// may be emitted with the same position.
|
152
|
-
// references, but unsuitable for many other cases.
|
154
|
+
// may be emitted with the same position. This is desirable for character
|
155
|
+
// references, but unsuitable for many other cases. Use the _temporary_buffer
|
153
156
|
// mechanism if the buffered characters must have their original positions in
|
154
157
|
// the document.
|
155
158
|
int _buffered_emit_char;
|
156
159
|
|
157
160
|
// A temporary buffer to accumulate characters, as described by the "temporary
|
158
|
-
// buffer" phrase in the tokenizer spec.
|
161
|
+
// buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
|
159
162
|
// way: we record the specific character to go into the buffer, which may
|
160
|
-
// sometimes be a lowercased version of the actual input character.
|
163
|
+
// sometimes be a lowercased version of the actual input character. However,
|
161
164
|
// we *also* use utf8iterator_mark() to record the position at tag start.
|
162
165
|
// When we start flushing the temporary buffer, we set _temporary_buffer_emit
|
163
166
|
// to the start of it, and then increment it for each call to the tokenizer.
|
@@ -167,13 +170,13 @@ typedef struct GumboInternalTokenizerState {
|
|
167
170
|
GumboStringBuffer _temporary_buffer;
|
168
171
|
|
169
172
|
// The current cursor position we're emitting from within
|
170
|
-
// _temporary_buffer.data.
|
173
|
+
// _temporary_buffer.data. NULL whenever we're not flushing the buffer.
|
171
174
|
const char* _temporary_buffer_emit;
|
172
175
|
|
173
176
|
// The temporary buffer is also used by the spec to check whether we should
|
174
177
|
// enter the script data double escaped state, but we can't use the same
|
175
178
|
// buffer for both because we have to flush out "<s" as emits while still
|
176
|
-
// maintaining the context that will eventually become "script".
|
179
|
+
// maintaining the context that will eventually become "script". This is a
|
177
180
|
// separate buffer that's used in place of the temporary buffer for states
|
178
181
|
// that may enter the script data double escape start state.
|
179
182
|
GumboStringBuffer _script_data_buffer;
|
@@ -189,7 +192,7 @@ typedef struct GumboInternalTokenizerState {
|
|
189
192
|
// Current tag state.
|
190
193
|
GumboTagState _tag_state;
|
191
194
|
|
192
|
-
// Doctype state.
|
195
|
+
// Doctype state. We use the temporary buffer to accumulate characters (it's
|
193
196
|
// not used for anything else in the doctype states), and then freshly
|
194
197
|
// allocate the strings in the doctype token, then copy it over on emit.
|
195
198
|
GumboTokenDocType _doc_type_state;
|
@@ -199,8 +202,10 @@ typedef struct GumboInternalTokenizerState {
|
|
199
202
|
} GumboTokenizerState;
|
200
203
|
|
201
204
|
// Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
|
202
|
-
static void tokenizer_add_parse_error(
|
203
|
-
|
205
|
+
static void tokenizer_add_parse_error (
|
206
|
+
GumboParser* parser,
|
207
|
+
GumboErrorType type
|
208
|
+
) {
|
204
209
|
GumboError* error = gumbo_add_error(parser);
|
205
210
|
if (!error) {
|
206
211
|
return;
|
@@ -309,14 +314,14 @@ static void tokenizer_add_parse_error(
|
|
309
314
|
}
|
310
315
|
|
311
316
|
static bool is_alpha(int c) {
|
312
|
-
// We don't use ISO C
|
313
|
-
//
|
314
|
-
// independent
|
315
|
-
return (
|
317
|
+
// We don't use the ISO C isalpha() function here because it depends
|
318
|
+
// on the current locale, whereas the behavior in the HTML5 spec is
|
319
|
+
// locale-independent.
|
320
|
+
return ((unsigned) c | 32) - 'a' < 26;
|
316
321
|
}
|
317
322
|
|
318
323
|
static int ensure_lowercase(int c) {
|
319
|
-
return c
|
324
|
+
return gumbo_ascii_tolower(c);
|
320
325
|
}
|
321
326
|
|
322
327
|
static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
|
@@ -346,7 +351,7 @@ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
|
|
346
351
|
// text that will eventually be emitted, it needs to be called a couple of
|
347
352
|
// states before the spec says "Set the temporary buffer to the empty string".
|
348
353
|
// In general, this should be called whenever there's a transition to a
|
349
|
-
// "less-than sign state".
|
354
|
+
// "less-than sign state". The initial < and possibly / then need to be
|
350
355
|
// appended to the temporary buffer, their presence needs to be accounted for in
|
351
356
|
// states that compare the temporary buffer against a literal value, and
|
352
357
|
// spec stanzas that say "emit a < and / character token along with a character
|
@@ -356,30 +361,40 @@ static void clear_temporary_buffer(GumboParser* parser) {
|
|
356
361
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
357
362
|
assert(!tokenizer->_temporary_buffer_emit);
|
358
363
|
utf8iterator_mark(&tokenizer->_input);
|
359
|
-
gumbo_string_buffer_clear(
|
364
|
+
gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
|
360
365
|
// The temporary buffer and script data buffer are the same object in the
|
361
366
|
// spec, so the script data buffer should be cleared as well.
|
362
|
-
gumbo_string_buffer_clear(
|
367
|
+
gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
|
363
368
|
}
|
364
369
|
|
365
370
|
// Appends a codepoint to the temporary buffer.
|
366
|
-
static void append_char_to_temporary_buffer(
|
367
|
-
|
368
|
-
|
369
|
-
|
371
|
+
static void append_char_to_temporary_buffer (
|
372
|
+
GumboParser* parser,
|
373
|
+
int codepoint
|
374
|
+
) {
|
375
|
+
gumbo_string_buffer_append_codepoint (
|
376
|
+
codepoint,
|
377
|
+
&parser->_tokenizer_state->_temporary_buffer
|
378
|
+
);
|
370
379
|
}
|
371
380
|
|
372
|
-
// Checks to see if the temporary buffer equals a certain string.
|
373
|
-
// Make sure this remains side-effect free; it's used in assertions.
|
374
381
|
#ifndef NDEBUG
|
375
|
-
static bool
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
return
|
382
|
-
|
382
|
+
static bool temporary_buffer_equals__ (
|
383
|
+
const GumboParser* parser,
|
384
|
+
const char* text,
|
385
|
+
size_t text_len
|
386
|
+
) {
|
387
|
+
const GumboStringBuffer* buf = &parser->_tokenizer_state->_temporary_buffer;
|
388
|
+
return
|
389
|
+
text_len == buf->length
|
390
|
+
&& memcmp(buf->data, text, text_len) == 0;
|
391
|
+
}
|
392
|
+
|
393
|
+
#define temporary_buffer_equals(parser, text) \
|
394
|
+
temporary_buffer_equals__(parser, "" text, sizeof(text) - 1)
|
395
|
+
|
396
|
+
static bool temporary_buffer_is_empty(const GumboParser* parser) {
|
397
|
+
return parser->_tokenizer_state->_temporary_buffer.length == 0;
|
383
398
|
}
|
384
399
|
#endif
|
385
400
|
|
@@ -387,9 +402,9 @@ static void doc_type_state_init(GumboParser* parser) {
|
|
387
402
|
GumboTokenDocType* doc_type_state =
|
388
403
|
&parser->_tokenizer_state->_doc_type_state;
|
389
404
|
// We initialize these to NULL here so that we don't end up leaking memory if
|
390
|
-
// we never see a doctype token.
|
405
|
+
// we never see a doctype token. When we do see a doctype token, we reset
|
391
406
|
// them to a freshly-allocated empty string so that we can present a uniform
|
392
|
-
// interface to client code and not make them check for null.
|
407
|
+
// interface to client code and not make them check for null. Ownership is
|
393
408
|
// transferred to the doctype token when it's emitted.
|
394
409
|
doc_type_state->name = NULL;
|
395
410
|
doc_type_state->public_identifier = NULL;
|
@@ -408,7 +423,7 @@ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
|
|
408
423
|
}
|
409
424
|
|
410
425
|
// Sets the tag buffer original text and start point to the current iterator
|
411
|
-
// position.
|
426
|
+
// position. This is necessary because attribute names & values may have
|
412
427
|
// whitespace preceeding them, and so we can't assume that the actual token
|
413
428
|
// starting point was the end of the last tag buffer usage.
|
414
429
|
static void reset_tag_buffer_start_point(GumboParser* parser) {
|
@@ -423,15 +438,14 @@ static void reset_tag_buffer_start_point(GumboParser* parser) {
|
|
423
438
|
// and clears the temporary buffer.
|
424
439
|
static void finish_temporary_buffer(GumboParser* parser, const char** output) {
|
425
440
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
426
|
-
*output =
|
427
|
-
gumbo_string_buffer_to_string(parser, &tokenizer->_temporary_buffer);
|
441
|
+
*output = gumbo_string_buffer_to_string(&tokenizer->_temporary_buffer);
|
428
442
|
clear_temporary_buffer(parser);
|
429
443
|
}
|
430
444
|
|
431
445
|
// Advances the iterator past the end of the token, and then fills in the
|
432
|
-
// relevant position fields.
|
446
|
+
// relevant position fields. It's assumed that after every emit, the tokenizer
|
433
447
|
// will immediately return (letting the tree-construction stage read the filled
|
434
|
-
// in Token).
|
448
|
+
// in Token). Thus, it's safe to advance the input stream here, since it will
|
435
449
|
// bypass the advance at the bottom of the state machine loop.
|
436
450
|
//
|
437
451
|
// Since this advances the iterator and resets the current input, make sure to
|
@@ -450,7 +464,7 @@ static void finish_token(GumboParser* parser, GumboToken* token) {
|
|
450
464
|
if (token->original_text.length > 0 &&
|
451
465
|
token->original_text.data[token->original_text.length - 1] == '\r') {
|
452
466
|
// The UTF8 iterator will ignore carriage returns in the input stream, which
|
453
|
-
// means that the next token may start one past a \r character.
|
467
|
+
// means that the next token may start one past a \r character. The pointer
|
454
468
|
// arithmetic above results in that \r being appended to the original text
|
455
469
|
// of the preceding token, so we have to adjust its length here to chop the
|
456
470
|
// \r off.
|
@@ -463,7 +477,7 @@ static void finish_token(GumboParser* parser, GumboToken* token) {
|
|
463
477
|
static void finish_doctype_public_id(GumboParser* parser) {
|
464
478
|
GumboTokenDocType* doc_type_state =
|
465
479
|
&parser->_tokenizer_state->_doc_type_state;
|
466
|
-
|
480
|
+
gumbo_free((void*) doc_type_state->public_identifier);
|
467
481
|
finish_temporary_buffer(parser, &doc_type_state->public_identifier);
|
468
482
|
doc_type_state->has_public_identifier = true;
|
469
483
|
}
|
@@ -473,7 +487,7 @@ static void finish_doctype_public_id(GumboParser* parser) {
|
|
473
487
|
static void finish_doctype_system_id(GumboParser* parser) {
|
474
488
|
GumboTokenDocType* doc_type_state =
|
475
489
|
&parser->_tokenizer_state->_doc_type_state;
|
476
|
-
|
490
|
+
gumbo_free((void*) doc_type_state->system_identifier);
|
477
491
|
finish_temporary_buffer(parser, &doc_type_state->system_identifier);
|
478
492
|
doc_type_state->has_system_identifier = true;
|
479
493
|
}
|
@@ -495,7 +509,7 @@ static StateResult emit_replacement_char(
|
|
495
509
|
return RETURN_ERROR;
|
496
510
|
}
|
497
511
|
|
498
|
-
// Writes an EOF character token.
|
512
|
+
// Writes an EOF character token. Always returns RETURN_SUCCESS.
|
499
513
|
static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
|
500
514
|
emit_char(parser, -1, output);
|
501
515
|
return RETURN_SUCCESS;
|
@@ -520,7 +534,9 @@ static void emit_doctype(GumboParser* parser, GumboToken* output) {
|
|
520
534
|
// Debug-only function that explicitly sets the attribute vector data to NULL so
|
521
535
|
// it can be asserted on tag creation, verifying that there are no memory leaks.
|
522
536
|
static void mark_tag_state_as_empty(GumboTagState* tag_state) {
|
537
|
+
UNUSED_IF_NDEBUG(tag_state);
|
523
538
|
#ifndef NDEBUG
|
539
|
+
tag_state->_name = NULL;
|
524
540
|
tag_state->_attributes = kGumboEmptyVector;
|
525
541
|
#endif
|
526
542
|
}
|
@@ -532,6 +548,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
532
548
|
if (tag_state->_is_start_tag) {
|
533
549
|
output->type = GUMBO_TOKEN_START_TAG;
|
534
550
|
output->v.start_tag.tag = tag_state->_tag;
|
551
|
+
output->v.start_tag.name = tag_state->_name;
|
535
552
|
output->v.start_tag.attributes = tag_state->_attributes;
|
536
553
|
output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
|
537
554
|
tag_state->_last_start_tag = tag_state->_tag;
|
@@ -540,23 +557,28 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
540
557
|
"Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
|
541
558
|
} else {
|
542
559
|
output->type = GUMBO_TOKEN_END_TAG;
|
543
|
-
output->v.end_tag = tag_state->_tag;
|
560
|
+
output->v.end_tag.tag = tag_state->_tag;
|
561
|
+
output->v.end_tag.name = tag_state->_name;
|
562
|
+
output->v.end_tag.is_self_closing = tag_state->_is_self_closing;
|
544
563
|
// In end tags, ownership of the attributes vector is not transferred to the
|
545
564
|
// token, but it's still initialized as normal, so it must be manually
|
546
|
-
// deallocated.
|
565
|
+
// deallocated. There may also be attributes to destroy, in certain broken
|
547
566
|
// cases like </div</th> (the "th" is an attribute there).
|
548
567
|
for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
|
549
|
-
gumbo_destroy_attribute(
|
568
|
+
gumbo_destroy_attribute(tag_state->_attributes.data[i]);
|
550
569
|
}
|
551
|
-
|
570
|
+
gumbo_free(tag_state->_attributes.data);
|
552
571
|
mark_tag_state_as_empty(tag_state);
|
553
572
|
gumbo_debug(
|
554
573
|
"Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
|
555
574
|
}
|
556
|
-
gumbo_string_buffer_destroy(
|
575
|
+
gumbo_string_buffer_destroy(&tag_state->_buffer);
|
557
576
|
finish_token(parser, output);
|
558
|
-
gumbo_debug(
|
559
|
-
|
577
|
+
gumbo_debug (
|
578
|
+
"Original text = %.*s.\n",
|
579
|
+
(int) output->original_text.length,
|
580
|
+
output->original_text.data
|
581
|
+
);
|
560
582
|
assert(output->original_text.length >= 2);
|
561
583
|
assert(output->original_text.data[0] == '<');
|
562
584
|
assert(output->original_text.data[output->original_text.length - 1] == '>');
|
@@ -570,26 +592,36 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
570
592
|
static void abandon_current_tag(GumboParser* parser) {
|
571
593
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
572
594
|
for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
|
573
|
-
gumbo_destroy_attribute(
|
595
|
+
gumbo_destroy_attribute(tag_state->_attributes.data[i]);
|
574
596
|
}
|
575
|
-
|
597
|
+
gumbo_free(tag_state->_attributes.data);
|
576
598
|
mark_tag_state_as_empty(tag_state);
|
577
|
-
gumbo_string_buffer_destroy(
|
599
|
+
gumbo_string_buffer_destroy(&tag_state->_buffer);
|
578
600
|
gumbo_debug("Abandoning current tag.\n");
|
579
601
|
}
|
580
602
|
|
581
|
-
// Wraps the
|
582
|
-
// appropriate TokenizerState modifications.
|
603
|
+
// Wraps the gumbo_consume_char_ref function to handle its output and make the
|
604
|
+
// appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
|
583
605
|
// error occurred, RETURN_SUCCESS otherwise.
|
584
|
-
static StateResult emit_char_ref(
|
585
|
-
|
606
|
+
static StateResult emit_char_ref (
|
607
|
+
GumboParser* parser,
|
608
|
+
int additional_allowed_char,
|
609
|
+
bool UNUSED_ARG(is_in_attribute),
|
610
|
+
GumboToken* output
|
611
|
+
) {
|
586
612
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
587
613
|
OneOrTwoCodepoints char_ref;
|
588
|
-
bool status =
|
589
|
-
|
614
|
+
bool status = gumbo_consume_char_ref (
|
615
|
+
parser,
|
616
|
+
&tokenizer->_input,
|
617
|
+
additional_allowed_char,
|
618
|
+
false,
|
619
|
+
&char_ref
|
620
|
+
);
|
590
621
|
if (char_ref.first != kGumboNoChar) {
|
591
|
-
//
|
592
|
-
// so we need to be sure not advance it again before
|
622
|
+
// gumbo_consume_char_ref ends with the iterator pointing at the next
|
623
|
+
// character, so we need to be sure not advance it again before
|
624
|
+
// reading the next token.
|
593
625
|
tokenizer->_reconsume_current_input = true;
|
594
626
|
emit_char(parser, char_ref.first, output);
|
595
627
|
tokenizer->_buffered_emit_char = char_ref.second;
|
@@ -599,9 +631,9 @@ static StateResult emit_char_ref(GumboParser* parser,
|
|
599
631
|
return status ? RETURN_SUCCESS : RETURN_ERROR;
|
600
632
|
}
|
601
633
|
|
602
|
-
// Emits a comment token.
|
634
|
+
// Emits a comment token. Comments use the temporary buffer to accumulate their
|
603
635
|
// data, and then it's copied over and released to the 'text' field of the
|
604
|
-
// GumboToken union.
|
636
|
+
// GumboToken union. Always returns RETURN_SUCCESS.
|
605
637
|
static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
|
606
638
|
output->type = GUMBO_TOKEN_COMMENT;
|
607
639
|
finish_temporary_buffer(parser, &output->v.text);
|
@@ -626,11 +658,11 @@ static bool maybe_emit_from_temporary_buffer(
|
|
626
658
|
}
|
627
659
|
|
628
660
|
assert(*c == utf8iterator_current(&tokenizer->_input));
|
629
|
-
// emit_char also advances the input stream.
|
661
|
+
// emit_char also advances the input stream. We need to do some juggling of
|
630
662
|
// the _reconsume_current_input flag to get the proper behavior when emitting
|
631
|
-
// previous tokens.
|
663
|
+
// previous tokens. Basically, _reconsume_current_input should *never* be set
|
632
664
|
// when emitting anything from the temporary buffer, since those characters
|
633
|
-
// have already been advanced past.
|
665
|
+
// have already been advanced past. However, it should be preserved so that
|
634
666
|
// when the *next* character is encountered again, the tokenizer knows not to
|
635
667
|
// advance past it.
|
636
668
|
bool saved_reconsume_state = tokenizer->_reconsume_current_input;
|
@@ -644,7 +676,7 @@ static bool maybe_emit_from_temporary_buffer(
|
|
644
676
|
// Sets up the tokenizer to begin flushing the temporary buffer.
|
645
677
|
// This resets the input iterator stream to the start of the last tag, sets up
|
646
678
|
// _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
|
647
|
-
// the first character in it.
|
679
|
+
// the first character in it. It returns true if a character was emitted, false
|
648
680
|
// otherwise.
|
649
681
|
static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
|
650
682
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
@@ -654,32 +686,35 @@ static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
|
|
654
686
|
return maybe_emit_from_temporary_buffer(parser, output);
|
655
687
|
}
|
656
688
|
|
657
|
-
// Appends a codepoint to the current tag buffer.
|
689
|
+
// Appends a codepoint to the current tag buffer. If
|
658
690
|
// reinitilize_position_on_first is set, this also initializes the tag buffer
|
659
691
|
// start point; the only time you would *not* want to pass true for this
|
660
692
|
// parameter is if you want the original_text to include character (like an
|
661
693
|
// opening quote) that doesn't appear in the value.
|
662
|
-
static void append_char_to_tag_buffer(
|
663
|
-
|
694
|
+
static void append_char_to_tag_buffer (
|
695
|
+
GumboParser* parser,
|
696
|
+
int codepoint,
|
697
|
+
bool reinitilize_position_on_first
|
698
|
+
) {
|
664
699
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
665
700
|
if (buffer->length == 0 && reinitilize_position_on_first) {
|
666
701
|
reset_tag_buffer_start_point(parser);
|
667
702
|
}
|
668
|
-
gumbo_string_buffer_append_codepoint(
|
703
|
+
gumbo_string_buffer_append_codepoint(codepoint, buffer);
|
669
704
|
}
|
670
705
|
|
671
|
-
// (Re-)initialize the tag buffer.
|
706
|
+
// (Re-)initialize the tag buffer. This also resets the original_text pointer
|
672
707
|
// and _start_pos field to point to the current position.
|
673
708
|
static void initialize_tag_buffer(GumboParser* parser) {
|
674
709
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
675
710
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
676
711
|
|
677
|
-
gumbo_string_buffer_init(
|
712
|
+
gumbo_string_buffer_init(&tag_state->_buffer);
|
678
713
|
reset_tag_buffer_start_point(parser);
|
679
714
|
}
|
680
715
|
|
681
716
|
// Initializes the tag_state to start a new tag, keeping track of the opening
|
682
|
-
// positions and original text.
|
717
|
+
// positions and original text. Takes a boolean indicating whether this is a
|
683
718
|
// start or end tag.
|
684
719
|
static void start_new_tag(GumboParser* parser, bool is_start_tag) {
|
685
720
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
@@ -690,14 +725,15 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
|
|
690
725
|
assert(is_alpha(c));
|
691
726
|
|
692
727
|
initialize_tag_buffer(parser);
|
693
|
-
gumbo_string_buffer_append_codepoint(
|
728
|
+
gumbo_string_buffer_append_codepoint(c, &tag_state->_buffer);
|
694
729
|
|
730
|
+
assert(tag_state->_name == NULL);
|
695
731
|
assert(tag_state->_attributes.data == NULL);
|
696
732
|
// Initial size chosen by statistical analysis of a corpus of 60k webpages.
|
697
|
-
// 99.5% of elements have 0 attributes, 93% of the remainder have 1.
|
733
|
+
// 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
|
698
734
|
// numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
|
699
735
|
// for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
|
700
|
-
gumbo_vector_init(
|
736
|
+
gumbo_vector_init(1, &tag_state->_attributes);
|
701
737
|
tag_state->_drop_next_attr_value = false;
|
702
738
|
tag_state->_is_start_tag = is_start_tag;
|
703
739
|
tag_state->_is_self_closing = false;
|
@@ -708,7 +744,7 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
|
|
708
744
|
static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
|
709
745
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
710
746
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
711
|
-
*output = gumbo_string_buffer_to_string(
|
747
|
+
*output = gumbo_string_buffer_to_string(&tag_state->_buffer);
|
712
748
|
}
|
713
749
|
|
714
750
|
// Fills in:
|
@@ -717,9 +753,12 @@ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
|
|
717
753
|
// * The start_pos GumboSourcePosition with the start position of the tag
|
718
754
|
// buffer.
|
719
755
|
// * The end_pos GumboSourcePosition with the current source position.
|
720
|
-
static void copy_over_original_tag_text(
|
721
|
-
|
722
|
-
|
756
|
+
static void copy_over_original_tag_text (
|
757
|
+
GumboParser* parser,
|
758
|
+
GumboStringPiece* original_text,
|
759
|
+
GumboSourcePosition* start_pos,
|
760
|
+
GumboSourcePosition* end_pos
|
761
|
+
) {
|
723
762
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
724
763
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
725
764
|
|
@@ -729,7 +768,7 @@ static void copy_over_original_tag_text(GumboParser* parser,
|
|
729
768
|
if (original_text->data[original_text->length - 1] == '\r') {
|
730
769
|
// Since \r is skipped by the UTF-8 iterator, it can sometimes end up
|
731
770
|
// appended to the end of original text even when it's really the first part
|
732
|
-
// of the next character.
|
771
|
+
// of the next character. If we detect this situation, shrink the length of
|
733
772
|
// the original text by 1 to remove the carriage return.
|
734
773
|
--original_text->length;
|
735
774
|
}
|
@@ -739,8 +778,7 @@ static void copy_over_original_tag_text(GumboParser* parser,
|
|
739
778
|
|
740
779
|
// Releases and then re-initializes the tag buffer.
|
741
780
|
static void reinitialize_tag_buffer(GumboParser* parser) {
|
742
|
-
|
743
|
-
parser, parser->_tokenizer_state->_tag_state._buffer.data);
|
781
|
+
gumbo_free(parser->_tokenizer_state->_tag_state._buffer.data);
|
744
782
|
initialize_tag_buffer(parser);
|
745
783
|
}
|
746
784
|
|
@@ -750,14 +788,24 @@ static void finish_tag_name(GumboParser* parser) {
|
|
750
788
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
751
789
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
752
790
|
|
753
|
-
tag_state->
|
754
|
-
|
791
|
+
const char *data = tag_state->_buffer.data;
|
792
|
+
size_t length = tag_state->_buffer.length;
|
793
|
+
tag_state->_tag = gumbo_tagn_enum(data, length);
|
794
|
+
if (tag_state->_tag == GUMBO_TAG_UNKNOWN) {
|
795
|
+
char *name = gumbo_alloc(length + 1);
|
796
|
+
memcpy(name, data, length);
|
797
|
+
name[length] = 0;
|
798
|
+
tag_state->_name = name;
|
799
|
+
}
|
755
800
|
reinitialize_tag_buffer(parser);
|
756
801
|
}
|
757
802
|
|
758
803
|
// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
|
759
|
-
static void add_duplicate_attr_error(
|
760
|
-
|
804
|
+
static void add_duplicate_attr_error (
|
805
|
+
GumboParser* parser,
|
806
|
+
int original_index,
|
807
|
+
int new_index
|
808
|
+
) {
|
761
809
|
GumboError* error = gumbo_add_error(parser);
|
762
810
|
if (!error) {
|
763
811
|
return;
|
@@ -773,11 +821,11 @@ static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
|
|
773
821
|
}
|
774
822
|
|
775
823
|
// Creates a new attribute in the current tag, copying the current tag buffer to
|
776
|
-
// the attribute's name.
|
824
|
+
// the attribute's name. The attribute's value starts out as the empty string
|
777
825
|
// (following the "Boolean attributes" section of the spec) and is only
|
778
|
-
// overwritten on finish_attribute_value().
|
826
|
+
// overwritten on finish_attribute_value(). If the attribute has already been
|
779
827
|
// specified, the new attribute is dropped, a parse error is added, and the
|
780
|
-
// function returns false.
|
828
|
+
// function returns false. Otherwise, this returns true.
|
781
829
|
static bool finish_attribute_name(GumboParser* parser) {
|
782
830
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
783
831
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
@@ -789,30 +837,43 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
789
837
|
GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
|
790
838
|
for (unsigned int i = 0; i < attributes->length; ++i) {
|
791
839
|
GumboAttribute* attr = attributes->data[i];
|
792
|
-
if (
|
793
|
-
|
794
|
-
|
840
|
+
if (
|
841
|
+
strlen(attr->name) == tag_state->_buffer.length
|
842
|
+
&& 0 == memcmp (
|
843
|
+
attr->name,
|
844
|
+
tag_state->_buffer.data,
|
845
|
+
tag_state->_buffer.length
|
846
|
+
)
|
847
|
+
) {
|
795
848
|
// Identical attribute; bail.
|
796
|
-
add_duplicate_attr_error(parser,
|
849
|
+
add_duplicate_attr_error(parser, i, attributes->length);
|
797
850
|
tag_state->_drop_next_attr_value = true;
|
798
851
|
return false;
|
799
852
|
}
|
800
853
|
}
|
801
854
|
|
802
|
-
GumboAttribute* attr =
|
855
|
+
GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute));
|
803
856
|
attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
|
804
857
|
copy_over_tag_buffer(parser, &attr->name);
|
805
|
-
copy_over_original_tag_text(
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
858
|
+
copy_over_original_tag_text (
|
859
|
+
parser,
|
860
|
+
&attr->original_name,
|
861
|
+
&attr->name_start,
|
862
|
+
&attr->name_end
|
863
|
+
);
|
864
|
+
attr->value = gumbo_strdup("");
|
865
|
+
copy_over_original_tag_text (
|
866
|
+
parser,
|
867
|
+
&attr->original_value,
|
868
|
+
&attr->name_start,
|
869
|
+
&attr->name_end
|
870
|
+
);
|
871
|
+
gumbo_vector_add(attr, attributes);
|
811
872
|
reinitialize_tag_buffer(parser);
|
812
873
|
return true;
|
813
874
|
}
|
814
875
|
|
815
|
-
// Finishes an attribute value.
|
876
|
+
// Finishes an attribute value. This sets the value of the most recently added
|
816
877
|
// attribute to the current contents of the tag buffer.
|
817
878
|
static void finish_attribute_value(GumboParser* parser) {
|
818
879
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
@@ -826,7 +887,7 @@ static void finish_attribute_value(GumboParser* parser) {
|
|
826
887
|
|
827
888
|
GumboAttribute* attr =
|
828
889
|
tag_state->_attributes.data[tag_state->_attributes.length - 1];
|
829
|
-
|
890
|
+
gumbo_free((void*) attr->value);
|
830
891
|
copy_over_tag_buffer(parser, &attr->value);
|
831
892
|
copy_over_original_tag_text(
|
832
893
|
parser, &attr->original_value, &attr->value_start, &attr->value_end);
|
@@ -842,24 +903,27 @@ static bool is_appropriate_end_tag(GumboParser* parser) {
|
|
842
903
|
tag_state->_buffer.length);
|
843
904
|
}
|
844
905
|
|
845
|
-
void gumbo_tokenizer_state_init(
|
846
|
-
|
847
|
-
|
848
|
-
|
906
|
+
void gumbo_tokenizer_state_init (
|
907
|
+
GumboParser* parser,
|
908
|
+
const char* text,
|
909
|
+
size_t text_length
|
910
|
+
) {
|
911
|
+
GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState));
|
849
912
|
parser->_tokenizer_state = tokenizer;
|
850
913
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
851
914
|
tokenizer->_reconsume_current_input = false;
|
852
915
|
tokenizer->_is_current_node_foreign = false;
|
853
916
|
tokenizer->_is_in_cdata = false;
|
854
917
|
tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
|
918
|
+
tokenizer->_tag_state._name = NULL;
|
855
919
|
|
856
920
|
tokenizer->_buffered_emit_char = kGumboNoChar;
|
857
|
-
gumbo_string_buffer_init(
|
921
|
+
gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
|
858
922
|
tokenizer->_temporary_buffer_emit = NULL;
|
859
923
|
|
860
924
|
mark_tag_state_as_empty(&tokenizer->_tag_state);
|
861
925
|
|
862
|
-
gumbo_string_buffer_init(
|
926
|
+
gumbo_string_buffer_init(&tokenizer->_script_data_buffer);
|
863
927
|
tokenizer->_token_start = text;
|
864
928
|
utf8iterator_init(parser, text, text_length, &tokenizer->_input);
|
865
929
|
utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
|
@@ -871,27 +935,37 @@ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
|
|
871
935
|
assert(tokenizer->_doc_type_state.name == NULL);
|
872
936
|
assert(tokenizer->_doc_type_state.public_identifier == NULL);
|
873
937
|
assert(tokenizer->_doc_type_state.system_identifier == NULL);
|
874
|
-
gumbo_string_buffer_destroy(
|
875
|
-
gumbo_string_buffer_destroy(
|
876
|
-
|
938
|
+
gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
|
939
|
+
gumbo_string_buffer_destroy(&tokenizer->_script_data_buffer);
|
940
|
+
assert(tokenizer->_tag_state._name == NULL);
|
941
|
+
assert(tokenizer->_tag_state._attributes.data == NULL);
|
942
|
+
gumbo_free(tokenizer);
|
877
943
|
}
|
878
944
|
|
879
945
|
void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
|
880
946
|
parser->_tokenizer_state->_state = state;
|
881
947
|
}
|
882
948
|
|
883
|
-
void gumbo_tokenizer_set_is_current_node_foreign(
|
884
|
-
|
949
|
+
void gumbo_tokenizer_set_is_current_node_foreign (
|
950
|
+
GumboParser* parser,
|
951
|
+
bool is_foreign
|
952
|
+
) {
|
885
953
|
if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
|
886
|
-
gumbo_debug(
|
887
|
-
|
954
|
+
gumbo_debug (
|
955
|
+
"Toggling is_current_node_foreign to %s.\n",
|
956
|
+
is_foreign ? "true" : "false"
|
957
|
+
);
|
888
958
|
}
|
889
959
|
parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
|
890
960
|
}
|
891
961
|
|
892
|
-
//
|
893
|
-
static StateResult handle_data_state(
|
894
|
-
|
962
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#data-state
|
963
|
+
static StateResult handle_data_state (
|
964
|
+
GumboParser* parser,
|
965
|
+
GumboTokenizerState* tokenizer,
|
966
|
+
int c,
|
967
|
+
GumboToken* output
|
968
|
+
) {
|
895
969
|
switch (c) {
|
896
970
|
case '&':
|
897
971
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
|
@@ -914,16 +988,24 @@ static StateResult handle_data_state(GumboParser* parser,
|
|
914
988
|
}
|
915
989
|
}
|
916
990
|
|
917
|
-
//
|
918
|
-
static StateResult handle_char_ref_in_data_state(
|
919
|
-
|
991
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-data-state
|
992
|
+
static StateResult handle_char_ref_in_data_state (
|
993
|
+
GumboParser* parser,
|
994
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
995
|
+
int UNUSED_ARG(c),
|
996
|
+
GumboToken* output
|
997
|
+
) {
|
920
998
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
921
999
|
return emit_char_ref(parser, ' ', false, output);
|
922
1000
|
}
|
923
1001
|
|
924
|
-
//
|
925
|
-
static StateResult handle_rcdata_state(
|
926
|
-
|
1002
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
|
1003
|
+
static StateResult handle_rcdata_state (
|
1004
|
+
GumboParser* parser,
|
1005
|
+
GumboTokenizerState* tokenizer,
|
1006
|
+
int c,
|
1007
|
+
GumboToken* output
|
1008
|
+
) {
|
927
1009
|
switch (c) {
|
928
1010
|
case '&':
|
929
1011
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
|
@@ -943,16 +1025,24 @@ static StateResult handle_rcdata_state(GumboParser* parser,
|
|
943
1025
|
}
|
944
1026
|
}
|
945
1027
|
|
946
|
-
//
|
947
|
-
static StateResult handle_char_ref_in_rcdata_state(
|
948
|
-
|
1028
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-rcdata-state
|
1029
|
+
static StateResult handle_char_ref_in_rcdata_state (
|
1030
|
+
GumboParser* parser,
|
1031
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1032
|
+
int UNUSED_ARG(c),
|
1033
|
+
GumboToken* output
|
1034
|
+
) {
|
949
1035
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
|
950
1036
|
return emit_char_ref(parser, ' ', false, output);
|
951
1037
|
}
|
952
1038
|
|
953
|
-
//
|
954
|
-
static StateResult handle_rawtext_state(
|
955
|
-
|
1039
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
|
1040
|
+
static StateResult handle_rawtext_state (
|
1041
|
+
GumboParser* parser,
|
1042
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1043
|
+
int c,
|
1044
|
+
GumboToken* output
|
1045
|
+
) {
|
956
1046
|
switch (c) {
|
957
1047
|
case '<':
|
958
1048
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
|
@@ -968,9 +1058,13 @@ static StateResult handle_rawtext_state(GumboParser* parser,
|
|
968
1058
|
}
|
969
1059
|
}
|
970
1060
|
|
971
|
-
//
|
972
|
-
static StateResult handle_script_state(
|
973
|
-
|
1061
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
|
1062
|
+
static StateResult handle_script_state (
|
1063
|
+
GumboParser* parser,
|
1064
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1065
|
+
int c,
|
1066
|
+
GumboToken* output
|
1067
|
+
) {
|
974
1068
|
switch (c) {
|
975
1069
|
case '<':
|
976
1070
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
|
@@ -986,9 +1080,13 @@ static StateResult handle_script_state(GumboParser* parser,
|
|
986
1080
|
}
|
987
1081
|
}
|
988
1082
|
|
989
|
-
//
|
990
|
-
static StateResult handle_plaintext_state(
|
991
|
-
|
1083
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
|
1084
|
+
static StateResult handle_plaintext_state (
|
1085
|
+
GumboParser* parser,
|
1086
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1087
|
+
int c,
|
1088
|
+
GumboToken* output
|
1089
|
+
) {
|
992
1090
|
switch (c) {
|
993
1091
|
case '\0':
|
994
1092
|
return emit_replacement_char(parser, output);
|
@@ -999,9 +1097,13 @@ static StateResult handle_plaintext_state(GumboParser* parser,
|
|
999
1097
|
}
|
1000
1098
|
}
|
1001
1099
|
|
1002
|
-
//
|
1003
|
-
static StateResult handle_tag_open_state(
|
1004
|
-
|
1100
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
|
1101
|
+
static StateResult handle_tag_open_state (
|
1102
|
+
GumboParser* parser,
|
1103
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1104
|
+
int c,
|
1105
|
+
GumboToken* output
|
1106
|
+
) {
|
1005
1107
|
assert(temporary_buffer_equals(parser, "<"));
|
1006
1108
|
switch (c) {
|
1007
1109
|
case '!':
|
@@ -1032,9 +1134,13 @@ static StateResult handle_tag_open_state(GumboParser* parser,
|
|
1032
1134
|
}
|
1033
1135
|
}
|
1034
1136
|
|
1035
|
-
//
|
1036
|
-
static StateResult handle_end_tag_open_state(
|
1037
|
-
|
1137
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
|
1138
|
+
static StateResult handle_end_tag_open_state (
|
1139
|
+
GumboParser* parser,
|
1140
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1141
|
+
int c,
|
1142
|
+
GumboToken* output
|
1143
|
+
) {
|
1038
1144
|
assert(temporary_buffer_equals(parser, "</"));
|
1039
1145
|
switch (c) {
|
1040
1146
|
case '>':
|
@@ -1059,9 +1165,13 @@ static StateResult handle_end_tag_open_state(GumboParser* parser,
|
|
1059
1165
|
}
|
1060
1166
|
}
|
1061
1167
|
|
1062
|
-
//
|
1063
|
-
static StateResult handle_tag_name_state(
|
1064
|
-
|
1168
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
1169
|
+
static StateResult handle_tag_name_state (
|
1170
|
+
GumboParser* parser,
|
1171
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1172
|
+
int c,
|
1173
|
+
GumboToken* output
|
1174
|
+
) {
|
1065
1175
|
switch (c) {
|
1066
1176
|
case '\t':
|
1067
1177
|
case '\n':
|
@@ -1093,9 +1203,13 @@ static StateResult handle_tag_name_state(GumboParser* parser,
|
|
1093
1203
|
}
|
1094
1204
|
}
|
1095
1205
|
|
1096
|
-
//
|
1097
|
-
static StateResult handle_rcdata_lt_state(
|
1098
|
-
|
1206
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
|
1207
|
+
static StateResult handle_rcdata_lt_state (
|
1208
|
+
GumboParser* parser,
|
1209
|
+
GumboTokenizerState* tokenizer,
|
1210
|
+
int c,
|
1211
|
+
GumboToken* output
|
1212
|
+
) {
|
1099
1213
|
assert(temporary_buffer_equals(parser, "<"));
|
1100
1214
|
if (c == '/') {
|
1101
1215
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
|
@@ -1108,9 +1222,13 @@ static StateResult handle_rcdata_lt_state(GumboParser* parser,
|
|
1108
1222
|
}
|
1109
1223
|
}
|
1110
1224
|
|
1111
|
-
//
|
1112
|
-
static StateResult handle_rcdata_end_tag_open_state(
|
1113
|
-
|
1225
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
|
1226
|
+
static StateResult handle_rcdata_end_tag_open_state (
|
1227
|
+
GumboParser* parser,
|
1228
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1229
|
+
int c,
|
1230
|
+
GumboToken* output
|
1231
|
+
) {
|
1114
1232
|
assert(temporary_buffer_equals(parser, "</"));
|
1115
1233
|
if (is_alpha(c)) {
|
1116
1234
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
|
@@ -1124,9 +1242,14 @@ static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
|
|
1124
1242
|
return true;
|
1125
1243
|
}
|
1126
1244
|
|
1127
|
-
//
|
1128
|
-
static StateResult handle_rcdata_end_tag_name_state(
|
1129
|
-
|
1245
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
|
1246
|
+
static StateResult handle_rcdata_end_tag_name_state (
|
1247
|
+
GumboParser* parser,
|
1248
|
+
GumboTokenizerState* tokenizer,
|
1249
|
+
int c,
|
1250
|
+
GumboToken* output
|
1251
|
+
) {
|
1252
|
+
UNUSED_IF_NDEBUG(tokenizer);
|
1130
1253
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1131
1254
|
if (is_alpha(c)) {
|
1132
1255
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1156,9 +1279,13 @@ static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
|
|
1156
1279
|
return emit_temporary_buffer(parser, output);
|
1157
1280
|
}
|
1158
1281
|
|
1159
|
-
//
|
1160
|
-
static StateResult handle_rawtext_lt_state(
|
1161
|
-
|
1282
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
|
1283
|
+
static StateResult handle_rawtext_lt_state (
|
1284
|
+
GumboParser* parser,
|
1285
|
+
GumboTokenizerState* tokenizer,
|
1286
|
+
int c,
|
1287
|
+
GumboToken* output
|
1288
|
+
) {
|
1162
1289
|
assert(temporary_buffer_equals(parser, "<"));
|
1163
1290
|
if (c == '/') {
|
1164
1291
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
|
@@ -1171,9 +1298,13 @@ static StateResult handle_rawtext_lt_state(GumboParser* parser,
|
|
1171
1298
|
}
|
1172
1299
|
}
|
1173
1300
|
|
1174
|
-
//
|
1175
|
-
static StateResult handle_rawtext_end_tag_open_state(
|
1176
|
-
|
1301
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
|
1302
|
+
static StateResult handle_rawtext_end_tag_open_state (
|
1303
|
+
GumboParser* parser,
|
1304
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1305
|
+
int c,
|
1306
|
+
GumboToken* output
|
1307
|
+
) {
|
1177
1308
|
assert(temporary_buffer_equals(parser, "</"));
|
1178
1309
|
if (is_alpha(c)) {
|
1179
1310
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
|
@@ -1186,9 +1317,13 @@ static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
|
|
1186
1317
|
}
|
1187
1318
|
}
|
1188
1319
|
|
1189
|
-
//
|
1190
|
-
static StateResult handle_rawtext_end_tag_name_state(
|
1191
|
-
|
1320
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
|
1321
|
+
static StateResult handle_rawtext_end_tag_name_state (
|
1322
|
+
GumboParser* parser,
|
1323
|
+
GumboTokenizerState* tokenizer,
|
1324
|
+
int c,
|
1325
|
+
GumboToken* output
|
1326
|
+
) {
|
1192
1327
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1193
1328
|
gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
|
1194
1329
|
tokenizer->_tag_state._buffer.data);
|
@@ -1221,9 +1356,13 @@ static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
|
|
1221
1356
|
return emit_temporary_buffer(parser, output);
|
1222
1357
|
}
|
1223
1358
|
|
1224
|
-
//
|
1225
|
-
static StateResult handle_script_lt_state(
|
1226
|
-
|
1359
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
|
1360
|
+
static StateResult handle_script_lt_state (
|
1361
|
+
GumboParser* parser,
|
1362
|
+
GumboTokenizerState* tokenizer,
|
1363
|
+
int c,
|
1364
|
+
GumboToken* output
|
1365
|
+
) {
|
1227
1366
|
assert(temporary_buffer_equals(parser, "<"));
|
1228
1367
|
if (c == '/') {
|
1229
1368
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
|
@@ -1240,9 +1379,13 @@ static StateResult handle_script_lt_state(GumboParser* parser,
|
|
1240
1379
|
}
|
1241
1380
|
}
|
1242
1381
|
|
1243
|
-
//
|
1244
|
-
static StateResult handle_script_end_tag_open_state(
|
1245
|
-
|
1382
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
|
1383
|
+
static StateResult handle_script_end_tag_open_state (
|
1384
|
+
GumboParser* parser,
|
1385
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1386
|
+
int c,
|
1387
|
+
GumboToken* output
|
1388
|
+
) {
|
1246
1389
|
assert(temporary_buffer_equals(parser, "</"));
|
1247
1390
|
if (is_alpha(c)) {
|
1248
1391
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
|
@@ -1255,9 +1398,14 @@ static StateResult handle_script_end_tag_open_state(GumboParser* parser,
|
|
1255
1398
|
}
|
1256
1399
|
}
|
1257
1400
|
|
1258
|
-
//
|
1259
|
-
static StateResult handle_script_end_tag_name_state(
|
1260
|
-
|
1401
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
|
1402
|
+
static StateResult handle_script_end_tag_name_state (
|
1403
|
+
GumboParser* parser,
|
1404
|
+
GumboTokenizerState* tokenizer,
|
1405
|
+
int c,
|
1406
|
+
GumboToken* output
|
1407
|
+
) {
|
1408
|
+
UNUSED_IF_NDEBUG(tokenizer);
|
1261
1409
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1262
1410
|
if (is_alpha(c)) {
|
1263
1411
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1287,9 +1435,13 @@ static StateResult handle_script_end_tag_name_state(GumboParser* parser,
|
|
1287
1435
|
return emit_temporary_buffer(parser, output);
|
1288
1436
|
}
|
1289
1437
|
|
1290
|
-
//
|
1291
|
-
static StateResult handle_script_escaped_start_state(
|
1292
|
-
|
1438
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
|
1439
|
+
static StateResult handle_script_escaped_start_state (
|
1440
|
+
GumboParser* parser,
|
1441
|
+
GumboTokenizerState* tokenizer,
|
1442
|
+
int c,
|
1443
|
+
GumboToken* output
|
1444
|
+
) {
|
1293
1445
|
if (c == '-') {
|
1294
1446
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
|
1295
1447
|
return emit_current_char(parser, output);
|
@@ -1300,9 +1452,13 @@ static StateResult handle_script_escaped_start_state(GumboParser* parser,
|
|
1300
1452
|
}
|
1301
1453
|
}
|
1302
1454
|
|
1303
|
-
//
|
1304
|
-
static StateResult handle_script_escaped_start_dash_state(
|
1305
|
-
|
1455
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
|
1456
|
+
static StateResult handle_script_escaped_start_dash_state (
|
1457
|
+
GumboParser* parser,
|
1458
|
+
GumboTokenizerState* tokenizer,
|
1459
|
+
int c,
|
1460
|
+
GumboToken* output
|
1461
|
+
) {
|
1306
1462
|
if (c == '-') {
|
1307
1463
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
|
1308
1464
|
return emit_current_char(parser, output);
|
@@ -1313,9 +1469,13 @@ static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
|
|
1313
1469
|
}
|
1314
1470
|
}
|
1315
1471
|
|
1316
|
-
//
|
1317
|
-
static StateResult handle_script_escaped_state(
|
1318
|
-
|
1472
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
|
1473
|
+
static StateResult handle_script_escaped_state (
|
1474
|
+
GumboParser* parser,
|
1475
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1476
|
+
int c,
|
1477
|
+
GumboToken* output
|
1478
|
+
) {
|
1319
1479
|
switch (c) {
|
1320
1480
|
case '-':
|
1321
1481
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
|
@@ -1335,9 +1495,13 @@ static StateResult handle_script_escaped_state(GumboParser* parser,
|
|
1335
1495
|
}
|
1336
1496
|
}
|
1337
1497
|
|
1338
|
-
//
|
1339
|
-
static StateResult handle_script_escaped_dash_state(
|
1340
|
-
|
1498
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
|
1499
|
+
static StateResult handle_script_escaped_dash_state (
|
1500
|
+
GumboParser* parser,
|
1501
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1502
|
+
int c,
|
1503
|
+
GumboToken* output
|
1504
|
+
) {
|
1341
1505
|
switch (c) {
|
1342
1506
|
case '-':
|
1343
1507
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
|
@@ -1360,9 +1524,13 @@ static StateResult handle_script_escaped_dash_state(GumboParser* parser,
|
|
1360
1524
|
}
|
1361
1525
|
}
|
1362
1526
|
|
1363
|
-
//
|
1364
|
-
static StateResult handle_script_escaped_dash_dash_state(
|
1365
|
-
|
1527
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
|
1528
|
+
static StateResult handle_script_escaped_dash_dash_state (
|
1529
|
+
GumboParser* parser,
|
1530
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1531
|
+
int c,
|
1532
|
+
GumboToken* output
|
1533
|
+
) {
|
1366
1534
|
switch (c) {
|
1367
1535
|
case '-':
|
1368
1536
|
return emit_current_char(parser, output);
|
@@ -1387,9 +1555,13 @@ static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
|
|
1387
1555
|
}
|
1388
1556
|
}
|
1389
1557
|
|
1390
|
-
//
|
1391
|
-
static StateResult handle_script_escaped_lt_state(
|
1392
|
-
|
1558
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
|
1559
|
+
static StateResult handle_script_escaped_lt_state (
|
1560
|
+
GumboParser* parser,
|
1561
|
+
GumboTokenizerState* tokenizer,
|
1562
|
+
int c,
|
1563
|
+
GumboToken* output
|
1564
|
+
) {
|
1393
1565
|
assert(temporary_buffer_equals(parser, "<"));
|
1394
1566
|
assert(!tokenizer->_script_data_buffer.length);
|
1395
1567
|
if (c == '/') {
|
@@ -1399,8 +1571,10 @@ static StateResult handle_script_escaped_lt_state(GumboParser* parser,
|
|
1399
1571
|
} else if (is_alpha(c)) {
|
1400
1572
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START);
|
1401
1573
|
append_char_to_temporary_buffer(parser, c);
|
1402
|
-
gumbo_string_buffer_append_codepoint(
|
1403
|
-
|
1574
|
+
gumbo_string_buffer_append_codepoint (
|
1575
|
+
ensure_lowercase(c),
|
1576
|
+
&tokenizer->_script_data_buffer
|
1577
|
+
);
|
1404
1578
|
return emit_temporary_buffer(parser, output);
|
1405
1579
|
} else {
|
1406
1580
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
|
@@ -1408,9 +1582,13 @@ static StateResult handle_script_escaped_lt_state(GumboParser* parser,
|
|
1408
1582
|
}
|
1409
1583
|
}
|
1410
1584
|
|
1411
|
-
//
|
1412
|
-
static StateResult handle_script_escaped_end_tag_open_state(
|
1413
|
-
|
1585
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
|
1586
|
+
static StateResult handle_script_escaped_end_tag_open_state (
|
1587
|
+
GumboParser* parser,
|
1588
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1589
|
+
int c,
|
1590
|
+
GumboToken* output
|
1591
|
+
) {
|
1414
1592
|
assert(temporary_buffer_equals(parser, "</"));
|
1415
1593
|
if (is_alpha(c)) {
|
1416
1594
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
|
@@ -1423,9 +1601,14 @@ static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
|
|
1423
1601
|
}
|
1424
1602
|
}
|
1425
1603
|
|
1426
|
-
//
|
1427
|
-
static StateResult handle_script_escaped_end_tag_name_state(
|
1428
|
-
|
1604
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
|
1605
|
+
static StateResult handle_script_escaped_end_tag_name_state (
|
1606
|
+
GumboParser* parser,
|
1607
|
+
GumboTokenizerState* tokenizer,
|
1608
|
+
int c,
|
1609
|
+
GumboToken* output
|
1610
|
+
) {
|
1611
|
+
UNUSED_IF_NDEBUG(tokenizer);
|
1429
1612
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1430
1613
|
if (is_alpha(c)) {
|
1431
1614
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1455,9 +1638,13 @@ static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
|
|
1455
1638
|
return emit_temporary_buffer(parser, output);
|
1456
1639
|
}
|
1457
1640
|
|
1458
|
-
//
|
1459
|
-
static StateResult handle_script_double_escaped_start_state(
|
1460
|
-
|
1641
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
|
1642
|
+
static StateResult handle_script_double_escaped_start_state (
|
1643
|
+
GumboParser* parser,
|
1644
|
+
GumboTokenizerState* tokenizer,
|
1645
|
+
int c,
|
1646
|
+
GumboToken* output
|
1647
|
+
) {
|
1461
1648
|
switch (c) {
|
1462
1649
|
case '\t':
|
1463
1650
|
case '\n':
|
@@ -1465,16 +1652,22 @@ static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
|
|
1465
1652
|
case ' ':
|
1466
1653
|
case '/':
|
1467
1654
|
case '>':
|
1468
|
-
gumbo_tokenizer_set_state(
|
1469
|
-
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1655
|
+
gumbo_tokenizer_set_state (
|
1656
|
+
parser,
|
1657
|
+
gumbo_string_equals (
|
1658
|
+
&kScriptTag,
|
1659
|
+
(GumboStringPiece*) &tokenizer->_script_data_buffer
|
1660
|
+
)
|
1661
|
+
? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
|
1662
|
+
: GUMBO_LEX_SCRIPT_ESCAPED
|
1663
|
+
);
|
1473
1664
|
return emit_current_char(parser, output);
|
1474
1665
|
default:
|
1475
1666
|
if (is_alpha(c)) {
|
1476
|
-
gumbo_string_buffer_append_codepoint(
|
1477
|
-
|
1667
|
+
gumbo_string_buffer_append_codepoint (
|
1668
|
+
ensure_lowercase(c),
|
1669
|
+
&tokenizer->_script_data_buffer
|
1670
|
+
);
|
1478
1671
|
return emit_current_char(parser, output);
|
1479
1672
|
} else {
|
1480
1673
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED);
|
@@ -1484,9 +1677,13 @@ static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
|
|
1484
1677
|
}
|
1485
1678
|
}
|
1486
1679
|
|
1487
|
-
//
|
1488
|
-
static StateResult handle_script_double_escaped_state(
|
1489
|
-
|
1680
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
|
1681
|
+
static StateResult handle_script_double_escaped_state (
|
1682
|
+
GumboParser* parser,
|
1683
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1684
|
+
int c,
|
1685
|
+
GumboToken* output
|
1686
|
+
) {
|
1490
1687
|
switch (c) {
|
1491
1688
|
case '-':
|
1492
1689
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
|
@@ -1505,9 +1702,13 @@ static StateResult handle_script_double_escaped_state(GumboParser* parser,
|
|
1505
1702
|
}
|
1506
1703
|
}
|
1507
1704
|
|
1508
|
-
//
|
1509
|
-
static StateResult handle_script_double_escaped_dash_state(
|
1510
|
-
|
1705
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
|
1706
|
+
static StateResult handle_script_double_escaped_dash_state (
|
1707
|
+
GumboParser* parser,
|
1708
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1709
|
+
int c,
|
1710
|
+
GumboToken* output
|
1711
|
+
) {
|
1511
1712
|
switch (c) {
|
1512
1713
|
case '-':
|
1513
1714
|
gumbo_tokenizer_set_state(
|
@@ -1529,10 +1730,13 @@ static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
|
|
1529
1730
|
}
|
1530
1731
|
}
|
1531
1732
|
|
1532
|
-
//
|
1533
|
-
static StateResult handle_script_double_escaped_dash_dash_state(
|
1534
|
-
|
1535
|
-
|
1733
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
|
1734
|
+
static StateResult handle_script_double_escaped_dash_dash_state (
|
1735
|
+
GumboParser* parser,
|
1736
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1737
|
+
int c,
|
1738
|
+
GumboToken* output
|
1739
|
+
) {
|
1536
1740
|
switch (c) {
|
1537
1741
|
case '-':
|
1538
1742
|
return emit_current_char(parser, output);
|
@@ -1555,12 +1759,16 @@ static StateResult handle_script_double_escaped_dash_dash_state(
|
|
1555
1759
|
}
|
1556
1760
|
}
|
1557
1761
|
|
1558
|
-
//
|
1559
|
-
static StateResult handle_script_double_escaped_lt_state(
|
1560
|
-
|
1762
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
|
1763
|
+
static StateResult handle_script_double_escaped_lt_state (
|
1764
|
+
GumboParser* parser,
|
1765
|
+
GumboTokenizerState* tokenizer,
|
1766
|
+
int c,
|
1767
|
+
GumboToken* output
|
1768
|
+
) {
|
1561
1769
|
if (c == '/') {
|
1562
1770
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
|
1563
|
-
gumbo_string_buffer_clear(
|
1771
|
+
gumbo_string_buffer_clear(&tokenizer->_script_data_buffer);
|
1564
1772
|
return emit_current_char(parser, output);
|
1565
1773
|
} else {
|
1566
1774
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
|
@@ -1569,9 +1777,13 @@ static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
|
|
1569
1777
|
}
|
1570
1778
|
}
|
1571
1779
|
|
1572
|
-
//
|
1573
|
-
static StateResult handle_script_double_escaped_end_state(
|
1574
|
-
|
1780
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
|
1781
|
+
static StateResult handle_script_double_escaped_end_state (
|
1782
|
+
GumboParser* parser,
|
1783
|
+
GumboTokenizerState* tokenizer,
|
1784
|
+
int c,
|
1785
|
+
GumboToken* output
|
1786
|
+
) {
|
1575
1787
|
switch (c) {
|
1576
1788
|
case '\t':
|
1577
1789
|
case '\n':
|
@@ -1587,8 +1799,10 @@ static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
|
|
1587
1799
|
return emit_current_char(parser, output);
|
1588
1800
|
default:
|
1589
1801
|
if (is_alpha(c)) {
|
1590
|
-
gumbo_string_buffer_append_codepoint(
|
1591
|
-
|
1802
|
+
gumbo_string_buffer_append_codepoint (
|
1803
|
+
ensure_lowercase(c),
|
1804
|
+
&tokenizer->_script_data_buffer
|
1805
|
+
);
|
1592
1806
|
return emit_current_char(parser, output);
|
1593
1807
|
} else {
|
1594
1808
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
|
@@ -1598,9 +1812,13 @@ static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
|
|
1598
1812
|
}
|
1599
1813
|
}
|
1600
1814
|
|
1601
|
-
//
|
1602
|
-
static StateResult handle_before_attr_name_state(
|
1603
|
-
|
1815
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
|
1816
|
+
static StateResult handle_before_attr_name_state (
|
1817
|
+
GumboParser* parser,
|
1818
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1819
|
+
int c,
|
1820
|
+
GumboToken* output
|
1821
|
+
) {
|
1604
1822
|
switch (c) {
|
1605
1823
|
case '\t':
|
1606
1824
|
case '\n':
|
@@ -1636,9 +1854,13 @@ static StateResult handle_before_attr_name_state(GumboParser* parser,
|
|
1636
1854
|
}
|
1637
1855
|
}
|
1638
1856
|
|
1639
|
-
//
|
1640
|
-
static StateResult handle_attr_name_state(
|
1641
|
-
|
1857
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
|
1858
|
+
static StateResult handle_attr_name_state (
|
1859
|
+
GumboParser* parser,
|
1860
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1861
|
+
int c,
|
1862
|
+
GumboToken* output
|
1863
|
+
) {
|
1642
1864
|
switch (c) {
|
1643
1865
|
case '\t':
|
1644
1866
|
case '\n':
|
@@ -1679,9 +1901,13 @@ static StateResult handle_attr_name_state(GumboParser* parser,
|
|
1679
1901
|
}
|
1680
1902
|
}
|
1681
1903
|
|
1682
|
-
//
|
1683
|
-
static StateResult handle_after_attr_name_state(
|
1684
|
-
|
1904
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
|
1905
|
+
static StateResult handle_after_attr_name_state (
|
1906
|
+
GumboParser* parser,
|
1907
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
1908
|
+
int c,
|
1909
|
+
GumboToken* output
|
1910
|
+
) {
|
1685
1911
|
switch (c) {
|
1686
1912
|
case '\t':
|
1687
1913
|
case '\n':
|
@@ -1719,9 +1945,13 @@ static StateResult handle_after_attr_name_state(GumboParser* parser,
|
|
1719
1945
|
}
|
1720
1946
|
}
|
1721
1947
|
|
1722
|
-
//
|
1723
|
-
static StateResult handle_before_attr_value_state(
|
1724
|
-
|
1948
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
|
1949
|
+
static StateResult handle_before_attr_value_state (
|
1950
|
+
GumboParser* parser,
|
1951
|
+
GumboTokenizerState* tokenizer,
|
1952
|
+
int c,
|
1953
|
+
GumboToken* output
|
1954
|
+
) {
|
1725
1955
|
switch (c) {
|
1726
1956
|
case '\t':
|
1727
1957
|
case '\n':
|
@@ -1768,9 +1998,13 @@ static StateResult handle_before_attr_value_state(GumboParser* parser,
|
|
1768
1998
|
}
|
1769
1999
|
}
|
1770
2000
|
|
1771
|
-
//
|
1772
|
-
static StateResult handle_attr_value_double_quoted_state(
|
1773
|
-
|
2001
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
|
2002
|
+
static StateResult handle_attr_value_double_quoted_state (
|
2003
|
+
GumboParser* parser,
|
2004
|
+
GumboTokenizerState* tokenizer,
|
2005
|
+
int c,
|
2006
|
+
GumboToken* UNUSED_ARG(output)
|
2007
|
+
) {
|
1774
2008
|
switch (c) {
|
1775
2009
|
case '"':
|
1776
2010
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
|
@@ -1796,9 +2030,13 @@ static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
|
|
1796
2030
|
}
|
1797
2031
|
}
|
1798
2032
|
|
1799
|
-
//
|
1800
|
-
static StateResult handle_attr_value_single_quoted_state(
|
1801
|
-
|
2033
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state
|
2034
|
+
static StateResult handle_attr_value_single_quoted_state (
|
2035
|
+
GumboParser* parser,
|
2036
|
+
GumboTokenizerState* tokenizer,
|
2037
|
+
int c,
|
2038
|
+
GumboToken* UNUSED_ARG(output)
|
2039
|
+
) {
|
1802
2040
|
switch (c) {
|
1803
2041
|
case '\'':
|
1804
2042
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
|
@@ -1824,9 +2062,13 @@ static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
|
|
1824
2062
|
}
|
1825
2063
|
}
|
1826
2064
|
|
1827
|
-
//
|
1828
|
-
static StateResult handle_attr_value_unquoted_state(
|
1829
|
-
|
2065
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
|
2066
|
+
static StateResult handle_attr_value_unquoted_state (
|
2067
|
+
GumboParser* parser,
|
2068
|
+
GumboTokenizerState* tokenizer,
|
2069
|
+
int c,
|
2070
|
+
GumboToken* output
|
2071
|
+
) {
|
1830
2072
|
switch (c) {
|
1831
2073
|
case '\t':
|
1832
2074
|
case '\n':
|
@@ -1867,9 +2109,13 @@ static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
|
|
1867
2109
|
}
|
1868
2110
|
}
|
1869
2111
|
|
1870
|
-
//
|
1871
|
-
static StateResult handle_char_ref_in_attr_value_state(
|
1872
|
-
|
2112
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-in-attribute-value-state
|
2113
|
+
static StateResult handle_char_ref_in_attr_value_state (
|
2114
|
+
GumboParser* parser,
|
2115
|
+
GumboTokenizerState* tokenizer,
|
2116
|
+
int UNUSED_ARG(c),
|
2117
|
+
GumboToken* UNUSED_ARG(output)
|
2118
|
+
) {
|
1873
2119
|
OneOrTwoCodepoints char_ref;
|
1874
2120
|
int allowed_char;
|
1875
2121
|
bool is_unquoted = false;
|
@@ -1893,9 +2139,15 @@ static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
|
|
1893
2139
|
|
1894
2140
|
// Ignore the status, since we don't have a convenient way of signalling that
|
1895
2141
|
// a parser error has occurred when the error occurs in the middle of a
|
1896
|
-
// multi-state token.
|
2142
|
+
// multi-state token. We'd need a flag inside the TokenizerState to do this,
|
1897
2143
|
// but that's a low priority fix.
|
1898
|
-
|
2144
|
+
gumbo_consume_char_ref (
|
2145
|
+
parser,
|
2146
|
+
&tokenizer->_input,
|
2147
|
+
allowed_char,
|
2148
|
+
true,
|
2149
|
+
&char_ref
|
2150
|
+
);
|
1899
2151
|
if (char_ref.first != kGumboNoChar) {
|
1900
2152
|
tokenizer->_reconsume_current_input = true;
|
1901
2153
|
append_char_to_tag_buffer(parser, char_ref.first, is_unquoted);
|
@@ -1909,9 +2161,13 @@ static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
|
|
1909
2161
|
return NEXT_CHAR;
|
1910
2162
|
}
|
1911
2163
|
|
1912
|
-
//
|
1913
|
-
static StateResult handle_after_attr_value_quoted_state(
|
1914
|
-
|
2164
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
|
2165
|
+
static StateResult handle_after_attr_value_quoted_state (
|
2166
|
+
GumboParser* parser,
|
2167
|
+
GumboTokenizerState* tokenizer,
|
2168
|
+
int c,
|
2169
|
+
GumboToken* output
|
2170
|
+
) {
|
1915
2171
|
finish_attribute_value(parser);
|
1916
2172
|
switch (c) {
|
1917
2173
|
case '\t':
|
@@ -1940,9 +2196,13 @@ static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
|
|
1940
2196
|
}
|
1941
2197
|
}
|
1942
2198
|
|
1943
|
-
//
|
1944
|
-
static StateResult handle_self_closing_start_tag_state(
|
1945
|
-
|
2199
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
|
2200
|
+
static StateResult handle_self_closing_start_tag_state (
|
2201
|
+
GumboParser* parser,
|
2202
|
+
GumboTokenizerState* tokenizer,
|
2203
|
+
int c,
|
2204
|
+
GumboToken* output
|
2205
|
+
) {
|
1946
2206
|
switch (c) {
|
1947
2207
|
case '>':
|
1948
2208
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
@@ -1961,11 +2221,16 @@ static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
|
|
1961
2221
|
}
|
1962
2222
|
}
|
1963
2223
|
|
1964
|
-
//
|
1965
|
-
static StateResult handle_bogus_comment_state(
|
1966
|
-
|
2224
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
|
2225
|
+
static StateResult handle_bogus_comment_state (
|
2226
|
+
GumboParser* parser,
|
2227
|
+
GumboTokenizerState* tokenizer,
|
2228
|
+
int c,
|
2229
|
+
GumboToken* output
|
2230
|
+
) {
|
1967
2231
|
while (c != '>' && c != -1) {
|
1968
2232
|
if (c == '\0') {
|
2233
|
+
tokenizer_add_parse_error(parser, GUMBO_ERR_UTF8_NULL);
|
1969
2234
|
c = 0xFFFD;
|
1970
2235
|
}
|
1971
2236
|
append_char_to_temporary_buffer(parser, c);
|
@@ -1976,29 +2241,48 @@ static StateResult handle_bogus_comment_state(GumboParser* parser,
|
|
1976
2241
|
return emit_comment(parser, output);
|
1977
2242
|
}
|
1978
2243
|
|
1979
|
-
//
|
1980
|
-
static StateResult handle_markup_declaration_state(
|
1981
|
-
|
1982
|
-
|
1983
|
-
|
2244
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
2245
|
+
static StateResult handle_markup_declaration_state (
|
2246
|
+
GumboParser* parser,
|
2247
|
+
GumboTokenizerState* tokenizer,
|
2248
|
+
int UNUSED_ARG(c),
|
2249
|
+
GumboToken* UNUSED_ARG(output)
|
2250
|
+
) {
|
2251
|
+
if (
|
2252
|
+
utf8iterator_maybe_consume_match (
|
2253
|
+
&tokenizer->_input,
|
2254
|
+
"--",
|
2255
|
+
sizeof("--") - 1,
|
2256
|
+
true
|
2257
|
+
)
|
2258
|
+
) {
|
1984
2259
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
|
1985
2260
|
tokenizer->_reconsume_current_input = true;
|
1986
|
-
} else if (
|
1987
|
-
|
2261
|
+
} else if (
|
2262
|
+
utf8iterator_maybe_consume_match (
|
2263
|
+
&tokenizer->_input,
|
2264
|
+
"DOCTYPE",
|
2265
|
+
sizeof("DOCTYPE") - 1,
|
2266
|
+
false
|
2267
|
+
)
|
2268
|
+
) {
|
1988
2269
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
|
1989
2270
|
tokenizer->_reconsume_current_input = true;
|
1990
2271
|
// If we get here, we know we'll eventually emit a doctype token, so now is
|
1991
|
-
// the time to initialize the doctype strings.
|
2272
|
+
// the time to initialize the doctype strings. (Not in doctype_state_init,
|
1992
2273
|
// since then they'll leak if ownership never gets transferred to the
|
1993
2274
|
// doctype token.
|
1994
|
-
tokenizer->_doc_type_state.name =
|
1995
|
-
tokenizer->_doc_type_state.public_identifier =
|
1996
|
-
|
1997
|
-
|
1998
|
-
|
1999
|
-
|
2000
|
-
|
2001
|
-
|
2275
|
+
tokenizer->_doc_type_state.name = gumbo_strdup("");
|
2276
|
+
tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
|
2277
|
+
tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
|
2278
|
+
} else if (
|
2279
|
+
tokenizer->_is_current_node_foreign
|
2280
|
+
&& utf8iterator_maybe_consume_match (
|
2281
|
+
&tokenizer->_input,
|
2282
|
+
"[CDATA[", sizeof("[CDATA[") - 1,
|
2283
|
+
true
|
2284
|
+
)
|
2285
|
+
) {
|
2002
2286
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
|
2003
2287
|
tokenizer->_is_in_cdata = true;
|
2004
2288
|
tokenizer->_reconsume_current_input = true;
|
@@ -2011,9 +2295,13 @@ static StateResult handle_markup_declaration_state(GumboParser* parser,
|
|
2011
2295
|
return NEXT_CHAR;
|
2012
2296
|
}
|
2013
2297
|
|
2014
|
-
//
|
2015
|
-
static StateResult handle_comment_start_state(
|
2016
|
-
|
2298
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
|
2299
|
+
static StateResult handle_comment_start_state (
|
2300
|
+
GumboParser* parser,
|
2301
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2302
|
+
int c,
|
2303
|
+
GumboToken* output
|
2304
|
+
) {
|
2017
2305
|
switch (c) {
|
2018
2306
|
case '-':
|
2019
2307
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
|
@@ -2040,9 +2328,13 @@ static StateResult handle_comment_start_state(GumboParser* parser,
|
|
2040
2328
|
}
|
2041
2329
|
}
|
2042
2330
|
|
2043
|
-
//
|
2044
|
-
static StateResult handle_comment_start_dash_state(
|
2045
|
-
|
2331
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
|
2332
|
+
static StateResult handle_comment_start_dash_state (
|
2333
|
+
GumboParser* parser,
|
2334
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2335
|
+
int c,
|
2336
|
+
GumboToken* output
|
2337
|
+
) {
|
2046
2338
|
switch (c) {
|
2047
2339
|
case '-':
|
2048
2340
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
|
@@ -2071,9 +2363,13 @@ static StateResult handle_comment_start_dash_state(GumboParser* parser,
|
|
2071
2363
|
}
|
2072
2364
|
}
|
2073
2365
|
|
2074
|
-
//
|
2075
|
-
static StateResult handle_comment_state(
|
2076
|
-
|
2366
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-state
|
2367
|
+
static StateResult handle_comment_state (
|
2368
|
+
GumboParser* parser,
|
2369
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2370
|
+
int c,
|
2371
|
+
GumboToken* output
|
2372
|
+
) {
|
2077
2373
|
switch (c) {
|
2078
2374
|
case '-':
|
2079
2375
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
|
@@ -2093,9 +2389,13 @@ static StateResult handle_comment_state(GumboParser* parser,
|
|
2093
2389
|
}
|
2094
2390
|
}
|
2095
2391
|
|
2096
|
-
//
|
2097
|
-
static StateResult handle_comment_end_dash_state(
|
2098
|
-
|
2392
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
|
2393
|
+
static StateResult handle_comment_end_dash_state (
|
2394
|
+
GumboParser* parser,
|
2395
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2396
|
+
int c,
|
2397
|
+
GumboToken* output
|
2398
|
+
) {
|
2099
2399
|
switch (c) {
|
2100
2400
|
case '-':
|
2101
2401
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
|
@@ -2119,9 +2419,13 @@ static StateResult handle_comment_end_dash_state(GumboParser* parser,
|
|
2119
2419
|
}
|
2120
2420
|
}
|
2121
2421
|
|
2122
|
-
//
|
2123
|
-
static StateResult handle_comment_end_state(
|
2124
|
-
|
2422
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
|
2423
|
+
static StateResult handle_comment_end_state (
|
2424
|
+
GumboParser* parser,
|
2425
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2426
|
+
int c,
|
2427
|
+
GumboToken* output
|
2428
|
+
) {
|
2125
2429
|
switch (c) {
|
2126
2430
|
case '>':
|
2127
2431
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
@@ -2158,9 +2462,13 @@ static StateResult handle_comment_end_state(GumboParser* parser,
|
|
2158
2462
|
}
|
2159
2463
|
}
|
2160
2464
|
|
2161
|
-
//
|
2162
|
-
static StateResult handle_comment_end_bang_state(
|
2163
|
-
|
2465
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
|
2466
|
+
static StateResult handle_comment_end_bang_state (
|
2467
|
+
GumboParser* parser,
|
2468
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
2469
|
+
int c,
|
2470
|
+
GumboToken* output
|
2471
|
+
) {
|
2164
2472
|
switch (c) {
|
2165
2473
|
case '-':
|
2166
2474
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
|
@@ -2194,9 +2502,13 @@ static StateResult handle_comment_end_bang_state(GumboParser* parser,
|
|
2194
2502
|
}
|
2195
2503
|
}
|
2196
2504
|
|
2197
|
-
//
|
2198
|
-
static StateResult handle_doctype_state(
|
2199
|
-
|
2505
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
|
2506
|
+
static StateResult handle_doctype_state (
|
2507
|
+
GumboParser* parser,
|
2508
|
+
GumboTokenizerState* tokenizer,
|
2509
|
+
int c,
|
2510
|
+
GumboToken* output
|
2511
|
+
) {
|
2200
2512
|
assert(!tokenizer->_temporary_buffer.length);
|
2201
2513
|
switch (c) {
|
2202
2514
|
case '\t':
|
@@ -2220,9 +2532,13 @@ static StateResult handle_doctype_state(GumboParser* parser,
|
|
2220
2532
|
}
|
2221
2533
|
}
|
2222
2534
|
|
2223
|
-
//
|
2224
|
-
static StateResult handle_before_doctype_name_state(
|
2225
|
-
|
2535
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
|
2536
|
+
static StateResult handle_before_doctype_name_state (
|
2537
|
+
GumboParser* parser,
|
2538
|
+
GumboTokenizerState* tokenizer,
|
2539
|
+
int c,
|
2540
|
+
GumboToken* output
|
2541
|
+
) {
|
2226
2542
|
switch (c) {
|
2227
2543
|
case '\t':
|
2228
2544
|
case '\n':
|
@@ -2255,21 +2571,25 @@ static StateResult handle_before_doctype_name_state(GumboParser* parser,
|
|
2255
2571
|
}
|
2256
2572
|
}
|
2257
2573
|
|
2258
|
-
//
|
2259
|
-
static StateResult handle_doctype_name_state(
|
2260
|
-
|
2574
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
|
2575
|
+
static StateResult handle_doctype_name_state (
|
2576
|
+
GumboParser* parser,
|
2577
|
+
GumboTokenizerState* tokenizer,
|
2578
|
+
int c,
|
2579
|
+
GumboToken* output
|
2580
|
+
) {
|
2261
2581
|
switch (c) {
|
2262
2582
|
case '\t':
|
2263
2583
|
case '\n':
|
2264
2584
|
case '\f':
|
2265
2585
|
case ' ':
|
2266
2586
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
|
2267
|
-
|
2587
|
+
gumbo_free((void*) tokenizer->_doc_type_state.name);
|
2268
2588
|
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2269
2589
|
return NEXT_CHAR;
|
2270
2590
|
case '>':
|
2271
2591
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2272
|
-
|
2592
|
+
gumbo_free((void*) tokenizer->_doc_type_state.name);
|
2273
2593
|
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2274
2594
|
emit_doctype(parser, output);
|
2275
2595
|
return RETURN_SUCCESS;
|
@@ -2281,7 +2601,7 @@ static StateResult handle_doctype_name_state(GumboParser* parser,
|
|
2281
2601
|
tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_EOF);
|
2282
2602
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2283
2603
|
tokenizer->_doc_type_state.force_quirks = true;
|
2284
|
-
|
2604
|
+
gumbo_free((void*) tokenizer->_doc_type_state.name);
|
2285
2605
|
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2286
2606
|
emit_doctype(parser, output);
|
2287
2607
|
return RETURN_ERROR;
|
@@ -2293,9 +2613,13 @@ static StateResult handle_doctype_name_state(GumboParser* parser,
|
|
2293
2613
|
}
|
2294
2614
|
}
|
2295
2615
|
|
2296
|
-
//
|
2297
|
-
static StateResult handle_after_doctype_name_state(
|
2298
|
-
|
2616
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
|
2617
|
+
static StateResult handle_after_doctype_name_state (
|
2618
|
+
GumboParser* parser,
|
2619
|
+
GumboTokenizerState* tokenizer,
|
2620
|
+
int c,
|
2621
|
+
GumboToken* output
|
2622
|
+
) {
|
2299
2623
|
switch (c) {
|
2300
2624
|
case '\t':
|
2301
2625
|
case '\n':
|
@@ -2333,10 +2657,13 @@ static StateResult handle_after_doctype_name_state(GumboParser* parser,
|
|
2333
2657
|
}
|
2334
2658
|
}
|
2335
2659
|
|
2336
|
-
//
|
2337
|
-
static StateResult handle_after_doctype_public_keyword_state(
|
2338
|
-
|
2339
|
-
|
2660
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
|
2661
|
+
static StateResult handle_after_doctype_public_keyword_state (
|
2662
|
+
GumboParser* parser,
|
2663
|
+
GumboTokenizerState* tokenizer,
|
2664
|
+
int c,
|
2665
|
+
GumboToken* output
|
2666
|
+
) {
|
2340
2667
|
switch (c) {
|
2341
2668
|
case '\t':
|
2342
2669
|
case '\n':
|
@@ -2346,13 +2673,13 @@ static StateResult handle_after_doctype_public_keyword_state(
|
|
2346
2673
|
return NEXT_CHAR;
|
2347
2674
|
case '"':
|
2348
2675
|
tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
|
2349
|
-
assert(
|
2676
|
+
assert(temporary_buffer_is_empty(parser));
|
2350
2677
|
gumbo_tokenizer_set_state(
|
2351
2678
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
|
2352
2679
|
return NEXT_CHAR;
|
2353
2680
|
case '\'':
|
2354
2681
|
tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
|
2355
|
-
assert(
|
2682
|
+
assert(temporary_buffer_is_empty(parser));
|
2356
2683
|
gumbo_tokenizer_set_state(
|
2357
2684
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
|
2358
2685
|
return NEXT_CHAR;
|
@@ -2377,9 +2704,13 @@ static StateResult handle_after_doctype_public_keyword_state(
|
|
2377
2704
|
}
|
2378
2705
|
}
|
2379
2706
|
|
2380
|
-
//
|
2381
|
-
static StateResult handle_before_doctype_public_id_state(
|
2382
|
-
|
2707
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
|
2708
|
+
static StateResult handle_before_doctype_public_id_state (
|
2709
|
+
GumboParser* parser,
|
2710
|
+
GumboTokenizerState* tokenizer,
|
2711
|
+
int c,
|
2712
|
+
GumboToken* output
|
2713
|
+
) {
|
2383
2714
|
switch (c) {
|
2384
2715
|
case '\t':
|
2385
2716
|
case '\n':
|
@@ -2387,12 +2718,12 @@ static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
|
|
2387
2718
|
case ' ':
|
2388
2719
|
return NEXT_CHAR;
|
2389
2720
|
case '"':
|
2390
|
-
assert(
|
2721
|
+
assert(temporary_buffer_is_empty(parser));
|
2391
2722
|
gumbo_tokenizer_set_state(
|
2392
2723
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
|
2393
2724
|
return NEXT_CHAR;
|
2394
2725
|
case '\'':
|
2395
|
-
assert(
|
2726
|
+
assert(temporary_buffer_is_empty(parser));
|
2396
2727
|
gumbo_tokenizer_set_state(
|
2397
2728
|
parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
|
2398
2729
|
return NEXT_CHAR;
|
@@ -2417,10 +2748,13 @@ static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
|
|
2417
2748
|
}
|
2418
2749
|
}
|
2419
2750
|
|
2420
|
-
//
|
2421
|
-
static StateResult handle_doctype_public_id_double_quoted_state(
|
2422
|
-
|
2423
|
-
|
2751
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
|
2752
|
+
static StateResult handle_doctype_public_id_double_quoted_state (
|
2753
|
+
GumboParser* parser,
|
2754
|
+
GumboTokenizerState* tokenizer,
|
2755
|
+
int c,
|
2756
|
+
GumboToken* output
|
2757
|
+
) {
|
2424
2758
|
switch (c) {
|
2425
2759
|
case '"':
|
2426
2760
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
|
@@ -2450,10 +2784,13 @@ static StateResult handle_doctype_public_id_double_quoted_state(
|
|
2450
2784
|
}
|
2451
2785
|
}
|
2452
2786
|
|
2453
|
-
//
|
2454
|
-
static StateResult handle_doctype_public_id_single_quoted_state(
|
2455
|
-
|
2456
|
-
|
2787
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
|
2788
|
+
static StateResult handle_doctype_public_id_single_quoted_state (
|
2789
|
+
GumboParser* parser,
|
2790
|
+
GumboTokenizerState* tokenizer,
|
2791
|
+
int c,
|
2792
|
+
GumboToken* output
|
2793
|
+
) {
|
2457
2794
|
switch (c) {
|
2458
2795
|
case '\'':
|
2459
2796
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
|
@@ -2483,9 +2820,13 @@ static StateResult handle_doctype_public_id_single_quoted_state(
|
|
2483
2820
|
}
|
2484
2821
|
}
|
2485
2822
|
|
2486
|
-
//
|
2487
|
-
static StateResult handle_after_doctype_public_id_state(
|
2488
|
-
|
2823
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
|
2824
|
+
static StateResult handle_after_doctype_public_id_state (
|
2825
|
+
GumboParser* parser,
|
2826
|
+
GumboTokenizerState* tokenizer,
|
2827
|
+
int c,
|
2828
|
+
GumboToken* output
|
2829
|
+
) {
|
2489
2830
|
switch (c) {
|
2490
2831
|
case '\t':
|
2491
2832
|
case '\n':
|
@@ -2500,13 +2841,13 @@ static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
|
|
2500
2841
|
return RETURN_SUCCESS;
|
2501
2842
|
case '"':
|
2502
2843
|
tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
|
2503
|
-
assert(
|
2844
|
+
assert(temporary_buffer_is_empty(parser));
|
2504
2845
|
gumbo_tokenizer_set_state(
|
2505
2846
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2506
2847
|
return NEXT_CHAR;
|
2507
2848
|
case '\'':
|
2508
2849
|
tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
|
2509
|
-
assert(
|
2850
|
+
assert(temporary_buffer_is_empty(parser));
|
2510
2851
|
gumbo_tokenizer_set_state(
|
2511
2852
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2512
2853
|
return NEXT_CHAR;
|
@@ -2525,10 +2866,13 @@ static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
|
|
2525
2866
|
}
|
2526
2867
|
}
|
2527
2868
|
|
2528
|
-
//
|
2529
|
-
static StateResult handle_between_doctype_public_system_id_state(
|
2530
|
-
|
2531
|
-
|
2869
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
|
2870
|
+
static StateResult handle_between_doctype_public_system_id_state (
|
2871
|
+
GumboParser* parser,
|
2872
|
+
GumboTokenizerState* tokenizer,
|
2873
|
+
int c,
|
2874
|
+
GumboToken* output
|
2875
|
+
) {
|
2532
2876
|
switch (c) {
|
2533
2877
|
case '\t':
|
2534
2878
|
case '\n':
|
@@ -2540,12 +2884,12 @@ static StateResult handle_between_doctype_public_system_id_state(
|
|
2540
2884
|
emit_doctype(parser, output);
|
2541
2885
|
return RETURN_SUCCESS;
|
2542
2886
|
case '"':
|
2543
|
-
assert(
|
2887
|
+
assert(temporary_buffer_is_empty(parser));
|
2544
2888
|
gumbo_tokenizer_set_state(
|
2545
2889
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2546
2890
|
return NEXT_CHAR;
|
2547
2891
|
case '\'':
|
2548
|
-
assert(
|
2892
|
+
assert(temporary_buffer_is_empty(parser));
|
2549
2893
|
gumbo_tokenizer_set_state(
|
2550
2894
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2551
2895
|
return NEXT_CHAR;
|
@@ -2564,10 +2908,13 @@ static StateResult handle_between_doctype_public_system_id_state(
|
|
2564
2908
|
}
|
2565
2909
|
}
|
2566
2910
|
|
2567
|
-
//
|
2568
|
-
static StateResult handle_after_doctype_system_keyword_state(
|
2569
|
-
|
2570
|
-
|
2911
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
|
2912
|
+
static StateResult handle_after_doctype_system_keyword_state (
|
2913
|
+
GumboParser* parser,
|
2914
|
+
GumboTokenizerState* tokenizer,
|
2915
|
+
int c,
|
2916
|
+
GumboToken* output
|
2917
|
+
) {
|
2571
2918
|
switch (c) {
|
2572
2919
|
case '\t':
|
2573
2920
|
case '\n':
|
@@ -2577,13 +2924,13 @@ static StateResult handle_after_doctype_system_keyword_state(
|
|
2577
2924
|
return NEXT_CHAR;
|
2578
2925
|
case '"':
|
2579
2926
|
tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
|
2580
|
-
assert(
|
2927
|
+
assert(temporary_buffer_is_empty(parser));
|
2581
2928
|
gumbo_tokenizer_set_state(
|
2582
2929
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2583
2930
|
return NEXT_CHAR;
|
2584
2931
|
case '\'':
|
2585
2932
|
tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
|
2586
|
-
assert(
|
2933
|
+
assert(temporary_buffer_is_empty(parser));
|
2587
2934
|
gumbo_tokenizer_set_state(
|
2588
2935
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2589
2936
|
return NEXT_CHAR;
|
@@ -2607,9 +2954,13 @@ static StateResult handle_after_doctype_system_keyword_state(
|
|
2607
2954
|
}
|
2608
2955
|
}
|
2609
2956
|
|
2610
|
-
//
|
2611
|
-
static StateResult handle_before_doctype_system_id_state(
|
2612
|
-
|
2957
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
|
2958
|
+
static StateResult handle_before_doctype_system_id_state (
|
2959
|
+
GumboParser* parser,
|
2960
|
+
GumboTokenizerState* tokenizer,
|
2961
|
+
int c,
|
2962
|
+
GumboToken* output
|
2963
|
+
) {
|
2613
2964
|
switch (c) {
|
2614
2965
|
case '\t':
|
2615
2966
|
case '\n':
|
@@ -2617,12 +2968,12 @@ static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
|
|
2617
2968
|
case ' ':
|
2618
2969
|
return NEXT_CHAR;
|
2619
2970
|
case '"':
|
2620
|
-
assert(
|
2971
|
+
assert(temporary_buffer_is_empty(parser));
|
2621
2972
|
gumbo_tokenizer_set_state(
|
2622
2973
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
|
2623
2974
|
return NEXT_CHAR;
|
2624
2975
|
case '\'':
|
2625
|
-
assert(
|
2976
|
+
assert(temporary_buffer_is_empty(parser));
|
2626
2977
|
gumbo_tokenizer_set_state(
|
2627
2978
|
parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
|
2628
2979
|
return NEXT_CHAR;
|
@@ -2646,10 +2997,13 @@ static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
|
|
2646
2997
|
}
|
2647
2998
|
}
|
2648
2999
|
|
2649
|
-
//
|
2650
|
-
static StateResult handle_doctype_system_id_double_quoted_state(
|
2651
|
-
|
2652
|
-
|
3000
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
|
3001
|
+
static StateResult handle_doctype_system_id_double_quoted_state (
|
3002
|
+
GumboParser* parser,
|
3003
|
+
GumboTokenizerState* tokenizer,
|
3004
|
+
int c,
|
3005
|
+
GumboToken* output
|
3006
|
+
) {
|
2653
3007
|
switch (c) {
|
2654
3008
|
case '"':
|
2655
3009
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
|
@@ -2679,10 +3033,13 @@ static StateResult handle_doctype_system_id_double_quoted_state(
|
|
2679
3033
|
}
|
2680
3034
|
}
|
2681
3035
|
|
2682
|
-
//
|
2683
|
-
static StateResult handle_doctype_system_id_single_quoted_state(
|
2684
|
-
|
2685
|
-
|
3036
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
|
3037
|
+
static StateResult handle_doctype_system_id_single_quoted_state (
|
3038
|
+
GumboParser* parser,
|
3039
|
+
GumboTokenizerState* tokenizer,
|
3040
|
+
int c,
|
3041
|
+
GumboToken* output
|
3042
|
+
) {
|
2686
3043
|
switch (c) {
|
2687
3044
|
case '\'':
|
2688
3045
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
|
@@ -2712,9 +3069,13 @@ static StateResult handle_doctype_system_id_single_quoted_state(
|
|
2712
3069
|
}
|
2713
3070
|
}
|
2714
3071
|
|
2715
|
-
//
|
2716
|
-
static StateResult handle_after_doctype_system_id_state(
|
2717
|
-
|
3072
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
|
3073
|
+
static StateResult handle_after_doctype_system_id_state (
|
3074
|
+
GumboParser* parser,
|
3075
|
+
GumboTokenizerState* tokenizer,
|
3076
|
+
int c,
|
3077
|
+
GumboToken* output
|
3078
|
+
) {
|
2718
3079
|
switch (c) {
|
2719
3080
|
case '\t':
|
2720
3081
|
case '\n':
|
@@ -2738,9 +3099,13 @@ static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
|
|
2738
3099
|
}
|
2739
3100
|
}
|
2740
3101
|
|
2741
|
-
//
|
2742
|
-
static StateResult handle_bogus_doctype_state(
|
2743
|
-
|
3102
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
|
3103
|
+
static StateResult handle_bogus_doctype_state (
|
3104
|
+
GumboParser* parser,
|
3105
|
+
GumboTokenizerState* UNUSED_ARG(tokenizer),
|
3106
|
+
int c,
|
3107
|
+
GumboToken* output
|
3108
|
+
) {
|
2744
3109
|
if (c == '>' || c == -1) {
|
2745
3110
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2746
3111
|
emit_doctype(parser, output);
|
@@ -2749,9 +3114,13 @@ static StateResult handle_bogus_doctype_state(GumboParser* parser,
|
|
2749
3114
|
return NEXT_CHAR;
|
2750
3115
|
}
|
2751
3116
|
|
2752
|
-
//
|
2753
|
-
static StateResult handle_cdata_state(
|
2754
|
-
|
3117
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
|
3118
|
+
static StateResult handle_cdata_state (
|
3119
|
+
GumboParser* parser,
|
3120
|
+
GumboTokenizerState* tokenizer,
|
3121
|
+
int c,
|
3122
|
+
GumboToken* output
|
3123
|
+
) {
|
2755
3124
|
if (c == -1 || utf8iterator_maybe_consume_match(
|
2756
3125
|
&tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
|
2757
3126
|
tokenizer->_reconsume_current_input = true;
|
@@ -2764,50 +3133,83 @@ static StateResult handle_cdata_state(GumboParser* parser,
|
|
2764
3133
|
}
|
2765
3134
|
}
|
2766
3135
|
|
2767
|
-
typedef StateResult (*GumboLexerStateFunction)(
|
2768
|
-
|
2769
|
-
|
2770
|
-
|
2771
|
-
|
2772
|
-
|
2773
|
-
|
2774
|
-
|
2775
|
-
|
2776
|
-
|
2777
|
-
|
2778
|
-
|
2779
|
-
|
2780
|
-
|
2781
|
-
|
2782
|
-
|
2783
|
-
|
2784
|
-
|
2785
|
-
|
2786
|
-
|
2787
|
-
|
2788
|
-
|
2789
|
-
|
2790
|
-
|
2791
|
-
|
2792
|
-
|
2793
|
-
|
2794
|
-
|
2795
|
-
|
2796
|
-
|
2797
|
-
|
2798
|
-
|
2799
|
-
|
2800
|
-
|
2801
|
-
|
2802
|
-
|
2803
|
-
|
2804
|
-
|
2805
|
-
|
2806
|
-
|
2807
|
-
|
2808
|
-
|
2809
|
-
|
2810
|
-
|
3136
|
+
typedef StateResult (*GumboLexerStateFunction) (
|
3137
|
+
GumboParser* parser,
|
3138
|
+
GumboTokenizerState* tokenizer,
|
3139
|
+
int c,
|
3140
|
+
GumboToken* output
|
3141
|
+
);
|
3142
|
+
|
3143
|
+
static GumboLexerStateFunction dispatch_table[] = {
|
3144
|
+
handle_data_state,
|
3145
|
+
handle_char_ref_in_data_state,
|
3146
|
+
handle_rcdata_state,
|
3147
|
+
handle_char_ref_in_rcdata_state,
|
3148
|
+
handle_rawtext_state,
|
3149
|
+
handle_script_state,
|
3150
|
+
handle_plaintext_state,
|
3151
|
+
handle_tag_open_state,
|
3152
|
+
handle_end_tag_open_state,
|
3153
|
+
handle_tag_name_state,
|
3154
|
+
handle_rcdata_lt_state,
|
3155
|
+
handle_rcdata_end_tag_open_state,
|
3156
|
+
handle_rcdata_end_tag_name_state,
|
3157
|
+
handle_rawtext_lt_state,
|
3158
|
+
handle_rawtext_end_tag_open_state,
|
3159
|
+
handle_rawtext_end_tag_name_state,
|
3160
|
+
handle_script_lt_state,
|
3161
|
+
handle_script_end_tag_open_state,
|
3162
|
+
handle_script_end_tag_name_state,
|
3163
|
+
handle_script_escaped_start_state,
|
3164
|
+
handle_script_escaped_start_dash_state,
|
3165
|
+
handle_script_escaped_state,
|
3166
|
+
handle_script_escaped_dash_state,
|
3167
|
+
handle_script_escaped_dash_dash_state,
|
3168
|
+
handle_script_escaped_lt_state,
|
3169
|
+
handle_script_escaped_end_tag_open_state,
|
3170
|
+
handle_script_escaped_end_tag_name_state,
|
3171
|
+
handle_script_double_escaped_start_state,
|
3172
|
+
handle_script_double_escaped_state,
|
3173
|
+
handle_script_double_escaped_dash_state,
|
3174
|
+
handle_script_double_escaped_dash_dash_state,
|
3175
|
+
handle_script_double_escaped_lt_state,
|
3176
|
+
handle_script_double_escaped_end_state,
|
3177
|
+
handle_before_attr_name_state,
|
3178
|
+
handle_attr_name_state,
|
3179
|
+
handle_after_attr_name_state,
|
3180
|
+
handle_before_attr_value_state,
|
3181
|
+
handle_attr_value_double_quoted_state,
|
3182
|
+
handle_attr_value_single_quoted_state,
|
3183
|
+
handle_attr_value_unquoted_state,
|
3184
|
+
handle_char_ref_in_attr_value_state,
|
3185
|
+
handle_after_attr_value_quoted_state,
|
3186
|
+
handle_self_closing_start_tag_state,
|
3187
|
+
handle_bogus_comment_state,
|
3188
|
+
handle_markup_declaration_state,
|
3189
|
+
handle_comment_start_state,
|
3190
|
+
handle_comment_start_dash_state,
|
3191
|
+
handle_comment_state,
|
3192
|
+
handle_comment_end_dash_state,
|
3193
|
+
handle_comment_end_state,
|
3194
|
+
handle_comment_end_bang_state,
|
3195
|
+
handle_doctype_state,
|
3196
|
+
handle_before_doctype_name_state,
|
3197
|
+
handle_doctype_name_state,
|
3198
|
+
handle_after_doctype_name_state,
|
3199
|
+
handle_after_doctype_public_keyword_state,
|
3200
|
+
handle_before_doctype_public_id_state,
|
3201
|
+
handle_doctype_public_id_double_quoted_state,
|
3202
|
+
handle_doctype_public_id_single_quoted_state,
|
3203
|
+
handle_after_doctype_public_id_state,
|
3204
|
+
handle_between_doctype_public_system_id_state,
|
3205
|
+
handle_after_doctype_system_keyword_state,
|
3206
|
+
handle_before_doctype_system_id_state,
|
3207
|
+
handle_doctype_system_id_double_quoted_state,
|
3208
|
+
handle_doctype_system_id_single_quoted_state,
|
3209
|
+
handle_after_doctype_system_id_state,
|
3210
|
+
handle_bogus_doctype_state,
|
3211
|
+
handle_cdata_state
|
3212
|
+
};
|
2811
3213
|
|
2812
3214
|
bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
2813
3215
|
// Because of the spec requirements that...
|
@@ -2819,9 +3221,9 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
2819
3221
|
// state.
|
2820
3222
|
//
|
2821
3223
|
// ...all state must be held in the GumboTokenizer struct instead of in local
|
2822
|
-
// variables in this function.
|
3224
|
+
// variables in this function. That allows us to return from this method with
|
2823
3225
|
// a token, and then immediately jump back to the same state with the same
|
2824
|
-
// input if we need to return a different token.
|
3226
|
+
// input if we need to return a different token. The various emit_* functions
|
2825
3227
|
// are responsible for changing state (eg. flushing the chardata buffer,
|
2826
3228
|
// reading the next input character) to avoid an infinite loop.
|
2827
3229
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
@@ -2845,10 +3247,9 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
2845
3247
|
assert(!tokenizer->_temporary_buffer_emit);
|
2846
3248
|
assert(tokenizer->_buffered_emit_char == kGumboNoChar);
|
2847
3249
|
int c = utf8iterator_current(&tokenizer->_input);
|
2848
|
-
|
2849
|
-
|
2850
|
-
StateResult result =
|
2851
|
-
dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
|
3250
|
+
GumboTokenizerEnum state = tokenizer->_state;
|
3251
|
+
gumbo_debug("Lexing character '%c' (%d) in state %u.\n", c, c, state);
|
3252
|
+
StateResult result = dispatch_table[state](parser, tokenizer, c, output);
|
2852
3253
|
// We need to clear reconsume_current_input before returning to prevent
|
2853
3254
|
// certain infinite loop states.
|
2854
3255
|
bool should_advance = !tokenizer->_reconsume_current_input;
|
@@ -2866,30 +3267,33 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
2866
3267
|
}
|
2867
3268
|
}
|
2868
3269
|
|
2869
|
-
void gumbo_token_destroy(
|
3270
|
+
void gumbo_token_destroy(GumboToken* token) {
|
2870
3271
|
if (!token) return;
|
2871
3272
|
|
2872
3273
|
switch (token->type) {
|
2873
3274
|
case GUMBO_TOKEN_DOCTYPE:
|
2874
|
-
|
2875
|
-
|
2876
|
-
|
2877
|
-
gumbo_parser_deallocate(
|
2878
|
-
parser, (void*) token->v.doc_type.system_identifier);
|
3275
|
+
gumbo_free((void*) token->v.doc_type.name);
|
3276
|
+
gumbo_free((void*) token->v.doc_type.public_identifier);
|
3277
|
+
gumbo_free((void*) token->v.doc_type.system_identifier);
|
2879
3278
|
return;
|
2880
3279
|
case GUMBO_TOKEN_START_TAG:
|
2881
3280
|
for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
|
2882
3281
|
GumboAttribute* attr = token->v.start_tag.attributes.data[i];
|
2883
3282
|
if (attr) {
|
2884
3283
|
// May have been nulled out if this token was merged with another.
|
2885
|
-
gumbo_destroy_attribute(
|
3284
|
+
gumbo_destroy_attribute(attr);
|
2886
3285
|
}
|
2887
3286
|
}
|
2888
|
-
|
2889
|
-
|
3287
|
+
gumbo_free((void*) token->v.start_tag.attributes.data);
|
3288
|
+
if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN)
|
3289
|
+
gumbo_free(token->v.start_tag.name);
|
2890
3290
|
return;
|
3291
|
+
case GUMBO_TOKEN_END_TAG:
|
3292
|
+
if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN)
|
3293
|
+
gumbo_free(token->v.end_tag.name);
|
3294
|
+
break;
|
2891
3295
|
case GUMBO_TOKEN_COMMENT:
|
2892
|
-
|
3296
|
+
gumbo_free((void*) token->v.text);
|
2893
3297
|
return;
|
2894
3298
|
default:
|
2895
3299
|
return;
|