nokogumbo 2.0.0.pre.alpha → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +81 -10
- data/ext/nokogumbo/extconf.rb +6 -1
- data/ext/nokogumbo/nokogumbo.c +579 -233
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +376 -120
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +47 -4
- data/gumbo-parser/src/parser.c +849 -709
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1426 -1261
- data/gumbo-parser/src/tokenizer.h +5 -5
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +12 -59
- data/gumbo-parser/src/utf8.h +51 -16
- data/lib/nokogumbo.rb +0 -1
- data/lib/nokogumbo/html5.rb +2 -1
- data/lib/nokogumbo/html5/document.rb +12 -1
- data/lib/nokogumbo/html5/document_fragment.rb +35 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +16 -9
- data/CHANGELOG.md +0 -56
@@ -44,7 +44,6 @@ typedef struct GumboInternalTokenEndTag {
|
|
44
44
|
GumboTag tag;
|
45
45
|
// NULL unless tag is GUMBO_TAG_UNKNOWN
|
46
46
|
char *name;
|
47
|
-
bool is_self_closing;
|
48
47
|
} GumboTokenEndTag;
|
49
48
|
|
50
49
|
// A data structure representing a single token in the input stream. This
|
@@ -83,11 +82,12 @@ void gumbo_tokenizer_set_state (
|
|
83
82
|
GumboTokenizerEnum state
|
84
83
|
);
|
85
84
|
|
86
|
-
// Flags whether the current node is a foreign content element. This
|
87
|
-
// necessary for the markup declaration open state, where the tokenizer
|
88
|
-
// aware of the state of the parser to properly tokenize bad comment
|
85
|
+
// Flags whether the adjusted current node is a foreign content element. This
|
86
|
+
// is necessary for the markup declaration open state, where the tokenizer
|
87
|
+
// must be aware of the state of the parser to properly tokenize bad comment
|
88
|
+
// tags.
|
89
89
|
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
90
|
-
void
|
90
|
+
void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
|
91
91
|
struct GumboInternalParser* parser,
|
92
92
|
bool is_foreign
|
93
93
|
);
|
@@ -14,74 +14,326 @@
|
|
14
14
|
// The ordering of this enum is also used to build the dispatch table for the
|
15
15
|
// tokenizer state machine, so if it is changed, be sure to update that too.
|
16
16
|
typedef enum {
|
17
|
+
// 12.2.5.1 Data state
|
18
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#data-state
|
17
19
|
GUMBO_LEX_DATA,
|
18
|
-
|
20
|
+
|
21
|
+
// 12.2.5.2 RCDATA state
|
22
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
|
19
23
|
GUMBO_LEX_RCDATA,
|
20
|
-
|
24
|
+
|
25
|
+
// 12.2.5.3 RAWTEXT state
|
26
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state<Paste>
|
21
27
|
GUMBO_LEX_RAWTEXT,
|
22
|
-
|
28
|
+
|
29
|
+
// 12.2.5.4 Script data state
|
30
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
|
31
|
+
GUMBO_LEX_SCRIPT_DATA,
|
32
|
+
|
33
|
+
// 12.2.5.5 PLAINTEXT state
|
34
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
|
23
35
|
GUMBO_LEX_PLAINTEXT,
|
36
|
+
|
37
|
+
// 12.2.5.6 Tag open state
|
38
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
|
24
39
|
GUMBO_LEX_TAG_OPEN,
|
40
|
+
|
41
|
+
// 12.2.5.7 End tag open state
|
42
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
|
25
43
|
GUMBO_LEX_END_TAG_OPEN,
|
44
|
+
|
45
|
+
// 12.2.5.8 Tag name state
|
46
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
26
47
|
GUMBO_LEX_TAG_NAME,
|
48
|
+
|
49
|
+
// 12.2.5.9 RCDATA less-than sign state
|
50
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
|
27
51
|
GUMBO_LEX_RCDATA_LT,
|
52
|
+
|
53
|
+
// 12.2.5.10 RCDATA end tag open state
|
54
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
|
28
55
|
GUMBO_LEX_RCDATA_END_TAG_OPEN,
|
56
|
+
|
57
|
+
// 12.2.5.11 RCDATA end tag name state
|
58
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
|
29
59
|
GUMBO_LEX_RCDATA_END_TAG_NAME,
|
60
|
+
|
61
|
+
// 12.2.5.12 RAWTEXT less-than sign state
|
62
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
|
30
63
|
GUMBO_LEX_RAWTEXT_LT,
|
64
|
+
|
65
|
+
// 12.2.5.13 RAWTEXT end tag open state
|
66
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
|
31
67
|
GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
|
68
|
+
|
69
|
+
// 12.2.5.14 RAWTEXT end tag name state
|
70
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
|
32
71
|
GUMBO_LEX_RAWTEXT_END_TAG_NAME,
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
72
|
+
|
73
|
+
// 12.2.5.15 Script data less-than sign state
|
74
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
|
75
|
+
GUMBO_LEX_SCRIPT_DATA_LT,
|
76
|
+
|
77
|
+
// 12.2.5.16 Script data end tag open state
|
78
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
|
79
|
+
GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN,
|
80
|
+
|
81
|
+
// 12.2.5.17 Script data end tag name state
|
82
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
|
83
|
+
GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME,
|
84
|
+
|
85
|
+
// 12.2.5.18 Script data escape start state
|
86
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
|
87
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_START,
|
88
|
+
|
89
|
+
// 12.2.5.19 Script data escape start dash state
|
90
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
|
91
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH,
|
92
|
+
|
93
|
+
// 12.2.5.20 Script data escaped state
|
94
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
|
95
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED,
|
96
|
+
|
97
|
+
// 12.2.5.21 Script data escaped dash state
|
98
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
|
99
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH,
|
100
|
+
|
101
|
+
// 12.2.5.22 Script data escaped dash dash state
|
102
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
|
103
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH,
|
104
|
+
|
105
|
+
// 12.2.5.23 Script data escaped less than sign state
|
106
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
|
107
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT,
|
108
|
+
|
109
|
+
// 12.2.5.24 Script data escaped end tag open state
|
110
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
|
111
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN,
|
112
|
+
|
113
|
+
// 12.2.5.25 Script data escaped end tag name state
|
114
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
|
115
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME,
|
116
|
+
|
117
|
+
// 12.2.5.26 Script data double escape start state
|
118
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
|
119
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START,
|
120
|
+
|
121
|
+
// 12.2.5.27 Script data double escaped state
|
122
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
|
123
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED,
|
124
|
+
|
125
|
+
// 12.2.5.28 Script data double escaped dash state
|
126
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
|
127
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH,
|
128
|
+
|
129
|
+
// 12.2.5.29 Script data double escaped dash dash state
|
130
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
|
131
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH,
|
132
|
+
|
133
|
+
// 12.2.5.30 Script data double escaped less-than sign state
|
134
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
|
135
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT,
|
136
|
+
|
137
|
+
// 12.2.5.31 Script data double escape end state (XXX: spec bug with the
|
138
|
+
// name?)
|
139
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
|
140
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END,
|
141
|
+
|
142
|
+
// 12.2.5.32 Before attribute name state
|
143
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
|
50
144
|
GUMBO_LEX_BEFORE_ATTR_NAME,
|
145
|
+
|
146
|
+
// 12.2.5.33 Attributet name state
|
147
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
|
51
148
|
GUMBO_LEX_ATTR_NAME,
|
149
|
+
|
150
|
+
// 12.2.5.34 After attribute name state
|
151
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
|
52
152
|
GUMBO_LEX_AFTER_ATTR_NAME,
|
153
|
+
|
154
|
+
// 12.2.5.35 Before attribute value state
|
155
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
|
53
156
|
GUMBO_LEX_BEFORE_ATTR_VALUE,
|
157
|
+
|
158
|
+
// 12.2.5.36 Attribute value (double-quoted) state
|
159
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
|
54
160
|
GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
|
161
|
+
|
162
|
+
// 12.2.5.37 Attribute value (single-quoted) state
|
163
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
|
55
164
|
GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
|
165
|
+
|
166
|
+
// 12.2.5.38 Attribute value (unquoted) state
|
167
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
|
56
168
|
GUMBO_LEX_ATTR_VALUE_UNQUOTED,
|
57
|
-
|
169
|
+
|
170
|
+
// 12.2.5.39 After attribute value (quoted) state
|
171
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
|
58
172
|
GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
|
173
|
+
|
174
|
+
// 12.2.5.40 Self-closing start tag state
|
175
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
|
59
176
|
GUMBO_LEX_SELF_CLOSING_START_TAG,
|
177
|
+
|
178
|
+
// 12.2.5.41 Bogus comment state
|
179
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
|
60
180
|
GUMBO_LEX_BOGUS_COMMENT,
|
61
|
-
|
181
|
+
|
182
|
+
// 12.2.5.42 Markup declaration open state
|
183
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
184
|
+
GUMBO_LEX_MARKUP_DECLARATION_OPEN,
|
185
|
+
|
186
|
+
// 12.2.5.43 Comment start state
|
187
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
|
62
188
|
GUMBO_LEX_COMMENT_START,
|
189
|
+
|
190
|
+
// 12.2.5.44 Comment start dash state
|
191
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
|
63
192
|
GUMBO_LEX_COMMENT_START_DASH,
|
193
|
+
|
194
|
+
// 12.2.5.45 Comment state
|
195
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-state
|
64
196
|
GUMBO_LEX_COMMENT,
|
197
|
+
|
198
|
+
// 12.2.5.46 Comment less-than sign state
|
199
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
|
200
|
+
GUMBO_LEX_COMMENT_LT,
|
201
|
+
|
202
|
+
// 12.2.5.47 Comment less-than sign bang state
|
203
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
|
204
|
+
GUMBO_LEX_COMMENT_LT_BANG,
|
205
|
+
|
206
|
+
// 12.2.5.48 Comment less-than sign bang dash state
|
207
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
|
208
|
+
GUMBO_LEX_COMMENT_LT_BANG_DASH,
|
209
|
+
|
210
|
+
// 12.2.5.49 Comment less-than sign bang dash dash state
|
211
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
|
212
|
+
GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH,
|
213
|
+
|
214
|
+
// 12.2.5.50 Comment end dash state
|
215
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
|
65
216
|
GUMBO_LEX_COMMENT_END_DASH,
|
217
|
+
|
218
|
+
// 12.2.5.51 Comment end state
|
219
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
|
66
220
|
GUMBO_LEX_COMMENT_END,
|
221
|
+
|
222
|
+
// 12.2.5.52 Comment end bang state
|
223
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
|
67
224
|
GUMBO_LEX_COMMENT_END_BANG,
|
225
|
+
|
226
|
+
// 12.2.5.53 DOCTYPE state
|
227
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
|
68
228
|
GUMBO_LEX_DOCTYPE,
|
229
|
+
|
230
|
+
// 12.2.5.54 Before DOCTYPE name state
|
231
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
|
69
232
|
GUMBO_LEX_BEFORE_DOCTYPE_NAME,
|
233
|
+
|
234
|
+
// 12.2.5.55 DOCTYPE name state
|
235
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
|
70
236
|
GUMBO_LEX_DOCTYPE_NAME,
|
237
|
+
|
238
|
+
// 12.2.5.56 After DOCTYPE name state
|
239
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
|
71
240
|
GUMBO_LEX_AFTER_DOCTYPE_NAME,
|
241
|
+
|
242
|
+
// 12.2.5.57 After DOCTYPE public keyword state
|
243
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
|
72
244
|
GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
|
245
|
+
|
246
|
+
// 12.2.5.58 Before DOCTYPE public identifier state
|
247
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
|
73
248
|
GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
|
249
|
+
|
250
|
+
// 12.2.5.59 DOCTYPE public identifier (double-quoted) state
|
251
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
|
74
252
|
GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
|
253
|
+
|
254
|
+
// 12.2.5.60 DOCTYPE public identifier (single-quoted) state
|
255
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
|
75
256
|
GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
|
257
|
+
|
258
|
+
// 12.2.5.61 After DOCTYPE public identifier state
|
259
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
|
76
260
|
GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
|
261
|
+
|
262
|
+
// 12.2.5.62 Between DOCTYPE public and system identifiers state
|
263
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
|
77
264
|
GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
|
265
|
+
|
266
|
+
// 12.2.5.63 After DOCTYPE system keyword state
|
267
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
|
78
268
|
GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
|
269
|
+
|
270
|
+
// 12.2.5.64 Before DOCTYPE system identifier state
|
271
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
|
79
272
|
GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
|
273
|
+
|
274
|
+
// 12.2.5.65 DOCTYPE system identifier (double-quoted) state
|
275
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
|
80
276
|
GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
|
277
|
+
|
278
|
+
// 12.2.5.66 DOCTYPE system identifier (single-quoted) state
|
279
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
|
81
280
|
GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
|
281
|
+
|
282
|
+
// 12.2.5.67 After DOCTYPE system identifier state
|
283
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
|
82
284
|
GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
|
285
|
+
|
286
|
+
// 12.2.5.68 Bogus DOCTYPE state
|
287
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
|
83
288
|
GUMBO_LEX_BOGUS_DOCTYPE,
|
84
|
-
|
289
|
+
|
290
|
+
// 12.2.5.69 CDATA section state
|
291
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
|
292
|
+
GUMBO_LEX_CDATA_SECTION,
|
293
|
+
|
294
|
+
// 12.2.5.70 CDATA section bracket state
|
295
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
|
296
|
+
GUMBO_LEX_CDATA_SECTION_BRACKET,
|
297
|
+
|
298
|
+
// 12.2.5.71 CDATA section end state
|
299
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
|
300
|
+
GUMBO_LEX_CDATA_SECTION_END,
|
301
|
+
|
302
|
+
// 12.2.5.72 Character reference state
|
303
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
|
304
|
+
GUMBO_LEX_CHARACTER_REFERENCE,
|
305
|
+
|
306
|
+
// 12.2.5.73 Named character reference state
|
307
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
|
308
|
+
GUMBO_LEX_NAMED_CHARACTER_REFERENCE,
|
309
|
+
|
310
|
+
// 12.2.5.74 Ambiguous ampersand state
|
311
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
|
312
|
+
GUMBO_LEX_AMBIGUOUS_AMPERSAND,
|
313
|
+
|
314
|
+
// 12.2.5.75 Numeric character reference state
|
315
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
|
316
|
+
GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE,
|
317
|
+
|
318
|
+
// 12.2.5.76 Hexadecimal character reference start state
|
319
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
|
320
|
+
GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START,
|
321
|
+
|
322
|
+
// 12.2.5.77 Decimal character reference start state
|
323
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
|
324
|
+
GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START,
|
325
|
+
|
326
|
+
// 12.2.5.78 Hexadecimal character reference state
|
327
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
|
328
|
+
GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE,
|
329
|
+
|
330
|
+
// 12.2.5.79 Decimal character reference state
|
331
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
|
332
|
+
GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE,
|
333
|
+
|
334
|
+
// 12.2.5.80 Numeric character reference end state
|
335
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
|
336
|
+
GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END
|
85
337
|
} GumboTokenizerEnum;
|
86
338
|
|
87
339
|
#endif // GUMBO_TOKENIZER_STATES_H_
|
data/gumbo-parser/src/utf8.c
CHANGED
@@ -27,8 +27,6 @@
|
|
27
27
|
#include "ascii.h"
|
28
28
|
#include "vector.h"
|
29
29
|
|
30
|
-
const int kUtf8ReplacementChar = 0xFFFD;
|
31
|
-
|
32
30
|
// References:
|
33
31
|
// * https://tools.ietf.org/html/rfc3629
|
34
32
|
// * https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
|
@@ -100,16 +98,9 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
|
|
100
98
|
}
|
101
99
|
error->type = type;
|
102
100
|
error->position = iter->_pos;
|
103
|
-
error->original_text = iter->_start;
|
104
|
-
|
105
|
-
|
106
|
-
// yet (and can't be, because it's invalid), so we need to build up the raw
|
107
|
-
// hex value from the bytes under the cursor.
|
108
|
-
uint32_t code_point = 0;
|
109
|
-
for (size_t i = 0; i < iter->_width; ++i) {
|
110
|
-
code_point = (code_point << 8) | (unsigned char) iter->_start[i];
|
111
|
-
}
|
112
|
-
error->v.codepoint = code_point;
|
101
|
+
error->original_text.data = iter->_start;
|
102
|
+
error->original_text.length = iter->_width;
|
103
|
+
error->v.tokenizer.codepoint = iter->_current;
|
113
104
|
}
|
114
105
|
|
115
106
|
// Reads the next UTF-8 character in the iter.
|
@@ -147,13 +138,15 @@ static void read_char(Utf8Iterator* iter) {
|
|
147
138
|
}
|
148
139
|
code_point = '\n';
|
149
140
|
}
|
150
|
-
if (utf8_is_invalid_code_point(code_point)) {
|
151
|
-
// Invalid code points are errors, but they are not replaced by
|
152
|
-
// U+FFFD.
|
153
|
-
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
|
154
|
-
add_error(iter, GUMBO_ERR_UTF8_INVALID);
|
155
|
-
}
|
156
141
|
iter->_current = code_point;
|
142
|
+
if (utf8_is_surrogate(code_point)) {
|
143
|
+
add_error(iter, GUMBO_ERR_SURROGATE_IN_INPUT_STREAM);
|
144
|
+
} else if (utf8_is_noncharacter(code_point)) {
|
145
|
+
add_error(iter, GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM);
|
146
|
+
} else if (utf8_is_control(code_point)
|
147
|
+
&& !(gumbo_ascii_isspace(code_point) || code_point == 0)) {
|
148
|
+
add_error(iter, GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM);
|
149
|
+
}
|
157
150
|
return;
|
158
151
|
} else if (state == UTF8_REJECT) {
|
159
152
|
// We don't want to consume the invalid continuation byte of a multi-byte
|
@@ -169,8 +162,8 @@ static void read_char(Utf8Iterator* iter) {
|
|
169
162
|
// rest of the iterator, and emit a replacement character. The next time we
|
170
163
|
// enter this method, it will detect that there's no input to consume and
|
171
164
|
// output an EOF.
|
172
|
-
iter->_current = kUtf8ReplacementChar;
|
173
165
|
iter->_width = iter->_end - iter->_start;
|
166
|
+
iter->_current = kUtf8ReplacementChar;
|
174
167
|
add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
|
175
168
|
}
|
176
169
|
|
@@ -187,20 +180,6 @@ static void update_position(Utf8Iterator* iter) {
|
|
187
180
|
}
|
188
181
|
}
|
189
182
|
|
190
|
-
// Returns true if this Unicode code point is in the list of characters
|
191
|
-
// forbidden by the HTML5 spec, such as undefined control chars.
|
192
|
-
bool utf8_is_invalid_code_point(int c) {
|
193
|
-
return
|
194
|
-
(c >= 0x1 && c <= 0x8)
|
195
|
-
|| c == 0xB
|
196
|
-
|| (c >= 0xE && c <= 0x1F)
|
197
|
-
|| (c >= 0x7F && c <= 0x9F)
|
198
|
-
|| (c >= 0xFDD0 && c <= 0xFDEF)
|
199
|
-
|| ((c & 0xFFFF) == 0xFFFE)
|
200
|
-
|| ((c & 0xFFFF) == 0xFFFF)
|
201
|
-
;
|
202
|
-
}
|
203
|
-
|
204
183
|
void utf8iterator_init (
|
205
184
|
GumboParser* parser,
|
206
185
|
const char* source,
|
@@ -224,25 +203,6 @@ void utf8iterator_next(Utf8Iterator* iter) {
|
|
224
203
|
read_char(iter);
|
225
204
|
}
|
226
205
|
|
227
|
-
int utf8iterator_current(const Utf8Iterator* iter) {
|
228
|
-
return iter->_current;
|
229
|
-
}
|
230
|
-
|
231
|
-
void utf8iterator_get_position (
|
232
|
-
const Utf8Iterator* iter,
|
233
|
-
GumboSourcePosition* output
|
234
|
-
) {
|
235
|
-
*output = iter->_pos;
|
236
|
-
}
|
237
|
-
|
238
|
-
const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
|
239
|
-
return iter->_start;
|
240
|
-
}
|
241
|
-
|
242
|
-
const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
|
243
|
-
return iter->_end;
|
244
|
-
}
|
245
|
-
|
246
206
|
bool utf8iterator_maybe_consume_match (
|
247
207
|
Utf8Iterator* iter,
|
248
208
|
const char* prefix,
|
@@ -278,10 +238,3 @@ void utf8iterator_reset(Utf8Iterator* iter) {
|
|
278
238
|
iter->_pos = iter->_mark_pos;
|
279
239
|
read_char(iter);
|
280
240
|
}
|
281
|
-
|
282
|
-
// Sets the position and original text fields of an error to the value at the
|
283
|
-
// mark.
|
284
|
-
void utf8iterator_fill_error_at_mark(Utf8Iterator* iter, GumboError* error) {
|
285
|
-
error->position = iter->_mark_pos;
|
286
|
-
error->original_text = iter->_mark;
|
287
|
-
}
|