nokogumbo 2.0.0.pre.alpha → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +81 -10
- data/ext/nokogumbo/extconf.rb +6 -1
- data/ext/nokogumbo/nokogumbo.c +579 -233
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +376 -120
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +47 -4
- data/gumbo-parser/src/parser.c +849 -709
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1426 -1261
- data/gumbo-parser/src/tokenizer.h +5 -5
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +12 -59
- data/gumbo-parser/src/utf8.h +51 -16
- data/lib/nokogumbo.rb +0 -1
- data/lib/nokogumbo/html5.rb +2 -1
- data/lib/nokogumbo/html5/document.rb +12 -1
- data/lib/nokogumbo/html5/document_fragment.rb +35 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +16 -9
- data/CHANGELOG.md +0 -56
@@ -44,7 +44,6 @@ typedef struct GumboInternalTokenEndTag {
|
|
44
44
|
GumboTag tag;
|
45
45
|
// NULL unless tag is GUMBO_TAG_UNKNOWN
|
46
46
|
char *name;
|
47
|
-
bool is_self_closing;
|
48
47
|
} GumboTokenEndTag;
|
49
48
|
|
50
49
|
// A data structure representing a single token in the input stream. This
|
@@ -83,11 +82,12 @@ void gumbo_tokenizer_set_state (
|
|
83
82
|
GumboTokenizerEnum state
|
84
83
|
);
|
85
84
|
|
86
|
-
// Flags whether the current node is a foreign content element. This
|
87
|
-
// necessary for the markup declaration open state, where the tokenizer
|
88
|
-
// aware of the state of the parser to properly tokenize bad comment
|
85
|
+
// Flags whether the adjusted current node is a foreign content element. This
|
86
|
+
// is necessary for the markup declaration open state, where the tokenizer
|
87
|
+
// must be aware of the state of the parser to properly tokenize bad comment
|
88
|
+
// tags.
|
89
89
|
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
90
|
-
void
|
90
|
+
void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
|
91
91
|
struct GumboInternalParser* parser,
|
92
92
|
bool is_foreign
|
93
93
|
);
|
@@ -14,74 +14,326 @@
|
|
14
14
|
// The ordering of this enum is also used to build the dispatch table for the
|
15
15
|
// tokenizer state machine, so if it is changed, be sure to update that too.
|
16
16
|
typedef enum {
|
17
|
+
// 12.2.5.1 Data state
|
18
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#data-state
|
17
19
|
GUMBO_LEX_DATA,
|
18
|
-
|
20
|
+
|
21
|
+
// 12.2.5.2 RCDATA state
|
22
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
|
19
23
|
GUMBO_LEX_RCDATA,
|
20
|
-
|
24
|
+
|
25
|
+
// 12.2.5.3 RAWTEXT state
|
26
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state<Paste>
|
21
27
|
GUMBO_LEX_RAWTEXT,
|
22
|
-
|
28
|
+
|
29
|
+
// 12.2.5.4 Script data state
|
30
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
|
31
|
+
GUMBO_LEX_SCRIPT_DATA,
|
32
|
+
|
33
|
+
// 12.2.5.5 PLAINTEXT state
|
34
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
|
23
35
|
GUMBO_LEX_PLAINTEXT,
|
36
|
+
|
37
|
+
// 12.2.5.6 Tag open state
|
38
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
|
24
39
|
GUMBO_LEX_TAG_OPEN,
|
40
|
+
|
41
|
+
// 12.2.5.7 End tag open state
|
42
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
|
25
43
|
GUMBO_LEX_END_TAG_OPEN,
|
44
|
+
|
45
|
+
// 12.2.5.8 Tag name state
|
46
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
26
47
|
GUMBO_LEX_TAG_NAME,
|
48
|
+
|
49
|
+
// 12.2.5.9 RCDATA less-than sign state
|
50
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
|
27
51
|
GUMBO_LEX_RCDATA_LT,
|
52
|
+
|
53
|
+
// 12.2.5.10 RCDATA end tag open state
|
54
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
|
28
55
|
GUMBO_LEX_RCDATA_END_TAG_OPEN,
|
56
|
+
|
57
|
+
// 12.2.5.11 RCDATA end tag name state
|
58
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
|
29
59
|
GUMBO_LEX_RCDATA_END_TAG_NAME,
|
60
|
+
|
61
|
+
// 12.2.5.12 RAWTEXT less-than sign state
|
62
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
|
30
63
|
GUMBO_LEX_RAWTEXT_LT,
|
64
|
+
|
65
|
+
// 12.2.5.13 RAWTEXT end tag open state
|
66
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
|
31
67
|
GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
|
68
|
+
|
69
|
+
// 12.2.5.14 RAWTEXT end tag name state
|
70
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
|
32
71
|
GUMBO_LEX_RAWTEXT_END_TAG_NAME,
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
72
|
+
|
73
|
+
// 12.2.5.15 Script data less-than sign state
|
74
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
|
75
|
+
GUMBO_LEX_SCRIPT_DATA_LT,
|
76
|
+
|
77
|
+
// 12.2.5.16 Script data end tag open state
|
78
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
|
79
|
+
GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN,
|
80
|
+
|
81
|
+
// 12.2.5.17 Script data end tag name state
|
82
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
|
83
|
+
GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME,
|
84
|
+
|
85
|
+
// 12.2.5.18 Script data escape start state
|
86
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
|
87
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_START,
|
88
|
+
|
89
|
+
// 12.2.5.19 Script data escape start dash state
|
90
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
|
91
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH,
|
92
|
+
|
93
|
+
// 12.2.5.20 Script data escaped state
|
94
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
|
95
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED,
|
96
|
+
|
97
|
+
// 12.2.5.21 Script data escaped dash state
|
98
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
|
99
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH,
|
100
|
+
|
101
|
+
// 12.2.5.22 Script data escaped dash dash state
|
102
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
|
103
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH,
|
104
|
+
|
105
|
+
// 12.2.5.23 Script data escaped less than sign state
|
106
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
|
107
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT,
|
108
|
+
|
109
|
+
// 12.2.5.24 Script data escaped end tag open state
|
110
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
|
111
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN,
|
112
|
+
|
113
|
+
// 12.2.5.25 Script data escaped end tag name state
|
114
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
|
115
|
+
GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME,
|
116
|
+
|
117
|
+
// 12.2.5.26 Script data double escape start state
|
118
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
|
119
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START,
|
120
|
+
|
121
|
+
// 12.2.5.27 Script data double escaped state
|
122
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
|
123
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED,
|
124
|
+
|
125
|
+
// 12.2.5.28 Script data double escaped dash state
|
126
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
|
127
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH,
|
128
|
+
|
129
|
+
// 12.2.5.29 Script data double escaped dash dash state
|
130
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
|
131
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH,
|
132
|
+
|
133
|
+
// 12.2.5.30 Script data double escaped less-than sign state
|
134
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
|
135
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT,
|
136
|
+
|
137
|
+
// 12.2.5.31 Script data double escape end state (XXX: spec bug with the
|
138
|
+
// name?)
|
139
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
|
140
|
+
GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END,
|
141
|
+
|
142
|
+
// 12.2.5.32 Before attribute name state
|
143
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
|
50
144
|
GUMBO_LEX_BEFORE_ATTR_NAME,
|
145
|
+
|
146
|
+
// 12.2.5.33 Attributet name state
|
147
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
|
51
148
|
GUMBO_LEX_ATTR_NAME,
|
149
|
+
|
150
|
+
// 12.2.5.34 After attribute name state
|
151
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
|
52
152
|
GUMBO_LEX_AFTER_ATTR_NAME,
|
153
|
+
|
154
|
+
// 12.2.5.35 Before attribute value state
|
155
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
|
53
156
|
GUMBO_LEX_BEFORE_ATTR_VALUE,
|
157
|
+
|
158
|
+
// 12.2.5.36 Attribute value (double-quoted) state
|
159
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
|
54
160
|
GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
|
161
|
+
|
162
|
+
// 12.2.5.37 Attribute value (single-quoted) state
|
163
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
|
55
164
|
GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
|
165
|
+
|
166
|
+
// 12.2.5.38 Attribute value (unquoted) state
|
167
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
|
56
168
|
GUMBO_LEX_ATTR_VALUE_UNQUOTED,
|
57
|
-
|
169
|
+
|
170
|
+
// 12.2.5.39 After attribute value (quoted) state
|
171
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
|
58
172
|
GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
|
173
|
+
|
174
|
+
// 12.2.5.40 Self-closing start tag state
|
175
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
|
59
176
|
GUMBO_LEX_SELF_CLOSING_START_TAG,
|
177
|
+
|
178
|
+
// 12.2.5.41 Bogus comment state
|
179
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
|
60
180
|
GUMBO_LEX_BOGUS_COMMENT,
|
61
|
-
|
181
|
+
|
182
|
+
// 12.2.5.42 Markup declaration open state
|
183
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
184
|
+
GUMBO_LEX_MARKUP_DECLARATION_OPEN,
|
185
|
+
|
186
|
+
// 12.2.5.43 Comment start state
|
187
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
|
62
188
|
GUMBO_LEX_COMMENT_START,
|
189
|
+
|
190
|
+
// 12.2.5.44 Comment start dash state
|
191
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
|
63
192
|
GUMBO_LEX_COMMENT_START_DASH,
|
193
|
+
|
194
|
+
// 12.2.5.45 Comment state
|
195
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-state
|
64
196
|
GUMBO_LEX_COMMENT,
|
197
|
+
|
198
|
+
// 12.2.5.46 Comment less-than sign state
|
199
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
|
200
|
+
GUMBO_LEX_COMMENT_LT,
|
201
|
+
|
202
|
+
// 12.2.5.47 Comment less-than sign bang state
|
203
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
|
204
|
+
GUMBO_LEX_COMMENT_LT_BANG,
|
205
|
+
|
206
|
+
// 12.2.5.48 Comment less-than sign bang dash state
|
207
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
|
208
|
+
GUMBO_LEX_COMMENT_LT_BANG_DASH,
|
209
|
+
|
210
|
+
// 12.2.5.49 Comment less-than sign bang dash dash state
|
211
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
|
212
|
+
GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH,
|
213
|
+
|
214
|
+
// 12.2.5.50 Comment end dash state
|
215
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
|
65
216
|
GUMBO_LEX_COMMENT_END_DASH,
|
217
|
+
|
218
|
+
// 12.2.5.51 Comment end state
|
219
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
|
66
220
|
GUMBO_LEX_COMMENT_END,
|
221
|
+
|
222
|
+
// 12.2.5.52 Comment end bang state
|
223
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
|
67
224
|
GUMBO_LEX_COMMENT_END_BANG,
|
225
|
+
|
226
|
+
// 12.2.5.53 DOCTYPE state
|
227
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
|
68
228
|
GUMBO_LEX_DOCTYPE,
|
229
|
+
|
230
|
+
// 12.2.5.54 Before DOCTYPE name state
|
231
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
|
69
232
|
GUMBO_LEX_BEFORE_DOCTYPE_NAME,
|
233
|
+
|
234
|
+
// 12.2.5.55 DOCTYPE name state
|
235
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
|
70
236
|
GUMBO_LEX_DOCTYPE_NAME,
|
237
|
+
|
238
|
+
// 12.2.5.56 After DOCTYPE name state
|
239
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
|
71
240
|
GUMBO_LEX_AFTER_DOCTYPE_NAME,
|
241
|
+
|
242
|
+
// 12.2.5.57 After DOCTYPE public keyword state
|
243
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
|
72
244
|
GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
|
245
|
+
|
246
|
+
// 12.2.5.58 Before DOCTYPE public identifier state
|
247
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
|
73
248
|
GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
|
249
|
+
|
250
|
+
// 12.2.5.59 DOCTYPE public identifier (double-quoted) state
|
251
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
|
74
252
|
GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
|
253
|
+
|
254
|
+
// 12.2.5.60 DOCTYPE public identifier (single-quoted) state
|
255
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
|
75
256
|
GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
|
257
|
+
|
258
|
+
// 12.2.5.61 After DOCTYPE public identifier state
|
259
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
|
76
260
|
GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
|
261
|
+
|
262
|
+
// 12.2.5.62 Between DOCTYPE public and system identifiers state
|
263
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
|
77
264
|
GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
|
265
|
+
|
266
|
+
// 12.2.5.63 After DOCTYPE system keyword state
|
267
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
|
78
268
|
GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
|
269
|
+
|
270
|
+
// 12.2.5.64 Before DOCTYPE system identifier state
|
271
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
|
79
272
|
GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
|
273
|
+
|
274
|
+
// 12.2.5.65 DOCTYPE system identifier (double-quoted) state
|
275
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
|
80
276
|
GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
|
277
|
+
|
278
|
+
// 12.2.5.66 DOCTYPE system identifier (single-quoted) state
|
279
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
|
81
280
|
GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
|
281
|
+
|
282
|
+
// 12.2.5.67 After DOCTYPE system identifier state
|
283
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
|
82
284
|
GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
|
285
|
+
|
286
|
+
// 12.2.5.68 Bogus DOCTYPE state
|
287
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
|
83
288
|
GUMBO_LEX_BOGUS_DOCTYPE,
|
84
|
-
|
289
|
+
|
290
|
+
// 12.2.5.69 CDATA section state
|
291
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
|
292
|
+
GUMBO_LEX_CDATA_SECTION,
|
293
|
+
|
294
|
+
// 12.2.5.70 CDATA section bracket state
|
295
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
|
296
|
+
GUMBO_LEX_CDATA_SECTION_BRACKET,
|
297
|
+
|
298
|
+
// 12.2.5.71 CDATA section end state
|
299
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
|
300
|
+
GUMBO_LEX_CDATA_SECTION_END,
|
301
|
+
|
302
|
+
// 12.2.5.72 Character reference state
|
303
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
|
304
|
+
GUMBO_LEX_CHARACTER_REFERENCE,
|
305
|
+
|
306
|
+
// 12.2.5.73 Named character reference state
|
307
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
|
308
|
+
GUMBO_LEX_NAMED_CHARACTER_REFERENCE,
|
309
|
+
|
310
|
+
// 12.2.5.74 Ambiguous ampersand state
|
311
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
|
312
|
+
GUMBO_LEX_AMBIGUOUS_AMPERSAND,
|
313
|
+
|
314
|
+
// 12.2.5.75 Numeric character reference state
|
315
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
|
316
|
+
GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE,
|
317
|
+
|
318
|
+
// 12.2.5.76 Hexadecimal character reference start state
|
319
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
|
320
|
+
GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START,
|
321
|
+
|
322
|
+
// 12.2.5.77 Decimal character reference start state
|
323
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
|
324
|
+
GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START,
|
325
|
+
|
326
|
+
// 12.2.5.78 Hexadecimal character reference state
|
327
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
|
328
|
+
GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE,
|
329
|
+
|
330
|
+
// 12.2.5.79 Decimal character reference state
|
331
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
|
332
|
+
GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE,
|
333
|
+
|
334
|
+
// 12.2.5.80 Numeric character reference end state
|
335
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
|
336
|
+
GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END
|
85
337
|
} GumboTokenizerEnum;
|
86
338
|
|
87
339
|
#endif // GUMBO_TOKENIZER_STATES_H_
|
data/gumbo-parser/src/utf8.c
CHANGED
@@ -27,8 +27,6 @@
|
|
27
27
|
#include "ascii.h"
|
28
28
|
#include "vector.h"
|
29
29
|
|
30
|
-
const int kUtf8ReplacementChar = 0xFFFD;
|
31
|
-
|
32
30
|
// References:
|
33
31
|
// * https://tools.ietf.org/html/rfc3629
|
34
32
|
// * https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
|
@@ -100,16 +98,9 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
|
|
100
98
|
}
|
101
99
|
error->type = type;
|
102
100
|
error->position = iter->_pos;
|
103
|
-
error->original_text = iter->_start;
|
104
|
-
|
105
|
-
|
106
|
-
// yet (and can't be, because it's invalid), so we need to build up the raw
|
107
|
-
// hex value from the bytes under the cursor.
|
108
|
-
uint32_t code_point = 0;
|
109
|
-
for (size_t i = 0; i < iter->_width; ++i) {
|
110
|
-
code_point = (code_point << 8) | (unsigned char) iter->_start[i];
|
111
|
-
}
|
112
|
-
error->v.codepoint = code_point;
|
101
|
+
error->original_text.data = iter->_start;
|
102
|
+
error->original_text.length = iter->_width;
|
103
|
+
error->v.tokenizer.codepoint = iter->_current;
|
113
104
|
}
|
114
105
|
|
115
106
|
// Reads the next UTF-8 character in the iter.
|
@@ -147,13 +138,15 @@ static void read_char(Utf8Iterator* iter) {
|
|
147
138
|
}
|
148
139
|
code_point = '\n';
|
149
140
|
}
|
150
|
-
if (utf8_is_invalid_code_point(code_point)) {
|
151
|
-
// Invalid code points are errors, but they are not replaced by
|
152
|
-
// U+FFFD.
|
153
|
-
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
|
154
|
-
add_error(iter, GUMBO_ERR_UTF8_INVALID);
|
155
|
-
}
|
156
141
|
iter->_current = code_point;
|
142
|
+
if (utf8_is_surrogate(code_point)) {
|
143
|
+
add_error(iter, GUMBO_ERR_SURROGATE_IN_INPUT_STREAM);
|
144
|
+
} else if (utf8_is_noncharacter(code_point)) {
|
145
|
+
add_error(iter, GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM);
|
146
|
+
} else if (utf8_is_control(code_point)
|
147
|
+
&& !(gumbo_ascii_isspace(code_point) || code_point == 0)) {
|
148
|
+
add_error(iter, GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM);
|
149
|
+
}
|
157
150
|
return;
|
158
151
|
} else if (state == UTF8_REJECT) {
|
159
152
|
// We don't want to consume the invalid continuation byte of a multi-byte
|
@@ -169,8 +162,8 @@ static void read_char(Utf8Iterator* iter) {
|
|
169
162
|
// rest of the iterator, and emit a replacement character. The next time we
|
170
163
|
// enter this method, it will detect that there's no input to consume and
|
171
164
|
// output an EOF.
|
172
|
-
iter->_current = kUtf8ReplacementChar;
|
173
165
|
iter->_width = iter->_end - iter->_start;
|
166
|
+
iter->_current = kUtf8ReplacementChar;
|
174
167
|
add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
|
175
168
|
}
|
176
169
|
|
@@ -187,20 +180,6 @@ static void update_position(Utf8Iterator* iter) {
|
|
187
180
|
}
|
188
181
|
}
|
189
182
|
|
190
|
-
// Returns true if this Unicode code point is in the list of characters
|
191
|
-
// forbidden by the HTML5 spec, such as undefined control chars.
|
192
|
-
bool utf8_is_invalid_code_point(int c) {
|
193
|
-
return
|
194
|
-
(c >= 0x1 && c <= 0x8)
|
195
|
-
|| c == 0xB
|
196
|
-
|| (c >= 0xE && c <= 0x1F)
|
197
|
-
|| (c >= 0x7F && c <= 0x9F)
|
198
|
-
|| (c >= 0xFDD0 && c <= 0xFDEF)
|
199
|
-
|| ((c & 0xFFFF) == 0xFFFE)
|
200
|
-
|| ((c & 0xFFFF) == 0xFFFF)
|
201
|
-
;
|
202
|
-
}
|
203
|
-
|
204
183
|
void utf8iterator_init (
|
205
184
|
GumboParser* parser,
|
206
185
|
const char* source,
|
@@ -224,25 +203,6 @@ void utf8iterator_next(Utf8Iterator* iter) {
|
|
224
203
|
read_char(iter);
|
225
204
|
}
|
226
205
|
|
227
|
-
int utf8iterator_current(const Utf8Iterator* iter) {
|
228
|
-
return iter->_current;
|
229
|
-
}
|
230
|
-
|
231
|
-
void utf8iterator_get_position (
|
232
|
-
const Utf8Iterator* iter,
|
233
|
-
GumboSourcePosition* output
|
234
|
-
) {
|
235
|
-
*output = iter->_pos;
|
236
|
-
}
|
237
|
-
|
238
|
-
const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
|
239
|
-
return iter->_start;
|
240
|
-
}
|
241
|
-
|
242
|
-
const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
|
243
|
-
return iter->_end;
|
244
|
-
}
|
245
|
-
|
246
206
|
bool utf8iterator_maybe_consume_match (
|
247
207
|
Utf8Iterator* iter,
|
248
208
|
const char* prefix,
|
@@ -278,10 +238,3 @@ void utf8iterator_reset(Utf8Iterator* iter) {
|
|
278
238
|
iter->_pos = iter->_mark_pos;
|
279
239
|
read_char(iter);
|
280
240
|
}
|
281
|
-
|
282
|
-
// Sets the position and original text fields of an error to the value at the
|
283
|
-
// mark.
|
284
|
-
void utf8iterator_fill_error_at_mark(Utf8Iterator* iter, GumboError* error) {
|
285
|
-
error->position = iter->_mark_pos;
|
286
|
-
error->original_text = iter->_mark;
|
287
|
-
}
|