nokogumbo 2.0.0.pre.alpha → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,7 +44,6 @@ typedef struct GumboInternalTokenEndTag {
44
44
  GumboTag tag;
45
45
  // NULL unless tag is GUMBO_TAG_UNKNOWN
46
46
  char *name;
47
- bool is_self_closing;
48
47
  } GumboTokenEndTag;
49
48
 
50
49
  // A data structure representing a single token in the input stream. This
@@ -83,11 +82,12 @@ void gumbo_tokenizer_set_state (
83
82
  GumboTokenizerEnum state
84
83
  );
85
84
 
86
- // Flags whether the current node is a foreign content element. This is
87
- // necessary for the markup declaration open state, where the tokenizer must be
88
- // aware of the state of the parser to properly tokenize bad comment tags.
85
+ // Flags whether the adjusted current node is a foreign content element. This
86
+ // is necessary for the markup declaration open state, where the tokenizer
87
+ // must be aware of the state of the parser to properly tokenize bad comment
88
+ // tags.
89
89
  // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
90
- void gumbo_tokenizer_set_is_current_node_foreign (
90
+ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
91
91
  struct GumboInternalParser* parser,
92
92
  bool is_foreign
93
93
  );
@@ -14,74 +14,326 @@
14
14
  // The ordering of this enum is also used to build the dispatch table for the
15
15
  // tokenizer state machine, so if it is changed, be sure to update that too.
16
16
  typedef enum {
17
+ // 12.2.5.1 Data state
18
+ // https://html.spec.whatwg.org/multipage/parsing.html#data-state
17
19
  GUMBO_LEX_DATA,
18
- GUMBO_LEX_CHAR_REF_IN_DATA,
20
+
21
+ // 12.2.5.2 RCDATA state
22
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
19
23
  GUMBO_LEX_RCDATA,
20
- GUMBO_LEX_CHAR_REF_IN_RCDATA,
24
+
25
+ // 12.2.5.3 RAWTEXT state
26
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state<Paste>
21
27
  GUMBO_LEX_RAWTEXT,
22
- GUMBO_LEX_SCRIPT,
28
+
29
+ // 12.2.5.4 Script data state
30
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
31
+ GUMBO_LEX_SCRIPT_DATA,
32
+
33
+ // 12.2.5.5 PLAINTEXT state
34
+ // https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
23
35
  GUMBO_LEX_PLAINTEXT,
36
+
37
+ // 12.2.5.6 Tag open state
38
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
24
39
  GUMBO_LEX_TAG_OPEN,
40
+
41
+ // 12.2.5.7 End tag open state
42
+ // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
25
43
  GUMBO_LEX_END_TAG_OPEN,
44
+
45
+ // 12.2.5.8 Tag name state
46
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
26
47
  GUMBO_LEX_TAG_NAME,
48
+
49
+ // 12.2.5.9 RCDATA less-than sign state
50
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
27
51
  GUMBO_LEX_RCDATA_LT,
52
+
53
+ // 12.2.5.10 RCDATA end tag open state
54
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
28
55
  GUMBO_LEX_RCDATA_END_TAG_OPEN,
56
+
57
+ // 12.2.5.11 RCDATA end tag name state
58
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
29
59
  GUMBO_LEX_RCDATA_END_TAG_NAME,
60
+
61
+ // 12.2.5.12 RAWTEXT less-than sign state
62
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
30
63
  GUMBO_LEX_RAWTEXT_LT,
64
+
65
+ // 12.2.5.13 RAWTEXT end tag open state
66
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
31
67
  GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
68
+
69
+ // 12.2.5.14 RAWTEXT end tag name state
70
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
32
71
  GUMBO_LEX_RAWTEXT_END_TAG_NAME,
33
- GUMBO_LEX_SCRIPT_LT,
34
- GUMBO_LEX_SCRIPT_END_TAG_OPEN,
35
- GUMBO_LEX_SCRIPT_END_TAG_NAME,
36
- GUMBO_LEX_SCRIPT_ESCAPED_START,
37
- GUMBO_LEX_SCRIPT_ESCAPED_START_DASH,
38
- GUMBO_LEX_SCRIPT_ESCAPED,
39
- GUMBO_LEX_SCRIPT_ESCAPED_DASH,
40
- GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH,
41
- GUMBO_LEX_SCRIPT_ESCAPED_LT,
42
- GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN,
43
- GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME,
44
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START,
45
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED,
46
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH,
47
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH,
48
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT,
49
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END,
72
+
73
+ // 12.2.5.15 Script data less-than sign state
74
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
75
+ GUMBO_LEX_SCRIPT_DATA_LT,
76
+
77
+ // 12.2.5.16 Script data end tag open state
78
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
79
+ GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN,
80
+
81
+ // 12.2.5.17 Script data end tag name state
82
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
83
+ GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME,
84
+
85
+ // 12.2.5.18 Script data escape start state
86
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
87
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_START,
88
+
89
+ // 12.2.5.19 Script data escape start dash state
90
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
91
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH,
92
+
93
+ // 12.2.5.20 Script data escaped state
94
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
95
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED,
96
+
97
+ // 12.2.5.21 Script data escaped dash state
98
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
99
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH,
100
+
101
+ // 12.2.5.22 Script data escaped dash dash state
102
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
103
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH,
104
+
105
+ // 12.2.5.23 Script data escaped less than sign state
106
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
107
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT,
108
+
109
+ // 12.2.5.24 Script data escaped end tag open state
110
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
111
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN,
112
+
113
+ // 12.2.5.25 Script data escaped end tag name state
114
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
115
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME,
116
+
117
+ // 12.2.5.26 Script data double escape start state
118
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
119
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START,
120
+
121
+ // 12.2.5.27 Script data double escaped state
122
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
123
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED,
124
+
125
+ // 12.2.5.28 Script data double escaped dash state
126
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
127
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH,
128
+
129
+ // 12.2.5.29 Script data double escaped dash dash state
130
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
131
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH,
132
+
133
+ // 12.2.5.30 Script data double escaped less-than sign state
134
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
135
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT,
136
+
137
+ // 12.2.5.31 Script data double escape end state (XXX: spec bug with the
138
+ // name?)
139
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
140
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END,
141
+
142
+ // 12.2.5.32 Before attribute name state
143
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
50
144
  GUMBO_LEX_BEFORE_ATTR_NAME,
145
+
146
+ // 12.2.5.33 Attributet name state
147
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
51
148
  GUMBO_LEX_ATTR_NAME,
149
+
150
+ // 12.2.5.34 After attribute name state
151
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
52
152
  GUMBO_LEX_AFTER_ATTR_NAME,
153
+
154
+ // 12.2.5.35 Before attribute value state
155
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
53
156
  GUMBO_LEX_BEFORE_ATTR_VALUE,
157
+
158
+ // 12.2.5.36 Attribute value (double-quoted) state
159
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
54
160
  GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
161
+
162
+ // 12.2.5.37 Attribute value (single-quoted) state
163
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
55
164
  GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
165
+
166
+ // 12.2.5.38 Attribute value (unquoted) state
167
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
56
168
  GUMBO_LEX_ATTR_VALUE_UNQUOTED,
57
- GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE,
169
+
170
+ // 12.2.5.39 After attribute value (quoted) state
171
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
58
172
  GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
173
+
174
+ // 12.2.5.40 Self-closing start tag state
175
+ // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
59
176
  GUMBO_LEX_SELF_CLOSING_START_TAG,
177
+
178
+ // 12.2.5.41 Bogus comment state
179
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
60
180
  GUMBO_LEX_BOGUS_COMMENT,
61
- GUMBO_LEX_MARKUP_DECLARATION,
181
+
182
+ // 12.2.5.42 Markup declaration open state
183
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
184
+ GUMBO_LEX_MARKUP_DECLARATION_OPEN,
185
+
186
+ // 12.2.5.43 Comment start state
187
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
62
188
  GUMBO_LEX_COMMENT_START,
189
+
190
+ // 12.2.5.44 Comment start dash state
191
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
63
192
  GUMBO_LEX_COMMENT_START_DASH,
193
+
194
+ // 12.2.5.45 Comment state
195
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-state
64
196
  GUMBO_LEX_COMMENT,
197
+
198
+ // 12.2.5.46 Comment less-than sign state
199
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
200
+ GUMBO_LEX_COMMENT_LT,
201
+
202
+ // 12.2.5.47 Comment less-than sign bang state
203
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
204
+ GUMBO_LEX_COMMENT_LT_BANG,
205
+
206
+ // 12.2.5.48 Comment less-than sign bang dash state
207
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
208
+ GUMBO_LEX_COMMENT_LT_BANG_DASH,
209
+
210
+ // 12.2.5.49 Comment less-than sign bang dash dash state
211
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
212
+ GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH,
213
+
214
+ // 12.2.5.50 Comment end dash state
215
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
65
216
  GUMBO_LEX_COMMENT_END_DASH,
217
+
218
+ // 12.2.5.51 Comment end state
219
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
66
220
  GUMBO_LEX_COMMENT_END,
221
+
222
+ // 12.2.5.52 Comment end bang state
223
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
67
224
  GUMBO_LEX_COMMENT_END_BANG,
225
+
226
+ // 12.2.5.53 DOCTYPE state
227
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
68
228
  GUMBO_LEX_DOCTYPE,
229
+
230
+ // 12.2.5.54 Before DOCTYPE name state
231
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
69
232
  GUMBO_LEX_BEFORE_DOCTYPE_NAME,
233
+
234
+ // 12.2.5.55 DOCTYPE name state
235
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
70
236
  GUMBO_LEX_DOCTYPE_NAME,
237
+
238
+ // 12.2.5.56 After DOCTYPE name state
239
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
71
240
  GUMBO_LEX_AFTER_DOCTYPE_NAME,
241
+
242
+ // 12.2.5.57 After DOCTYPE public keyword state
243
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
72
244
  GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
245
+
246
+ // 12.2.5.58 Before DOCTYPE public identifier state
247
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
73
248
  GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
249
+
250
+ // 12.2.5.59 DOCTYPE public identifier (double-quoted) state
251
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
74
252
  GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
253
+
254
+ // 12.2.5.60 DOCTYPE public identifier (single-quoted) state
255
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
75
256
  GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
257
+
258
+ // 12.2.5.61 After DOCTYPE public identifier state
259
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
76
260
  GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
261
+
262
+ // 12.2.5.62 Between DOCTYPE public and system identifiers state
263
+ // https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
77
264
  GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
265
+
266
+ // 12.2.5.63 After DOCTYPE system keyword state
267
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
78
268
  GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
269
+
270
+ // 12.2.5.64 Before DOCTYPE system identifier state
271
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
79
272
  GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
273
+
274
+ // 12.2.5.65 DOCTYPE system identifier (double-quoted) state
275
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
80
276
  GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
277
+
278
+ // 12.2.5.66 DOCTYPE system identifier (single-quoted) state
279
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
81
280
  GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
281
+
282
+ // 12.2.5.67 After DOCTYPE system identifier state
283
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
82
284
  GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
285
+
286
+ // 12.2.5.68 Bogus DOCTYPE state
287
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
83
288
  GUMBO_LEX_BOGUS_DOCTYPE,
84
- GUMBO_LEX_CDATA
289
+
290
+ // 12.2.5.69 CDATA section state
291
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
292
+ GUMBO_LEX_CDATA_SECTION,
293
+
294
+ // 12.2.5.70 CDATA section bracket state
295
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
296
+ GUMBO_LEX_CDATA_SECTION_BRACKET,
297
+
298
+ // 12.2.5.71 CDATA section end state
299
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
300
+ GUMBO_LEX_CDATA_SECTION_END,
301
+
302
+ // 12.2.5.72 Character reference state
303
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
304
+ GUMBO_LEX_CHARACTER_REFERENCE,
305
+
306
+ // 12.2.5.73 Named character reference state
307
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
308
+ GUMBO_LEX_NAMED_CHARACTER_REFERENCE,
309
+
310
+ // 12.2.5.74 Ambiguous ampersand state
311
+ // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
312
+ GUMBO_LEX_AMBIGUOUS_AMPERSAND,
313
+
314
+ // 12.2.5.75 Numeric character reference state
315
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
316
+ GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE,
317
+
318
+ // 12.2.5.76 Hexadecimal character reference start state
319
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
320
+ GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START,
321
+
322
+ // 12.2.5.77 Decimal character reference start state
323
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
324
+ GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START,
325
+
326
+ // 12.2.5.78 Hexadecimal character reference state
327
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
328
+ GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE,
329
+
330
+ // 12.2.5.79 Decimal character reference state
331
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
332
+ GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE,
333
+
334
+ // 12.2.5.80 Numeric character reference end state
335
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
336
+ GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END
85
337
  } GumboTokenizerEnum;
86
338
 
87
339
  #endif // GUMBO_TOKENIZER_STATES_H_
@@ -27,8 +27,6 @@
27
27
  #include "ascii.h"
28
28
  #include "vector.h"
29
29
 
30
- const int kUtf8ReplacementChar = 0xFFFD;
31
-
32
30
  // References:
33
31
  // * https://tools.ietf.org/html/rfc3629
34
32
  // * https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
@@ -100,16 +98,9 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
100
98
  }
101
99
  error->type = type;
102
100
  error->position = iter->_pos;
103
- error->original_text = iter->_start;
104
-
105
- // At the point the error is recorded, the code point hasn't been computed
106
- // yet (and can't be, because it's invalid), so we need to build up the raw
107
- // hex value from the bytes under the cursor.
108
- uint32_t code_point = 0;
109
- for (size_t i = 0; i < iter->_width; ++i) {
110
- code_point = (code_point << 8) | (unsigned char) iter->_start[i];
111
- }
112
- error->v.codepoint = code_point;
101
+ error->original_text.data = iter->_start;
102
+ error->original_text.length = iter->_width;
103
+ error->v.tokenizer.codepoint = iter->_current;
113
104
  }
114
105
 
115
106
  // Reads the next UTF-8 character in the iter.
@@ -147,13 +138,15 @@ static void read_char(Utf8Iterator* iter) {
147
138
  }
148
139
  code_point = '\n';
149
140
  }
150
- if (utf8_is_invalid_code_point(code_point)) {
151
- // Invalid code points are errors, but they are not replaced by
152
- // U+FFFD.
153
- // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
154
- add_error(iter, GUMBO_ERR_UTF8_INVALID);
155
- }
156
141
  iter->_current = code_point;
142
+ if (utf8_is_surrogate(code_point)) {
143
+ add_error(iter, GUMBO_ERR_SURROGATE_IN_INPUT_STREAM);
144
+ } else if (utf8_is_noncharacter(code_point)) {
145
+ add_error(iter, GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM);
146
+ } else if (utf8_is_control(code_point)
147
+ && !(gumbo_ascii_isspace(code_point) || code_point == 0)) {
148
+ add_error(iter, GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM);
149
+ }
157
150
  return;
158
151
  } else if (state == UTF8_REJECT) {
159
152
  // We don't want to consume the invalid continuation byte of a multi-byte
@@ -169,8 +162,8 @@ static void read_char(Utf8Iterator* iter) {
169
162
  // rest of the iterator, and emit a replacement character. The next time we
170
163
  // enter this method, it will detect that there's no input to consume and
171
164
  // output an EOF.
172
- iter->_current = kUtf8ReplacementChar;
173
165
  iter->_width = iter->_end - iter->_start;
166
+ iter->_current = kUtf8ReplacementChar;
174
167
  add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
175
168
  }
176
169
 
@@ -187,20 +180,6 @@ static void update_position(Utf8Iterator* iter) {
187
180
  }
188
181
  }
189
182
 
190
- // Returns true if this Unicode code point is in the list of characters
191
- // forbidden by the HTML5 spec, such as undefined control chars.
192
- bool utf8_is_invalid_code_point(int c) {
193
- return
194
- (c >= 0x1 && c <= 0x8)
195
- || c == 0xB
196
- || (c >= 0xE && c <= 0x1F)
197
- || (c >= 0x7F && c <= 0x9F)
198
- || (c >= 0xFDD0 && c <= 0xFDEF)
199
- || ((c & 0xFFFF) == 0xFFFE)
200
- || ((c & 0xFFFF) == 0xFFFF)
201
- ;
202
- }
203
-
204
183
  void utf8iterator_init (
205
184
  GumboParser* parser,
206
185
  const char* source,
@@ -224,25 +203,6 @@ void utf8iterator_next(Utf8Iterator* iter) {
224
203
  read_char(iter);
225
204
  }
226
205
 
227
- int utf8iterator_current(const Utf8Iterator* iter) {
228
- return iter->_current;
229
- }
230
-
231
- void utf8iterator_get_position (
232
- const Utf8Iterator* iter,
233
- GumboSourcePosition* output
234
- ) {
235
- *output = iter->_pos;
236
- }
237
-
238
- const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
239
- return iter->_start;
240
- }
241
-
242
- const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
243
- return iter->_end;
244
- }
245
-
246
206
  bool utf8iterator_maybe_consume_match (
247
207
  Utf8Iterator* iter,
248
208
  const char* prefix,
@@ -278,10 +238,3 @@ void utf8iterator_reset(Utf8Iterator* iter) {
278
238
  iter->_pos = iter->_mark_pos;
279
239
  read_char(iter);
280
240
  }
281
-
282
- // Sets the position and original text fields of an error to the value at the
283
- // mark.
284
- void utf8iterator_fill_error_at_mark(Utf8Iterator* iter, GumboError* error) {
285
- error->position = iter->_mark_pos;
286
- error->original_text = iter->_mark;
287
- }