nokogumbo 2.0.0.pre.alpha → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -44,7 +44,6 @@ typedef struct GumboInternalTokenEndTag {
44
44
  GumboTag tag;
45
45
  // NULL unless tag is GUMBO_TAG_UNKNOWN
46
46
  char *name;
47
- bool is_self_closing;
48
47
  } GumboTokenEndTag;
49
48
 
50
49
  // A data structure representing a single token in the input stream. This
@@ -83,11 +82,12 @@ void gumbo_tokenizer_set_state (
83
82
  GumboTokenizerEnum state
84
83
  );
85
84
 
86
- // Flags whether the current node is a foreign content element. This is
87
- // necessary for the markup declaration open state, where the tokenizer must be
88
- // aware of the state of the parser to properly tokenize bad comment tags.
85
+ // Flags whether the adjusted current node is a foreign content element. This
86
+ // is necessary for the markup declaration open state, where the tokenizer
87
+ // must be aware of the state of the parser to properly tokenize bad comment
88
+ // tags.
89
89
  // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
90
- void gumbo_tokenizer_set_is_current_node_foreign (
90
+ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
91
91
  struct GumboInternalParser* parser,
92
92
  bool is_foreign
93
93
  );
@@ -14,74 +14,326 @@
14
14
  // The ordering of this enum is also used to build the dispatch table for the
15
15
  // tokenizer state machine, so if it is changed, be sure to update that too.
16
16
  typedef enum {
17
+ // 12.2.5.1 Data state
18
+ // https://html.spec.whatwg.org/multipage/parsing.html#data-state
17
19
  GUMBO_LEX_DATA,
18
- GUMBO_LEX_CHAR_REF_IN_DATA,
20
+
21
+ // 12.2.5.2 RCDATA state
22
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
19
23
  GUMBO_LEX_RCDATA,
20
- GUMBO_LEX_CHAR_REF_IN_RCDATA,
24
+
25
+ // 12.2.5.3 RAWTEXT state
26
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state<Paste>
21
27
  GUMBO_LEX_RAWTEXT,
22
- GUMBO_LEX_SCRIPT,
28
+
29
+ // 12.2.5.4 Script data state
30
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
31
+ GUMBO_LEX_SCRIPT_DATA,
32
+
33
+ // 12.2.5.5 PLAINTEXT state
34
+ // https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
23
35
  GUMBO_LEX_PLAINTEXT,
36
+
37
+ // 12.2.5.6 Tag open state
38
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
24
39
  GUMBO_LEX_TAG_OPEN,
40
+
41
+ // 12.2.5.7 End tag open state
42
+ // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
25
43
  GUMBO_LEX_END_TAG_OPEN,
44
+
45
+ // 12.2.5.8 Tag name state
46
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
26
47
  GUMBO_LEX_TAG_NAME,
48
+
49
+ // 12.2.5.9 RCDATA less-than sign state
50
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
27
51
  GUMBO_LEX_RCDATA_LT,
52
+
53
+ // 12.2.5.10 RCDATA end tag open state
54
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
28
55
  GUMBO_LEX_RCDATA_END_TAG_OPEN,
56
+
57
+ // 12.2.5.11 RCDATA end tag name state
58
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
29
59
  GUMBO_LEX_RCDATA_END_TAG_NAME,
60
+
61
+ // 12.2.5.12 RAWTEXT less-than sign state
62
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
30
63
  GUMBO_LEX_RAWTEXT_LT,
64
+
65
+ // 12.2.5.13 RAWTEXT end tag open state
66
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
31
67
  GUMBO_LEX_RAWTEXT_END_TAG_OPEN,
68
+
69
+ // 12.2.5.14 RAWTEXT end tag name state
70
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
32
71
  GUMBO_LEX_RAWTEXT_END_TAG_NAME,
33
- GUMBO_LEX_SCRIPT_LT,
34
- GUMBO_LEX_SCRIPT_END_TAG_OPEN,
35
- GUMBO_LEX_SCRIPT_END_TAG_NAME,
36
- GUMBO_LEX_SCRIPT_ESCAPED_START,
37
- GUMBO_LEX_SCRIPT_ESCAPED_START_DASH,
38
- GUMBO_LEX_SCRIPT_ESCAPED,
39
- GUMBO_LEX_SCRIPT_ESCAPED_DASH,
40
- GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH,
41
- GUMBO_LEX_SCRIPT_ESCAPED_LT,
42
- GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_OPEN,
43
- GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME,
44
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_START,
45
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED,
46
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH,
47
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH_DASH,
48
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_LT,
49
- GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END,
72
+
73
+ // 12.2.5.15 Script data less-than sign state
74
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
75
+ GUMBO_LEX_SCRIPT_DATA_LT,
76
+
77
+ // 12.2.5.16 Script data end tag open state
78
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
79
+ GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN,
80
+
81
+ // 12.2.5.17 Script data end tag name state
82
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
83
+ GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME,
84
+
85
+ // 12.2.5.18 Script data escape start state
86
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
87
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_START,
88
+
89
+ // 12.2.5.19 Script data escape start dash state
90
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
91
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH,
92
+
93
+ // 12.2.5.20 Script data escaped state
94
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
95
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED,
96
+
97
+ // 12.2.5.21 Script data escaped dash state
98
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
99
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH,
100
+
101
+ // 12.2.5.22 Script data escaped dash dash state
102
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
103
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH,
104
+
105
+ // 12.2.5.23 Script data escaped less than sign state
106
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
107
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT,
108
+
109
+ // 12.2.5.24 Script data escaped end tag open state
110
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
111
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN,
112
+
113
+ // 12.2.5.25 Script data escaped end tag name state
114
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
115
+ GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME,
116
+
117
+ // 12.2.5.26 Script data double escape start state
118
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
119
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START,
120
+
121
+ // 12.2.5.27 Script data double escaped state
122
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
123
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED,
124
+
125
+ // 12.2.5.28 Script data double escaped dash state
126
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
127
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH,
128
+
129
+ // 12.2.5.29 Script data double escaped dash dash state
130
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
131
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH,
132
+
133
+ // 12.2.5.30 Script data double escaped less-than sign state
134
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
135
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT,
136
+
137
+ // 12.2.5.31 Script data double escape end state (XXX: spec bug with the
138
+ // name?)
139
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
140
+ GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END,
141
+
142
+ // 12.2.5.32 Before attribute name state
143
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
50
144
  GUMBO_LEX_BEFORE_ATTR_NAME,
145
+
146
+ // 12.2.5.33 Attributet name state
147
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
51
148
  GUMBO_LEX_ATTR_NAME,
149
+
150
+ // 12.2.5.34 After attribute name state
151
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
52
152
  GUMBO_LEX_AFTER_ATTR_NAME,
153
+
154
+ // 12.2.5.35 Before attribute value state
155
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
53
156
  GUMBO_LEX_BEFORE_ATTR_VALUE,
157
+
158
+ // 12.2.5.36 Attribute value (double-quoted) state
159
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
54
160
  GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED,
161
+
162
+ // 12.2.5.37 Attribute value (single-quoted) state
163
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
55
164
  GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED,
165
+
166
+ // 12.2.5.38 Attribute value (unquoted) state
167
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
56
168
  GUMBO_LEX_ATTR_VALUE_UNQUOTED,
57
- GUMBO_LEX_CHAR_REF_IN_ATTR_VALUE,
169
+
170
+ // 12.2.5.39 After attribute value (quoted) state
171
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
58
172
  GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED,
173
+
174
+ // 12.2.5.40 Self-closing start tag state
175
+ // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
59
176
  GUMBO_LEX_SELF_CLOSING_START_TAG,
177
+
178
+ // 12.2.5.41 Bogus comment state
179
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
60
180
  GUMBO_LEX_BOGUS_COMMENT,
61
- GUMBO_LEX_MARKUP_DECLARATION,
181
+
182
+ // 12.2.5.42 Markup declaration open state
183
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
184
+ GUMBO_LEX_MARKUP_DECLARATION_OPEN,
185
+
186
+ // 12.2.5.43 Comment start state
187
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
62
188
  GUMBO_LEX_COMMENT_START,
189
+
190
+ // 12.2.5.44 Comment start dash state
191
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
63
192
  GUMBO_LEX_COMMENT_START_DASH,
193
+
194
+ // 12.2.5.45 Comment state
195
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-state
64
196
  GUMBO_LEX_COMMENT,
197
+
198
+ // 12.2.5.46 Comment less-than sign state
199
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
200
+ GUMBO_LEX_COMMENT_LT,
201
+
202
+ // 12.2.5.47 Comment less-than sign bang state
203
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
204
+ GUMBO_LEX_COMMENT_LT_BANG,
205
+
206
+ // 12.2.5.48 Comment less-than sign bang dash state
207
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
208
+ GUMBO_LEX_COMMENT_LT_BANG_DASH,
209
+
210
+ // 12.2.5.49 Comment less-than sign bang dash dash state
211
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
212
+ GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH,
213
+
214
+ // 12.2.5.50 Comment end dash state
215
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
65
216
  GUMBO_LEX_COMMENT_END_DASH,
217
+
218
+ // 12.2.5.51 Comment end state
219
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
66
220
  GUMBO_LEX_COMMENT_END,
221
+
222
+ // 12.2.5.52 Comment end bang state
223
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
67
224
  GUMBO_LEX_COMMENT_END_BANG,
225
+
226
+ // 12.2.5.53 DOCTYPE state
227
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
68
228
  GUMBO_LEX_DOCTYPE,
229
+
230
+ // 12.2.5.54 Before DOCTYPE name state
231
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
69
232
  GUMBO_LEX_BEFORE_DOCTYPE_NAME,
233
+
234
+ // 12.2.5.55 DOCTYPE name state
235
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
70
236
  GUMBO_LEX_DOCTYPE_NAME,
237
+
238
+ // 12.2.5.56 After DOCTYPE name state
239
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
71
240
  GUMBO_LEX_AFTER_DOCTYPE_NAME,
241
+
242
+ // 12.2.5.57 After DOCTYPE public keyword state
243
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
72
244
  GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD,
245
+
246
+ // 12.2.5.58 Before DOCTYPE public identifier state
247
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
73
248
  GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID,
249
+
250
+ // 12.2.5.59 DOCTYPE public identifier (double-quoted) state
251
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
74
252
  GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED,
253
+
254
+ // 12.2.5.60 DOCTYPE public identifier (single-quoted) state
255
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
75
256
  GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED,
257
+
258
+ // 12.2.5.61 After DOCTYPE public identifier state
259
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
76
260
  GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID,
261
+
262
+ // 12.2.5.62 Between DOCTYPE public and system identifiers state
263
+ // https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
77
264
  GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID,
265
+
266
+ // 12.2.5.63 After DOCTYPE system keyword state
267
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
78
268
  GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD,
269
+
270
+ // 12.2.5.64 Before DOCTYPE system identifier state
271
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
79
272
  GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID,
273
+
274
+ // 12.2.5.65 DOCTYPE system identifier (double-quoted) state
275
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
80
276
  GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED,
277
+
278
+ // 12.2.5.66 DOCTYPE system identifier (single-quoted) state
279
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
81
280
  GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED,
281
+
282
+ // 12.2.5.67 After DOCTYPE system identifier state
283
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
82
284
  GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID,
285
+
286
+ // 12.2.5.68 Bogus DOCTYPE state
287
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
83
288
  GUMBO_LEX_BOGUS_DOCTYPE,
84
- GUMBO_LEX_CDATA
289
+
290
+ // 12.2.5.69 CDATA section state
291
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
292
+ GUMBO_LEX_CDATA_SECTION,
293
+
294
+ // 12.2.5.70 CDATA section bracket state
295
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
296
+ GUMBO_LEX_CDATA_SECTION_BRACKET,
297
+
298
+ // 12.2.5.71 CDATA section end state
299
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
300
+ GUMBO_LEX_CDATA_SECTION_END,
301
+
302
+ // 12.2.5.72 Character reference state
303
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
304
+ GUMBO_LEX_CHARACTER_REFERENCE,
305
+
306
+ // 12.2.5.73 Named character reference state
307
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
308
+ GUMBO_LEX_NAMED_CHARACTER_REFERENCE,
309
+
310
+ // 12.2.5.74 Ambiguous ampersand state
311
+ // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
312
+ GUMBO_LEX_AMBIGUOUS_AMPERSAND,
313
+
314
+ // 12.2.5.75 Numeric character reference state
315
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
316
+ GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE,
317
+
318
+ // 12.2.5.76 Hexadecimal character reference start state
319
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
320
+ GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START,
321
+
322
+ // 12.2.5.77 Decimal character reference start state
323
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
324
+ GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START,
325
+
326
+ // 12.2.5.78 Hexadecimal character reference state
327
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
328
+ GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE,
329
+
330
+ // 12.2.5.79 Decimal character reference state
331
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
332
+ GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE,
333
+
334
+ // 12.2.5.80 Numeric character reference end state
335
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
336
+ GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END
85
337
  } GumboTokenizerEnum;
86
338
 
87
339
  #endif // GUMBO_TOKENIZER_STATES_H_
@@ -27,8 +27,6 @@
27
27
  #include "ascii.h"
28
28
  #include "vector.h"
29
29
 
30
- const int kUtf8ReplacementChar = 0xFFFD;
31
-
32
30
  // References:
33
31
  // * https://tools.ietf.org/html/rfc3629
34
32
  // * https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
@@ -100,16 +98,9 @@ static void add_error(Utf8Iterator* iter, GumboErrorType type) {
100
98
  }
101
99
  error->type = type;
102
100
  error->position = iter->_pos;
103
- error->original_text = iter->_start;
104
-
105
- // At the point the error is recorded, the code point hasn't been computed
106
- // yet (and can't be, because it's invalid), so we need to build up the raw
107
- // hex value from the bytes under the cursor.
108
- uint32_t code_point = 0;
109
- for (size_t i = 0; i < iter->_width; ++i) {
110
- code_point = (code_point << 8) | (unsigned char) iter->_start[i];
111
- }
112
- error->v.codepoint = code_point;
101
+ error->original_text.data = iter->_start;
102
+ error->original_text.length = iter->_width;
103
+ error->v.tokenizer.codepoint = iter->_current;
113
104
  }
114
105
 
115
106
  // Reads the next UTF-8 character in the iter.
@@ -147,13 +138,15 @@ static void read_char(Utf8Iterator* iter) {
147
138
  }
148
139
  code_point = '\n';
149
140
  }
150
- if (utf8_is_invalid_code_point(code_point)) {
151
- // Invalid code points are errors, but they are not replaced by
152
- // U+FFFD.
153
- // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
154
- add_error(iter, GUMBO_ERR_UTF8_INVALID);
155
- }
156
141
  iter->_current = code_point;
142
+ if (utf8_is_surrogate(code_point)) {
143
+ add_error(iter, GUMBO_ERR_SURROGATE_IN_INPUT_STREAM);
144
+ } else if (utf8_is_noncharacter(code_point)) {
145
+ add_error(iter, GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM);
146
+ } else if (utf8_is_control(code_point)
147
+ && !(gumbo_ascii_isspace(code_point) || code_point == 0)) {
148
+ add_error(iter, GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM);
149
+ }
157
150
  return;
158
151
  } else if (state == UTF8_REJECT) {
159
152
  // We don't want to consume the invalid continuation byte of a multi-byte
@@ -169,8 +162,8 @@ static void read_char(Utf8Iterator* iter) {
169
162
  // rest of the iterator, and emit a replacement character. The next time we
170
163
  // enter this method, it will detect that there's no input to consume and
171
164
  // output an EOF.
172
- iter->_current = kUtf8ReplacementChar;
173
165
  iter->_width = iter->_end - iter->_start;
166
+ iter->_current = kUtf8ReplacementChar;
174
167
  add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
175
168
  }
176
169
 
@@ -187,20 +180,6 @@ static void update_position(Utf8Iterator* iter) {
187
180
  }
188
181
  }
189
182
 
190
- // Returns true if this Unicode code point is in the list of characters
191
- // forbidden by the HTML5 spec, such as undefined control chars.
192
- bool utf8_is_invalid_code_point(int c) {
193
- return
194
- (c >= 0x1 && c <= 0x8)
195
- || c == 0xB
196
- || (c >= 0xE && c <= 0x1F)
197
- || (c >= 0x7F && c <= 0x9F)
198
- || (c >= 0xFDD0 && c <= 0xFDEF)
199
- || ((c & 0xFFFF) == 0xFFFE)
200
- || ((c & 0xFFFF) == 0xFFFF)
201
- ;
202
- }
203
-
204
183
  void utf8iterator_init (
205
184
  GumboParser* parser,
206
185
  const char* source,
@@ -224,25 +203,6 @@ void utf8iterator_next(Utf8Iterator* iter) {
224
203
  read_char(iter);
225
204
  }
226
205
 
227
- int utf8iterator_current(const Utf8Iterator* iter) {
228
- return iter->_current;
229
- }
230
-
231
- void utf8iterator_get_position (
232
- const Utf8Iterator* iter,
233
- GumboSourcePosition* output
234
- ) {
235
- *output = iter->_pos;
236
- }
237
-
238
- const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
239
- return iter->_start;
240
- }
241
-
242
- const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
243
- return iter->_end;
244
- }
245
-
246
206
  bool utf8iterator_maybe_consume_match (
247
207
  Utf8Iterator* iter,
248
208
  const char* prefix,
@@ -278,10 +238,3 @@ void utf8iterator_reset(Utf8Iterator* iter) {
278
238
  iter->_pos = iter->_mark_pos;
279
239
  read_char(iter);
280
240
  }
281
-
282
- // Sets the position and original text fields of an error to the value at the
283
- // mark.
284
- void utf8iterator_fill_error_at_mark(Utf8Iterator* iter, GumboError* error) {
285
- error->position = iter->_mark_pos;
286
- error->original_text = iter->_mark;
287
- }