nokogumbo 2.0.0.pre.alpha → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +81 -10
- data/ext/nokogumbo/extconf.rb +6 -1
- data/ext/nokogumbo/nokogumbo.c +579 -233
- data/gumbo-parser/src/ascii.c +42 -0
- data/gumbo-parser/src/ascii.h +91 -7
- data/gumbo-parser/src/char_ref.c +5973 -4601
- data/gumbo-parser/src/char_ref.h +13 -28
- data/gumbo-parser/src/error.c +376 -120
- data/gumbo-parser/src/error.h +63 -125
- data/gumbo-parser/src/gumbo.h +47 -4
- data/gumbo-parser/src/parser.c +849 -709
- data/gumbo-parser/src/string_buffer.c +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/tokenizer.c +1426 -1261
- data/gumbo-parser/src/tokenizer.h +5 -5
- data/gumbo-parser/src/tokenizer_states.h +275 -23
- data/gumbo-parser/src/utf8.c +12 -59
- data/gumbo-parser/src/utf8.h +51 -16
- data/lib/nokogumbo.rb +0 -1
- data/lib/nokogumbo/html5.rb +2 -1
- data/lib/nokogumbo/html5/document.rb +12 -1
- data/lib/nokogumbo/html5/document_fragment.rb +35 -20
- data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
- data/lib/nokogumbo/version.rb +1 -1
- metadata +16 -9
- data/CHANGELOG.md +0 -56
data/gumbo-parser/src/char_ref.h
CHANGED
@@ -1,40 +1,25 @@
|
|
1
1
|
#ifndef GUMBO_CHAR_REF_H_
|
2
2
|
#define GUMBO_CHAR_REF_H_
|
3
3
|
|
4
|
-
#include <
|
4
|
+
#include <stdlib.h>
|
5
5
|
|
6
6
|
#ifdef __cplusplus
|
7
7
|
extern "C" {
|
8
8
|
#endif
|
9
9
|
|
10
|
-
struct GumboInternalParser;
|
11
|
-
struct GumboInternalUtf8Iterator;
|
12
|
-
|
13
10
|
// Value that indicates no character was produced.
|
14
|
-
|
15
|
-
|
16
|
-
//
|
17
|
-
//
|
18
|
-
//
|
19
|
-
//
|
20
|
-
//
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
// Implements the "consume a character reference" section of the spec.
|
27
|
-
// This reads in characters from the input as necessary, and fills in a
|
28
|
-
// OneOrTwoCodepoints struct containing the characters read. It may add parse
|
29
|
-
// errors to the GumboParser's errors vector, if the spec calls for it. Pass a
|
30
|
-
// space for the "additional allowed char" when the spec says "with no
|
31
|
-
// additional allowed char". Returns false on parse error, true otherwise.
|
32
|
-
bool gumbo_consume_char_ref (
|
33
|
-
struct GumboInternalParser* parser,
|
34
|
-
struct GumboInternalUtf8Iterator* input,
|
35
|
-
int additional_allowed_char,
|
36
|
-
bool is_in_attribute,
|
37
|
-
OneOrTwoCodepoints* output
|
11
|
+
#define kGumboNoChar (-1)
|
12
|
+
|
13
|
+
// On input, str points to the start of the string to match and size is the
|
14
|
+
// size of the string.
|
15
|
+
//
|
16
|
+
// Returns the length of the match or 0 if there is no match.
|
17
|
+
// output[0] contains the first codepoint and output[1] contains the second if
|
18
|
+
// there are two, otherwise output[1] contains kGumboNoChar.
|
19
|
+
size_t match_named_char_ref (
|
20
|
+
const char *str,
|
21
|
+
size_t size,
|
22
|
+
int output[2]
|
38
23
|
);
|
39
24
|
|
40
25
|
#ifdef __cplusplus
|
data/gumbo-parser/src/error.c
CHANGED
@@ -19,6 +19,7 @@
|
|
19
19
|
#include <stdarg.h>
|
20
20
|
#include <stdio.h>
|
21
21
|
#include <string.h>
|
22
|
+
#include "ascii.h"
|
22
23
|
#include "error.h"
|
23
24
|
#include "gumbo.h"
|
24
25
|
#include "macros.h"
|
@@ -45,13 +46,13 @@ static int PRINTF(2) print_message (
|
|
45
46
|
args
|
46
47
|
);
|
47
48
|
va_end(args);
|
48
|
-
#
|
49
|
+
#if _MSC_VER && _MSC_VER < 1900
|
49
50
|
if (bytes_written == -1) {
|
50
|
-
// vsnprintf returns -1 on MSVC++ if there's not enough capacity,
|
51
|
-
// returning the number of bytes that would've been written had
|
52
|
-
// enough. In this case, we'll double the buffer size and hope
|
53
|
-
// we retry (letting it fail and returning 0 if it doesn't),
|
54
|
-
// no way to smartly resize the buffer.
|
51
|
+
// vsnprintf returns -1 on older MSVC++ if there's not enough capacity,
|
52
|
+
// instead of returning the number of bytes that would've been written had
|
53
|
+
// there been enough. In this case, we'll double the buffer size and hope
|
54
|
+
// it fits when we retry (letting it fail and returning 0 if it doesn't),
|
55
|
+
// since there's no way to smartly resize the buffer.
|
55
56
|
gumbo_string_buffer_reserve(output->capacity * 2, output);
|
56
57
|
va_start(args, format);
|
57
58
|
int result = vsnprintf (
|
@@ -101,6 +102,214 @@ static void print_tag_stack (
|
|
101
102
|
gumbo_string_buffer_append_codepoint('.', output);
|
102
103
|
}
|
103
104
|
|
105
|
+
static void handle_tokenizer_error (
|
106
|
+
const GumboError* error,
|
107
|
+
GumboStringBuffer* output
|
108
|
+
) {
|
109
|
+
switch (error->type) {
|
110
|
+
case GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT:
|
111
|
+
print_message(output, "Empty comment abruptly closed by '%s', use '-->'.",
|
112
|
+
error->v.tokenizer.state == GUMBO_LEX_COMMENT_START? ">" : "->");
|
113
|
+
break;
|
114
|
+
case GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER:
|
115
|
+
print_message (
|
116
|
+
output,
|
117
|
+
"DOCTYPE public identifier missing closing %s.",
|
118
|
+
error->v.tokenizer.state == GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED?
|
119
|
+
"quotation mark (\")" : "apostrophe (')"
|
120
|
+
);
|
121
|
+
break;
|
122
|
+
case GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER:
|
123
|
+
print_message (
|
124
|
+
output,
|
125
|
+
"DOCTYPE system identifier missing closing %s.",
|
126
|
+
error->v.tokenizer.state == GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED?
|
127
|
+
"quotation mark (\")" : "apostrophe (')"
|
128
|
+
);
|
129
|
+
break;
|
130
|
+
case GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE:
|
131
|
+
print_message (
|
132
|
+
output,
|
133
|
+
"Numeric character reference '%.*s' does not contain any %sdigits.",
|
134
|
+
(int)error->original_text.length, error->original_text.data,
|
135
|
+
error->v.tokenizer.state == GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START? "hexadecimal " : ""
|
136
|
+
);
|
137
|
+
break;
|
138
|
+
case GUMBO_ERR_CDATA_IN_HTML_CONTENT:
|
139
|
+
print_message(output, "CDATA section outside foreign (SVG or MathML) content.");
|
140
|
+
break;
|
141
|
+
case GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE:
|
142
|
+
print_message (
|
143
|
+
output,
|
144
|
+
"Numeric character reference '%.*s' references a code point that is outside the valid Unicode range.",
|
145
|
+
(int)error->original_text.length, error->original_text.data
|
146
|
+
);
|
147
|
+
break;
|
148
|
+
case GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM:
|
149
|
+
print_message (
|
150
|
+
output,
|
151
|
+
"Input contains prohibited control code point U+%04X.",
|
152
|
+
error->v.tokenizer.codepoint
|
153
|
+
);
|
154
|
+
break;
|
155
|
+
case GUMBO_ERR_CONTROL_CHARACTER_REFERENCE:
|
156
|
+
print_message (
|
157
|
+
output,
|
158
|
+
"Numeric character reference '%.*s' references prohibited control code point U+%04X.",
|
159
|
+
(int)error->original_text.length, error->original_text.data,
|
160
|
+
error->v.tokenizer.codepoint
|
161
|
+
);
|
162
|
+
break;
|
163
|
+
case GUMBO_ERR_END_TAG_WITH_ATTRIBUTES:
|
164
|
+
print_message(output, "End tag contains attributes.");
|
165
|
+
break;
|
166
|
+
case GUMBO_ERR_DUPLICATE_ATTRIBUTE:
|
167
|
+
print_message(output, "Tag contains multiple attributes with the same name.");
|
168
|
+
break;
|
169
|
+
case GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS:
|
170
|
+
print_message(output, "End tag ends with '/>', use '>'.");
|
171
|
+
break;
|
172
|
+
case GUMBO_ERR_EOF_BEFORE_TAG_NAME:
|
173
|
+
print_message(output, "End of input where a tag name is expected.");
|
174
|
+
break;
|
175
|
+
case GUMBO_ERR_EOF_IN_CDATA:
|
176
|
+
print_message(output, "End of input in CDATA section.");
|
177
|
+
break;
|
178
|
+
case GUMBO_ERR_EOF_IN_COMMENT:
|
179
|
+
print_message(output, "End of input in comment.");
|
180
|
+
break;
|
181
|
+
case GUMBO_ERR_EOF_IN_DOCTYPE:
|
182
|
+
print_message(output, "End of input in DOCTYPE.");
|
183
|
+
break;
|
184
|
+
case GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT:
|
185
|
+
print_message(output, "End of input in text that resembles an HTML comment inside script element content.");
|
186
|
+
break;
|
187
|
+
case GUMBO_ERR_EOF_IN_TAG:
|
188
|
+
print_message(output, "End of input in tag.");
|
189
|
+
break;
|
190
|
+
case GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT:
|
191
|
+
print_message(output, "Comment closed incorrectly by '--!>', use '-->'.");
|
192
|
+
break;
|
193
|
+
case GUMBO_ERR_INCORRECTLY_OPENED_COMMENT:
|
194
|
+
print_message(output, "Comment, DOCTYPE, or CDATA opened incorrectly, use '<!--', '<!DOCTYPE', or '<![CDATA['.");
|
195
|
+
break;
|
196
|
+
case GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME:
|
197
|
+
print_message(output, "Invalid character sequence after DOCTYPE name, expected 'PUBLIC', 'SYSTEM', or '>'.");
|
198
|
+
break;
|
199
|
+
case GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME:
|
200
|
+
if (gumbo_ascii_isascii(error->v.tokenizer.codepoint)
|
201
|
+
&& !gumbo_ascii_iscntrl(error->v.tokenizer.codepoint))
|
202
|
+
print_message(output, "Invalid first character of tag name '%c'.", error->v.tokenizer.codepoint);
|
203
|
+
else
|
204
|
+
print_message(output, "Invalid first code point of tag name U+%04X.", error->v.tokenizer.codepoint);
|
205
|
+
break;
|
206
|
+
case GUMBO_ERR_MISSING_ATTRIBUTE_VALUE:
|
207
|
+
print_message(output, "Missing attribute value.");
|
208
|
+
break;
|
209
|
+
case GUMBO_ERR_MISSING_DOCTYPE_NAME:
|
210
|
+
print_message(output, "Missing DOCTYPE name.");
|
211
|
+
break;
|
212
|
+
case GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER:
|
213
|
+
print_message(output, "Missing DOCTYPE public identifier.");
|
214
|
+
break;
|
215
|
+
case GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER:
|
216
|
+
print_message(output, "Missing DOCTYPE system identifier.");
|
217
|
+
break;
|
218
|
+
case GUMBO_ERR_MISSING_END_TAG_NAME:
|
219
|
+
print_message(output, "Missing end tag name.");
|
220
|
+
break;
|
221
|
+
case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
|
222
|
+
print_message(output, "Missing quote before DOCTYPE public identifier.");
|
223
|
+
break;
|
224
|
+
case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
|
225
|
+
print_message(output, "Missing quote before DOCTYPE system identifier.");
|
226
|
+
break;
|
227
|
+
case GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE:
|
228
|
+
print_message(output, "Missing semicolon after character reference '%.*s'.",
|
229
|
+
(int)error->original_text.length, error->original_text.data);
|
230
|
+
break;
|
231
|
+
case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD:
|
232
|
+
print_message(output, "Missing whitespace after 'PUBLIC' keyword.");
|
233
|
+
break;
|
234
|
+
case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD:
|
235
|
+
print_message(output, "Missing whitespace after 'SYSTEM' keyword.");
|
236
|
+
break;
|
237
|
+
case GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME:
|
238
|
+
print_message(output, "Missing whitespace between 'DOCTYPE' keyword and DOCTYPE name.");
|
239
|
+
break;
|
240
|
+
case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES:
|
241
|
+
print_message(output, "Missing whitespace between attributes.");
|
242
|
+
break;
|
243
|
+
case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
|
244
|
+
print_message(output, "Missing whitespace between DOCTYPE public and system identifiers.");
|
245
|
+
break;
|
246
|
+
case GUMBO_ERR_NESTED_COMMENT:
|
247
|
+
print_message(output, "Nested comment.");
|
248
|
+
break;
|
249
|
+
case GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE:
|
250
|
+
print_message (
|
251
|
+
output,
|
252
|
+
"Numeric character reference '%.*s' references noncharacter U+%04X.",
|
253
|
+
(int)error->original_text.length, error->original_text.data,
|
254
|
+
error->v.tokenizer.codepoint
|
255
|
+
);
|
256
|
+
break;
|
257
|
+
case GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM:
|
258
|
+
print_message(output, "Input contains noncharacter U+%04X.", error->v.tokenizer.codepoint);
|
259
|
+
break;
|
260
|
+
case GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS:
|
261
|
+
print_message(output, "Start tag of nonvoid HTML element ends with '/>', use '>'.");
|
262
|
+
break;
|
263
|
+
case GUMBO_ERR_NULL_CHARACTER_REFERENCE:
|
264
|
+
print_message(output, "Numeric character reference '%.*s' references U+0000.",
|
265
|
+
(int)error->original_text.length, error->original_text.data);
|
266
|
+
break;
|
267
|
+
case GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE:
|
268
|
+
print_message (
|
269
|
+
output,
|
270
|
+
"Numeric character reference '%.*s' references surrogate U+%4X.",
|
271
|
+
(int)error->original_text.length, error->original_text.data,
|
272
|
+
error->v.tokenizer.codepoint
|
273
|
+
);
|
274
|
+
break;
|
275
|
+
case GUMBO_ERR_SURROGATE_IN_INPUT_STREAM:
|
276
|
+
print_message(output, "Input contains surrogate U+%04X.", error->v.tokenizer.codepoint);
|
277
|
+
break;
|
278
|
+
case GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
|
279
|
+
print_message(output, "Unexpected character after DOCTYPE system identifier.");
|
280
|
+
break;
|
281
|
+
case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME:
|
282
|
+
print_message(output, "Unexpected character (%c) in attribute name.", error->v.tokenizer.codepoint);
|
283
|
+
break;
|
284
|
+
case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE:
|
285
|
+
print_message(output, "Unexpected character (%c) in unquoted attribute value.", error->v.tokenizer.codepoint);
|
286
|
+
break;
|
287
|
+
case GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME:
|
288
|
+
print_message(output, "Unexpected '=' before an attribute name.");
|
289
|
+
break;
|
290
|
+
case GUMBO_ERR_UNEXPECTED_NULL_CHARACTER:
|
291
|
+
print_message(output, "Input contains unexpected U+0000.");
|
292
|
+
break;
|
293
|
+
case GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME:
|
294
|
+
print_message(output, "Unexpected '?' where start tag name is expected.");
|
295
|
+
break;
|
296
|
+
case GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG:
|
297
|
+
print_message(output, "Unexpected '/' in tag.");
|
298
|
+
break;
|
299
|
+
case GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE:
|
300
|
+
print_message(output, "Unknown named character reference '%.*s'.",
|
301
|
+
(int)error->original_text.length, error->original_text.data);
|
302
|
+
break;
|
303
|
+
case GUMBO_ERR_UTF8_INVALID:
|
304
|
+
print_message(output, "Invalid UTF8 encoding.");
|
305
|
+
break;
|
306
|
+
case GUMBO_ERR_UTF8_TRUNCATED:
|
307
|
+
print_message(output, "UTF8 character truncated.");
|
308
|
+
break;
|
309
|
+
case GUMBO_ERR_PARSER:
|
310
|
+
assert(0 && "Unreachable.");
|
311
|
+
}
|
312
|
+
}
|
104
313
|
static void handle_parser_error (
|
105
314
|
const GumboParserError* error,
|
106
315
|
GumboStringBuffer* output
|
@@ -111,7 +320,7 @@ static void handle_parser_error (
|
|
111
320
|
) {
|
112
321
|
print_message (
|
113
322
|
output,
|
114
|
-
"
|
323
|
+
"Expected a doctype token"
|
115
324
|
);
|
116
325
|
return;
|
117
326
|
}
|
@@ -191,122 +400,155 @@ GumboError* gumbo_add_error(GumboParser* parser) {
|
|
191
400
|
return error;
|
192
401
|
}
|
193
402
|
|
194
|
-
|
403
|
+
GumboSourcePosition gumbo_error_position(const GumboError* error) {
|
404
|
+
return error->position;
|
405
|
+
}
|
406
|
+
|
407
|
+
const char* gumbo_error_code(const GumboError* error) {
|
408
|
+
switch (error->type) {
|
409
|
+
// Defined tokenizer errors.
|
410
|
+
case GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT:
|
411
|
+
return "abrupt-closing-of-empty-comment";
|
412
|
+
case GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER:
|
413
|
+
return "abrupt-doctype-public-identifier";
|
414
|
+
case GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER:
|
415
|
+
return "abrupt-doctype-system-identifier";
|
416
|
+
case GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE:
|
417
|
+
return "absence-of-digits-in-numeric-character-reference";
|
418
|
+
case GUMBO_ERR_CDATA_IN_HTML_CONTENT:
|
419
|
+
return "cdata-in-html-content";
|
420
|
+
case GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE:
|
421
|
+
return "character-reference-outside-unicode-range";
|
422
|
+
case GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM:
|
423
|
+
return "control-character-in-input-stream";
|
424
|
+
case GUMBO_ERR_CONTROL_CHARACTER_REFERENCE:
|
425
|
+
return "control-character-reference";
|
426
|
+
case GUMBO_ERR_END_TAG_WITH_ATTRIBUTES:
|
427
|
+
return "end-tag-with-attributes";
|
428
|
+
case GUMBO_ERR_DUPLICATE_ATTRIBUTE:
|
429
|
+
return "duplicate-attribute";
|
430
|
+
case GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS:
|
431
|
+
return "end-tag-with-trailing-solidus";
|
432
|
+
case GUMBO_ERR_EOF_BEFORE_TAG_NAME:
|
433
|
+
return "eof-before-tag-name";
|
434
|
+
case GUMBO_ERR_EOF_IN_CDATA:
|
435
|
+
return "eof-in-cdata";
|
436
|
+
case GUMBO_ERR_EOF_IN_COMMENT:
|
437
|
+
return "eof-in-comment";
|
438
|
+
case GUMBO_ERR_EOF_IN_DOCTYPE:
|
439
|
+
return "eof-in-doctype";
|
440
|
+
case GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT:
|
441
|
+
return "eof-in-script-html-comment-like-text";
|
442
|
+
case GUMBO_ERR_EOF_IN_TAG:
|
443
|
+
return "eof-in-tag";
|
444
|
+
case GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT:
|
445
|
+
return "incorrectly-closed-comment";
|
446
|
+
case GUMBO_ERR_INCORRECTLY_OPENED_COMMENT:
|
447
|
+
return "incorrectly-opened-comment";
|
448
|
+
case GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME:
|
449
|
+
return "invalid-character-sequence-after-doctype-name";
|
450
|
+
case GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME:
|
451
|
+
return "invalid-first-character-of-tag-name";
|
452
|
+
case GUMBO_ERR_MISSING_ATTRIBUTE_VALUE:
|
453
|
+
return "missing-attribute-value";
|
454
|
+
case GUMBO_ERR_MISSING_DOCTYPE_NAME:
|
455
|
+
return "missing-doctype-name";
|
456
|
+
case GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER:
|
457
|
+
return "missing-doctype-public-identifier";
|
458
|
+
case GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER:
|
459
|
+
return "missing-doctype-system-identifier";
|
460
|
+
case GUMBO_ERR_MISSING_END_TAG_NAME:
|
461
|
+
return "missing-end-tag-name";
|
462
|
+
case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
|
463
|
+
return "missing-quote-before-doctype-public-identifier";
|
464
|
+
case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
|
465
|
+
return "missing-quote-before-doctype-system-identifier";
|
466
|
+
case GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE:
|
467
|
+
return "missing-semicolon-after-character-reference";
|
468
|
+
case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD:
|
469
|
+
return "missing-whitespace-after-doctype-public-keyword";
|
470
|
+
case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD:
|
471
|
+
return "missing-whitespace-after-doctype-system-keyword";
|
472
|
+
case GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME:
|
473
|
+
return "missing-whitespace-before-doctype-name";
|
474
|
+
case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES:
|
475
|
+
return "missing-whitespace-between-attributes";
|
476
|
+
case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
|
477
|
+
return "missing-whitespace-between-doctype-public-and-system-identifiers";
|
478
|
+
case GUMBO_ERR_NESTED_COMMENT:
|
479
|
+
return "nested-comment";
|
480
|
+
case GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE:
|
481
|
+
return "noncharacter-character-reference";
|
482
|
+
case GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM:
|
483
|
+
return "noncharacter-in-input-stream";
|
484
|
+
case GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS:
|
485
|
+
return "non-void-html-element-start-tag-with-trailing-solidus";
|
486
|
+
case GUMBO_ERR_NULL_CHARACTER_REFERENCE:
|
487
|
+
return "null-character-reference";
|
488
|
+
case GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE:
|
489
|
+
return "surrogate-character-reference";
|
490
|
+
case GUMBO_ERR_SURROGATE_IN_INPUT_STREAM:
|
491
|
+
return "surrogate-in-input-stream";
|
492
|
+
case GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
|
493
|
+
return "unexpected-character-after-doctype-system-identifier";
|
494
|
+
case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME:
|
495
|
+
return "unexpected-character-in-attribute-name";
|
496
|
+
case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE:
|
497
|
+
return "unexpected-character-in-unquoted-attribute-value";
|
498
|
+
case GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME:
|
499
|
+
return "unexpected-equals-sign-before-attribute-name";
|
500
|
+
case GUMBO_ERR_UNEXPECTED_NULL_CHARACTER:
|
501
|
+
return "unexpected-null-character";
|
502
|
+
case GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME:
|
503
|
+
return "unexpected-question-mark-instead-of-tag-name";
|
504
|
+
case GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG:
|
505
|
+
return "unexpected-solidus-in-tag";
|
506
|
+
case GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE:
|
507
|
+
return "unknown-named-character-reference";
|
508
|
+
|
509
|
+
// Encoding errors.
|
510
|
+
case GUMBO_ERR_UTF8_INVALID:
|
511
|
+
return "utf8-invalid";
|
512
|
+
case GUMBO_ERR_UTF8_TRUNCATED:
|
513
|
+
return "utf8-truncated";
|
514
|
+
|
515
|
+
// Generic parser error.
|
516
|
+
case GUMBO_ERR_PARSER:
|
517
|
+
return "generic-parser";
|
518
|
+
}
|
519
|
+
// Silence warning about control reaching end of non-void function.
|
520
|
+
// All errors _should_ be handled in the switch statement.
|
521
|
+
return "generic-parser";
|
522
|
+
}
|
523
|
+
|
524
|
+
static void error_to_string (
|
195
525
|
const GumboError* error,
|
196
526
|
GumboStringBuffer* output
|
197
527
|
) {
|
198
|
-
|
199
|
-
output
|
200
|
-
|
201
|
-
error->
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
);
|
211
|
-
break;
|
212
|
-
case GUMBO_ERR_UTF8_TRUNCATED:
|
213
|
-
print_message (
|
214
|
-
output,
|
215
|
-
"Input stream ends with a truncated UTF8 character 0x%" PRIx32,
|
216
|
-
error->v.codepoint
|
217
|
-
);
|
218
|
-
break;
|
219
|
-
case GUMBO_ERR_UTF8_NULL:
|
220
|
-
print_message (
|
221
|
-
output,
|
222
|
-
"Unexpected NULL character in the input stream"
|
223
|
-
);
|
224
|
-
break;
|
225
|
-
case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
|
226
|
-
print_message (
|
227
|
-
output,
|
228
|
-
"No digits after &# in numeric character reference"
|
229
|
-
);
|
230
|
-
break;
|
231
|
-
case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
|
232
|
-
print_message (
|
233
|
-
output,
|
234
|
-
"The numeric character reference &#%" PRIu32 " should be followed "
|
235
|
-
"by a semicolon",
|
236
|
-
error->v.codepoint
|
237
|
-
);
|
238
|
-
break;
|
239
|
-
case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
|
240
|
-
print_message (
|
241
|
-
output,
|
242
|
-
"The numeric character reference &#%" PRIu32 "; encodes an invalid "
|
243
|
-
"unicode codepoint",
|
244
|
-
error->v.codepoint
|
245
|
-
);
|
246
|
-
break;
|
247
|
-
case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
|
248
|
-
// The textual data came from one of the literal strings in the table, and
|
249
|
-
// so it'll be null-terminated.
|
250
|
-
print_message (
|
251
|
-
output,
|
252
|
-
"The named character reference &%.*s should be followed by a "
|
253
|
-
"semicolon",
|
254
|
-
(int) error->v.text.length,
|
255
|
-
error->v.text.data
|
256
|
-
);
|
257
|
-
break;
|
258
|
-
case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
|
259
|
-
print_message (
|
260
|
-
output,
|
261
|
-
"The named character reference &%.*s; is not a valid entity name",
|
262
|
-
(int) error->v.text.length,
|
263
|
-
error->v.text.data
|
264
|
-
);
|
265
|
-
break;
|
266
|
-
case GUMBO_ERR_DUPLICATE_ATTR:
|
267
|
-
print_message (
|
268
|
-
output,
|
269
|
-
"Attribute %s occurs multiple times, at positions %u and %u",
|
270
|
-
error->v.duplicate_attr.name,
|
271
|
-
error->v.duplicate_attr.original_index,
|
272
|
-
error->v.duplicate_attr.new_index
|
273
|
-
);
|
274
|
-
break;
|
275
|
-
case GUMBO_ERR_DASHES_OR_DOCTYPE:
|
276
|
-
print_message (
|
277
|
-
output,
|
278
|
-
"Incorrectly opened comment; expected '--', 'DOCTYPE', or '[CDATA['"
|
279
|
-
);
|
280
|
-
break;
|
281
|
-
case GUMBO_ERR_PARSER:
|
282
|
-
handle_parser_error(&error->v.parser, output);
|
283
|
-
break;
|
284
|
-
case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
|
285
|
-
case GUMBO_ERR_SELF_CLOSING_END_TAG:
|
286
|
-
print_message (
|
287
|
-
output,
|
288
|
-
"Tag cannot be self-closing");
|
289
|
-
break;
|
290
|
-
default:
|
291
|
-
print_message (
|
292
|
-
output,
|
293
|
-
"Tokenizer error with an unimplemented error message"
|
294
|
-
);
|
295
|
-
break;
|
296
|
-
}
|
297
|
-
gumbo_string_buffer_append_codepoint('.', output);
|
528
|
+
if (error->type < GUMBO_ERR_PARSER)
|
529
|
+
handle_tokenizer_error(error, output);
|
530
|
+
else
|
531
|
+
handle_parser_error(&error->v.parser, output);
|
532
|
+
}
|
533
|
+
|
534
|
+
size_t gumbo_error_to_string(const GumboError* error, char** output) {
|
535
|
+
GumboStringBuffer sb;
|
536
|
+
gumbo_string_buffer_init(&sb);
|
537
|
+
error_to_string(error, &sb);
|
538
|
+
*output = sb.data;
|
539
|
+
return sb.length;
|
298
540
|
}
|
299
541
|
|
300
|
-
void
|
542
|
+
void caret_diagnostic_to_string (
|
301
543
|
const GumboError* error,
|
302
544
|
const char* source_text,
|
303
545
|
size_t source_length,
|
304
546
|
GumboStringBuffer* output
|
305
547
|
) {
|
306
|
-
|
548
|
+
error_to_string(error, output);
|
307
549
|
|
308
|
-
const char* line_start = find_prev_newline(source_text, error->original_text);
|
309
|
-
const char* line_end = find_next_newline(source_text + source_length, error->original_text);
|
550
|
+
const char* line_start = find_prev_newline(source_text, error->original_text.data);
|
551
|
+
const char* line_end = find_next_newline(source_text + source_length, error->original_text.data);
|
310
552
|
GumboStringPiece original_line;
|
311
553
|
original_line.data = line_start;
|
312
554
|
original_line.length = line_end - line_start;
|
@@ -324,6 +566,19 @@ void gumbo_caret_diagnostic_to_string (
|
|
324
566
|
gumbo_string_buffer_append_codepoint('\n', output);
|
325
567
|
}
|
326
568
|
|
569
|
+
size_t gumbo_caret_diagnostic_to_string (
|
570
|
+
const GumboError* error,
|
571
|
+
const char* source_text,
|
572
|
+
size_t source_length,
|
573
|
+
char **output
|
574
|
+
) {
|
575
|
+
GumboStringBuffer sb;
|
576
|
+
gumbo_string_buffer_init(&sb);
|
577
|
+
caret_diagnostic_to_string(error, source_text, source_length, &sb);
|
578
|
+
*output = sb.data;
|
579
|
+
return sb.length;
|
580
|
+
}
|
581
|
+
|
327
582
|
void gumbo_print_caret_diagnostic (
|
328
583
|
const GumboError* error,
|
329
584
|
const char* source_text,
|
@@ -331,20 +586,21 @@ void gumbo_print_caret_diagnostic (
|
|
331
586
|
) {
|
332
587
|
GumboStringBuffer text;
|
333
588
|
gumbo_string_buffer_init(&text);
|
334
|
-
|
589
|
+
print_message (
|
590
|
+
&text,
|
591
|
+
"%lu:%lu: ",
|
592
|
+
(unsigned long)error->position.line,
|
593
|
+
(unsigned long)error->position.column
|
594
|
+
);
|
595
|
+
|
596
|
+
caret_diagnostic_to_string(error, source_text, source_length, &text);
|
335
597
|
printf("%.*s", (int) text.length, text.data);
|
336
598
|
gumbo_string_buffer_destroy(&text);
|
337
599
|
}
|
338
600
|
|
339
601
|
void gumbo_error_destroy(GumboError* error) {
|
340
|
-
if (
|
341
|
-
error->type == GUMBO_ERR_PARSER
|
342
|
-
|| error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG
|
343
|
-
|| error->type == GUMBO_ERR_SELF_CLOSING_END_TAG
|
344
|
-
) {
|
602
|
+
if (error->type == GUMBO_ERR_PARSER) {
|
345
603
|
gumbo_vector_destroy(&error->v.parser.tag_stack);
|
346
|
-
} else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) {
|
347
|
-
gumbo_free((void*) error->v.duplicate_attr.name);
|
348
604
|
}
|
349
605
|
gumbo_free(error);
|
350
606
|
}
|