nokogumbo 2.0.0.pre.alpha → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,40 +1,25 @@
1
1
  #ifndef GUMBO_CHAR_REF_H_
2
2
  #define GUMBO_CHAR_REF_H_
3
3
 
4
- #include <stdbool.h>
4
+ #include <stdlib.h>
5
5
 
6
6
  #ifdef __cplusplus
7
7
  extern "C" {
8
8
  #endif
9
9
 
10
- struct GumboInternalParser;
11
- struct GumboInternalUtf8Iterator;
12
-
13
10
  // Value that indicates no character was produced.
14
- extern const int kGumboNoChar;
15
-
16
- // Certain named character references generate two codepoints, not one, and so
17
- // the gumbo_consume_char_ref subroutine needs to return this instead of an int.
18
- // The first field will be kGumboNoChar if no character reference was found; the
19
- // second field will be kGumboNoChar if that is the case or if the character
20
- // reference returns only a single codepoint.
21
- typedef struct {
22
- int first;
23
- int second;
24
- } OneOrTwoCodepoints;
25
-
26
- // Implements the "consume a character reference" section of the spec.
27
- // This reads in characters from the input as necessary, and fills in a
28
- // OneOrTwoCodepoints struct containing the characters read. It may add parse
29
- // errors to the GumboParser's errors vector, if the spec calls for it. Pass a
30
- // space for the "additional allowed char" when the spec says "with no
31
- // additional allowed char". Returns false on parse error, true otherwise.
32
- bool gumbo_consume_char_ref (
33
- struct GumboInternalParser* parser,
34
- struct GumboInternalUtf8Iterator* input,
35
- int additional_allowed_char,
36
- bool is_in_attribute,
37
- OneOrTwoCodepoints* output
11
+ #define kGumboNoChar (-1)
12
+
13
+ // On input, str points to the start of the string to match and size is the
14
+ // size of the string.
15
+ //
16
+ // Returns the length of the match or 0 if there is no match.
17
+ // output[0] contains the first codepoint and output[1] contains the second if
18
+ // there are two, otherwise output[1] contains kGumboNoChar.
19
+ size_t match_named_char_ref (
20
+ const char *str,
21
+ size_t size,
22
+ int output[2]
38
23
  );
39
24
 
40
25
  #ifdef __cplusplus
@@ -19,6 +19,7 @@
19
19
  #include <stdarg.h>
20
20
  #include <stdio.h>
21
21
  #include <string.h>
22
+ #include "ascii.h"
22
23
  #include "error.h"
23
24
  #include "gumbo.h"
24
25
  #include "macros.h"
@@ -45,13 +46,13 @@ static int PRINTF(2) print_message (
45
46
  args
46
47
  );
47
48
  va_end(args);
48
- #ifdef _MSC_VER
49
+ #if _MSC_VER && _MSC_VER < 1900
49
50
  if (bytes_written == -1) {
50
- // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
51
- // returning the number of bytes that would've been written had there been
52
- // enough. In this case, we'll double the buffer size and hope it fits when
53
- // we retry (letting it fail and returning 0 if it doesn't), since there's
54
- // no way to smartly resize the buffer.
51
+ // vsnprintf returns -1 on older MSVC++ if there's not enough capacity,
52
+ // instead of returning the number of bytes that would've been written had
53
+ // there been enough. In this case, we'll double the buffer size and hope
54
+ // it fits when we retry (letting it fail and returning 0 if it doesn't),
55
+ // since there's no way to smartly resize the buffer.
55
56
  gumbo_string_buffer_reserve(output->capacity * 2, output);
56
57
  va_start(args, format);
57
58
  int result = vsnprintf (
@@ -101,6 +102,214 @@ static void print_tag_stack (
101
102
  gumbo_string_buffer_append_codepoint('.', output);
102
103
  }
103
104
 
105
+ static void handle_tokenizer_error (
106
+ const GumboError* error,
107
+ GumboStringBuffer* output
108
+ ) {
109
+ switch (error->type) {
110
+ case GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT:
111
+ print_message(output, "Empty comment abruptly closed by '%s', use '-->'.",
112
+ error->v.tokenizer.state == GUMBO_LEX_COMMENT_START? ">" : "->");
113
+ break;
114
+ case GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER:
115
+ print_message (
116
+ output,
117
+ "DOCTYPE public identifier missing closing %s.",
118
+ error->v.tokenizer.state == GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED?
119
+ "quotation mark (\")" : "apostrophe (')"
120
+ );
121
+ break;
122
+ case GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER:
123
+ print_message (
124
+ output,
125
+ "DOCTYPE system identifier missing closing %s.",
126
+ error->v.tokenizer.state == GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED?
127
+ "quotation mark (\")" : "apostrophe (')"
128
+ );
129
+ break;
130
+ case GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE:
131
+ print_message (
132
+ output,
133
+ "Numeric character reference '%.*s' does not contain any %sdigits.",
134
+ (int)error->original_text.length, error->original_text.data,
135
+ error->v.tokenizer.state == GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START? "hexadecimal " : ""
136
+ );
137
+ break;
138
+ case GUMBO_ERR_CDATA_IN_HTML_CONTENT:
139
+ print_message(output, "CDATA section outside foreign (SVG or MathML) content.");
140
+ break;
141
+ case GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE:
142
+ print_message (
143
+ output,
144
+ "Numeric character reference '%.*s' references a code point that is outside the valid Unicode range.",
145
+ (int)error->original_text.length, error->original_text.data
146
+ );
147
+ break;
148
+ case GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM:
149
+ print_message (
150
+ output,
151
+ "Input contains prohibited control code point U+%04X.",
152
+ error->v.tokenizer.codepoint
153
+ );
154
+ break;
155
+ case GUMBO_ERR_CONTROL_CHARACTER_REFERENCE:
156
+ print_message (
157
+ output,
158
+ "Numeric character reference '%.*s' references prohibited control code point U+%04X.",
159
+ (int)error->original_text.length, error->original_text.data,
160
+ error->v.tokenizer.codepoint
161
+ );
162
+ break;
163
+ case GUMBO_ERR_END_TAG_WITH_ATTRIBUTES:
164
+ print_message(output, "End tag contains attributes.");
165
+ break;
166
+ case GUMBO_ERR_DUPLICATE_ATTRIBUTE:
167
+ print_message(output, "Tag contains multiple attributes with the same name.");
168
+ break;
169
+ case GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS:
170
+ print_message(output, "End tag ends with '/>', use '>'.");
171
+ break;
172
+ case GUMBO_ERR_EOF_BEFORE_TAG_NAME:
173
+ print_message(output, "End of input where a tag name is expected.");
174
+ break;
175
+ case GUMBO_ERR_EOF_IN_CDATA:
176
+ print_message(output, "End of input in CDATA section.");
177
+ break;
178
+ case GUMBO_ERR_EOF_IN_COMMENT:
179
+ print_message(output, "End of input in comment.");
180
+ break;
181
+ case GUMBO_ERR_EOF_IN_DOCTYPE:
182
+ print_message(output, "End of input in DOCTYPE.");
183
+ break;
184
+ case GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT:
185
+ print_message(output, "End of input in text that resembles an HTML comment inside script element content.");
186
+ break;
187
+ case GUMBO_ERR_EOF_IN_TAG:
188
+ print_message(output, "End of input in tag.");
189
+ break;
190
+ case GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT:
191
+ print_message(output, "Comment closed incorrectly by '--!>', use '-->'.");
192
+ break;
193
+ case GUMBO_ERR_INCORRECTLY_OPENED_COMMENT:
194
+ print_message(output, "Comment, DOCTYPE, or CDATA opened incorrectly, use '<!--', '<!DOCTYPE', or '<![CDATA['.");
195
+ break;
196
+ case GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME:
197
+ print_message(output, "Invalid character sequence after DOCTYPE name, expected 'PUBLIC', 'SYSTEM', or '>'.");
198
+ break;
199
+ case GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME:
200
+ if (gumbo_ascii_isascii(error->v.tokenizer.codepoint)
201
+ && !gumbo_ascii_iscntrl(error->v.tokenizer.codepoint))
202
+ print_message(output, "Invalid first character of tag name '%c'.", error->v.tokenizer.codepoint);
203
+ else
204
+ print_message(output, "Invalid first code point of tag name U+%04X.", error->v.tokenizer.codepoint);
205
+ break;
206
+ case GUMBO_ERR_MISSING_ATTRIBUTE_VALUE:
207
+ print_message(output, "Missing attribute value.");
208
+ break;
209
+ case GUMBO_ERR_MISSING_DOCTYPE_NAME:
210
+ print_message(output, "Missing DOCTYPE name.");
211
+ break;
212
+ case GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER:
213
+ print_message(output, "Missing DOCTYPE public identifier.");
214
+ break;
215
+ case GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER:
216
+ print_message(output, "Missing DOCTYPE system identifier.");
217
+ break;
218
+ case GUMBO_ERR_MISSING_END_TAG_NAME:
219
+ print_message(output, "Missing end tag name.");
220
+ break;
221
+ case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
222
+ print_message(output, "Missing quote before DOCTYPE public identifier.");
223
+ break;
224
+ case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
225
+ print_message(output, "Missing quote before DOCTYPE system identifier.");
226
+ break;
227
+ case GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE:
228
+ print_message(output, "Missing semicolon after character reference '%.*s'.",
229
+ (int)error->original_text.length, error->original_text.data);
230
+ break;
231
+ case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD:
232
+ print_message(output, "Missing whitespace after 'PUBLIC' keyword.");
233
+ break;
234
+ case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD:
235
+ print_message(output, "Missing whitespace after 'SYSTEM' keyword.");
236
+ break;
237
+ case GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME:
238
+ print_message(output, "Missing whitespace between 'DOCTYPE' keyword and DOCTYPE name.");
239
+ break;
240
+ case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES:
241
+ print_message(output, "Missing whitespace between attributes.");
242
+ break;
243
+ case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
244
+ print_message(output, "Missing whitespace between DOCTYPE public and system identifiers.");
245
+ break;
246
+ case GUMBO_ERR_NESTED_COMMENT:
247
+ print_message(output, "Nested comment.");
248
+ break;
249
+ case GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE:
250
+ print_message (
251
+ output,
252
+ "Numeric character reference '%.*s' references noncharacter U+%04X.",
253
+ (int)error->original_text.length, error->original_text.data,
254
+ error->v.tokenizer.codepoint
255
+ );
256
+ break;
257
+ case GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM:
258
+ print_message(output, "Input contains noncharacter U+%04X.", error->v.tokenizer.codepoint);
259
+ break;
260
+ case GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS:
261
+ print_message(output, "Start tag of nonvoid HTML element ends with '/>', use '>'.");
262
+ break;
263
+ case GUMBO_ERR_NULL_CHARACTER_REFERENCE:
264
+ print_message(output, "Numeric character reference '%.*s' references U+0000.",
265
+ (int)error->original_text.length, error->original_text.data);
266
+ break;
267
+ case GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE:
268
+ print_message (
269
+ output,
270
+ "Numeric character reference '%.*s' references surrogate U+%4X.",
271
+ (int)error->original_text.length, error->original_text.data,
272
+ error->v.tokenizer.codepoint
273
+ );
274
+ break;
275
+ case GUMBO_ERR_SURROGATE_IN_INPUT_STREAM:
276
+ print_message(output, "Input contains surrogate U+%04X.", error->v.tokenizer.codepoint);
277
+ break;
278
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
279
+ print_message(output, "Unexpected character after DOCTYPE system identifier.");
280
+ break;
281
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME:
282
+ print_message(output, "Unexpected character (%c) in attribute name.", error->v.tokenizer.codepoint);
283
+ break;
284
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE:
285
+ print_message(output, "Unexpected character (%c) in unquoted attribute value.", error->v.tokenizer.codepoint);
286
+ break;
287
+ case GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME:
288
+ print_message(output, "Unexpected '=' before an attribute name.");
289
+ break;
290
+ case GUMBO_ERR_UNEXPECTED_NULL_CHARACTER:
291
+ print_message(output, "Input contains unexpected U+0000.");
292
+ break;
293
+ case GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME:
294
+ print_message(output, "Unexpected '?' where start tag name is expected.");
295
+ break;
296
+ case GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG:
297
+ print_message(output, "Unexpected '/' in tag.");
298
+ break;
299
+ case GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE:
300
+ print_message(output, "Unknown named character reference '%.*s'.",
301
+ (int)error->original_text.length, error->original_text.data);
302
+ break;
303
+ case GUMBO_ERR_UTF8_INVALID:
304
+ print_message(output, "Invalid UTF8 encoding.");
305
+ break;
306
+ case GUMBO_ERR_UTF8_TRUNCATED:
307
+ print_message(output, "UTF8 character truncated.");
308
+ break;
309
+ case GUMBO_ERR_PARSER:
310
+ assert(0 && "Unreachable.");
311
+ }
312
+ }
104
313
  static void handle_parser_error (
105
314
  const GumboParserError* error,
106
315
  GumboStringBuffer* output
@@ -111,7 +320,7 @@ static void handle_parser_error (
111
320
  ) {
112
321
  print_message (
113
322
  output,
114
- "The doctype must be the first token in the document"
323
+ "Expected a doctype token"
115
324
  );
116
325
  return;
117
326
  }
@@ -191,122 +400,155 @@ GumboError* gumbo_add_error(GumboParser* parser) {
191
400
  return error;
192
401
  }
193
402
 
194
- void gumbo_error_to_string (
403
+ GumboSourcePosition gumbo_error_position(const GumboError* error) {
404
+ return error->position;
405
+ }
406
+
407
+ const char* gumbo_error_code(const GumboError* error) {
408
+ switch (error->type) {
409
+ // Defined tokenizer errors.
410
+ case GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT:
411
+ return "abrupt-closing-of-empty-comment";
412
+ case GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER:
413
+ return "abrupt-doctype-public-identifier";
414
+ case GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER:
415
+ return "abrupt-doctype-system-identifier";
416
+ case GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE:
417
+ return "absence-of-digits-in-numeric-character-reference";
418
+ case GUMBO_ERR_CDATA_IN_HTML_CONTENT:
419
+ return "cdata-in-html-content";
420
+ case GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE:
421
+ return "character-reference-outside-unicode-range";
422
+ case GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM:
423
+ return "control-character-in-input-stream";
424
+ case GUMBO_ERR_CONTROL_CHARACTER_REFERENCE:
425
+ return "control-character-reference";
426
+ case GUMBO_ERR_END_TAG_WITH_ATTRIBUTES:
427
+ return "end-tag-with-attributes";
428
+ case GUMBO_ERR_DUPLICATE_ATTRIBUTE:
429
+ return "duplicate-attribute";
430
+ case GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS:
431
+ return "end-tag-with-trailing-solidus";
432
+ case GUMBO_ERR_EOF_BEFORE_TAG_NAME:
433
+ return "eof-before-tag-name";
434
+ case GUMBO_ERR_EOF_IN_CDATA:
435
+ return "eof-in-cdata";
436
+ case GUMBO_ERR_EOF_IN_COMMENT:
437
+ return "eof-in-comment";
438
+ case GUMBO_ERR_EOF_IN_DOCTYPE:
439
+ return "eof-in-doctype";
440
+ case GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT:
441
+ return "eof-in-script-html-comment-like-text";
442
+ case GUMBO_ERR_EOF_IN_TAG:
443
+ return "eof-in-tag";
444
+ case GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT:
445
+ return "incorrectly-closed-comment";
446
+ case GUMBO_ERR_INCORRECTLY_OPENED_COMMENT:
447
+ return "incorrectly-opened-comment";
448
+ case GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME:
449
+ return "invalid-character-sequence-after-doctype-name";
450
+ case GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME:
451
+ return "invalid-first-character-of-tag-name";
452
+ case GUMBO_ERR_MISSING_ATTRIBUTE_VALUE:
453
+ return "missing-attribute-value";
454
+ case GUMBO_ERR_MISSING_DOCTYPE_NAME:
455
+ return "missing-doctype-name";
456
+ case GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER:
457
+ return "missing-doctype-public-identifier";
458
+ case GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER:
459
+ return "missing-doctype-system-identifier";
460
+ case GUMBO_ERR_MISSING_END_TAG_NAME:
461
+ return "missing-end-tag-name";
462
+ case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
463
+ return "missing-quote-before-doctype-public-identifier";
464
+ case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
465
+ return "missing-quote-before-doctype-system-identifier";
466
+ case GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE:
467
+ return "missing-semicolon-after-character-reference";
468
+ case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD:
469
+ return "missing-whitespace-after-doctype-public-keyword";
470
+ case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD:
471
+ return "missing-whitespace-after-doctype-system-keyword";
472
+ case GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME:
473
+ return "missing-whitespace-before-doctype-name";
474
+ case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES:
475
+ return "missing-whitespace-between-attributes";
476
+ case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
477
+ return "missing-whitespace-between-doctype-public-and-system-identifiers";
478
+ case GUMBO_ERR_NESTED_COMMENT:
479
+ return "nested-comment";
480
+ case GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE:
481
+ return "noncharacter-character-reference";
482
+ case GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM:
483
+ return "noncharacter-in-input-stream";
484
+ case GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS:
485
+ return "non-void-html-element-start-tag-with-trailing-solidus";
486
+ case GUMBO_ERR_NULL_CHARACTER_REFERENCE:
487
+ return "null-character-reference";
488
+ case GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE:
489
+ return "surrogate-character-reference";
490
+ case GUMBO_ERR_SURROGATE_IN_INPUT_STREAM:
491
+ return "surrogate-in-input-stream";
492
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
493
+ return "unexpected-character-after-doctype-system-identifier";
494
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME:
495
+ return "unexpected-character-in-attribute-name";
496
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE:
497
+ return "unexpected-character-in-unquoted-attribute-value";
498
+ case GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME:
499
+ return "unexpected-equals-sign-before-attribute-name";
500
+ case GUMBO_ERR_UNEXPECTED_NULL_CHARACTER:
501
+ return "unexpected-null-character";
502
+ case GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME:
503
+ return "unexpected-question-mark-instead-of-tag-name";
504
+ case GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG:
505
+ return "unexpected-solidus-in-tag";
506
+ case GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE:
507
+ return "unknown-named-character-reference";
508
+
509
+ // Encoding errors.
510
+ case GUMBO_ERR_UTF8_INVALID:
511
+ return "utf8-invalid";
512
+ case GUMBO_ERR_UTF8_TRUNCATED:
513
+ return "utf8-truncated";
514
+
515
+ // Generic parser error.
516
+ case GUMBO_ERR_PARSER:
517
+ return "generic-parser";
518
+ }
519
+ // Silence warning about control reaching end of non-void function.
520
+ // All errors _should_ be handled in the switch statement.
521
+ return "generic-parser";
522
+ }
523
+
524
+ static void error_to_string (
195
525
  const GumboError* error,
196
526
  GumboStringBuffer* output
197
527
  ) {
198
- print_message (
199
- output,
200
- "@%zu:%zu: ",
201
- error->position.line,
202
- error->position.column
203
- );
204
- switch (error->type) {
205
- case GUMBO_ERR_UTF8_INVALID:
206
- print_message (
207
- output,
208
- "Invalid UTF8 character 0x%" PRIx32,
209
- error->v.codepoint
210
- );
211
- break;
212
- case GUMBO_ERR_UTF8_TRUNCATED:
213
- print_message (
214
- output,
215
- "Input stream ends with a truncated UTF8 character 0x%" PRIx32,
216
- error->v.codepoint
217
- );
218
- break;
219
- case GUMBO_ERR_UTF8_NULL:
220
- print_message (
221
- output,
222
- "Unexpected NULL character in the input stream"
223
- );
224
- break;
225
- case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
226
- print_message (
227
- output,
228
- "No digits after &# in numeric character reference"
229
- );
230
- break;
231
- case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
232
- print_message (
233
- output,
234
- "The numeric character reference &#%" PRIu32 " should be followed "
235
- "by a semicolon",
236
- error->v.codepoint
237
- );
238
- break;
239
- case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
240
- print_message (
241
- output,
242
- "The numeric character reference &#%" PRIu32 "; encodes an invalid "
243
- "unicode codepoint",
244
- error->v.codepoint
245
- );
246
- break;
247
- case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
248
- // The textual data came from one of the literal strings in the table, and
249
- // so it'll be null-terminated.
250
- print_message (
251
- output,
252
- "The named character reference &%.*s should be followed by a "
253
- "semicolon",
254
- (int) error->v.text.length,
255
- error->v.text.data
256
- );
257
- break;
258
- case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
259
- print_message (
260
- output,
261
- "The named character reference &%.*s; is not a valid entity name",
262
- (int) error->v.text.length,
263
- error->v.text.data
264
- );
265
- break;
266
- case GUMBO_ERR_DUPLICATE_ATTR:
267
- print_message (
268
- output,
269
- "Attribute %s occurs multiple times, at positions %u and %u",
270
- error->v.duplicate_attr.name,
271
- error->v.duplicate_attr.original_index,
272
- error->v.duplicate_attr.new_index
273
- );
274
- break;
275
- case GUMBO_ERR_DASHES_OR_DOCTYPE:
276
- print_message (
277
- output,
278
- "Incorrectly opened comment; expected '--', 'DOCTYPE', or '[CDATA['"
279
- );
280
- break;
281
- case GUMBO_ERR_PARSER:
282
- handle_parser_error(&error->v.parser, output);
283
- break;
284
- case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
285
- case GUMBO_ERR_SELF_CLOSING_END_TAG:
286
- print_message (
287
- output,
288
- "Tag cannot be self-closing");
289
- break;
290
- default:
291
- print_message (
292
- output,
293
- "Tokenizer error with an unimplemented error message"
294
- );
295
- break;
296
- }
297
- gumbo_string_buffer_append_codepoint('.', output);
528
+ if (error->type < GUMBO_ERR_PARSER)
529
+ handle_tokenizer_error(error, output);
530
+ else
531
+ handle_parser_error(&error->v.parser, output);
532
+ }
533
+
534
+ size_t gumbo_error_to_string(const GumboError* error, char** output) {
535
+ GumboStringBuffer sb;
536
+ gumbo_string_buffer_init(&sb);
537
+ error_to_string(error, &sb);
538
+ *output = sb.data;
539
+ return sb.length;
298
540
  }
299
541
 
300
- void gumbo_caret_diagnostic_to_string (
542
+ void caret_diagnostic_to_string (
301
543
  const GumboError* error,
302
544
  const char* source_text,
303
545
  size_t source_length,
304
546
  GumboStringBuffer* output
305
547
  ) {
306
- gumbo_error_to_string(error, output);
548
+ error_to_string(error, output);
307
549
 
308
- const char* line_start = find_prev_newline(source_text, error->original_text);
309
- const char* line_end = find_next_newline(source_text + source_length, error->original_text);
550
+ const char* line_start = find_prev_newline(source_text, error->original_text.data);
551
+ const char* line_end = find_next_newline(source_text + source_length, error->original_text.data);
310
552
  GumboStringPiece original_line;
311
553
  original_line.data = line_start;
312
554
  original_line.length = line_end - line_start;
@@ -324,6 +566,19 @@ void gumbo_caret_diagnostic_to_string (
324
566
  gumbo_string_buffer_append_codepoint('\n', output);
325
567
  }
326
568
 
569
+ size_t gumbo_caret_diagnostic_to_string (
570
+ const GumboError* error,
571
+ const char* source_text,
572
+ size_t source_length,
573
+ char **output
574
+ ) {
575
+ GumboStringBuffer sb;
576
+ gumbo_string_buffer_init(&sb);
577
+ caret_diagnostic_to_string(error, source_text, source_length, &sb);
578
+ *output = sb.data;
579
+ return sb.length;
580
+ }
581
+
327
582
  void gumbo_print_caret_diagnostic (
328
583
  const GumboError* error,
329
584
  const char* source_text,
@@ -331,20 +586,21 @@ void gumbo_print_caret_diagnostic (
331
586
  ) {
332
587
  GumboStringBuffer text;
333
588
  gumbo_string_buffer_init(&text);
334
- gumbo_caret_diagnostic_to_string(error, source_text, source_length, &text);
589
+ print_message (
590
+ &text,
591
+ "%lu:%lu: ",
592
+ (unsigned long)error->position.line,
593
+ (unsigned long)error->position.column
594
+ );
595
+
596
+ caret_diagnostic_to_string(error, source_text, source_length, &text);
335
597
  printf("%.*s", (int) text.length, text.data);
336
598
  gumbo_string_buffer_destroy(&text);
337
599
  }
338
600
 
339
601
  void gumbo_error_destroy(GumboError* error) {
340
- if (
341
- error->type == GUMBO_ERR_PARSER
342
- || error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG
343
- || error->type == GUMBO_ERR_SELF_CLOSING_END_TAG
344
- ) {
602
+ if (error->type == GUMBO_ERR_PARSER) {
345
603
  gumbo_vector_destroy(&error->v.parser.tag_stack);
346
- } else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) {
347
- gumbo_free((void*) error->v.duplicate_attr.name);
348
604
  }
349
605
  gumbo_free(error);
350
606
  }