nokogumbo 1.5.0 → 2.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +237 -26
  3. data/ext/nokogumbo/extconf.rb +144 -0
  4. data/ext/nokogumbo/nokogumbo.c +793 -0
  5. data/gumbo-parser/src/ascii.c +75 -0
  6. data/gumbo-parser/src/ascii.h +115 -0
  7. data/gumbo-parser/src/attribute.c +26 -28
  8. data/gumbo-parser/src/attribute.h +3 -23
  9. data/gumbo-parser/src/char_ref.c +5972 -6816
  10. data/gumbo-parser/src/char_ref.h +14 -45
  11. data/gumbo-parser/src/error.c +510 -163
  12. data/gumbo-parser/src/error.h +70 -147
  13. data/gumbo-parser/src/foreign_attrs.c +104 -0
  14. data/gumbo-parser/src/gumbo.h +577 -305
  15. data/gumbo-parser/src/insertion_mode.h +4 -28
  16. data/gumbo-parser/src/macros.h +91 -0
  17. data/gumbo-parser/src/parser.c +2922 -2228
  18. data/gumbo-parser/src/parser.h +6 -22
  19. data/gumbo-parser/src/replacement.h +33 -0
  20. data/gumbo-parser/src/string_buffer.c +43 -50
  21. data/gumbo-parser/src/string_buffer.h +24 -40
  22. data/gumbo-parser/src/string_piece.c +39 -39
  23. data/gumbo-parser/src/svg_attrs.c +174 -0
  24. data/gumbo-parser/src/svg_tags.c +137 -0
  25. data/gumbo-parser/src/tag.c +186 -59
  26. data/gumbo-parser/src/tag_lookup.c +382 -0
  27. data/gumbo-parser/src/tag_lookup.h +13 -0
  28. data/gumbo-parser/src/token_buffer.c +79 -0
  29. data/gumbo-parser/src/token_buffer.h +71 -0
  30. data/gumbo-parser/src/token_type.h +1 -25
  31. data/gumbo-parser/src/tokenizer.c +2128 -1562
  32. data/gumbo-parser/src/tokenizer.h +41 -52
  33. data/gumbo-parser/src/tokenizer_states.h +281 -45
  34. data/gumbo-parser/src/utf8.c +98 -123
  35. data/gumbo-parser/src/utf8.h +84 -52
  36. data/gumbo-parser/src/util.c +48 -38
  37. data/gumbo-parser/src/util.h +10 -40
  38. data/gumbo-parser/src/vector.c +45 -57
  39. data/gumbo-parser/src/vector.h +17 -39
  40. data/lib/nokogumbo.rb +18 -170
  41. data/lib/nokogumbo/html5.rb +252 -0
  42. data/lib/nokogumbo/html5/document.rb +53 -0
  43. data/lib/nokogumbo/html5/document_fragment.rb +62 -0
  44. data/lib/nokogumbo/html5/node.rb +72 -0
  45. data/lib/nokogumbo/version.rb +3 -0
  46. metadata +40 -21
  47. data/ext/nokogumboc/extconf.rb +0 -60
  48. data/ext/nokogumboc/nokogumbo.c +0 -295
  49. data/gumbo-parser/src/char_ref.rl +0 -2554
  50. data/gumbo-parser/src/string_piece.h +0 -38
  51. data/gumbo-parser/src/tag.in +0 -150
  52. data/gumbo-parser/src/tag_enum.h +0 -153
  53. data/gumbo-parser/src/tag_gperf.h +0 -105
  54. data/gumbo-parser/src/tag_sizes.h +0 -4
  55. data/gumbo-parser/src/tag_strings.h +0 -153
  56. data/gumbo-parser/visualc/include/strings.h +0 -4
  57. data/test-nokogumbo.rb +0 -190
@@ -1,60 +1,29 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Internal header for character reference handling; this should not be exposed
18
- // transitively by any public API header. This is why the functions aren't
19
- // namespaced.
20
-
21
1
  #ifndef GUMBO_CHAR_REF_H_
22
2
  #define GUMBO_CHAR_REF_H_
23
3
 
24
- #include <stdbool.h>
4
+ #include <stdlib.h>
25
5
 
26
6
  #ifdef __cplusplus
27
7
  extern "C" {
28
8
  #endif
29
9
 
30
- struct GumboInternalParser;
31
- struct GumboInternalUtf8Iterator;
32
-
33
10
  // Value that indicates no character was produced.
34
- extern const int kGumboNoChar;
11
+ #define kGumboNoChar (-1)
35
12
 
36
- // Certain named character references generate two codepoints, not one, and so
37
- // the consume_char_ref subroutine needs to return this instead of an int. The
38
- // first field will be kGumboNoChar if no character reference was found; the
39
- // second field will be kGumboNoChar if that is the case or if the character
40
- // reference returns only a single codepoint.
41
- typedef struct {
42
- int first;
43
- int second;
44
- } OneOrTwoCodepoints;
45
-
46
- // Implements the "consume a character reference" section of the spec.
47
- // This reads in characters from the input as necessary, and fills in a
48
- // OneOrTwoCodepoints struct containing the characters read. It may add parse
49
- // errors to the GumboParser's errors vector, if the spec calls for it. Pass a
50
- // space for the "additional allowed char" when the spec says "with no
51
- // additional allowed char". Returns false on parse error, true otherwise.
52
- bool consume_char_ref(struct GumboInternalParser* parser,
53
- struct GumboInternalUtf8Iterator* input, int additional_allowed_char,
54
- bool is_in_attribute, OneOrTwoCodepoints* output);
13
+ // On input, str points to the start of the string to match and size is the
14
+ // size of the string.
15
+ //
16
+ // Returns the length of the match or 0 if there is no match.
17
+ // output[0] contains the first codepoint and output[1] contains the second if
18
+ // there are two, otherwise output[1] contains kGumboNoChar.
19
+ size_t match_named_char_ref (
20
+ const char *str,
21
+ size_t size,
22
+ int output[2]
23
+ );
55
24
 
56
25
  #ifdef __cplusplus
57
26
  }
58
27
  #endif
59
28
 
60
- #endif // GUMBO_CHAR_REF_H_
29
+ #endif // GUMBO_CHAR_REF_H_
@@ -1,279 +1,626 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
1
+ /*
2
+ Copyright 2010 Google Inc.
16
3
 
17
- #include "error.h"
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ https://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
18
16
 
19
17
  #include <assert.h>
18
+ #include <inttypes.h>
20
19
  #include <stdarg.h>
21
20
  #include <stdio.h>
22
21
  #include <string.h>
23
-
22
+ #include "ascii.h"
23
+ #include "error.h"
24
24
  #include "gumbo.h"
25
+ #include "macros.h"
25
26
  #include "parser.h"
26
27
  #include "string_buffer.h"
27
28
  #include "util.h"
28
29
  #include "vector.h"
29
30
 
30
- // Prints a formatted message to a StringBuffer. This automatically resizes the
31
- // StringBuffer as necessary to fit the message. Returns the number of bytes
31
+ // Prints a formatted message to a StringBuffer. This automatically resizes the
32
+ // StringBuffer as necessary to fit the message. Returns the number of bytes
32
33
  // written.
33
- static int print_message(
34
- GumboParser* parser, GumboStringBuffer* output, const char* format, ...) {
34
+ static int PRINTF(2) print_message (
35
+ GumboStringBuffer* output,
36
+ const char* format,
37
+ ...
38
+ ) {
35
39
  va_list args;
36
40
  int remaining_capacity = output->capacity - output->length;
37
41
  va_start(args, format);
38
- int bytes_written = vsnprintf(
39
- output->data + output->length, remaining_capacity, format, args);
42
+ int bytes_written = vsnprintf (
43
+ output->data + output->length,
44
+ remaining_capacity,
45
+ format,
46
+ args
47
+ );
40
48
  va_end(args);
41
- #ifdef _MSC_VER
49
+ #if _MSC_VER && _MSC_VER < 1900
42
50
  if (bytes_written == -1) {
43
- // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
44
- // returning the number of bytes that would've been written had there been
45
- // enough. In this case, we'll double the buffer size and hope it fits when
46
- // we retry (letting it fail and returning 0 if it doesn't), since there's
47
- // no way to smartly resize the buffer.
48
- gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
51
+ // vsnprintf returns -1 on older MSVC++ if there's not enough capacity,
52
+ // instead of returning the number of bytes that would've been written had
53
+ // there been enough. In this case, we'll double the buffer size and hope
54
+ // it fits when we retry (letting it fail and returning 0 if it doesn't),
55
+ // since there's no way to smartly resize the buffer.
56
+ gumbo_string_buffer_reserve(output->capacity * 2, output);
49
57
  va_start(args, format);
50
- int result = vsnprintf(
51
- output->data + output->length, remaining_capacity, format, args);
58
+ int result = vsnprintf (
59
+ output->data + output->length,
60
+ remaining_capacity,
61
+ format,
62
+ args
63
+ );
52
64
  va_end(args);
53
65
  return result == -1 ? 0 : result;
54
66
  }
55
67
  #else
56
- // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
68
+ // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
57
69
  if (bytes_written == -1) {
58
70
  return 0;
59
71
  }
60
72
  #endif
61
73
 
62
- if (bytes_written > remaining_capacity) {
63
- gumbo_string_buffer_reserve(
64
- parser, output->capacity + bytes_written, output);
74
+ if (bytes_written >= remaining_capacity) {
75
+ gumbo_string_buffer_reserve(output->capacity + bytes_written, output);
65
76
  remaining_capacity = output->capacity - output->length;
66
77
  va_start(args, format);
67
- bytes_written = vsnprintf(
68
- output->data + output->length, remaining_capacity, format, args);
78
+ bytes_written = vsnprintf (
79
+ output->data + output->length,
80
+ remaining_capacity,
81
+ format,
82
+ args
83
+ );
69
84
  va_end(args);
70
85
  }
71
86
  output->length += bytes_written;
72
87
  return bytes_written;
73
88
  }
74
89
 
75
- static void print_tag_stack(GumboParser* parser, const GumboParserError* error,
76
- GumboStringBuffer* output) {
77
- print_message(parser, output, " Currently open tags: ");
90
+ static void print_tag_stack (
91
+ const GumboParserError* error,
92
+ GumboStringBuffer* output
93
+ ) {
94
+ print_message(output, " Currently open tags: ");
78
95
  for (unsigned int i = 0; i < error->tag_stack.length; ++i) {
79
96
  if (i) {
80
- print_message(parser, output, ", ");
97
+ print_message(output, ", ");
81
98
  }
82
99
  GumboTag tag = (GumboTag) error->tag_stack.data[i];
83
- print_message(parser, output, gumbo_normalized_tagname(tag));
100
+ print_message(output, "%s", gumbo_normalized_tagname(tag));
84
101
  }
85
- gumbo_string_buffer_append_codepoint(parser, '.', output);
102
+ gumbo_string_buffer_append_codepoint('.', output);
86
103
  }
87
104
 
88
- static void handle_parser_error(GumboParser* parser,
89
- const GumboParserError* error, GumboStringBuffer* output) {
90
- if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL &&
91
- error->input_type != GUMBO_TOKEN_DOCTYPE) {
92
- print_message(
93
- parser, output, "The doctype must be the first token in the document");
105
+ static void handle_tokenizer_error (
106
+ const GumboError* error,
107
+ GumboStringBuffer* output
108
+ ) {
109
+ switch (error->type) {
110
+ case GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT:
111
+ print_message(output, "Empty comment abruptly closed by '%s', use '-->'.",
112
+ error->v.tokenizer.state == GUMBO_LEX_COMMENT_START? ">" : "->");
113
+ break;
114
+ case GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER:
115
+ print_message (
116
+ output,
117
+ "DOCTYPE public identifier missing closing %s.",
118
+ error->v.tokenizer.state == GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED?
119
+ "quotation mark (\")" : "apostrophe (')"
120
+ );
121
+ break;
122
+ case GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER:
123
+ print_message (
124
+ output,
125
+ "DOCTYPE system identifier missing closing %s.",
126
+ error->v.tokenizer.state == GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED?
127
+ "quotation mark (\")" : "apostrophe (')"
128
+ );
129
+ break;
130
+ case GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE:
131
+ print_message (
132
+ output,
133
+ "Numeric character reference '%.*s' does not contain any %sdigits.",
134
+ (int)error->original_text.length, error->original_text.data,
135
+ error->v.tokenizer.state == GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START? "hexadecimal " : ""
136
+ );
137
+ break;
138
+ case GUMBO_ERR_CDATA_IN_HTML_CONTENT:
139
+ print_message(output, "CDATA section outside foreign (SVG or MathML) content.");
140
+ break;
141
+ case GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE:
142
+ print_message (
143
+ output,
144
+ "Numeric character reference '%.*s' references a code point that is outside the valid Unicode range.",
145
+ (int)error->original_text.length, error->original_text.data
146
+ );
147
+ break;
148
+ case GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM:
149
+ print_message (
150
+ output,
151
+ "Input contains prohibited control code point U+%04X.",
152
+ error->v.tokenizer.codepoint
153
+ );
154
+ break;
155
+ case GUMBO_ERR_CONTROL_CHARACTER_REFERENCE:
156
+ print_message (
157
+ output,
158
+ "Numeric character reference '%.*s' references prohibited control code point U+%04X.",
159
+ (int)error->original_text.length, error->original_text.data,
160
+ error->v.tokenizer.codepoint
161
+ );
162
+ break;
163
+ case GUMBO_ERR_END_TAG_WITH_ATTRIBUTES:
164
+ print_message(output, "End tag contains attributes.");
165
+ break;
166
+ case GUMBO_ERR_DUPLICATE_ATTRIBUTE:
167
+ print_message(output, "Tag contains multiple attributes with the same name.");
168
+ break;
169
+ case GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS:
170
+ print_message(output, "End tag ends with '/>', use '>'.");
171
+ break;
172
+ case GUMBO_ERR_EOF_BEFORE_TAG_NAME:
173
+ print_message(output, "End of input where a tag name is expected.");
174
+ break;
175
+ case GUMBO_ERR_EOF_IN_CDATA:
176
+ print_message(output, "End of input in CDATA section.");
177
+ break;
178
+ case GUMBO_ERR_EOF_IN_COMMENT:
179
+ print_message(output, "End of input in comment.");
180
+ break;
181
+ case GUMBO_ERR_EOF_IN_DOCTYPE:
182
+ print_message(output, "End of input in DOCTYPE.");
183
+ break;
184
+ case GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT:
185
+ print_message(output, "End of input in text that resembles an HTML comment inside script element content.");
186
+ break;
187
+ case GUMBO_ERR_EOF_IN_TAG:
188
+ print_message(output, "End of input in tag.");
189
+ break;
190
+ case GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT:
191
+ print_message(output, "Comment closed incorrectly by '--!>', use '-->'.");
192
+ break;
193
+ case GUMBO_ERR_INCORRECTLY_OPENED_COMMENT:
194
+ print_message(output, "Comment, DOCTYPE, or CDATA opened incorrectly, use '<!--', '<!DOCTYPE', or '<![CDATA['.");
195
+ break;
196
+ case GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME:
197
+ print_message(output, "Invalid character sequence after DOCTYPE name, expected 'PUBLIC', 'SYSTEM', or '>'.");
198
+ break;
199
+ case GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME:
200
+ if (gumbo_ascii_isascii(error->v.tokenizer.codepoint)
201
+ && !gumbo_ascii_iscntrl(error->v.tokenizer.codepoint))
202
+ print_message(output, "Invalid first character of tag name '%c'.", error->v.tokenizer.codepoint);
203
+ else
204
+ print_message(output, "Invalid first code point of tag name U+%04X.", error->v.tokenizer.codepoint);
205
+ break;
206
+ case GUMBO_ERR_MISSING_ATTRIBUTE_VALUE:
207
+ print_message(output, "Missing attribute value.");
208
+ break;
209
+ case GUMBO_ERR_MISSING_DOCTYPE_NAME:
210
+ print_message(output, "Missing DOCTYPE name.");
211
+ break;
212
+ case GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER:
213
+ print_message(output, "Missing DOCTYPE public identifier.");
214
+ break;
215
+ case GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER:
216
+ print_message(output, "Missing DOCTYPE system identifier.");
217
+ break;
218
+ case GUMBO_ERR_MISSING_END_TAG_NAME:
219
+ print_message(output, "Missing end tag name.");
220
+ break;
221
+ case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
222
+ print_message(output, "Missing quote before DOCTYPE public identifier.");
223
+ break;
224
+ case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
225
+ print_message(output, "Missing quote before DOCTYPE system identifier.");
226
+ break;
227
+ case GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE:
228
+ print_message(output, "Missing semicolon after character reference '%.*s'.",
229
+ (int)error->original_text.length, error->original_text.data);
230
+ break;
231
+ case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD:
232
+ print_message(output, "Missing whitespace after 'PUBLIC' keyword.");
233
+ break;
234
+ case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD:
235
+ print_message(output, "Missing whitespace after 'SYSTEM' keyword.");
236
+ break;
237
+ case GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME:
238
+ print_message(output, "Missing whitespace between 'DOCTYPE' keyword and DOCTYPE name.");
239
+ break;
240
+ case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES:
241
+ print_message(output, "Missing whitespace between attributes.");
242
+ break;
243
+ case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
244
+ print_message(output, "Missing whitespace between DOCTYPE public and system identifiers.");
245
+ break;
246
+ case GUMBO_ERR_NESTED_COMMENT:
247
+ print_message(output, "Nested comment.");
248
+ break;
249
+ case GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE:
250
+ print_message (
251
+ output,
252
+ "Numeric character reference '%.*s' references noncharacter U+%04X.",
253
+ (int)error->original_text.length, error->original_text.data,
254
+ error->v.tokenizer.codepoint
255
+ );
256
+ break;
257
+ case GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM:
258
+ print_message(output, "Input contains noncharacter U+%04X.", error->v.tokenizer.codepoint);
259
+ break;
260
+ case GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS:
261
+ print_message(output, "Start tag of nonvoid HTML element ends with '/>', use '>'.");
262
+ break;
263
+ case GUMBO_ERR_NULL_CHARACTER_REFERENCE:
264
+ print_message(output, "Numeric character reference '%.*s' references U+0000.",
265
+ (int)error->original_text.length, error->original_text.data);
266
+ break;
267
+ case GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE:
268
+ print_message (
269
+ output,
270
+ "Numeric character reference '%.*s' references surrogate U+%4X.",
271
+ (int)error->original_text.length, error->original_text.data,
272
+ error->v.tokenizer.codepoint
273
+ );
274
+ break;
275
+ case GUMBO_ERR_SURROGATE_IN_INPUT_STREAM:
276
+ print_message(output, "Input contains surrogate U+%04X.", error->v.tokenizer.codepoint);
277
+ break;
278
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
279
+ print_message(output, "Unexpected character after DOCTYPE system identifier.");
280
+ break;
281
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME:
282
+ print_message(output, "Unexpected character (%c) in attribute name.", error->v.tokenizer.codepoint);
283
+ break;
284
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE:
285
+ print_message(output, "Unexpected character (%c) in unquoted attribute value.", error->v.tokenizer.codepoint);
286
+ break;
287
+ case GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME:
288
+ print_message(output, "Unexpected '=' before an attribute name.");
289
+ break;
290
+ case GUMBO_ERR_UNEXPECTED_NULL_CHARACTER:
291
+ print_message(output, "Input contains unexpected U+0000.");
292
+ break;
293
+ case GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME:
294
+ print_message(output, "Unexpected '?' where start tag name is expected.");
295
+ break;
296
+ case GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG:
297
+ print_message(output, "Unexpected '/' in tag.");
298
+ break;
299
+ case GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE:
300
+ print_message(output, "Unknown named character reference '%.*s'.",
301
+ (int)error->original_text.length, error->original_text.data);
302
+ break;
303
+ case GUMBO_ERR_UTF8_INVALID:
304
+ print_message(output, "Invalid UTF8 encoding.");
305
+ break;
306
+ case GUMBO_ERR_UTF8_TRUNCATED:
307
+ print_message(output, "UTF8 character truncated.");
308
+ break;
309
+ case GUMBO_ERR_PARSER:
310
+ assert(0 && "Unreachable.");
311
+ }
312
+ }
313
+ static void handle_parser_error (
314
+ const GumboParserError* error,
315
+ GumboStringBuffer* output
316
+ ) {
317
+ if (
318
+ error->parser_state == GUMBO_INSERTION_MODE_INITIAL
319
+ && error->input_type != GUMBO_TOKEN_DOCTYPE
320
+ ) {
321
+ print_message (
322
+ output,
323
+ "Expected a doctype token"
324
+ );
94
325
  return;
95
326
  }
96
327
 
97
328
  switch (error->input_type) {
98
329
  case GUMBO_TOKEN_DOCTYPE:
99
- print_message(parser, output, "This is not a legal doctype");
330
+ print_message(output, "This is not a legal doctype");
100
331
  return;
101
332
  case GUMBO_TOKEN_COMMENT:
102
333
  // Should never happen; comments are always legal.
103
334
  assert(0);
104
335
  // But just in case...
105
- print_message(parser, output, "Comments aren't legal here");
336
+ print_message(output, "Comments aren't legal here");
106
337
  return;
107
338
  case GUMBO_TOKEN_CDATA:
108
339
  case GUMBO_TOKEN_WHITESPACE:
109
340
  case GUMBO_TOKEN_CHARACTER:
110
- print_message(parser, output, "Character tokens aren't legal here");
341
+ print_message(output, "Character tokens aren't legal here");
111
342
  return;
112
343
  case GUMBO_TOKEN_NULL:
113
- print_message(parser, output, "Null bytes are not allowed in HTML5");
344
+ print_message(output, "Null bytes are not allowed in HTML5");
114
345
  return;
115
346
  case GUMBO_TOKEN_EOF:
116
347
  if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
117
- print_message(parser, output, "You must provide a doctype");
348
+ print_message(output, "You must provide a doctype");
118
349
  } else {
119
- print_message(parser, output, "Premature end of file");
120
- print_tag_stack(parser, error, output);
350
+ print_message(output, "Premature end of file");
351
+ print_tag_stack(error, output);
121
352
  }
122
353
  return;
123
354
  case GUMBO_TOKEN_START_TAG:
124
355
  case GUMBO_TOKEN_END_TAG:
125
- print_message(parser, output, "That tag isn't allowed here");
126
- print_tag_stack(parser, error, output);
356
+ print_message(output, "That tag isn't allowed here");
357
+ print_tag_stack(error, output);
127
358
  // TODO(jdtang): Give more specific messaging.
128
359
  return;
129
360
  }
130
361
  }
131
362
 
132
363
  // Finds the preceding newline in an original source buffer from a given byte
133
- // location. Returns a character pointer to the character after that, or a
364
+ // location. Returns a character pointer to the character after that, or a
134
365
  // pointer to the beginning of the string if this is the first line.
135
- static const char* find_last_newline(
136
- const char* original_text, const char* error_location) {
137
- assert(error_location >= original_text);
366
+ static const char* find_prev_newline (
367
+ const char* source_text,
368
+ size_t source_length,
369
+ const char* error_location
370
+ ) {
371
+ const char* source_end = source_text + source_length;
372
+ assert(error_location >= source_text);
373
+ assert(error_location <= source_end);
138
374
  const char* c = error_location;
139
- for (; c != original_text && *c != '\n'; --c) {
140
- // There may be an error at EOF, which would be a nul byte.
141
- assert(*c || c == error_location);
142
- }
143
- return c == original_text ? c : c + 1;
375
+ if (c != source_text && (error_location == source_end || *c == '\n'))
376
+ --c;
377
+ while (c != source_text && *c != '\n')
378
+ --c;
379
+ return c == source_text ? c : c + 1;
144
380
  }
145
381
 
146
382
  // Finds the next newline in the original source buffer from a given byte
147
- // location. Returns a character pointer to that newline, or a pointer to the
148
- // terminating null byte if this is the last line.
383
+ // location. Returns a character pointer to that newline, or a pointer to
384
+ // source_text + source_length if this is the last line.
149
385
  static const char* find_next_newline(
150
- const char* original_text, const char* error_location) {
386
+ const char* source_text,
387
+ size_t source_length,
388
+ const char* error_location
389
+ ) {
390
+ const char* source_end = source_text + source_length;
391
+ assert(error_location >= source_text);
392
+ assert(error_location <= source_end);
151
393
  const char* c = error_location;
152
- for (; *c && *c != '\n'; ++c)
153
- ;
394
+ while (c != source_end && *c != '\n')
395
+ ++c;
154
396
  return c;
155
397
  }
156
398
 
157
399
  GumboError* gumbo_add_error(GumboParser* parser) {
400
+ parser->_output->document_error = true;
401
+
158
402
  int max_errors = parser->_options->max_errors;
159
403
  if (max_errors >= 0 && parser->_output->errors.length >= (unsigned int) max_errors) {
160
404
  return NULL;
161
405
  }
162
- GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError));
163
- gumbo_vector_add(parser, error, &parser->_output->errors);
406
+ GumboError* error = gumbo_alloc(sizeof(GumboError));
407
+ gumbo_vector_add(error, &parser->_output->errors);
164
408
  return error;
165
409
  }
166
410
 
167
- void gumbo_error_to_string(
168
- GumboParser* parser, const GumboError* error, GumboStringBuffer* output) {
169
- print_message(
170
- parser, output, "@%d:%d: ", error->position.line, error->position.column);
411
+ GumboSourcePosition gumbo_error_position(const GumboError* error) {
412
+ return error->position;
413
+ }
414
+
415
+ const char* gumbo_error_code(const GumboError* error) {
171
416
  switch (error->type) {
172
- case GUMBO_ERR_UTF8_INVALID:
173
- print_message(
174
- parser, output, "Invalid UTF8 character 0x%x", error->v.codepoint);
175
- break;
176
- case GUMBO_ERR_UTF8_TRUNCATED:
177
- print_message(parser, output,
178
- "Input stream ends with a truncated UTF8 character 0x%x",
179
- error->v.codepoint);
180
- break;
181
- case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
182
- print_message(
183
- parser, output, "No digits after &# in numeric character reference");
184
- break;
185
- case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
186
- print_message(parser, output,
187
- "The numeric character reference &#%d should be followed "
188
- "by a semicolon",
189
- error->v.codepoint);
190
- break;
191
- case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
192
- print_message(parser, output,
193
- "The numeric character reference &#%d; encodes an invalid "
194
- "unicode codepoint",
195
- error->v.codepoint);
196
- break;
197
- case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
198
- // The textual data came from one of the literal strings in the table, and
199
- // so it'll be null-terminated.
200
- print_message(parser, output,
201
- "The named character reference &%.*s should be followed by a "
202
- "semicolon",
203
- (int) error->v.text.length, error->v.text.data);
204
- break;
205
- case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
206
- print_message(parser, output,
207
- "The named character reference &%.*s; is not a valid entity name",
208
- (int) error->v.text.length, error->v.text.data);
209
- break;
210
- case GUMBO_ERR_DUPLICATE_ATTR:
211
- print_message(parser, output,
212
- "Attribute %s occurs multiple times, at positions %d and %d",
213
- error->v.duplicate_attr.name, error->v.duplicate_attr.original_index,
214
- error->v.duplicate_attr.new_index);
215
- break;
216
- case GUMBO_ERR_PARSER:
217
- case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
218
- handle_parser_error(parser, &error->v.parser, output);
219
- break;
220
- default:
221
- print_message(parser, output,
222
- "Tokenizer error with an unimplemented error message");
223
- break;
417
+ // Defined tokenizer errors.
418
+ case GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT:
419
+ return "abrupt-closing-of-empty-comment";
420
+ case GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER:
421
+ return "abrupt-doctype-public-identifier";
422
+ case GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER:
423
+ return "abrupt-doctype-system-identifier";
424
+ case GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE:
425
+ return "absence-of-digits-in-numeric-character-reference";
426
+ case GUMBO_ERR_CDATA_IN_HTML_CONTENT:
427
+ return "cdata-in-html-content";
428
+ case GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE:
429
+ return "character-reference-outside-unicode-range";
430
+ case GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM:
431
+ return "control-character-in-input-stream";
432
+ case GUMBO_ERR_CONTROL_CHARACTER_REFERENCE:
433
+ return "control-character-reference";
434
+ case GUMBO_ERR_END_TAG_WITH_ATTRIBUTES:
435
+ return "end-tag-with-attributes";
436
+ case GUMBO_ERR_DUPLICATE_ATTRIBUTE:
437
+ return "duplicate-attribute";
438
+ case GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS:
439
+ return "end-tag-with-trailing-solidus";
440
+ case GUMBO_ERR_EOF_BEFORE_TAG_NAME:
441
+ return "eof-before-tag-name";
442
+ case GUMBO_ERR_EOF_IN_CDATA:
443
+ return "eof-in-cdata";
444
+ case GUMBO_ERR_EOF_IN_COMMENT:
445
+ return "eof-in-comment";
446
+ case GUMBO_ERR_EOF_IN_DOCTYPE:
447
+ return "eof-in-doctype";
448
+ case GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT:
449
+ return "eof-in-script-html-comment-like-text";
450
+ case GUMBO_ERR_EOF_IN_TAG:
451
+ return "eof-in-tag";
452
+ case GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT:
453
+ return "incorrectly-closed-comment";
454
+ case GUMBO_ERR_INCORRECTLY_OPENED_COMMENT:
455
+ return "incorrectly-opened-comment";
456
+ case GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME:
457
+ return "invalid-character-sequence-after-doctype-name";
458
+ case GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME:
459
+ return "invalid-first-character-of-tag-name";
460
+ case GUMBO_ERR_MISSING_ATTRIBUTE_VALUE:
461
+ return "missing-attribute-value";
462
+ case GUMBO_ERR_MISSING_DOCTYPE_NAME:
463
+ return "missing-doctype-name";
464
+ case GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER:
465
+ return "missing-doctype-public-identifier";
466
+ case GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER:
467
+ return "missing-doctype-system-identifier";
468
+ case GUMBO_ERR_MISSING_END_TAG_NAME:
469
+ return "missing-end-tag-name";
470
+ case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER:
471
+ return "missing-quote-before-doctype-public-identifier";
472
+ case GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER:
473
+ return "missing-quote-before-doctype-system-identifier";
474
+ case GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE:
475
+ return "missing-semicolon-after-character-reference";
476
+ case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD:
477
+ return "missing-whitespace-after-doctype-public-keyword";
478
+ case GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD:
479
+ return "missing-whitespace-after-doctype-system-keyword";
480
+ case GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME:
481
+ return "missing-whitespace-before-doctype-name";
482
+ case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES:
483
+ return "missing-whitespace-between-attributes";
484
+ case GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS:
485
+ return "missing-whitespace-between-doctype-public-and-system-identifiers";
486
+ case GUMBO_ERR_NESTED_COMMENT:
487
+ return "nested-comment";
488
+ case GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE:
489
+ return "noncharacter-character-reference";
490
+ case GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM:
491
+ return "noncharacter-in-input-stream";
492
+ case GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS:
493
+ return "non-void-html-element-start-tag-with-trailing-solidus";
494
+ case GUMBO_ERR_NULL_CHARACTER_REFERENCE:
495
+ return "null-character-reference";
496
+ case GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE:
497
+ return "surrogate-character-reference";
498
+ case GUMBO_ERR_SURROGATE_IN_INPUT_STREAM:
499
+ return "surrogate-in-input-stream";
500
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER:
501
+ return "unexpected-character-after-doctype-system-identifier";
502
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME:
503
+ return "unexpected-character-in-attribute-name";
504
+ case GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE:
505
+ return "unexpected-character-in-unquoted-attribute-value";
506
+ case GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME:
507
+ return "unexpected-equals-sign-before-attribute-name";
508
+ case GUMBO_ERR_UNEXPECTED_NULL_CHARACTER:
509
+ return "unexpected-null-character";
510
+ case GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME:
511
+ return "unexpected-question-mark-instead-of-tag-name";
512
+ case GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG:
513
+ return "unexpected-solidus-in-tag";
514
+ case GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE:
515
+ return "unknown-named-character-reference";
516
+
517
+ // Encoding errors.
518
+ case GUMBO_ERR_UTF8_INVALID:
519
+ return "utf8-invalid";
520
+ case GUMBO_ERR_UTF8_TRUNCATED:
521
+ return "utf8-truncated";
522
+
523
+ // Generic parser error.
524
+ case GUMBO_ERR_PARSER:
525
+ return "generic-parser";
224
526
  }
225
- gumbo_string_buffer_append_codepoint(parser, '.', output);
527
+ // Silence warning about control reaching end of non-void function.
528
+ // All errors _should_ be handled in the switch statement.
529
+ return "generic-parser";
226
530
  }
227
531
 
228
- void gumbo_caret_diagnostic_to_string(GumboParser* parser,
229
- const GumboError* error, const char* source_text,
230
- GumboStringBuffer* output) {
231
- gumbo_error_to_string(parser, error, output);
532
+ static void error_to_string (
533
+ const GumboError* error,
534
+ GumboStringBuffer* output
535
+ ) {
536
+ if (error->type < GUMBO_ERR_PARSER)
537
+ handle_tokenizer_error(error, output);
538
+ else
539
+ handle_parser_error(&error->v.parser, output);
540
+ }
541
+
542
+ size_t gumbo_error_to_string(const GumboError* error, char** output) {
543
+ GumboStringBuffer sb;
544
+ gumbo_string_buffer_init(&sb);
545
+ error_to_string(error, &sb);
546
+ *output = sb.data;
547
+ return sb.length;
548
+ }
232
549
 
233
- const char* line_start = find_last_newline(source_text, error->original_text);
234
- const char* line_end = find_next_newline(source_text, error->original_text);
550
+ void caret_diagnostic_to_string (
551
+ const GumboError* error,
552
+ const char* source_text,
553
+ size_t source_length,
554
+ GumboStringBuffer* output
555
+ ) {
556
+ error_to_string(error, output);
557
+
558
+ const char* error_text = error->original_text.data;
559
+ const char* line_start = find_prev_newline(source_text, source_length, error_text);
560
+ const char* line_end = find_next_newline(source_text, source_length, error_text);
235
561
  GumboStringPiece original_line;
236
562
  original_line.data = line_start;
237
563
  original_line.length = line_end - line_start;
238
564
 
239
- gumbo_string_buffer_append_codepoint(parser, '\n', output);
240
- gumbo_string_buffer_append_string(parser, &original_line, output);
241
- gumbo_string_buffer_append_codepoint(parser, '\n', output);
242
- gumbo_string_buffer_reserve(
243
- parser, output->length + error->position.column, output);
244
- int num_spaces = error->position.column - 1;
245
- memset(output->data + output->length, ' ', num_spaces);
246
- output->length += num_spaces;
247
- gumbo_string_buffer_append_codepoint(parser, '^', output);
248
- gumbo_string_buffer_append_codepoint(parser, '\n', output);
565
+ gumbo_string_buffer_append_codepoint('\n', output);
566
+ gumbo_string_buffer_append_string(&original_line, output);
567
+ gumbo_string_buffer_append_codepoint('\n', output);
568
+ gumbo_string_buffer_reserve(output->length + error->position.column, output);
569
+ if (error->position.column >= 2) {
570
+ size_t num_spaces = error->position.column - 1;
571
+ memset(output->data + output->length, ' ', num_spaces);
572
+ output->length += num_spaces;
573
+ }
574
+ gumbo_string_buffer_append_codepoint('^', output);
575
+ gumbo_string_buffer_append_codepoint('\n', output);
249
576
  }
250
577
 
251
- void gumbo_print_caret_diagnostic(
252
- GumboParser* parser, const GumboError* error, const char* source_text) {
578
+ size_t gumbo_caret_diagnostic_to_string (
579
+ const GumboError* error,
580
+ const char* source_text,
581
+ size_t source_length,
582
+ char **output
583
+ ) {
584
+ GumboStringBuffer sb;
585
+ gumbo_string_buffer_init(&sb);
586
+ caret_diagnostic_to_string(error, source_text, source_length, &sb);
587
+ *output = sb.data;
588
+ return sb.length;
589
+ }
590
+
591
+ void gumbo_print_caret_diagnostic (
592
+ const GumboError* error,
593
+ const char* source_text,
594
+ size_t source_length
595
+ ) {
253
596
  GumboStringBuffer text;
254
- gumbo_string_buffer_init(parser, &text);
255
- gumbo_caret_diagnostic_to_string(parser, error, source_text, &text);
597
+ gumbo_string_buffer_init(&text);
598
+ print_message (
599
+ &text,
600
+ "%lu:%lu: ",
601
+ (unsigned long)error->position.line,
602
+ (unsigned long)error->position.column
603
+ );
604
+
605
+ caret_diagnostic_to_string(error, source_text, source_length, &text);
256
606
  printf("%.*s", (int) text.length, text.data);
257
- gumbo_string_buffer_destroy(parser, &text);
607
+ gumbo_string_buffer_destroy(&text);
258
608
  }
259
609
 
260
- void gumbo_error_destroy(GumboParser* parser, GumboError* error) {
261
- if (error->type == GUMBO_ERR_PARSER ||
262
- error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG) {
263
- gumbo_vector_destroy(parser, &error->v.parser.tag_stack);
264
- } else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) {
265
- gumbo_parser_deallocate(parser, (void*) error->v.duplicate_attr.name);
610
+ void gumbo_error_destroy(GumboError* error) {
611
+ if (error->type == GUMBO_ERR_PARSER) {
612
+ gumbo_vector_destroy(&error->v.parser.tag_stack);
266
613
  }
267
- gumbo_parser_deallocate(parser, error);
614
+ gumbo_free(error);
268
615
  }
269
616
 
270
617
  void gumbo_init_errors(GumboParser* parser) {
271
- gumbo_vector_init(parser, 5, &parser->_output->errors);
618
+ gumbo_vector_init(5, &parser->_output->errors);
272
619
  }
273
620
 
274
621
  void gumbo_destroy_errors(GumboParser* parser) {
275
622
  for (unsigned int i = 0; i < parser->_output->errors.length; ++i) {
276
- gumbo_error_destroy(parser, parser->_output->errors.data[i]);
623
+ gumbo_error_destroy(parser->_output->errors.data[i]);
277
624
  }
278
- gumbo_vector_destroy(parser, &parser->_output->errors);
625
+ gumbo_vector_destroy(&parser->_output->errors);
279
626
  }