ruby-gumbo 1.0.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.mkd +28 -31
  4. data/Rakefile +60 -59
  5. data/ext/extconf.rb +17 -9
  6. data/ext/{gumbo.c → ruby_gumbo_ext.c} +29 -28
  7. data/lib/gumbo.rb +19 -0
  8. data/lib/gumbo/element.rb +52 -0
  9. data/lib/gumbo/{extra.rb → node.rb} +19 -22
  10. data/lib/gumbo/text.rb +29 -0
  11. data/vendor/gumbo-parser/src/attribute.c +44 -0
  12. data/vendor/gumbo-parser/src/attribute.h +37 -0
  13. data/vendor/gumbo-parser/src/char_ref.c +2561 -0
  14. data/vendor/gumbo-parser/src/char_ref.h +61 -0
  15. data/vendor/gumbo-parser/src/error.c +258 -0
  16. data/vendor/gumbo-parser/src/error.h +227 -0
  17. data/vendor/gumbo-parser/src/gumbo.h +807 -0
  18. data/vendor/gumbo-parser/src/insertion_mode.h +57 -0
  19. data/vendor/gumbo-parser/src/parser.c +3917 -0
  20. data/vendor/gumbo-parser/src/parser.h +57 -0
  21. data/vendor/gumbo-parser/src/string_buffer.c +106 -0
  22. data/vendor/gumbo-parser/src/string_buffer.h +81 -0
  23. data/vendor/gumbo-parser/src/string_piece.c +49 -0
  24. data/vendor/gumbo-parser/src/string_piece.h +39 -0
  25. data/vendor/gumbo-parser/src/tag.c +225 -0
  26. data/vendor/gumbo-parser/src/token_type.h +40 -0
  27. data/vendor/gumbo-parser/src/tokenizer.c +2980 -0
  28. data/vendor/gumbo-parser/src/tokenizer.h +123 -0
  29. data/vendor/gumbo-parser/src/tokenizer_states.h +103 -0
  30. data/vendor/gumbo-parser/src/utf8.c +275 -0
  31. data/vendor/gumbo-parser/src/utf8.h +127 -0
  32. data/vendor/gumbo-parser/src/util.c +58 -0
  33. data/vendor/gumbo-parser/src/util.h +62 -0
  34. data/vendor/gumbo-parser/src/vector.c +123 -0
  35. data/vendor/gumbo-parser/src/vector.h +69 -0
  36. metadata +40 -10
  37. data/ext/extconf.h +0 -3
@@ -0,0 +1,61 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // Internal header for character reference handling; this should not be exposed
18
+ // transitively by any public API header. This is why the functions aren't
19
+ // namespaced.
20
+
21
+ #ifndef GUMBO_CHAR_REF_H_
22
+ #define GUMBO_CHAR_REF_H_
23
+
24
+ #include <stdbool.h>
25
+
26
+ #ifdef __cplusplus
27
+ extern "C" {
28
+ #endif
29
+
30
+ struct GumboInternalParser;
31
+ struct GumboInternalUtf8Iterator;
32
+
33
+ // Value that indicates no character was produced.
34
+ extern const int kGumboNoChar;
35
+
36
+ // Certain named character references generate two codepoints, not one, and so
37
+ // the consume_char_ref subroutine needs to return this instead of an int. The
38
+ // first field will be kGumboNoChar if no character reference was found; the
39
+ // second field will be kGumboNoChar if that is the case or if the character
40
+ // reference returns only a single codepoint.
41
+ typedef struct {
42
+ int first;
43
+ int second;
44
+ } OneOrTwoCodepoints;
45
+
46
+ // Implements the "consume a character reference" section of the spec.
47
+ // This reads in characters from the input as necessary, and fills in a
48
+ // OneOrTwoCodepoints struct containing the characters read. It may add parse
49
+ // errors to the GumboParser's errors vector, if the spec calls for it. Pass a
50
+ // space for the "additional allowed char" when the spec says "with no
51
+ // additional allowed char". Returns false on parse error, true otherwise.
52
+ bool consume_char_ref(
53
+ struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
54
+ int additional_allowed_char, bool is_in_attribute,
55
+ OneOrTwoCodepoints* output);
56
+
57
+ #ifdef __cplusplus
58
+ }
59
+ #endif
60
+
61
+ #endif // GUMBO_CHAR_REF_H_
@@ -0,0 +1,258 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "error.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdarg.h>
21
+ #include <stdio.h>
22
+ #include <string.h>
23
+
24
+ #include "gumbo.h"
25
+ #include "parser.h"
26
+ #include "string_buffer.h"
27
+ #include "util.h"
28
+ #include "vector.h"
29
+
30
+ static const size_t kMessageBufferSize = 256;
31
+
32
+ // Prints a formatted message to a StringBuffer. This automatically resizes the
33
+ // StringBuffer as necessary to fit the message. Returns the number of bytes
34
+ // written.
35
+ static int print_message(GumboParser* parser, GumboStringBuffer* output,
36
+ const char* format, ...) {
37
+ va_list args;
38
+ va_start(args, format);
39
+ int remaining_capacity = output->capacity - output->length;
40
+ int bytes_written = vsnprintf(output->data + output->length,
41
+ remaining_capacity, format, args);
42
+ if (bytes_written > remaining_capacity) {
43
+ gumbo_string_buffer_reserve(
44
+ parser, output->capacity + bytes_written, output);
45
+ remaining_capacity = output->capacity - output->length;
46
+ bytes_written = vsnprintf(output->data + output->length,
47
+ remaining_capacity, format, args);
48
+ }
49
+ output->length += bytes_written;
50
+ va_end(args);
51
+ return bytes_written;
52
+ }
53
+
54
+ static void print_tag_stack(
55
+ GumboParser* parser, const GumboParserError* error,
56
+ GumboStringBuffer* output) {
57
+ print_message(parser, output, " Currently open tags: ");
58
+ for (int i = 0; i < error->tag_stack.length; ++i) {
59
+ if (i) {
60
+ print_message(parser, output, ", ");
61
+ }
62
+ GumboTag tag = (GumboTag) error->tag_stack.data[i];
63
+ print_message(parser, output, gumbo_normalized_tagname(tag));
64
+ }
65
+ gumbo_string_buffer_append_codepoint(parser, '.', output);
66
+ }
67
+
68
+ static void handle_parser_error(GumboParser* parser,
69
+ const GumboParserError* error,
70
+ GumboStringBuffer* output) {
71
+ if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL &&
72
+ error->input_type != GUMBO_TOKEN_DOCTYPE) {
73
+ print_message(parser, output,
74
+ "The doctype must be the first token in the document");
75
+ return;
76
+ }
77
+
78
+ switch (error->input_type) {
79
+ case GUMBO_TOKEN_DOCTYPE:
80
+ print_message(parser, output, "This is not a legal doctype");
81
+ return;
82
+ case GUMBO_TOKEN_COMMENT:
83
+ // Should never happen; comments are always legal.
84
+ assert(0);
85
+ // But just in case...
86
+ print_message(parser, output, "Comments aren't legal here");
87
+ return;
88
+ case GUMBO_TOKEN_WHITESPACE:
89
+ case GUMBO_TOKEN_CHARACTER:
90
+ print_message(parser, output, "Character tokens aren't legal here");
91
+ return;
92
+ case GUMBO_TOKEN_NULL:
93
+ print_message(parser, output, "Null bytes are not allowed in HTML5");
94
+ return;
95
+ case GUMBO_TOKEN_EOF:
96
+ if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
97
+ print_message(parser, output, "You must provide a doctype");
98
+ } else {
99
+ print_message(parser, output, "Premature end of file");
100
+ print_tag_stack(parser, error, output);
101
+ }
102
+ return;
103
+ case GUMBO_TOKEN_START_TAG:
104
+ case GUMBO_TOKEN_END_TAG:
105
+ print_message(parser, output, "That tag isn't allowed here");
106
+ print_tag_stack(parser, error, output);
107
+ // TODO(jdtang): Give more specific messaging.
108
+ return;
109
+ }
110
+ }
111
+
112
+ // Finds the preceding newline in an original source buffer from a given byte
113
+ // location. Returns a character pointer to the character after that, or a
114
+ // pointer to the beginning of the string if this is the first line.
115
+ static const char* find_last_newline(
116
+ const char* original_text, const char* error_location) {
117
+ assert(error_location >= original_text);
118
+ const char* c = error_location;
119
+ for (; c != original_text && *c != '\n'; --c) {
120
+ // There may be an error at EOF, which would be a nul byte.
121
+ assert(*c || c == error_location);
122
+ }
123
+ return c == original_text ? c : c + 1;
124
+ }
125
+
126
+ // Finds the next newline in the original source buffer from a given byte
127
+ // location. Returns a character pointer to that newline, or a pointer to the
128
+ // terminating null byte if this is the last line.
129
+ static const char* find_next_newline(
130
+ const char* original_text, const char* error_location) {
131
+ const char* c = error_location;
132
+ for (; *c && *c != '\n'; ++c);
133
+ return c;
134
+ }
135
+
136
+ GumboError* gumbo_add_error(GumboParser* parser) {
137
+ int max_errors = parser->_options->max_errors;
138
+ if (max_errors >= 0 && parser->_output->errors.length >= max_errors) {
139
+ return NULL;
140
+ }
141
+ GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError));
142
+ gumbo_vector_add(parser, error, &parser->_output->errors);
143
+ return error;
144
+ }
145
+
146
+ void gumbo_error_to_string(
147
+ GumboParser* parser, const GumboError* error, GumboStringBuffer* output) {
148
+ print_message(parser, output, "@%d:%d: ",
149
+ error->position.line, error->position.column);
150
+ switch (error->type) {
151
+ case GUMBO_ERR_UTF8_INVALID:
152
+ print_message(parser, output, "Invalid UTF8 character 0x%x",
153
+ error->v.codepoint);
154
+ break;
155
+ case GUMBO_ERR_UTF8_TRUNCATED:
156
+ print_message(parser, output,
157
+ "Input stream ends with a truncated UTF8 character 0x%x",
158
+ error->v.codepoint);
159
+ break;
160
+ case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
161
+ print_message(parser, output,
162
+ "No digits after &# in numeric character reference");
163
+ break;
164
+ case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
165
+ print_message(parser, output,
166
+ "The numeric character reference &#%d should be followed "
167
+ "by a semicolon", error->v.codepoint);
168
+ break;
169
+ case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
170
+ print_message(parser, output,
171
+ "The numeric character reference &#%d; encodes an invalid "
172
+ "unicode codepoint", error->v.codepoint);
173
+ break;
174
+ case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
175
+ // The textual data came from one of the literal strings in the table, and
176
+ // so it'll be null-terminated.
177
+ print_message(parser, output,
178
+ "The named character reference &%.*s should be followed by a "
179
+ "semicolon", (int) error->v.text.length, error->v.text.data);
180
+ break;
181
+ case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
182
+ print_message(parser, output,
183
+ "The named character reference &%.*s; is not a valid entity name",
184
+ (int) error->v.text.length, error->v.text.data);
185
+ break;
186
+ case GUMBO_ERR_DUPLICATE_ATTR:
187
+ print_message(parser, output,
188
+ "Attribute %s occurs multiple times, at positions %d and %d",
189
+ error->v.duplicate_attr.name,
190
+ error->v.duplicate_attr.original_index,
191
+ error->v.duplicate_attr.new_index);
192
+ break;
193
+ case GUMBO_ERR_PARSER:
194
+ case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
195
+ handle_parser_error(parser, &error->v.parser, output);
196
+ break;
197
+ default:
198
+ print_message(parser, output,
199
+ "Tokenizer error with an unimplemented error message");
200
+ break;
201
+ }
202
+ gumbo_string_buffer_append_codepoint(parser, '.', output);
203
+ }
204
+
205
+ void gumbo_caret_diagnostic_to_string(
206
+ GumboParser* parser, const GumboError* error,
207
+ const char* source_text, GumboStringBuffer* output) {
208
+ gumbo_error_to_string(parser, error, output);
209
+
210
+ const char* line_start =
211
+ find_last_newline(source_text, error->original_text);
212
+ const char* line_end =
213
+ find_next_newline(source_text, error->original_text);
214
+ GumboStringPiece original_line;
215
+ original_line.data = line_start;
216
+ original_line.length = line_end - line_start;
217
+
218
+ gumbo_string_buffer_append_codepoint(parser, '\n', output);
219
+ gumbo_string_buffer_append_string(parser, &original_line, output);
220
+ gumbo_string_buffer_append_codepoint(parser, '\n', output);
221
+ gumbo_string_buffer_reserve(
222
+ parser, output->length + error->position.column, output);
223
+ int num_spaces = error->position.column - 1;
224
+ memset(output->data + output->length, ' ', num_spaces);
225
+ output->length += num_spaces;
226
+ gumbo_string_buffer_append_codepoint(parser, '^', output);
227
+ gumbo_string_buffer_append_codepoint(parser, '\n', output);
228
+ }
229
+
230
+ void gumbo_print_caret_diagnostic(
231
+ GumboParser* parser, const GumboError* error, const char* source_text) {
232
+ GumboStringBuffer text;
233
+ gumbo_string_buffer_init(parser, &text);
234
+ gumbo_caret_diagnostic_to_string(parser, error, source_text, &text);
235
+ printf("%.*s", (int) text.length, text.data);
236
+ gumbo_string_buffer_destroy(parser, &text);
237
+ }
238
+
239
+ void gumbo_error_destroy(GumboParser* parser, GumboError* error) {
240
+ if (error->type == GUMBO_ERR_PARSER ||
241
+ error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG) {
242
+ gumbo_vector_destroy(parser, &error->v.parser.tag_stack);
243
+ } else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) {
244
+ gumbo_parser_deallocate(parser, (void*) error->v.duplicate_attr.name);
245
+ }
246
+ gumbo_parser_deallocate(parser, error);
247
+ }
248
+
249
+ void gumbo_init_errors(GumboParser* parser) {
250
+ gumbo_vector_init(parser, 5, &parser->_output->errors);
251
+ }
252
+
253
+ void gumbo_destroy_errors(GumboParser* parser) {
254
+ for (int i = 0; i < parser->_output->errors.length; ++i) {
255
+ gumbo_error_destroy(parser, parser->_output->errors.data[i]);
256
+ }
257
+ gumbo_vector_destroy(parser, &parser->_output->errors);
258
+ }
@@ -0,0 +1,227 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // Error types, enums, and handling functions.
18
+
19
+ #ifndef GUMBO_ERROR_H_
20
+ #define GUMBO_ERROR_H_
21
+ #ifdef _MSC_VER
22
+ #define _CRT_SECURE_NO_WARNINGS
23
+ #endif
24
+ #include <stdint.h>
25
+
26
+ #include "gumbo.h"
27
+ #include "insertion_mode.h"
28
+ #include "string_buffer.h"
29
+ #include "token_type.h"
30
+
31
+ #ifdef __cplusplus
32
+ extern "C" {
33
+ #endif
34
+
35
+ struct GumboInternalParser;
36
+
37
+ typedef enum {
38
+ GUMBO_ERR_UTF8_INVALID,
39
+ GUMBO_ERR_UTF8_TRUNCATED,
40
+ GUMBO_ERR_UTF8_NULL,
41
+ GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
42
+ GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
43
+ GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
44
+ GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
45
+ GUMBO_ERR_NAMED_CHAR_REF_INVALID,
46
+ GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
47
+ GUMBO_ERR_TAG_EOF,
48
+ GUMBO_ERR_TAG_INVALID,
49
+ GUMBO_ERR_CLOSE_TAG_EMPTY,
50
+ GUMBO_ERR_CLOSE_TAG_EOF,
51
+ GUMBO_ERR_CLOSE_TAG_INVALID,
52
+ GUMBO_ERR_SCRIPT_EOF,
53
+ GUMBO_ERR_ATTR_NAME_EOF,
54
+ GUMBO_ERR_ATTR_NAME_INVALID,
55
+ GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
56
+ GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
57
+ GUMBO_ERR_ATTR_UNQUOTED_EOF,
58
+ GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
59
+ GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
60
+ GUMBO_ERR_ATTR_AFTER_EOF,
61
+ GUMBO_ERR_ATTR_AFTER_INVALID,
62
+ GUMBO_ERR_DUPLICATE_ATTR,
63
+ GUMBO_ERR_SOLIDUS_EOF,
64
+ GUMBO_ERR_SOLIDUS_INVALID,
65
+ GUMBO_ERR_DASHES_OR_DOCTYPE,
66
+ GUMBO_ERR_COMMENT_EOF,
67
+ GUMBO_ERR_COMMENT_INVALID,
68
+ GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
69
+ GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
70
+ GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
71
+ GUMBO_ERR_COMMENT_END_BANG_EOF,
72
+ GUMBO_ERR_DOCTYPE_EOF,
73
+ GUMBO_ERR_DOCTYPE_INVALID,
74
+ GUMBO_ERR_DOCTYPE_SPACE,
75
+ GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
76
+ GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
77
+ GUMBO_ERR_DOCTYPE_END,
78
+ GUMBO_ERR_PARSER,
79
+ GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
80
+ } GumboErrorType;
81
+
82
+ // Additional data for duplicated attributes.
83
+ typedef struct GumboInternalDuplicateAttrError {
84
+ // The name of the attribute. Owned by this struct.
85
+ const char* name;
86
+
87
+ // The (0-based) index within the attributes vector of the original
88
+ // occurrence.
89
+ unsigned int original_index;
90
+
91
+ // The (0-based) index where the new occurrence would be.
92
+ unsigned int new_index;
93
+ } GumboDuplicateAttrError;
94
+
95
+ // A simplified representation of the tokenizer state, designed to be more
96
+ // useful to clients of this library than the internal representation. This
97
+ // condenses the actual states used in the tokenizer state machine into a few
98
+ // values that will be familiar to users of HTML.
99
+ typedef enum {
100
+ GUMBO_ERR_TOKENIZER_DATA,
101
+ GUMBO_ERR_TOKENIZER_CHAR_REF,
102
+ GUMBO_ERR_TOKENIZER_RCDATA,
103
+ GUMBO_ERR_TOKENIZER_RAWTEXT,
104
+ GUMBO_ERR_TOKENIZER_PLAINTEXT,
105
+ GUMBO_ERR_TOKENIZER_SCRIPT,
106
+ GUMBO_ERR_TOKENIZER_TAG,
107
+ GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
108
+ GUMBO_ERR_TOKENIZER_ATTR_NAME,
109
+ GUMBO_ERR_TOKENIZER_ATTR_VALUE,
110
+ GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
111
+ GUMBO_ERR_TOKENIZER_COMMENT,
112
+ GUMBO_ERR_TOKENIZER_DOCTYPE,
113
+ GUMBO_ERR_TOKENIZER_CDATA,
114
+ } GumboTokenizerErrorState;
115
+
116
+ // Additional data for tokenizer errors.
117
+ // This records the current state and codepoint encountered - this is usually
118
+ // enough to reconstruct what went wrong and provide a friendly error message.
119
+ typedef struct GumboInternalTokenizerError {
120
+ // The bad codepoint encountered.
121
+ int codepoint;
122
+
123
+ // The state that the tokenizer was in at the time.
124
+ GumboTokenizerErrorState state;
125
+ } GumboTokenizerError;
126
+
127
+ // Additional data for parse errors.
128
+ typedef struct GumboInternalParserError {
129
+ // The type of input token that resulted in this error.
130
+ GumboTokenType input_type;
131
+
132
+ // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
133
+ GumboTag input_tag;
134
+
135
+ // The insertion mode that the parser was in at the time.
136
+ GumboInsertionMode parser_state;
137
+
138
+ // The tag stack at the point of the error. Note that this is an GumboVector
139
+ // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
140
+ // get at the tag.
141
+ GumboVector /* GumboTag */ tag_stack;
142
+ } GumboParserError;
143
+
144
+ // The overall error struct representing an error in decoding/tokenizing/parsing
145
+ // the HTML. This contains an enumerated type flag, a source position, and then
146
+ // a union of fields containing data specific to the error.
147
+ typedef struct GumboInternalError {
148
+ // The type of error.
149
+ GumboErrorType type;
150
+
151
+ // The position within the source file where the error occurred.
152
+ GumboSourcePosition position;
153
+
154
+ // A pointer to the byte within the original source file text where the error
155
+ // occurred (note that this is not the same as position.offset, as that gives
156
+ // character-based instead of byte-based offsets).
157
+ const char* original_text;
158
+
159
+ // Type-specific error information.
160
+ union {
161
+ // The code point we encountered, for:
162
+ // * GUMBO_ERR_UTF8_INVALID
163
+ // * GUMBO_ERR_UTF8_TRUNCATED
164
+ // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
165
+ // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
166
+ uint64_t codepoint;
167
+
168
+ // Tokenizer errors.
169
+ GumboTokenizerError tokenizer;
170
+
171
+ // Short textual data, for:
172
+ // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
173
+ // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
174
+ GumboStringPiece text;
175
+
176
+ // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
177
+ GumboDuplicateAttrError duplicate_attr;
178
+
179
+ // Parser state, for GUMBO_ERR_PARSER and
180
+ // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
181
+ struct GumboInternalParserError parser;
182
+ } v;
183
+ } GumboError;
184
+
185
+ // Adds a new error to the parser's error list, and returns a pointer to it so
186
+ // that clients can fill out the rest of its fields. May return NULL if we're
187
+ // already over the max_errors field specified in GumboOptions.
188
+ GumboError* gumbo_add_error(struct GumboInternalParser* parser);
189
+
190
+ // Initializes the errors vector in the parser.
191
+ void gumbo_init_errors(struct GumboInternalParser* errors);
192
+
193
+ // Frees all the errors in the 'errors_' field of the parser.
194
+ void gumbo_destroy_errors(struct GumboInternalParser* errors);
195
+
196
+ // Frees the memory used for a single GumboError.
197
+ void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
198
+
199
+ // Prints an error to a string. This fills an empty GumboStringBuffer with a
200
+ // freshly-allocated buffer containing the error message text. The caller is
201
+ // responsible for deleting the buffer. (Note that the buffer is allocated with
202
+ // the allocator specified in the GumboParser config and hence should be freed
203
+ // by gumbo_parser_deallocate().)
204
+ void gumbo_error_to_string(
205
+ struct GumboInternalParser* parser, const GumboError* error,
206
+ GumboStringBuffer* output);
207
+
208
+ // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
209
+ // with a freshly-allocated buffer containing the error message text. The
210
+ // caller is responsible for deleting the buffer. (Note that the buffer is
211
+ // allocated with the allocator specified in the GumboParser config and hence
212
+ // should be freed by gumbo_parser_deallocate().)
213
+ void gumbo_caret_diagnostic_to_string(
214
+ struct GumboInternalParser* parser, const GumboError* error,
215
+ const char* source_text, GumboStringBuffer* output);
216
+
217
+ // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
218
+ // of writing to a string.
219
+ void gumbo_print_caret_diagnostic(
220
+ struct GumboInternalParser* parser, const GumboError* error,
221
+ const char* source_text);
222
+
223
+ #ifdef __cplusplus
224
+ }
225
+ #endif
226
+
227
+ #endif // GUMBO_ERROR_H_