nokogumbo 1.4.7 → 1.4.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,279 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "error.h"
18
-
19
- #include <assert.h>
20
- #include <stdarg.h>
21
- #include <stdio.h>
22
- #include <string.h>
23
-
24
- #include "gumbo.h"
25
- #include "parser.h"
26
- #include "string_buffer.h"
27
- #include "util.h"
28
- #include "vector.h"
29
-
30
- // Prints a formatted message to a StringBuffer. This automatically resizes the
31
- // StringBuffer as necessary to fit the message. Returns the number of bytes
32
- // written.
33
- static int print_message(
34
- GumboParser* parser, GumboStringBuffer* output, const char* format, ...) {
35
- va_list args;
36
- int remaining_capacity = output->capacity - output->length;
37
- va_start(args, format);
38
- int bytes_written = vsnprintf(
39
- output->data + output->length, remaining_capacity, format, args);
40
- va_end(args);
41
- #ifdef _MSC_VER
42
- if (bytes_written == -1) {
43
- // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
44
- // returning the number of bytes that would've been written had there been
45
- // enough. In this case, we'll double the buffer size and hope it fits when
46
- // we retry (letting it fail and returning 0 if it doesn't), since there's
47
- // no way to smartly resize the buffer.
48
- gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
49
- va_start(args, format);
50
- int result = vsnprintf(
51
- output->data + output->length, remaining_capacity, format, args);
52
- va_end(args);
53
- return result == -1 ? 0 : result;
54
- }
55
- #else
56
- // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
57
- if (bytes_written == -1) {
58
- return 0;
59
- }
60
- #endif
61
-
62
- if (bytes_written > remaining_capacity) {
63
- gumbo_string_buffer_reserve(
64
- parser, output->capacity + bytes_written, output);
65
- remaining_capacity = output->capacity - output->length;
66
- va_start(args, format);
67
- bytes_written = vsnprintf(
68
- output->data + output->length, remaining_capacity, format, args);
69
- va_end(args);
70
- }
71
- output->length += bytes_written;
72
- return bytes_written;
73
- }
74
-
75
- static void print_tag_stack(GumboParser* parser, const GumboParserError* error,
76
- GumboStringBuffer* output) {
77
- print_message(parser, output, " Currently open tags: ");
78
- for (unsigned int i = 0; i < error->tag_stack.length; ++i) {
79
- if (i) {
80
- print_message(parser, output, ", ");
81
- }
82
- GumboTag tag = (GumboTag) error->tag_stack.data[i];
83
- print_message(parser, output, gumbo_normalized_tagname(tag));
84
- }
85
- gumbo_string_buffer_append_codepoint(parser, '.', output);
86
- }
87
-
88
- static void handle_parser_error(GumboParser* parser,
89
- const GumboParserError* error, GumboStringBuffer* output) {
90
- if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL &&
91
- error->input_type != GUMBO_TOKEN_DOCTYPE) {
92
- print_message(
93
- parser, output, "The doctype must be the first token in the document");
94
- return;
95
- }
96
-
97
- switch (error->input_type) {
98
- case GUMBO_TOKEN_DOCTYPE:
99
- print_message(parser, output, "This is not a legal doctype");
100
- return;
101
- case GUMBO_TOKEN_COMMENT:
102
- // Should never happen; comments are always legal.
103
- assert(0);
104
- // But just in case...
105
- print_message(parser, output, "Comments aren't legal here");
106
- return;
107
- case GUMBO_TOKEN_CDATA:
108
- case GUMBO_TOKEN_WHITESPACE:
109
- case GUMBO_TOKEN_CHARACTER:
110
- print_message(parser, output, "Character tokens aren't legal here");
111
- return;
112
- case GUMBO_TOKEN_NULL:
113
- print_message(parser, output, "Null bytes are not allowed in HTML5");
114
- return;
115
- case GUMBO_TOKEN_EOF:
116
- if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
117
- print_message(parser, output, "You must provide a doctype");
118
- } else {
119
- print_message(parser, output, "Premature end of file");
120
- print_tag_stack(parser, error, output);
121
- }
122
- return;
123
- case GUMBO_TOKEN_START_TAG:
124
- case GUMBO_TOKEN_END_TAG:
125
- print_message(parser, output, "That tag isn't allowed here");
126
- print_tag_stack(parser, error, output);
127
- // TODO(jdtang): Give more specific messaging.
128
- return;
129
- }
130
- }
131
-
132
- // Finds the preceding newline in an original source buffer from a given byte
133
- // location. Returns a character pointer to the character after that, or a
134
- // pointer to the beginning of the string if this is the first line.
135
- static const char* find_last_newline(
136
- const char* original_text, const char* error_location) {
137
- assert(error_location >= original_text);
138
- const char* c = error_location;
139
- for (; c != original_text && *c != '\n'; --c) {
140
- // There may be an error at EOF, which would be a nul byte.
141
- assert(*c || c == error_location);
142
- }
143
- return c == original_text ? c : c + 1;
144
- }
145
-
146
- // Finds the next newline in the original source buffer from a given byte
147
- // location. Returns a character pointer to that newline, or a pointer to the
148
- // terminating null byte if this is the last line.
149
- static const char* find_next_newline(
150
- const char* original_text, const char* error_location) {
151
- const char* c = error_location;
152
- for (; *c && *c != '\n'; ++c)
153
- ;
154
- return c;
155
- }
156
-
157
- GumboError* gumbo_add_error(GumboParser* parser) {
158
- int max_errors = parser->_options->max_errors;
159
- if (max_errors >= 0 && parser->_output->errors.length >= (unsigned int) max_errors) {
160
- return NULL;
161
- }
162
- GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError));
163
- gumbo_vector_add(parser, error, &parser->_output->errors);
164
- return error;
165
- }
166
-
167
- void gumbo_error_to_string(
168
- GumboParser* parser, const GumboError* error, GumboStringBuffer* output) {
169
- print_message(
170
- parser, output, "@%d:%d: ", error->position.line, error->position.column);
171
- switch (error->type) {
172
- case GUMBO_ERR_UTF8_INVALID:
173
- print_message(
174
- parser, output, "Invalid UTF8 character 0x%x", error->v.codepoint);
175
- break;
176
- case GUMBO_ERR_UTF8_TRUNCATED:
177
- print_message(parser, output,
178
- "Input stream ends with a truncated UTF8 character 0x%x",
179
- error->v.codepoint);
180
- break;
181
- case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
182
- print_message(
183
- parser, output, "No digits after &# in numeric character reference");
184
- break;
185
- case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
186
- print_message(parser, output,
187
- "The numeric character reference &#%d should be followed "
188
- "by a semicolon",
189
- error->v.codepoint);
190
- break;
191
- case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
192
- print_message(parser, output,
193
- "The numeric character reference &#%d; encodes an invalid "
194
- "unicode codepoint",
195
- error->v.codepoint);
196
- break;
197
- case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
198
- // The textual data came from one of the literal strings in the table, and
199
- // so it'll be null-terminated.
200
- print_message(parser, output,
201
- "The named character reference &%.*s should be followed by a "
202
- "semicolon",
203
- (int) error->v.text.length, error->v.text.data);
204
- break;
205
- case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
206
- print_message(parser, output,
207
- "The named character reference &%.*s; is not a valid entity name",
208
- (int) error->v.text.length, error->v.text.data);
209
- break;
210
- case GUMBO_ERR_DUPLICATE_ATTR:
211
- print_message(parser, output,
212
- "Attribute %s occurs multiple times, at positions %d and %d",
213
- error->v.duplicate_attr.name, error->v.duplicate_attr.original_index,
214
- error->v.duplicate_attr.new_index);
215
- break;
216
- case GUMBO_ERR_PARSER:
217
- case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
218
- handle_parser_error(parser, &error->v.parser, output);
219
- break;
220
- default:
221
- print_message(parser, output,
222
- "Tokenizer error with an unimplemented error message");
223
- break;
224
- }
225
- gumbo_string_buffer_append_codepoint(parser, '.', output);
226
- }
227
-
228
- void gumbo_caret_diagnostic_to_string(GumboParser* parser,
229
- const GumboError* error, const char* source_text,
230
- GumboStringBuffer* output) {
231
- gumbo_error_to_string(parser, error, output);
232
-
233
- const char* line_start = find_last_newline(source_text, error->original_text);
234
- const char* line_end = find_next_newline(source_text, error->original_text);
235
- GumboStringPiece original_line;
236
- original_line.data = line_start;
237
- original_line.length = line_end - line_start;
238
-
239
- gumbo_string_buffer_append_codepoint(parser, '\n', output);
240
- gumbo_string_buffer_append_string(parser, &original_line, output);
241
- gumbo_string_buffer_append_codepoint(parser, '\n', output);
242
- gumbo_string_buffer_reserve(
243
- parser, output->length + error->position.column, output);
244
- int num_spaces = error->position.column - 1;
245
- memset(output->data + output->length, ' ', num_spaces);
246
- output->length += num_spaces;
247
- gumbo_string_buffer_append_codepoint(parser, '^', output);
248
- gumbo_string_buffer_append_codepoint(parser, '\n', output);
249
- }
250
-
251
- void gumbo_print_caret_diagnostic(
252
- GumboParser* parser, const GumboError* error, const char* source_text) {
253
- GumboStringBuffer text;
254
- gumbo_string_buffer_init(parser, &text);
255
- gumbo_caret_diagnostic_to_string(parser, error, source_text, &text);
256
- printf("%.*s", (int) text.length, text.data);
257
- gumbo_string_buffer_destroy(parser, &text);
258
- }
259
-
260
- void gumbo_error_destroy(GumboParser* parser, GumboError* error) {
261
- if (error->type == GUMBO_ERR_PARSER ||
262
- error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG) {
263
- gumbo_vector_destroy(parser, &error->v.parser.tag_stack);
264
- } else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) {
265
- gumbo_parser_deallocate(parser, (void*) error->v.duplicate_attr.name);
266
- }
267
- gumbo_parser_deallocate(parser, error);
268
- }
269
-
270
- void gumbo_init_errors(GumboParser* parser) {
271
- gumbo_vector_init(parser, 5, &parser->_output->errors);
272
- }
273
-
274
- void gumbo_destroy_errors(GumboParser* parser) {
275
- for (unsigned int i = 0; i < parser->_output->errors.length; ++i) {
276
- gumbo_error_destroy(parser, parser->_output->errors.data[i]);
277
- }
278
- gumbo_vector_destroy(parser, &parser->_output->errors);
279
- }
@@ -1,225 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Error types, enums, and handling functions.
18
-
19
- #ifndef GUMBO_ERROR_H_
20
- #define GUMBO_ERROR_H_
21
- #ifdef _MSC_VER
22
- #define _CRT_SECURE_NO_WARNINGS
23
- #endif
24
- #include <stdint.h>
25
-
26
- #include "gumbo.h"
27
- #include "insertion_mode.h"
28
- #include "string_buffer.h"
29
- #include "token_type.h"
30
-
31
- #ifdef __cplusplus
32
- extern "C" {
33
- #endif
34
-
35
- struct GumboInternalParser;
36
-
37
- typedef enum {
38
- GUMBO_ERR_UTF8_INVALID,
39
- GUMBO_ERR_UTF8_TRUNCATED,
40
- GUMBO_ERR_UTF8_NULL,
41
- GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
42
- GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
43
- GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
44
- GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
45
- GUMBO_ERR_NAMED_CHAR_REF_INVALID,
46
- GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
47
- GUMBO_ERR_TAG_EOF,
48
- GUMBO_ERR_TAG_INVALID,
49
- GUMBO_ERR_CLOSE_TAG_EMPTY,
50
- GUMBO_ERR_CLOSE_TAG_EOF,
51
- GUMBO_ERR_CLOSE_TAG_INVALID,
52
- GUMBO_ERR_SCRIPT_EOF,
53
- GUMBO_ERR_ATTR_NAME_EOF,
54
- GUMBO_ERR_ATTR_NAME_INVALID,
55
- GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
56
- GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
57
- GUMBO_ERR_ATTR_UNQUOTED_EOF,
58
- GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
59
- GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
60
- GUMBO_ERR_ATTR_AFTER_EOF,
61
- GUMBO_ERR_ATTR_AFTER_INVALID,
62
- GUMBO_ERR_DUPLICATE_ATTR,
63
- GUMBO_ERR_SOLIDUS_EOF,
64
- GUMBO_ERR_SOLIDUS_INVALID,
65
- GUMBO_ERR_DASHES_OR_DOCTYPE,
66
- GUMBO_ERR_COMMENT_EOF,
67
- GUMBO_ERR_COMMENT_INVALID,
68
- GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
69
- GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
70
- GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
71
- GUMBO_ERR_COMMENT_END_BANG_EOF,
72
- GUMBO_ERR_DOCTYPE_EOF,
73
- GUMBO_ERR_DOCTYPE_INVALID,
74
- GUMBO_ERR_DOCTYPE_SPACE,
75
- GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
76
- GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
77
- GUMBO_ERR_DOCTYPE_END,
78
- GUMBO_ERR_PARSER,
79
- GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
80
- } GumboErrorType;
81
-
82
- // Additional data for duplicated attributes.
83
- typedef struct GumboInternalDuplicateAttrError {
84
- // The name of the attribute. Owned by this struct.
85
- const char* name;
86
-
87
- // The (0-based) index within the attributes vector of the original
88
- // occurrence.
89
- unsigned int original_index;
90
-
91
- // The (0-based) index where the new occurrence would be.
92
- unsigned int new_index;
93
- } GumboDuplicateAttrError;
94
-
95
- // A simplified representation of the tokenizer state, designed to be more
96
- // useful to clients of this library than the internal representation. This
97
- // condenses the actual states used in the tokenizer state machine into a few
98
- // values that will be familiar to users of HTML.
99
- typedef enum {
100
- GUMBO_ERR_TOKENIZER_DATA,
101
- GUMBO_ERR_TOKENIZER_CHAR_REF,
102
- GUMBO_ERR_TOKENIZER_RCDATA,
103
- GUMBO_ERR_TOKENIZER_RAWTEXT,
104
- GUMBO_ERR_TOKENIZER_PLAINTEXT,
105
- GUMBO_ERR_TOKENIZER_SCRIPT,
106
- GUMBO_ERR_TOKENIZER_TAG,
107
- GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
108
- GUMBO_ERR_TOKENIZER_ATTR_NAME,
109
- GUMBO_ERR_TOKENIZER_ATTR_VALUE,
110
- GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
111
- GUMBO_ERR_TOKENIZER_COMMENT,
112
- GUMBO_ERR_TOKENIZER_DOCTYPE,
113
- GUMBO_ERR_TOKENIZER_CDATA,
114
- } GumboTokenizerErrorState;
115
-
116
- // Additional data for tokenizer errors.
117
- // This records the current state and codepoint encountered - this is usually
118
- // enough to reconstruct what went wrong and provide a friendly error message.
119
- typedef struct GumboInternalTokenizerError {
120
- // The bad codepoint encountered.
121
- int codepoint;
122
-
123
- // The state that the tokenizer was in at the time.
124
- GumboTokenizerErrorState state;
125
- } GumboTokenizerError;
126
-
127
- // Additional data for parse errors.
128
- typedef struct GumboInternalParserError {
129
- // The type of input token that resulted in this error.
130
- GumboTokenType input_type;
131
-
132
- // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
133
- GumboTag input_tag;
134
-
135
- // The insertion mode that the parser was in at the time.
136
- GumboInsertionMode parser_state;
137
-
138
- // The tag stack at the point of the error. Note that this is an GumboVector
139
- // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
140
- // get at the tag.
141
- GumboVector /* GumboTag */ tag_stack;
142
- } GumboParserError;
143
-
144
- // The overall error struct representing an error in decoding/tokenizing/parsing
145
- // the HTML. This contains an enumerated type flag, a source position, and then
146
- // a union of fields containing data specific to the error.
147
- typedef struct GumboInternalError {
148
- // The type of error.
149
- GumboErrorType type;
150
-
151
- // The position within the source file where the error occurred.
152
- GumboSourcePosition position;
153
-
154
- // A pointer to the byte within the original source file text where the error
155
- // occurred (note that this is not the same as position.offset, as that gives
156
- // character-based instead of byte-based offsets).
157
- const char* original_text;
158
-
159
- // Type-specific error information.
160
- union {
161
- // The code point we encountered, for:
162
- // * GUMBO_ERR_UTF8_INVALID
163
- // * GUMBO_ERR_UTF8_TRUNCATED
164
- // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
165
- // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
166
- uint64_t codepoint;
167
-
168
- // Tokenizer errors.
169
- GumboTokenizerError tokenizer;
170
-
171
- // Short textual data, for:
172
- // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
173
- // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
174
- GumboStringPiece text;
175
-
176
- // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
177
- GumboDuplicateAttrError duplicate_attr;
178
-
179
- // Parser state, for GUMBO_ERR_PARSER and
180
- // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
181
- struct GumboInternalParserError parser;
182
- } v;
183
- } GumboError;
184
-
185
- // Adds a new error to the parser's error list, and returns a pointer to it so
186
- // that clients can fill out the rest of its fields. May return NULL if we're
187
- // already over the max_errors field specified in GumboOptions.
188
- GumboError* gumbo_add_error(struct GumboInternalParser* parser);
189
-
190
- // Initializes the errors vector in the parser.
191
- void gumbo_init_errors(struct GumboInternalParser* errors);
192
-
193
- // Frees all the errors in the 'errors_' field of the parser.
194
- void gumbo_destroy_errors(struct GumboInternalParser* errors);
195
-
196
- // Frees the memory used for a single GumboError.
197
- void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
198
-
199
- // Prints an error to a string. This fills an empty GumboStringBuffer with a
200
- // freshly-allocated buffer containing the error message text. The caller is
201
- // responsible for deleting the buffer. (Note that the buffer is allocated with
202
- // the allocator specified in the GumboParser config and hence should be freed
203
- // by gumbo_parser_deallocate().)
204
- void gumbo_error_to_string(struct GumboInternalParser* parser,
205
- const GumboError* error, GumboStringBuffer* output);
206
-
207
- // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
208
- // with a freshly-allocated buffer containing the error message text. The
209
- // caller is responsible for deleting the buffer. (Note that the buffer is
210
- // allocated with the allocator specified in the GumboParser config and hence
211
- // should be freed by gumbo_parser_deallocate().)
212
- void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
213
- const GumboError* error, const char* source_text,
214
- GumboStringBuffer* output);
215
-
216
- // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
217
- // of writing to a string.
218
- void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
219
- const GumboError* error, const char* source_text);
220
-
221
- #ifdef __cplusplus
222
- }
223
- #endif
224
-
225
- #endif // GUMBO_ERROR_H_