nokogumbo 1.4.8 → 1.4.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,279 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "error.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdarg.h>
21
+ #include <stdio.h>
22
+ #include <string.h>
23
+
24
+ #include "gumbo.h"
25
+ #include "parser.h"
26
+ #include "string_buffer.h"
27
+ #include "util.h"
28
+ #include "vector.h"
29
+
30
+ // Prints a formatted message to a StringBuffer. This automatically resizes the
31
+ // StringBuffer as necessary to fit the message. Returns the number of bytes
32
+ // written.
33
+ static int print_message(
34
+ GumboParser* parser, GumboStringBuffer* output, const char* format, ...) {
35
+ va_list args;
36
+ int remaining_capacity = output->capacity - output->length;
37
+ va_start(args, format);
38
+ int bytes_written = vsnprintf(
39
+ output->data + output->length, remaining_capacity, format, args);
40
+ va_end(args);
41
+ #ifdef _MSC_VER
42
+ if (bytes_written == -1) {
43
+ // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
44
+ // returning the number of bytes that would've been written had there been
45
+ // enough. In this case, we'll double the buffer size and hope it fits when
46
+ // we retry (letting it fail and returning 0 if it doesn't), since there's
47
+ // no way to smartly resize the buffer.
48
+ gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
49
+ va_start(args, format);
50
+ int result = vsnprintf(
51
+ output->data + output->length, remaining_capacity, format, args);
52
+ va_end(args);
53
+ return result == -1 ? 0 : result;
54
+ }
55
+ #else
56
+ // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
57
+ if (bytes_written == -1) {
58
+ return 0;
59
+ }
60
+ #endif
61
+
62
+ if (bytes_written > remaining_capacity) {
63
+ gumbo_string_buffer_reserve(
64
+ parser, output->capacity + bytes_written, output);
65
+ remaining_capacity = output->capacity - output->length;
66
+ va_start(args, format);
67
+ bytes_written = vsnprintf(
68
+ output->data + output->length, remaining_capacity, format, args);
69
+ va_end(args);
70
+ }
71
+ output->length += bytes_written;
72
+ return bytes_written;
73
+ }
74
+
75
+ static void print_tag_stack(GumboParser* parser, const GumboParserError* error,
76
+ GumboStringBuffer* output) {
77
+ print_message(parser, output, " Currently open tags: ");
78
+ for (unsigned int i = 0; i < error->tag_stack.length; ++i) {
79
+ if (i) {
80
+ print_message(parser, output, ", ");
81
+ }
82
+ GumboTag tag = (GumboTag) error->tag_stack.data[i];
83
+ print_message(parser, output, gumbo_normalized_tagname(tag));
84
+ }
85
+ gumbo_string_buffer_append_codepoint(parser, '.', output);
86
+ }
87
+
88
+ static void handle_parser_error(GumboParser* parser,
89
+ const GumboParserError* error, GumboStringBuffer* output) {
90
+ if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL &&
91
+ error->input_type != GUMBO_TOKEN_DOCTYPE) {
92
+ print_message(
93
+ parser, output, "The doctype must be the first token in the document");
94
+ return;
95
+ }
96
+
97
+ switch (error->input_type) {
98
+ case GUMBO_TOKEN_DOCTYPE:
99
+ print_message(parser, output, "This is not a legal doctype");
100
+ return;
101
+ case GUMBO_TOKEN_COMMENT:
102
+ // Should never happen; comments are always legal.
103
+ assert(0);
104
+ // But just in case...
105
+ print_message(parser, output, "Comments aren't legal here");
106
+ return;
107
+ case GUMBO_TOKEN_CDATA:
108
+ case GUMBO_TOKEN_WHITESPACE:
109
+ case GUMBO_TOKEN_CHARACTER:
110
+ print_message(parser, output, "Character tokens aren't legal here");
111
+ return;
112
+ case GUMBO_TOKEN_NULL:
113
+ print_message(parser, output, "Null bytes are not allowed in HTML5");
114
+ return;
115
+ case GUMBO_TOKEN_EOF:
116
+ if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL) {
117
+ print_message(parser, output, "You must provide a doctype");
118
+ } else {
119
+ print_message(parser, output, "Premature end of file");
120
+ print_tag_stack(parser, error, output);
121
+ }
122
+ return;
123
+ case GUMBO_TOKEN_START_TAG:
124
+ case GUMBO_TOKEN_END_TAG:
125
+ print_message(parser, output, "That tag isn't allowed here");
126
+ print_tag_stack(parser, error, output);
127
+ // TODO(jdtang): Give more specific messaging.
128
+ return;
129
+ }
130
+ }
131
+
132
+ // Finds the preceding newline in an original source buffer from a given byte
133
+ // location. Returns a character pointer to the character after that, or a
134
+ // pointer to the beginning of the string if this is the first line.
135
+ static const char* find_last_newline(
136
+ const char* original_text, const char* error_location) {
137
+ assert(error_location >= original_text);
138
+ const char* c = error_location;
139
+ for (; c != original_text && *c != '\n'; --c) {
140
+ // There may be an error at EOF, which would be a nul byte.
141
+ assert(*c || c == error_location);
142
+ }
143
+ return c == original_text ? c : c + 1;
144
+ }
145
+
146
+ // Finds the next newline in the original source buffer from a given byte
147
+ // location. Returns a character pointer to that newline, or a pointer to the
148
+ // terminating null byte if this is the last line.
149
+ static const char* find_next_newline(
150
+ const char* original_text, const char* error_location) {
151
+ const char* c = error_location;
152
+ for (; *c && *c != '\n'; ++c)
153
+ ;
154
+ return c;
155
+ }
156
+
157
+ GumboError* gumbo_add_error(GumboParser* parser) {
158
+ int max_errors = parser->_options->max_errors;
159
+ if (max_errors >= 0 && parser->_output->errors.length >= (unsigned int) max_errors) {
160
+ return NULL;
161
+ }
162
+ GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError));
163
+ gumbo_vector_add(parser, error, &parser->_output->errors);
164
+ return error;
165
+ }
166
+
167
+ void gumbo_error_to_string(
168
+ GumboParser* parser, const GumboError* error, GumboStringBuffer* output) {
169
+ print_message(
170
+ parser, output, "@%d:%d: ", error->position.line, error->position.column);
171
+ switch (error->type) {
172
+ case GUMBO_ERR_UTF8_INVALID:
173
+ print_message(
174
+ parser, output, "Invalid UTF8 character 0x%x", error->v.codepoint);
175
+ break;
176
+ case GUMBO_ERR_UTF8_TRUNCATED:
177
+ print_message(parser, output,
178
+ "Input stream ends with a truncated UTF8 character 0x%x",
179
+ error->v.codepoint);
180
+ break;
181
+ case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
182
+ print_message(
183
+ parser, output, "No digits after &# in numeric character reference");
184
+ break;
185
+ case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
186
+ print_message(parser, output,
187
+ "The numeric character reference &#%d should be followed "
188
+ "by a semicolon",
189
+ error->v.codepoint);
190
+ break;
191
+ case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
192
+ print_message(parser, output,
193
+ "The numeric character reference &#%d; encodes an invalid "
194
+ "unicode codepoint",
195
+ error->v.codepoint);
196
+ break;
197
+ case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
198
+ // The textual data came from one of the literal strings in the table, and
199
+ // so it'll be null-terminated.
200
+ print_message(parser, output,
201
+ "The named character reference &%.*s should be followed by a "
202
+ "semicolon",
203
+ (int) error->v.text.length, error->v.text.data);
204
+ break;
205
+ case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
206
+ print_message(parser, output,
207
+ "The named character reference &%.*s; is not a valid entity name",
208
+ (int) error->v.text.length, error->v.text.data);
209
+ break;
210
+ case GUMBO_ERR_DUPLICATE_ATTR:
211
+ print_message(parser, output,
212
+ "Attribute %s occurs multiple times, at positions %d and %d",
213
+ error->v.duplicate_attr.name, error->v.duplicate_attr.original_index,
214
+ error->v.duplicate_attr.new_index);
215
+ break;
216
+ case GUMBO_ERR_PARSER:
217
+ case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
218
+ handle_parser_error(parser, &error->v.parser, output);
219
+ break;
220
+ default:
221
+ print_message(parser, output,
222
+ "Tokenizer error with an unimplemented error message");
223
+ break;
224
+ }
225
+ gumbo_string_buffer_append_codepoint(parser, '.', output);
226
+ }
227
+
228
+ void gumbo_caret_diagnostic_to_string(GumboParser* parser,
229
+ const GumboError* error, const char* source_text,
230
+ GumboStringBuffer* output) {
231
+ gumbo_error_to_string(parser, error, output);
232
+
233
+ const char* line_start = find_last_newline(source_text, error->original_text);
234
+ const char* line_end = find_next_newline(source_text, error->original_text);
235
+ GumboStringPiece original_line;
236
+ original_line.data = line_start;
237
+ original_line.length = line_end - line_start;
238
+
239
+ gumbo_string_buffer_append_codepoint(parser, '\n', output);
240
+ gumbo_string_buffer_append_string(parser, &original_line, output);
241
+ gumbo_string_buffer_append_codepoint(parser, '\n', output);
242
+ gumbo_string_buffer_reserve(
243
+ parser, output->length + error->position.column, output);
244
+ int num_spaces = error->position.column - 1;
245
+ memset(output->data + output->length, ' ', num_spaces);
246
+ output->length += num_spaces;
247
+ gumbo_string_buffer_append_codepoint(parser, '^', output);
248
+ gumbo_string_buffer_append_codepoint(parser, '\n', output);
249
+ }
250
+
251
+ void gumbo_print_caret_diagnostic(
252
+ GumboParser* parser, const GumboError* error, const char* source_text) {
253
+ GumboStringBuffer text;
254
+ gumbo_string_buffer_init(parser, &text);
255
+ gumbo_caret_diagnostic_to_string(parser, error, source_text, &text);
256
+ printf("%.*s", (int) text.length, text.data);
257
+ gumbo_string_buffer_destroy(parser, &text);
258
+ }
259
+
260
+ void gumbo_error_destroy(GumboParser* parser, GumboError* error) {
261
+ if (error->type == GUMBO_ERR_PARSER ||
262
+ error->type == GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG) {
263
+ gumbo_vector_destroy(parser, &error->v.parser.tag_stack);
264
+ } else if (error->type == GUMBO_ERR_DUPLICATE_ATTR) {
265
+ gumbo_parser_deallocate(parser, (void*) error->v.duplicate_attr.name);
266
+ }
267
+ gumbo_parser_deallocate(parser, error);
268
+ }
269
+
270
+ void gumbo_init_errors(GumboParser* parser) {
271
+ gumbo_vector_init(parser, 5, &parser->_output->errors);
272
+ }
273
+
274
+ void gumbo_destroy_errors(GumboParser* parser) {
275
+ for (unsigned int i = 0; i < parser->_output->errors.length; ++i) {
276
+ gumbo_error_destroy(parser, parser->_output->errors.data[i]);
277
+ }
278
+ gumbo_vector_destroy(parser, &parser->_output->errors);
279
+ }
@@ -0,0 +1,225 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // Error types, enums, and handling functions.
18
+
19
+ #ifndef GUMBO_ERROR_H_
20
+ #define GUMBO_ERROR_H_
21
+ #ifdef _MSC_VER
22
+ #define _CRT_SECURE_NO_WARNINGS
23
+ #endif
24
+ #include <stdint.h>
25
+
26
+ #include "gumbo.h"
27
+ #include "insertion_mode.h"
28
+ #include "string_buffer.h"
29
+ #include "token_type.h"
30
+
31
+ #ifdef __cplusplus
32
+ extern "C" {
33
+ #endif
34
+
35
+ struct GumboInternalParser;
36
+
37
+ typedef enum {
38
+ GUMBO_ERR_UTF8_INVALID,
39
+ GUMBO_ERR_UTF8_TRUNCATED,
40
+ GUMBO_ERR_UTF8_NULL,
41
+ GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
42
+ GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
43
+ GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
44
+ GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
45
+ GUMBO_ERR_NAMED_CHAR_REF_INVALID,
46
+ GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
47
+ GUMBO_ERR_TAG_EOF,
48
+ GUMBO_ERR_TAG_INVALID,
49
+ GUMBO_ERR_CLOSE_TAG_EMPTY,
50
+ GUMBO_ERR_CLOSE_TAG_EOF,
51
+ GUMBO_ERR_CLOSE_TAG_INVALID,
52
+ GUMBO_ERR_SCRIPT_EOF,
53
+ GUMBO_ERR_ATTR_NAME_EOF,
54
+ GUMBO_ERR_ATTR_NAME_INVALID,
55
+ GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
56
+ GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
57
+ GUMBO_ERR_ATTR_UNQUOTED_EOF,
58
+ GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
59
+ GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
60
+ GUMBO_ERR_ATTR_AFTER_EOF,
61
+ GUMBO_ERR_ATTR_AFTER_INVALID,
62
+ GUMBO_ERR_DUPLICATE_ATTR,
63
+ GUMBO_ERR_SOLIDUS_EOF,
64
+ GUMBO_ERR_SOLIDUS_INVALID,
65
+ GUMBO_ERR_DASHES_OR_DOCTYPE,
66
+ GUMBO_ERR_COMMENT_EOF,
67
+ GUMBO_ERR_COMMENT_INVALID,
68
+ GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
69
+ GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
70
+ GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
71
+ GUMBO_ERR_COMMENT_END_BANG_EOF,
72
+ GUMBO_ERR_DOCTYPE_EOF,
73
+ GUMBO_ERR_DOCTYPE_INVALID,
74
+ GUMBO_ERR_DOCTYPE_SPACE,
75
+ GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
76
+ GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
77
+ GUMBO_ERR_DOCTYPE_END,
78
+ GUMBO_ERR_PARSER,
79
+ GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
80
+ } GumboErrorType;
81
+
82
+ // Additional data for duplicated attributes.
83
+ typedef struct GumboInternalDuplicateAttrError {
84
+ // The name of the attribute. Owned by this struct.
85
+ const char* name;
86
+
87
+ // The (0-based) index within the attributes vector of the original
88
+ // occurrence.
89
+ unsigned int original_index;
90
+
91
+ // The (0-based) index where the new occurrence would be.
92
+ unsigned int new_index;
93
+ } GumboDuplicateAttrError;
94
+
95
+ // A simplified representation of the tokenizer state, designed to be more
96
+ // useful to clients of this library than the internal representation. This
97
+ // condenses the actual states used in the tokenizer state machine into a few
98
+ // values that will be familiar to users of HTML.
99
+ typedef enum {
100
+ GUMBO_ERR_TOKENIZER_DATA,
101
+ GUMBO_ERR_TOKENIZER_CHAR_REF,
102
+ GUMBO_ERR_TOKENIZER_RCDATA,
103
+ GUMBO_ERR_TOKENIZER_RAWTEXT,
104
+ GUMBO_ERR_TOKENIZER_PLAINTEXT,
105
+ GUMBO_ERR_TOKENIZER_SCRIPT,
106
+ GUMBO_ERR_TOKENIZER_TAG,
107
+ GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
108
+ GUMBO_ERR_TOKENIZER_ATTR_NAME,
109
+ GUMBO_ERR_TOKENIZER_ATTR_VALUE,
110
+ GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
111
+ GUMBO_ERR_TOKENIZER_COMMENT,
112
+ GUMBO_ERR_TOKENIZER_DOCTYPE,
113
+ GUMBO_ERR_TOKENIZER_CDATA,
114
+ } GumboTokenizerErrorState;
115
+
116
+ // Additional data for tokenizer errors.
117
+ // This records the current state and codepoint encountered - this is usually
118
+ // enough to reconstruct what went wrong and provide a friendly error message.
119
+ typedef struct GumboInternalTokenizerError {
120
+ // The bad codepoint encountered.
121
+ int codepoint;
122
+
123
+ // The state that the tokenizer was in at the time.
124
+ GumboTokenizerErrorState state;
125
+ } GumboTokenizerError;
126
+
127
+ // Additional data for parse errors.
128
+ typedef struct GumboInternalParserError {
129
+ // The type of input token that resulted in this error.
130
+ GumboTokenType input_type;
131
+
132
+ // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
133
+ GumboTag input_tag;
134
+
135
+ // The insertion mode that the parser was in at the time.
136
+ GumboInsertionMode parser_state;
137
+
138
+ // The tag stack at the point of the error. Note that this is an GumboVector
139
+ // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
140
+ // get at the tag.
141
+ GumboVector /* GumboTag */ tag_stack;
142
+ } GumboParserError;
143
+
144
+ // The overall error struct representing an error in decoding/tokenizing/parsing
145
+ // the HTML. This contains an enumerated type flag, a source position, and then
146
+ // a union of fields containing data specific to the error.
147
+ typedef struct GumboInternalError {
148
+ // The type of error.
149
+ GumboErrorType type;
150
+
151
+ // The position within the source file where the error occurred.
152
+ GumboSourcePosition position;
153
+
154
+ // A pointer to the byte within the original source file text where the error
155
+ // occurred (note that this is not the same as position.offset, as that gives
156
+ // character-based instead of byte-based offsets).
157
+ const char* original_text;
158
+
159
+ // Type-specific error information.
160
+ union {
161
+ // The code point we encountered, for:
162
+ // * GUMBO_ERR_UTF8_INVALID
163
+ // * GUMBO_ERR_UTF8_TRUNCATED
164
+ // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
165
+ // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
166
+ uint64_t codepoint;
167
+
168
+ // Tokenizer errors.
169
+ GumboTokenizerError tokenizer;
170
+
171
+ // Short textual data, for:
172
+ // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
173
+ // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
174
+ GumboStringPiece text;
175
+
176
+ // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
177
+ GumboDuplicateAttrError duplicate_attr;
178
+
179
+ // Parser state, for GUMBO_ERR_PARSER and
180
+ // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
181
+ struct GumboInternalParserError parser;
182
+ } v;
183
+ } GumboError;
184
+
185
+ // Adds a new error to the parser's error list, and returns a pointer to it so
186
+ // that clients can fill out the rest of its fields. May return NULL if we're
187
+ // already over the max_errors field specified in GumboOptions.
188
+ GumboError* gumbo_add_error(struct GumboInternalParser* parser);
189
+
190
+ // Initializes the errors vector in the parser.
191
+ void gumbo_init_errors(struct GumboInternalParser* errors);
192
+
193
+ // Frees all the errors in the 'errors_' field of the parser.
194
+ void gumbo_destroy_errors(struct GumboInternalParser* errors);
195
+
196
+ // Frees the memory used for a single GumboError.
197
+ void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
198
+
199
+ // Prints an error to a string. This fills an empty GumboStringBuffer with a
200
+ // freshly-allocated buffer containing the error message text. The caller is
201
+ // responsible for deleting the buffer. (Note that the buffer is allocated with
202
+ // the allocator specified in the GumboParser config and hence should be freed
203
+ // by gumbo_parser_deallocate().)
204
+ void gumbo_error_to_string(struct GumboInternalParser* parser,
205
+ const GumboError* error, GumboStringBuffer* output);
206
+
207
+ // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
208
+ // with a freshly-allocated buffer containing the error message text. The
209
+ // caller is responsible for deleting the buffer. (Note that the buffer is
210
+ // allocated with the allocator specified in the GumboParser config and hence
211
+ // should be freed by gumbo_parser_deallocate().)
212
+ void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
213
+ const GumboError* error, const char* source_text,
214
+ GumboStringBuffer* output);
215
+
216
+ // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
217
+ // of writing to a string.
218
+ void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
219
+ const GumboError* error, const char* source_text);
220
+
221
+ #ifdef __cplusplus
222
+ }
223
+ #endif
224
+
225
+ #endif // GUMBO_ERROR_H_