nokogumbo 1.3.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +8 -2
- data/ext/nokogumboc/extconf.rb +18 -6
- data/ext/nokogumboc/nokogumbo.c +102 -42
- data/gumbo-parser/src/attribute.c +1 -1
- data/gumbo-parser/src/char_ref.c +37 -67
- data/gumbo-parser/src/char_ref.h +3 -4
- data/gumbo-parser/src/char_ref.rl +6 -1
- data/gumbo-parser/src/error.c +51 -51
- data/gumbo-parser/src/error.h +7 -9
- data/gumbo-parser/src/gumbo.h +45 -181
- data/gumbo-parser/src/parser.c +1439 -1172
- data/gumbo-parser/src/string_buffer.c +14 -10
- data/gumbo-parser/src/string_buffer.h +9 -6
- data/gumbo-parser/src/string_piece.c +5 -6
- data/gumbo-parser/src/string_piece.h +2 -3
- data/gumbo-parser/src/tag.c +36 -166
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +153 -0
- data/gumbo-parser/src/tag_gperf.h +105 -0
- data/gumbo-parser/src/tag_sizes.h +4 -0
- data/gumbo-parser/src/tag_strings.h +153 -0
- data/gumbo-parser/src/token_type.h +1 -0
- data/gumbo-parser/src/tokenizer.c +278 -361
- data/gumbo-parser/src/tokenizer.h +2 -2
- data/gumbo-parser/src/utf8.c +53 -52
- data/gumbo-parser/src/utf8.h +1 -2
- data/gumbo-parser/src/util.c +1 -1
- data/gumbo-parser/src/util.h +0 -2
- data/gumbo-parser/src/vector.c +17 -17
- data/gumbo-parser/src/vector.h +6 -8
- data/gumbo-parser/visualc/include/strings.h +2 -1
- data/lib/nokogumbo.rb +8 -8
- data/test-nokogumbo.rb +190 -0
- metadata +19 -17
data/gumbo-parser/src/char_ref.h
CHANGED
@@ -49,10 +49,9 @@ typedef struct {
|
|
49
49
|
// errors to the GumboParser's errors vector, if the spec calls for it. Pass a
|
50
50
|
// space for the "additional allowed char" when the spec says "with no
|
51
51
|
// additional allowed char". Returns false on parse error, true otherwise.
|
52
|
-
bool consume_char_ref(
|
53
|
-
struct
|
54
|
-
|
55
|
-
OneOrTwoCodepoints* output);
|
52
|
+
bool consume_char_ref(struct GumboInternalParser* parser,
|
53
|
+
struct GumboInternalUtf8Iterator* input, int additional_allowed_char,
|
54
|
+
bool is_in_attribute, OneOrTwoCodepoints* output);
|
56
55
|
|
57
56
|
#ifdef __cplusplus
|
58
57
|
}
|
@@ -2464,7 +2464,9 @@ valid_named_ref := |*
|
|
2464
2464
|
*|;
|
2465
2465
|
}%%
|
2466
2466
|
|
2467
|
-
|
2467
|
+
// clang-format off
|
2468
|
+
%% write data noerror nofinal;
|
2469
|
+
// clang-format on
|
2468
2470
|
|
2469
2471
|
static bool consume_named_ref(
|
2470
2472
|
struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute,
|
@@ -2477,13 +2479,16 @@ static bool consume_named_ref(
|
|
2477
2479
|
const char *ts, *start;
|
2478
2480
|
int cs, act;
|
2479
2481
|
|
2482
|
+
// clang-format off
|
2480
2483
|
%% write init;
|
2481
2484
|
// Avoid unused variable warnings.
|
2482
2485
|
(void) act;
|
2483
2486
|
(void) ts;
|
2487
|
+
(void) char_ref_en_valid_named_ref;
|
2484
2488
|
|
2485
2489
|
start = p;
|
2486
2490
|
%% write exec;
|
2491
|
+
// clang-format on
|
2487
2492
|
|
2488
2493
|
if (cs >= %%{ write first_final; }%%) {
|
2489
2494
|
assert(output->first != kGumboNoChar);
|
data/gumbo-parser/src/error.c
CHANGED
@@ -27,18 +27,17 @@
|
|
27
27
|
#include "util.h"
|
28
28
|
#include "vector.h"
|
29
29
|
|
30
|
-
static const size_t kMessageBufferSize = 256;
|
31
|
-
|
32
30
|
// Prints a formatted message to a StringBuffer. This automatically resizes the
|
33
31
|
// StringBuffer as necessary to fit the message. Returns the number of bytes
|
34
32
|
// written.
|
35
|
-
static int print_message(
|
36
|
-
|
33
|
+
static int print_message(
|
34
|
+
GumboParser* parser, GumboStringBuffer* output, const char* format, ...) {
|
37
35
|
va_list args;
|
38
|
-
va_start(args, format);
|
39
36
|
int remaining_capacity = output->capacity - output->length;
|
40
|
-
|
41
|
-
|
37
|
+
va_start(args, format);
|
38
|
+
int bytes_written = vsnprintf(
|
39
|
+
output->data + output->length, remaining_capacity, format, args);
|
40
|
+
va_end(args);
|
42
41
|
#ifdef _MSC_VER
|
43
42
|
if (bytes_written == -1) {
|
44
43
|
// vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
|
@@ -47,15 +46,15 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
|
|
47
46
|
// we retry (letting it fail and returning 0 if it doesn't), since there's
|
48
47
|
// no way to smartly resize the buffer.
|
49
48
|
gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
|
50
|
-
|
51
|
-
|
49
|
+
va_start(args, format);
|
50
|
+
int result = vsnprintf(
|
51
|
+
output->data + output->length, remaining_capacity, format, args);
|
52
52
|
va_end(args);
|
53
53
|
return result == -1 ? 0 : result;
|
54
54
|
}
|
55
55
|
#else
|
56
56
|
// -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
|
57
57
|
if (bytes_written == -1) {
|
58
|
-
va_end(args);
|
59
58
|
return 0;
|
60
59
|
}
|
61
60
|
#endif
|
@@ -64,19 +63,19 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
|
|
64
63
|
gumbo_string_buffer_reserve(
|
65
64
|
parser, output->capacity + bytes_written, output);
|
66
65
|
remaining_capacity = output->capacity - output->length;
|
67
|
-
|
68
|
-
|
66
|
+
va_start(args, format);
|
67
|
+
bytes_written = vsnprintf(
|
68
|
+
output->data + output->length, remaining_capacity, format, args);
|
69
|
+
va_end(args);
|
69
70
|
}
|
70
71
|
output->length += bytes_written;
|
71
|
-
va_end(args);
|
72
72
|
return bytes_written;
|
73
73
|
}
|
74
74
|
|
75
|
-
static void print_tag_stack(
|
76
|
-
GumboParser* parser, const GumboParserError* error,
|
75
|
+
static void print_tag_stack(GumboParser* parser, const GumboParserError* error,
|
77
76
|
GumboStringBuffer* output) {
|
78
77
|
print_message(parser, output, " Currently open tags: ");
|
79
|
-
for (int i = 0; i < error->tag_stack.length; ++i) {
|
78
|
+
for (unsigned int i = 0; i < error->tag_stack.length; ++i) {
|
80
79
|
if (i) {
|
81
80
|
print_message(parser, output, ", ");
|
82
81
|
}
|
@@ -87,12 +86,11 @@ static void print_tag_stack(
|
|
87
86
|
}
|
88
87
|
|
89
88
|
static void handle_parser_error(GumboParser* parser,
|
90
|
-
|
91
|
-
GumboStringBuffer* output) {
|
89
|
+
const GumboParserError* error, GumboStringBuffer* output) {
|
92
90
|
if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL &&
|
93
91
|
error->input_type != GUMBO_TOKEN_DOCTYPE) {
|
94
|
-
print_message(
|
95
|
-
|
92
|
+
print_message(
|
93
|
+
parser, output, "The doctype must be the first token in the document");
|
96
94
|
return;
|
97
95
|
}
|
98
96
|
|
@@ -106,6 +104,7 @@ static void handle_parser_error(GumboParser* parser,
|
|
106
104
|
// But just in case...
|
107
105
|
print_message(parser, output, "Comments aren't legal here");
|
108
106
|
return;
|
107
|
+
case GUMBO_TOKEN_CDATA:
|
109
108
|
case GUMBO_TOKEN_WHITESPACE:
|
110
109
|
case GUMBO_TOKEN_CHARACTER:
|
111
110
|
print_message(parser, output, "Character tokens aren't legal here");
|
@@ -150,13 +149,14 @@ static const char* find_last_newline(
|
|
150
149
|
static const char* find_next_newline(
|
151
150
|
const char* original_text, const char* error_location) {
|
152
151
|
const char* c = error_location;
|
153
|
-
for (; *c && *c != '\n'; ++c)
|
152
|
+
for (; *c && *c != '\n'; ++c)
|
153
|
+
;
|
154
154
|
return c;
|
155
155
|
}
|
156
156
|
|
157
157
|
GumboError* gumbo_add_error(GumboParser* parser) {
|
158
158
|
int max_errors = parser->_options->max_errors;
|
159
|
-
if (max_errors >= 0 && parser->_output->errors.length >= max_errors) {
|
159
|
+
if (max_errors >= 0 && parser->_output->errors.length >= (unsigned int) max_errors) {
|
160
160
|
return NULL;
|
161
161
|
}
|
162
162
|
GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError));
|
@@ -166,50 +166,52 @@ GumboError* gumbo_add_error(GumboParser* parser) {
|
|
166
166
|
|
167
167
|
void gumbo_error_to_string(
|
168
168
|
GumboParser* parser, const GumboError* error, GumboStringBuffer* output) {
|
169
|
-
print_message(
|
170
|
-
|
169
|
+
print_message(
|
170
|
+
parser, output, "@%d:%d: ", error->position.line, error->position.column);
|
171
171
|
switch (error->type) {
|
172
172
|
case GUMBO_ERR_UTF8_INVALID:
|
173
|
-
print_message(
|
174
|
-
|
173
|
+
print_message(
|
174
|
+
parser, output, "Invalid UTF8 character 0x%x", error->v.codepoint);
|
175
175
|
break;
|
176
176
|
case GUMBO_ERR_UTF8_TRUNCATED:
|
177
177
|
print_message(parser, output,
|
178
|
-
|
179
|
-
|
178
|
+
"Input stream ends with a truncated UTF8 character 0x%x",
|
179
|
+
error->v.codepoint);
|
180
180
|
break;
|
181
181
|
case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
|
182
|
-
print_message(
|
183
|
-
|
182
|
+
print_message(
|
183
|
+
parser, output, "No digits after &# in numeric character reference");
|
184
184
|
break;
|
185
185
|
case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
|
186
186
|
print_message(parser, output,
|
187
|
-
|
188
|
-
|
187
|
+
"The numeric character reference &#%d should be followed "
|
188
|
+
"by a semicolon",
|
189
|
+
error->v.codepoint);
|
189
190
|
break;
|
190
191
|
case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
|
191
192
|
print_message(parser, output,
|
192
|
-
|
193
|
-
|
193
|
+
"The numeric character reference &#%d; encodes an invalid "
|
194
|
+
"unicode codepoint",
|
195
|
+
error->v.codepoint);
|
194
196
|
break;
|
195
197
|
case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
|
196
198
|
// The textual data came from one of the literal strings in the table, and
|
197
199
|
// so it'll be null-terminated.
|
198
200
|
print_message(parser, output,
|
199
|
-
|
200
|
-
|
201
|
+
"The named character reference &%.*s should be followed by a "
|
202
|
+
"semicolon",
|
203
|
+
(int) error->v.text.length, error->v.text.data);
|
201
204
|
break;
|
202
205
|
case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
|
203
206
|
print_message(parser, output,
|
204
|
-
|
205
|
-
|
207
|
+
"The named character reference &%.*s; is not a valid entity name",
|
208
|
+
(int) error->v.text.length, error->v.text.data);
|
206
209
|
break;
|
207
210
|
case GUMBO_ERR_DUPLICATE_ATTR:
|
208
211
|
print_message(parser, output,
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
error->v.duplicate_attr.new_index);
|
212
|
+
"Attribute %s occurs multiple times, at positions %d and %d",
|
213
|
+
error->v.duplicate_attr.name, error->v.duplicate_attr.original_index,
|
214
|
+
error->v.duplicate_attr.new_index);
|
213
215
|
break;
|
214
216
|
case GUMBO_ERR_PARSER:
|
215
217
|
case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
|
@@ -217,21 +219,19 @@ void gumbo_error_to_string(
|
|
217
219
|
break;
|
218
220
|
default:
|
219
221
|
print_message(parser, output,
|
220
|
-
|
222
|
+
"Tokenizer error with an unimplemented error message");
|
221
223
|
break;
|
222
224
|
}
|
223
225
|
gumbo_string_buffer_append_codepoint(parser, '.', output);
|
224
226
|
}
|
225
227
|
|
226
|
-
void gumbo_caret_diagnostic_to_string(
|
227
|
-
|
228
|
-
|
228
|
+
void gumbo_caret_diagnostic_to_string(GumboParser* parser,
|
229
|
+
const GumboError* error, const char* source_text,
|
230
|
+
GumboStringBuffer* output) {
|
229
231
|
gumbo_error_to_string(parser, error, output);
|
230
232
|
|
231
|
-
const char* line_start =
|
232
|
-
|
233
|
-
const char* line_end =
|
234
|
-
find_next_newline(source_text, error->original_text);
|
233
|
+
const char* line_start = find_last_newline(source_text, error->original_text);
|
234
|
+
const char* line_end = find_next_newline(source_text, error->original_text);
|
235
235
|
GumboStringPiece original_line;
|
236
236
|
original_line.data = line_start;
|
237
237
|
original_line.length = line_end - line_start;
|
@@ -272,7 +272,7 @@ void gumbo_init_errors(GumboParser* parser) {
|
|
272
272
|
}
|
273
273
|
|
274
274
|
void gumbo_destroy_errors(GumboParser* parser) {
|
275
|
-
for (int i = 0; i < parser->_output->errors.length; ++i) {
|
275
|
+
for (unsigned int i = 0; i < parser->_output->errors.length; ++i) {
|
276
276
|
gumbo_error_destroy(parser, parser->_output->errors.data[i]);
|
277
277
|
}
|
278
278
|
gumbo_vector_destroy(parser, &parser->_output->errors);
|
data/gumbo-parser/src/error.h
CHANGED
@@ -201,24 +201,22 @@ void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
|
|
201
201
|
// responsible for deleting the buffer. (Note that the buffer is allocated with
|
202
202
|
// the allocator specified in the GumboParser config and hence should be freed
|
203
203
|
// by gumbo_parser_deallocate().)
|
204
|
-
void gumbo_error_to_string(
|
205
|
-
|
206
|
-
GumboStringBuffer* output);
|
204
|
+
void gumbo_error_to_string(struct GumboInternalParser* parser,
|
205
|
+
const GumboError* error, GumboStringBuffer* output);
|
207
206
|
|
208
207
|
// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
|
209
208
|
// with a freshly-allocated buffer containing the error message text. The
|
210
209
|
// caller is responsible for deleting the buffer. (Note that the buffer is
|
211
210
|
// allocated with the allocator specified in the GumboParser config and hence
|
212
211
|
// should be freed by gumbo_parser_deallocate().)
|
213
|
-
void gumbo_caret_diagnostic_to_string(
|
214
|
-
|
215
|
-
|
212
|
+
void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
|
213
|
+
const GumboError* error, const char* source_text,
|
214
|
+
GumboStringBuffer* output);
|
216
215
|
|
217
216
|
// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
|
218
217
|
// of writing to a string.
|
219
|
-
void gumbo_print_caret_diagnostic(
|
220
|
-
|
221
|
-
const char* source_text);
|
218
|
+
void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
|
219
|
+
const GumboError* error, const char* source_text);
|
222
220
|
|
223
221
|
#ifdef __cplusplus
|
224
222
|
}
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -76,7 +76,6 @@ typedef struct {
|
|
76
76
|
*/
|
77
77
|
extern const GumboSourcePosition kGumboEmptySourcePosition;
|
78
78
|
|
79
|
-
|
80
79
|
/**
|
81
80
|
* A struct representing a string or part of a string. Strings within the
|
82
81
|
* parser are represented by a char* and a length; the char* points into
|
@@ -111,7 +110,6 @@ bool gumbo_string_equals(
|
|
111
110
|
bool gumbo_string_equals_ignore_case(
|
112
111
|
const GumboStringPiece* str1, const GumboStringPiece* str2);
|
113
112
|
|
114
|
-
|
115
113
|
/**
|
116
114
|
* A simple vector implementation. This stores a pointer to a data array and a
|
117
115
|
* length. All elements are stored as void*; client code must cast to the
|
@@ -141,8 +139,7 @@ extern const GumboVector kGumboEmptyVector;
|
|
141
139
|
* Returns the first index at which an element appears in this vector (testing
|
142
140
|
* by pointer equality), or -1 if it never does.
|
143
141
|
*/
|
144
|
-
int gumbo_vector_index_of(GumboVector* vector, void* element);
|
145
|
-
|
142
|
+
int gumbo_vector_index_of(GumboVector* vector, const void* element);
|
146
143
|
|
147
144
|
/**
|
148
145
|
* An enum for all the tags defined in the HTML5 standard. These correspond to
|
@@ -157,172 +154,10 @@ int gumbo_vector_index_of(GumboVector* vector, void* element);
|
|
157
154
|
* strings.
|
158
155
|
*/
|
159
156
|
typedef enum {
|
160
|
-
|
161
|
-
|
162
|
-
//
|
163
|
-
|
164
|
-
GUMBO_TAG_TITLE,
|
165
|
-
GUMBO_TAG_BASE,
|
166
|
-
GUMBO_TAG_LINK,
|
167
|
-
GUMBO_TAG_META,
|
168
|
-
GUMBO_TAG_STYLE,
|
169
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
|
170
|
-
GUMBO_TAG_SCRIPT,
|
171
|
-
GUMBO_TAG_NOSCRIPT,
|
172
|
-
GUMBO_TAG_TEMPLATE,
|
173
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
|
174
|
-
GUMBO_TAG_BODY,
|
175
|
-
GUMBO_TAG_ARTICLE,
|
176
|
-
GUMBO_TAG_SECTION,
|
177
|
-
GUMBO_TAG_NAV,
|
178
|
-
GUMBO_TAG_ASIDE,
|
179
|
-
GUMBO_TAG_H1,
|
180
|
-
GUMBO_TAG_H2,
|
181
|
-
GUMBO_TAG_H3,
|
182
|
-
GUMBO_TAG_H4,
|
183
|
-
GUMBO_TAG_H5,
|
184
|
-
GUMBO_TAG_H6,
|
185
|
-
GUMBO_TAG_HGROUP,
|
186
|
-
GUMBO_TAG_HEADER,
|
187
|
-
GUMBO_TAG_FOOTER,
|
188
|
-
GUMBO_TAG_ADDRESS,
|
189
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
|
190
|
-
GUMBO_TAG_P,
|
191
|
-
GUMBO_TAG_HR,
|
192
|
-
GUMBO_TAG_PRE,
|
193
|
-
GUMBO_TAG_BLOCKQUOTE,
|
194
|
-
GUMBO_TAG_OL,
|
195
|
-
GUMBO_TAG_UL,
|
196
|
-
GUMBO_TAG_LI,
|
197
|
-
GUMBO_TAG_DL,
|
198
|
-
GUMBO_TAG_DT,
|
199
|
-
GUMBO_TAG_DD,
|
200
|
-
GUMBO_TAG_FIGURE,
|
201
|
-
GUMBO_TAG_FIGCAPTION,
|
202
|
-
GUMBO_TAG_MAIN,
|
203
|
-
GUMBO_TAG_DIV,
|
204
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
|
205
|
-
GUMBO_TAG_A,
|
206
|
-
GUMBO_TAG_EM,
|
207
|
-
GUMBO_TAG_STRONG,
|
208
|
-
GUMBO_TAG_SMALL,
|
209
|
-
GUMBO_TAG_S,
|
210
|
-
GUMBO_TAG_CITE,
|
211
|
-
GUMBO_TAG_Q,
|
212
|
-
GUMBO_TAG_DFN,
|
213
|
-
GUMBO_TAG_ABBR,
|
214
|
-
GUMBO_TAG_DATA,
|
215
|
-
GUMBO_TAG_TIME,
|
216
|
-
GUMBO_TAG_CODE,
|
217
|
-
GUMBO_TAG_VAR,
|
218
|
-
GUMBO_TAG_SAMP,
|
219
|
-
GUMBO_TAG_KBD,
|
220
|
-
GUMBO_TAG_SUB,
|
221
|
-
GUMBO_TAG_SUP,
|
222
|
-
GUMBO_TAG_I,
|
223
|
-
GUMBO_TAG_B,
|
224
|
-
GUMBO_TAG_U,
|
225
|
-
GUMBO_TAG_MARK,
|
226
|
-
GUMBO_TAG_RUBY,
|
227
|
-
GUMBO_TAG_RT,
|
228
|
-
GUMBO_TAG_RP,
|
229
|
-
GUMBO_TAG_BDI,
|
230
|
-
GUMBO_TAG_BDO,
|
231
|
-
GUMBO_TAG_SPAN,
|
232
|
-
GUMBO_TAG_BR,
|
233
|
-
GUMBO_TAG_WBR,
|
234
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
|
235
|
-
GUMBO_TAG_INS,
|
236
|
-
GUMBO_TAG_DEL,
|
237
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
|
238
|
-
GUMBO_TAG_IMAGE,
|
239
|
-
GUMBO_TAG_IMG,
|
240
|
-
GUMBO_TAG_IFRAME,
|
241
|
-
GUMBO_TAG_EMBED,
|
242
|
-
GUMBO_TAG_OBJECT,
|
243
|
-
GUMBO_TAG_PARAM,
|
244
|
-
GUMBO_TAG_VIDEO,
|
245
|
-
GUMBO_TAG_AUDIO,
|
246
|
-
GUMBO_TAG_SOURCE,
|
247
|
-
GUMBO_TAG_TRACK,
|
248
|
-
GUMBO_TAG_CANVAS,
|
249
|
-
GUMBO_TAG_MAP,
|
250
|
-
GUMBO_TAG_AREA,
|
251
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
|
252
|
-
GUMBO_TAG_MATH,
|
253
|
-
GUMBO_TAG_MI,
|
254
|
-
GUMBO_TAG_MO,
|
255
|
-
GUMBO_TAG_MN,
|
256
|
-
GUMBO_TAG_MS,
|
257
|
-
GUMBO_TAG_MTEXT,
|
258
|
-
GUMBO_TAG_MGLYPH,
|
259
|
-
GUMBO_TAG_MALIGNMARK,
|
260
|
-
GUMBO_TAG_ANNOTATION_XML,
|
261
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
|
262
|
-
GUMBO_TAG_SVG,
|
263
|
-
GUMBO_TAG_FOREIGNOBJECT,
|
264
|
-
GUMBO_TAG_DESC,
|
265
|
-
// SVG title tags will have GUMBO_TAG_TITLE as with HTML.
|
266
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
|
267
|
-
GUMBO_TAG_TABLE,
|
268
|
-
GUMBO_TAG_CAPTION,
|
269
|
-
GUMBO_TAG_COLGROUP,
|
270
|
-
GUMBO_TAG_COL,
|
271
|
-
GUMBO_TAG_TBODY,
|
272
|
-
GUMBO_TAG_THEAD,
|
273
|
-
GUMBO_TAG_TFOOT,
|
274
|
-
GUMBO_TAG_TR,
|
275
|
-
GUMBO_TAG_TD,
|
276
|
-
GUMBO_TAG_TH,
|
277
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
|
278
|
-
GUMBO_TAG_FORM,
|
279
|
-
GUMBO_TAG_FIELDSET,
|
280
|
-
GUMBO_TAG_LEGEND,
|
281
|
-
GUMBO_TAG_LABEL,
|
282
|
-
GUMBO_TAG_INPUT,
|
283
|
-
GUMBO_TAG_BUTTON,
|
284
|
-
GUMBO_TAG_SELECT,
|
285
|
-
GUMBO_TAG_DATALIST,
|
286
|
-
GUMBO_TAG_OPTGROUP,
|
287
|
-
GUMBO_TAG_OPTION,
|
288
|
-
GUMBO_TAG_TEXTAREA,
|
289
|
-
GUMBO_TAG_KEYGEN,
|
290
|
-
GUMBO_TAG_OUTPUT,
|
291
|
-
GUMBO_TAG_PROGRESS,
|
292
|
-
GUMBO_TAG_METER,
|
293
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
|
294
|
-
GUMBO_TAG_DETAILS,
|
295
|
-
GUMBO_TAG_SUMMARY,
|
296
|
-
GUMBO_TAG_MENU,
|
297
|
-
GUMBO_TAG_MENUITEM,
|
298
|
-
// Non-conforming elements that nonetheless appear in the HTML5 spec.
|
299
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
|
300
|
-
GUMBO_TAG_APPLET,
|
301
|
-
GUMBO_TAG_ACRONYM,
|
302
|
-
GUMBO_TAG_BGSOUND,
|
303
|
-
GUMBO_TAG_DIR,
|
304
|
-
GUMBO_TAG_FRAME,
|
305
|
-
GUMBO_TAG_FRAMESET,
|
306
|
-
GUMBO_TAG_NOFRAMES,
|
307
|
-
GUMBO_TAG_ISINDEX,
|
308
|
-
GUMBO_TAG_LISTING,
|
309
|
-
GUMBO_TAG_XMP,
|
310
|
-
GUMBO_TAG_NEXTID,
|
311
|
-
GUMBO_TAG_NOEMBED,
|
312
|
-
GUMBO_TAG_PLAINTEXT,
|
313
|
-
GUMBO_TAG_RB,
|
314
|
-
GUMBO_TAG_STRIKE,
|
315
|
-
GUMBO_TAG_BASEFONT,
|
316
|
-
GUMBO_TAG_BIG,
|
317
|
-
GUMBO_TAG_BLINK,
|
318
|
-
GUMBO_TAG_CENTER,
|
319
|
-
GUMBO_TAG_FONT,
|
320
|
-
GUMBO_TAG_MARQUEE,
|
321
|
-
GUMBO_TAG_MULTICOL,
|
322
|
-
GUMBO_TAG_NOBR,
|
323
|
-
GUMBO_TAG_SPACER,
|
324
|
-
GUMBO_TAG_TT,
|
325
|
-
// Used for all tags that don't have special handling in HTML.
|
157
|
+
// Load all the tags from an external source, generated from tag.in.
|
158
|
+
#include "tag_enum.h"
|
159
|
+
// Used for all tags that don't have special handling in HTML. Add new tags
|
160
|
+
// to the end of tag.in so as to preserve backwards-compatibility.
|
326
161
|
GUMBO_TAG_UNKNOWN,
|
327
162
|
// A marker value to indicate the end of the enum, for iterating over it.
|
328
163
|
// Also used as the terminator for varargs functions that take tags.
|
@@ -364,9 +199,10 @@ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
|
|
364
199
|
|
365
200
|
/**
|
366
201
|
* Converts a tag name string (which may be in upper or mixed case) to a tag
|
367
|
-
* enum.
|
202
|
+
* enum. The `tag` version expects `tagname` to be NULL-terminated
|
368
203
|
*/
|
369
204
|
GumboTag gumbo_tag_enum(const char* tagname);
|
205
|
+
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
|
370
206
|
|
371
207
|
/**
|
372
208
|
* Attribute namespaces.
|
@@ -461,10 +297,16 @@ typedef enum {
|
|
461
297
|
GUMBO_NODE_TEXT,
|
462
298
|
/** CDATA node. v will be a GumboText. */
|
463
299
|
GUMBO_NODE_CDATA,
|
464
|
-
/** Comment node. v
|
300
|
+
/** Comment node. v will be a GumboText, excluding comment delimiters. */
|
465
301
|
GUMBO_NODE_COMMENT,
|
466
302
|
/** Text node, where all contents is whitespace. v will be a GumboText. */
|
467
|
-
GUMBO_NODE_WHITESPACE
|
303
|
+
GUMBO_NODE_WHITESPACE,
|
304
|
+
/** Template node. This is separate from GUMBO_NODE_ELEMENT because many
|
305
|
+
* client libraries will want to ignore the contents of template nodes, as
|
306
|
+
* the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
|
307
|
+
* here, while clients that want to include template contents should also
|
308
|
+
* check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
|
309
|
+
GUMBO_NODE_TEMPLATE
|
468
310
|
} GumboNodeType;
|
469
311
|
|
470
312
|
/**
|
@@ -473,7 +315,9 @@ typedef enum {
|
|
473
315
|
*/
|
474
316
|
typedef struct GumboInternalNode GumboNode;
|
475
317
|
|
476
|
-
/**
|
318
|
+
/**
|
319
|
+
* http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
|
320
|
+
*/
|
477
321
|
typedef enum {
|
478
322
|
GUMBO_DOCTYPE_NO_QUIRKS,
|
479
323
|
GUMBO_DOCTYPE_QUIRKS,
|
@@ -571,7 +415,6 @@ typedef enum {
|
|
571
415
|
GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
|
572
416
|
} GumboParseFlags;
|
573
417
|
|
574
|
-
|
575
418
|
/**
|
576
419
|
* Information specific to document nodes.
|
577
420
|
*/
|
@@ -690,9 +533,9 @@ struct GumboInternalNode {
|
|
690
533
|
|
691
534
|
/** The actual node data. */
|
692
535
|
union {
|
693
|
-
GumboDocument document;
|
694
|
-
GumboElement element;
|
695
|
-
GumboText text;
|
536
|
+
GumboDocument document; // For GUMBO_NODE_DOCUMENT.
|
537
|
+
GumboElement element; // For GUMBO_NODE_ELEMENT.
|
538
|
+
GumboText text; // For everything else.
|
696
539
|
} v;
|
697
540
|
};
|
698
541
|
|
@@ -750,6 +593,29 @@ typedef struct GumboInternalOptions {
|
|
750
593
|
* Default: -1
|
751
594
|
*/
|
752
595
|
int max_errors;
|
596
|
+
|
597
|
+
/**
|
598
|
+
* The fragment context for parsing:
|
599
|
+
* https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
|
600
|
+
*
|
601
|
+
* If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e.
|
602
|
+
* the regular parsing algorithm. Otherwise, pass the tag enum for the
|
603
|
+
* intended parent of the parsed fragment. We use just the tag enum rather
|
604
|
+
* than a full node because that's enough to set all the parsing context we
|
605
|
+
* need, and it provides some additional flexibility for client code to act as
|
606
|
+
* if parsing a fragment even when a full HTML tree isn't available.
|
607
|
+
*
|
608
|
+
* Default: GUMBO_TAG_LAST
|
609
|
+
*/
|
610
|
+
GumboTag fragment_context;
|
611
|
+
|
612
|
+
/**
|
613
|
+
* The namespace for the fragment context. This lets client code
|
614
|
+
* differentiate between, say, parsing a <title> tag in SVG vs. parsing it in
|
615
|
+
* HTML.
|
616
|
+
* Default: GUMBO_NAMESPACE_HTML
|
617
|
+
*/
|
618
|
+
GumboNamespaceEnum fragment_namespace;
|
753
619
|
} GumboOptions;
|
754
620
|
|
755
621
|
/** Default options struct; use this with gumbo_parse_with_options. */
|
@@ -796,9 +662,7 @@ GumboOutput* gumbo_parse_with_options(
|
|
796
662
|
const GumboOptions* options, const char* buffer, size_t buffer_length);
|
797
663
|
|
798
664
|
/** Release the memory used for the parse tree & parse errors. */
|
799
|
-
void gumbo_destroy_output(
|
800
|
-
const GumboOptions* options, GumboOutput* output);
|
801
|
-
|
665
|
+
void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output);
|
802
666
|
|
803
667
|
#ifdef __cplusplus
|
804
668
|
}
|