nokogumbo 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +8 -2
- data/ext/nokogumboc/extconf.rb +18 -6
- data/ext/nokogumboc/nokogumbo.c +102 -42
- data/gumbo-parser/src/attribute.c +1 -1
- data/gumbo-parser/src/char_ref.c +37 -67
- data/gumbo-parser/src/char_ref.h +3 -4
- data/gumbo-parser/src/char_ref.rl +6 -1
- data/gumbo-parser/src/error.c +51 -51
- data/gumbo-parser/src/error.h +7 -9
- data/gumbo-parser/src/gumbo.h +45 -181
- data/gumbo-parser/src/parser.c +1439 -1172
- data/gumbo-parser/src/string_buffer.c +14 -10
- data/gumbo-parser/src/string_buffer.h +9 -6
- data/gumbo-parser/src/string_piece.c +5 -6
- data/gumbo-parser/src/string_piece.h +2 -3
- data/gumbo-parser/src/tag.c +36 -166
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +153 -0
- data/gumbo-parser/src/tag_gperf.h +105 -0
- data/gumbo-parser/src/tag_sizes.h +4 -0
- data/gumbo-parser/src/tag_strings.h +153 -0
- data/gumbo-parser/src/token_type.h +1 -0
- data/gumbo-parser/src/tokenizer.c +278 -361
- data/gumbo-parser/src/tokenizer.h +2 -2
- data/gumbo-parser/src/utf8.c +53 -52
- data/gumbo-parser/src/utf8.h +1 -2
- data/gumbo-parser/src/util.c +1 -1
- data/gumbo-parser/src/util.h +0 -2
- data/gumbo-parser/src/vector.c +17 -17
- data/gumbo-parser/src/vector.h +6 -8
- data/gumbo-parser/visualc/include/strings.h +2 -1
- data/lib/nokogumbo.rb +8 -8
- data/test-nokogumbo.rb +190 -0
- metadata +19 -17
data/gumbo-parser/src/char_ref.h
CHANGED
@@ -49,10 +49,9 @@ typedef struct {
|
|
49
49
|
// errors to the GumboParser's errors vector, if the spec calls for it. Pass a
|
50
50
|
// space for the "additional allowed char" when the spec says "with no
|
51
51
|
// additional allowed char". Returns false on parse error, true otherwise.
|
52
|
-
bool consume_char_ref(
|
53
|
-
struct
|
54
|
-
|
55
|
-
OneOrTwoCodepoints* output);
|
52
|
+
bool consume_char_ref(struct GumboInternalParser* parser,
|
53
|
+
struct GumboInternalUtf8Iterator* input, int additional_allowed_char,
|
54
|
+
bool is_in_attribute, OneOrTwoCodepoints* output);
|
56
55
|
|
57
56
|
#ifdef __cplusplus
|
58
57
|
}
|
@@ -2464,7 +2464,9 @@ valid_named_ref := |*
|
|
2464
2464
|
*|;
|
2465
2465
|
}%%
|
2466
2466
|
|
2467
|
-
|
2467
|
+
// clang-format off
|
2468
|
+
%% write data noerror nofinal;
|
2469
|
+
// clang-format on
|
2468
2470
|
|
2469
2471
|
static bool consume_named_ref(
|
2470
2472
|
struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute,
|
@@ -2477,13 +2479,16 @@ static bool consume_named_ref(
|
|
2477
2479
|
const char *ts, *start;
|
2478
2480
|
int cs, act;
|
2479
2481
|
|
2482
|
+
// clang-format off
|
2480
2483
|
%% write init;
|
2481
2484
|
// Avoid unused variable warnings.
|
2482
2485
|
(void) act;
|
2483
2486
|
(void) ts;
|
2487
|
+
(void) char_ref_en_valid_named_ref;
|
2484
2488
|
|
2485
2489
|
start = p;
|
2486
2490
|
%% write exec;
|
2491
|
+
// clang-format on
|
2487
2492
|
|
2488
2493
|
if (cs >= %%{ write first_final; }%%) {
|
2489
2494
|
assert(output->first != kGumboNoChar);
|
data/gumbo-parser/src/error.c
CHANGED
@@ -27,18 +27,17 @@
|
|
27
27
|
#include "util.h"
|
28
28
|
#include "vector.h"
|
29
29
|
|
30
|
-
static const size_t kMessageBufferSize = 256;
|
31
|
-
|
32
30
|
// Prints a formatted message to a StringBuffer. This automatically resizes the
|
33
31
|
// StringBuffer as necessary to fit the message. Returns the number of bytes
|
34
32
|
// written.
|
35
|
-
static int print_message(
|
36
|
-
|
33
|
+
static int print_message(
|
34
|
+
GumboParser* parser, GumboStringBuffer* output, const char* format, ...) {
|
37
35
|
va_list args;
|
38
|
-
va_start(args, format);
|
39
36
|
int remaining_capacity = output->capacity - output->length;
|
40
|
-
|
41
|
-
|
37
|
+
va_start(args, format);
|
38
|
+
int bytes_written = vsnprintf(
|
39
|
+
output->data + output->length, remaining_capacity, format, args);
|
40
|
+
va_end(args);
|
42
41
|
#ifdef _MSC_VER
|
43
42
|
if (bytes_written == -1) {
|
44
43
|
// vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
|
@@ -47,15 +46,15 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
|
|
47
46
|
// we retry (letting it fail and returning 0 if it doesn't), since there's
|
48
47
|
// no way to smartly resize the buffer.
|
49
48
|
gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
|
50
|
-
|
51
|
-
|
49
|
+
va_start(args, format);
|
50
|
+
int result = vsnprintf(
|
51
|
+
output->data + output->length, remaining_capacity, format, args);
|
52
52
|
va_end(args);
|
53
53
|
return result == -1 ? 0 : result;
|
54
54
|
}
|
55
55
|
#else
|
56
56
|
// -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
|
57
57
|
if (bytes_written == -1) {
|
58
|
-
va_end(args);
|
59
58
|
return 0;
|
60
59
|
}
|
61
60
|
#endif
|
@@ -64,19 +63,19 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
|
|
64
63
|
gumbo_string_buffer_reserve(
|
65
64
|
parser, output->capacity + bytes_written, output);
|
66
65
|
remaining_capacity = output->capacity - output->length;
|
67
|
-
|
68
|
-
|
66
|
+
va_start(args, format);
|
67
|
+
bytes_written = vsnprintf(
|
68
|
+
output->data + output->length, remaining_capacity, format, args);
|
69
|
+
va_end(args);
|
69
70
|
}
|
70
71
|
output->length += bytes_written;
|
71
|
-
va_end(args);
|
72
72
|
return bytes_written;
|
73
73
|
}
|
74
74
|
|
75
|
-
static void print_tag_stack(
|
76
|
-
GumboParser* parser, const GumboParserError* error,
|
75
|
+
static void print_tag_stack(GumboParser* parser, const GumboParserError* error,
|
77
76
|
GumboStringBuffer* output) {
|
78
77
|
print_message(parser, output, " Currently open tags: ");
|
79
|
-
for (int i = 0; i < error->tag_stack.length; ++i) {
|
78
|
+
for (unsigned int i = 0; i < error->tag_stack.length; ++i) {
|
80
79
|
if (i) {
|
81
80
|
print_message(parser, output, ", ");
|
82
81
|
}
|
@@ -87,12 +86,11 @@ static void print_tag_stack(
|
|
87
86
|
}
|
88
87
|
|
89
88
|
static void handle_parser_error(GumboParser* parser,
|
90
|
-
|
91
|
-
GumboStringBuffer* output) {
|
89
|
+
const GumboParserError* error, GumboStringBuffer* output) {
|
92
90
|
if (error->parser_state == GUMBO_INSERTION_MODE_INITIAL &&
|
93
91
|
error->input_type != GUMBO_TOKEN_DOCTYPE) {
|
94
|
-
print_message(
|
95
|
-
|
92
|
+
print_message(
|
93
|
+
parser, output, "The doctype must be the first token in the document");
|
96
94
|
return;
|
97
95
|
}
|
98
96
|
|
@@ -106,6 +104,7 @@ static void handle_parser_error(GumboParser* parser,
|
|
106
104
|
// But just in case...
|
107
105
|
print_message(parser, output, "Comments aren't legal here");
|
108
106
|
return;
|
107
|
+
case GUMBO_TOKEN_CDATA:
|
109
108
|
case GUMBO_TOKEN_WHITESPACE:
|
110
109
|
case GUMBO_TOKEN_CHARACTER:
|
111
110
|
print_message(parser, output, "Character tokens aren't legal here");
|
@@ -150,13 +149,14 @@ static const char* find_last_newline(
|
|
150
149
|
static const char* find_next_newline(
|
151
150
|
const char* original_text, const char* error_location) {
|
152
151
|
const char* c = error_location;
|
153
|
-
for (; *c && *c != '\n'; ++c)
|
152
|
+
for (; *c && *c != '\n'; ++c)
|
153
|
+
;
|
154
154
|
return c;
|
155
155
|
}
|
156
156
|
|
157
157
|
GumboError* gumbo_add_error(GumboParser* parser) {
|
158
158
|
int max_errors = parser->_options->max_errors;
|
159
|
-
if (max_errors >= 0 && parser->_output->errors.length >= max_errors) {
|
159
|
+
if (max_errors >= 0 && parser->_output->errors.length >= (unsigned int) max_errors) {
|
160
160
|
return NULL;
|
161
161
|
}
|
162
162
|
GumboError* error = gumbo_parser_allocate(parser, sizeof(GumboError));
|
@@ -166,50 +166,52 @@ GumboError* gumbo_add_error(GumboParser* parser) {
|
|
166
166
|
|
167
167
|
void gumbo_error_to_string(
|
168
168
|
GumboParser* parser, const GumboError* error, GumboStringBuffer* output) {
|
169
|
-
print_message(
|
170
|
-
|
169
|
+
print_message(
|
170
|
+
parser, output, "@%d:%d: ", error->position.line, error->position.column);
|
171
171
|
switch (error->type) {
|
172
172
|
case GUMBO_ERR_UTF8_INVALID:
|
173
|
-
print_message(
|
174
|
-
|
173
|
+
print_message(
|
174
|
+
parser, output, "Invalid UTF8 character 0x%x", error->v.codepoint);
|
175
175
|
break;
|
176
176
|
case GUMBO_ERR_UTF8_TRUNCATED:
|
177
177
|
print_message(parser, output,
|
178
|
-
|
179
|
-
|
178
|
+
"Input stream ends with a truncated UTF8 character 0x%x",
|
179
|
+
error->v.codepoint);
|
180
180
|
break;
|
181
181
|
case GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS:
|
182
|
-
print_message(
|
183
|
-
|
182
|
+
print_message(
|
183
|
+
parser, output, "No digits after &# in numeric character reference");
|
184
184
|
break;
|
185
185
|
case GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON:
|
186
186
|
print_message(parser, output,
|
187
|
-
|
188
|
-
|
187
|
+
"The numeric character reference &#%d should be followed "
|
188
|
+
"by a semicolon",
|
189
|
+
error->v.codepoint);
|
189
190
|
break;
|
190
191
|
case GUMBO_ERR_NUMERIC_CHAR_REF_INVALID:
|
191
192
|
print_message(parser, output,
|
192
|
-
|
193
|
-
|
193
|
+
"The numeric character reference &#%d; encodes an invalid "
|
194
|
+
"unicode codepoint",
|
195
|
+
error->v.codepoint);
|
194
196
|
break;
|
195
197
|
case GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON:
|
196
198
|
// The textual data came from one of the literal strings in the table, and
|
197
199
|
// so it'll be null-terminated.
|
198
200
|
print_message(parser, output,
|
199
|
-
|
200
|
-
|
201
|
+
"The named character reference &%.*s should be followed by a "
|
202
|
+
"semicolon",
|
203
|
+
(int) error->v.text.length, error->v.text.data);
|
201
204
|
break;
|
202
205
|
case GUMBO_ERR_NAMED_CHAR_REF_INVALID:
|
203
206
|
print_message(parser, output,
|
204
|
-
|
205
|
-
|
207
|
+
"The named character reference &%.*s; is not a valid entity name",
|
208
|
+
(int) error->v.text.length, error->v.text.data);
|
206
209
|
break;
|
207
210
|
case GUMBO_ERR_DUPLICATE_ATTR:
|
208
211
|
print_message(parser, output,
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
error->v.duplicate_attr.new_index);
|
212
|
+
"Attribute %s occurs multiple times, at positions %d and %d",
|
213
|
+
error->v.duplicate_attr.name, error->v.duplicate_attr.original_index,
|
214
|
+
error->v.duplicate_attr.new_index);
|
213
215
|
break;
|
214
216
|
case GUMBO_ERR_PARSER:
|
215
217
|
case GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG:
|
@@ -217,21 +219,19 @@ void gumbo_error_to_string(
|
|
217
219
|
break;
|
218
220
|
default:
|
219
221
|
print_message(parser, output,
|
220
|
-
|
222
|
+
"Tokenizer error with an unimplemented error message");
|
221
223
|
break;
|
222
224
|
}
|
223
225
|
gumbo_string_buffer_append_codepoint(parser, '.', output);
|
224
226
|
}
|
225
227
|
|
226
|
-
void gumbo_caret_diagnostic_to_string(
|
227
|
-
|
228
|
-
|
228
|
+
void gumbo_caret_diagnostic_to_string(GumboParser* parser,
|
229
|
+
const GumboError* error, const char* source_text,
|
230
|
+
GumboStringBuffer* output) {
|
229
231
|
gumbo_error_to_string(parser, error, output);
|
230
232
|
|
231
|
-
const char* line_start =
|
232
|
-
|
233
|
-
const char* line_end =
|
234
|
-
find_next_newline(source_text, error->original_text);
|
233
|
+
const char* line_start = find_last_newline(source_text, error->original_text);
|
234
|
+
const char* line_end = find_next_newline(source_text, error->original_text);
|
235
235
|
GumboStringPiece original_line;
|
236
236
|
original_line.data = line_start;
|
237
237
|
original_line.length = line_end - line_start;
|
@@ -272,7 +272,7 @@ void gumbo_init_errors(GumboParser* parser) {
|
|
272
272
|
}
|
273
273
|
|
274
274
|
void gumbo_destroy_errors(GumboParser* parser) {
|
275
|
-
for (int i = 0; i < parser->_output->errors.length; ++i) {
|
275
|
+
for (unsigned int i = 0; i < parser->_output->errors.length; ++i) {
|
276
276
|
gumbo_error_destroy(parser, parser->_output->errors.data[i]);
|
277
277
|
}
|
278
278
|
gumbo_vector_destroy(parser, &parser->_output->errors);
|
data/gumbo-parser/src/error.h
CHANGED
@@ -201,24 +201,22 @@ void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
|
|
201
201
|
// responsible for deleting the buffer. (Note that the buffer is allocated with
|
202
202
|
// the allocator specified in the GumboParser config and hence should be freed
|
203
203
|
// by gumbo_parser_deallocate().)
|
204
|
-
void gumbo_error_to_string(
|
205
|
-
|
206
|
-
GumboStringBuffer* output);
|
204
|
+
void gumbo_error_to_string(struct GumboInternalParser* parser,
|
205
|
+
const GumboError* error, GumboStringBuffer* output);
|
207
206
|
|
208
207
|
// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
|
209
208
|
// with a freshly-allocated buffer containing the error message text. The
|
210
209
|
// caller is responsible for deleting the buffer. (Note that the buffer is
|
211
210
|
// allocated with the allocator specified in the GumboParser config and hence
|
212
211
|
// should be freed by gumbo_parser_deallocate().)
|
213
|
-
void gumbo_caret_diagnostic_to_string(
|
214
|
-
|
215
|
-
|
212
|
+
void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
|
213
|
+
const GumboError* error, const char* source_text,
|
214
|
+
GumboStringBuffer* output);
|
216
215
|
|
217
216
|
// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
|
218
217
|
// of writing to a string.
|
219
|
-
void gumbo_print_caret_diagnostic(
|
220
|
-
|
221
|
-
const char* source_text);
|
218
|
+
void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
|
219
|
+
const GumboError* error, const char* source_text);
|
222
220
|
|
223
221
|
#ifdef __cplusplus
|
224
222
|
}
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -76,7 +76,6 @@ typedef struct {
|
|
76
76
|
*/
|
77
77
|
extern const GumboSourcePosition kGumboEmptySourcePosition;
|
78
78
|
|
79
|
-
|
80
79
|
/**
|
81
80
|
* A struct representing a string or part of a string. Strings within the
|
82
81
|
* parser are represented by a char* and a length; the char* points into
|
@@ -111,7 +110,6 @@ bool gumbo_string_equals(
|
|
111
110
|
bool gumbo_string_equals_ignore_case(
|
112
111
|
const GumboStringPiece* str1, const GumboStringPiece* str2);
|
113
112
|
|
114
|
-
|
115
113
|
/**
|
116
114
|
* A simple vector implementation. This stores a pointer to a data array and a
|
117
115
|
* length. All elements are stored as void*; client code must cast to the
|
@@ -141,8 +139,7 @@ extern const GumboVector kGumboEmptyVector;
|
|
141
139
|
* Returns the first index at which an element appears in this vector (testing
|
142
140
|
* by pointer equality), or -1 if it never does.
|
143
141
|
*/
|
144
|
-
int gumbo_vector_index_of(GumboVector* vector, void* element);
|
145
|
-
|
142
|
+
int gumbo_vector_index_of(GumboVector* vector, const void* element);
|
146
143
|
|
147
144
|
/**
|
148
145
|
* An enum for all the tags defined in the HTML5 standard. These correspond to
|
@@ -157,172 +154,10 @@ int gumbo_vector_index_of(GumboVector* vector, void* element);
|
|
157
154
|
* strings.
|
158
155
|
*/
|
159
156
|
typedef enum {
|
160
|
-
|
161
|
-
|
162
|
-
//
|
163
|
-
|
164
|
-
GUMBO_TAG_TITLE,
|
165
|
-
GUMBO_TAG_BASE,
|
166
|
-
GUMBO_TAG_LINK,
|
167
|
-
GUMBO_TAG_META,
|
168
|
-
GUMBO_TAG_STYLE,
|
169
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
|
170
|
-
GUMBO_TAG_SCRIPT,
|
171
|
-
GUMBO_TAG_NOSCRIPT,
|
172
|
-
GUMBO_TAG_TEMPLATE,
|
173
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
|
174
|
-
GUMBO_TAG_BODY,
|
175
|
-
GUMBO_TAG_ARTICLE,
|
176
|
-
GUMBO_TAG_SECTION,
|
177
|
-
GUMBO_TAG_NAV,
|
178
|
-
GUMBO_TAG_ASIDE,
|
179
|
-
GUMBO_TAG_H1,
|
180
|
-
GUMBO_TAG_H2,
|
181
|
-
GUMBO_TAG_H3,
|
182
|
-
GUMBO_TAG_H4,
|
183
|
-
GUMBO_TAG_H5,
|
184
|
-
GUMBO_TAG_H6,
|
185
|
-
GUMBO_TAG_HGROUP,
|
186
|
-
GUMBO_TAG_HEADER,
|
187
|
-
GUMBO_TAG_FOOTER,
|
188
|
-
GUMBO_TAG_ADDRESS,
|
189
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
|
190
|
-
GUMBO_TAG_P,
|
191
|
-
GUMBO_TAG_HR,
|
192
|
-
GUMBO_TAG_PRE,
|
193
|
-
GUMBO_TAG_BLOCKQUOTE,
|
194
|
-
GUMBO_TAG_OL,
|
195
|
-
GUMBO_TAG_UL,
|
196
|
-
GUMBO_TAG_LI,
|
197
|
-
GUMBO_TAG_DL,
|
198
|
-
GUMBO_TAG_DT,
|
199
|
-
GUMBO_TAG_DD,
|
200
|
-
GUMBO_TAG_FIGURE,
|
201
|
-
GUMBO_TAG_FIGCAPTION,
|
202
|
-
GUMBO_TAG_MAIN,
|
203
|
-
GUMBO_TAG_DIV,
|
204
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
|
205
|
-
GUMBO_TAG_A,
|
206
|
-
GUMBO_TAG_EM,
|
207
|
-
GUMBO_TAG_STRONG,
|
208
|
-
GUMBO_TAG_SMALL,
|
209
|
-
GUMBO_TAG_S,
|
210
|
-
GUMBO_TAG_CITE,
|
211
|
-
GUMBO_TAG_Q,
|
212
|
-
GUMBO_TAG_DFN,
|
213
|
-
GUMBO_TAG_ABBR,
|
214
|
-
GUMBO_TAG_DATA,
|
215
|
-
GUMBO_TAG_TIME,
|
216
|
-
GUMBO_TAG_CODE,
|
217
|
-
GUMBO_TAG_VAR,
|
218
|
-
GUMBO_TAG_SAMP,
|
219
|
-
GUMBO_TAG_KBD,
|
220
|
-
GUMBO_TAG_SUB,
|
221
|
-
GUMBO_TAG_SUP,
|
222
|
-
GUMBO_TAG_I,
|
223
|
-
GUMBO_TAG_B,
|
224
|
-
GUMBO_TAG_U,
|
225
|
-
GUMBO_TAG_MARK,
|
226
|
-
GUMBO_TAG_RUBY,
|
227
|
-
GUMBO_TAG_RT,
|
228
|
-
GUMBO_TAG_RP,
|
229
|
-
GUMBO_TAG_BDI,
|
230
|
-
GUMBO_TAG_BDO,
|
231
|
-
GUMBO_TAG_SPAN,
|
232
|
-
GUMBO_TAG_BR,
|
233
|
-
GUMBO_TAG_WBR,
|
234
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
|
235
|
-
GUMBO_TAG_INS,
|
236
|
-
GUMBO_TAG_DEL,
|
237
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
|
238
|
-
GUMBO_TAG_IMAGE,
|
239
|
-
GUMBO_TAG_IMG,
|
240
|
-
GUMBO_TAG_IFRAME,
|
241
|
-
GUMBO_TAG_EMBED,
|
242
|
-
GUMBO_TAG_OBJECT,
|
243
|
-
GUMBO_TAG_PARAM,
|
244
|
-
GUMBO_TAG_VIDEO,
|
245
|
-
GUMBO_TAG_AUDIO,
|
246
|
-
GUMBO_TAG_SOURCE,
|
247
|
-
GUMBO_TAG_TRACK,
|
248
|
-
GUMBO_TAG_CANVAS,
|
249
|
-
GUMBO_TAG_MAP,
|
250
|
-
GUMBO_TAG_AREA,
|
251
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
|
252
|
-
GUMBO_TAG_MATH,
|
253
|
-
GUMBO_TAG_MI,
|
254
|
-
GUMBO_TAG_MO,
|
255
|
-
GUMBO_TAG_MN,
|
256
|
-
GUMBO_TAG_MS,
|
257
|
-
GUMBO_TAG_MTEXT,
|
258
|
-
GUMBO_TAG_MGLYPH,
|
259
|
-
GUMBO_TAG_MALIGNMARK,
|
260
|
-
GUMBO_TAG_ANNOTATION_XML,
|
261
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
|
262
|
-
GUMBO_TAG_SVG,
|
263
|
-
GUMBO_TAG_FOREIGNOBJECT,
|
264
|
-
GUMBO_TAG_DESC,
|
265
|
-
// SVG title tags will have GUMBO_TAG_TITLE as with HTML.
|
266
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
|
267
|
-
GUMBO_TAG_TABLE,
|
268
|
-
GUMBO_TAG_CAPTION,
|
269
|
-
GUMBO_TAG_COLGROUP,
|
270
|
-
GUMBO_TAG_COL,
|
271
|
-
GUMBO_TAG_TBODY,
|
272
|
-
GUMBO_TAG_THEAD,
|
273
|
-
GUMBO_TAG_TFOOT,
|
274
|
-
GUMBO_TAG_TR,
|
275
|
-
GUMBO_TAG_TD,
|
276
|
-
GUMBO_TAG_TH,
|
277
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
|
278
|
-
GUMBO_TAG_FORM,
|
279
|
-
GUMBO_TAG_FIELDSET,
|
280
|
-
GUMBO_TAG_LEGEND,
|
281
|
-
GUMBO_TAG_LABEL,
|
282
|
-
GUMBO_TAG_INPUT,
|
283
|
-
GUMBO_TAG_BUTTON,
|
284
|
-
GUMBO_TAG_SELECT,
|
285
|
-
GUMBO_TAG_DATALIST,
|
286
|
-
GUMBO_TAG_OPTGROUP,
|
287
|
-
GUMBO_TAG_OPTION,
|
288
|
-
GUMBO_TAG_TEXTAREA,
|
289
|
-
GUMBO_TAG_KEYGEN,
|
290
|
-
GUMBO_TAG_OUTPUT,
|
291
|
-
GUMBO_TAG_PROGRESS,
|
292
|
-
GUMBO_TAG_METER,
|
293
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
|
294
|
-
GUMBO_TAG_DETAILS,
|
295
|
-
GUMBO_TAG_SUMMARY,
|
296
|
-
GUMBO_TAG_MENU,
|
297
|
-
GUMBO_TAG_MENUITEM,
|
298
|
-
// Non-conforming elements that nonetheless appear in the HTML5 spec.
|
299
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
|
300
|
-
GUMBO_TAG_APPLET,
|
301
|
-
GUMBO_TAG_ACRONYM,
|
302
|
-
GUMBO_TAG_BGSOUND,
|
303
|
-
GUMBO_TAG_DIR,
|
304
|
-
GUMBO_TAG_FRAME,
|
305
|
-
GUMBO_TAG_FRAMESET,
|
306
|
-
GUMBO_TAG_NOFRAMES,
|
307
|
-
GUMBO_TAG_ISINDEX,
|
308
|
-
GUMBO_TAG_LISTING,
|
309
|
-
GUMBO_TAG_XMP,
|
310
|
-
GUMBO_TAG_NEXTID,
|
311
|
-
GUMBO_TAG_NOEMBED,
|
312
|
-
GUMBO_TAG_PLAINTEXT,
|
313
|
-
GUMBO_TAG_RB,
|
314
|
-
GUMBO_TAG_STRIKE,
|
315
|
-
GUMBO_TAG_BASEFONT,
|
316
|
-
GUMBO_TAG_BIG,
|
317
|
-
GUMBO_TAG_BLINK,
|
318
|
-
GUMBO_TAG_CENTER,
|
319
|
-
GUMBO_TAG_FONT,
|
320
|
-
GUMBO_TAG_MARQUEE,
|
321
|
-
GUMBO_TAG_MULTICOL,
|
322
|
-
GUMBO_TAG_NOBR,
|
323
|
-
GUMBO_TAG_SPACER,
|
324
|
-
GUMBO_TAG_TT,
|
325
|
-
// Used for all tags that don't have special handling in HTML.
|
157
|
+
// Load all the tags from an external source, generated from tag.in.
|
158
|
+
#include "tag_enum.h"
|
159
|
+
// Used for all tags that don't have special handling in HTML. Add new tags
|
160
|
+
// to the end of tag.in so as to preserve backwards-compatibility.
|
326
161
|
GUMBO_TAG_UNKNOWN,
|
327
162
|
// A marker value to indicate the end of the enum, for iterating over it.
|
328
163
|
// Also used as the terminator for varargs functions that take tags.
|
@@ -364,9 +199,10 @@ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
|
|
364
199
|
|
365
200
|
/**
|
366
201
|
* Converts a tag name string (which may be in upper or mixed case) to a tag
|
367
|
-
* enum.
|
202
|
+
* enum. The `tag` version expects `tagname` to be NULL-terminated
|
368
203
|
*/
|
369
204
|
GumboTag gumbo_tag_enum(const char* tagname);
|
205
|
+
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
|
370
206
|
|
371
207
|
/**
|
372
208
|
* Attribute namespaces.
|
@@ -461,10 +297,16 @@ typedef enum {
|
|
461
297
|
GUMBO_NODE_TEXT,
|
462
298
|
/** CDATA node. v will be a GumboText. */
|
463
299
|
GUMBO_NODE_CDATA,
|
464
|
-
/** Comment node. v
|
300
|
+
/** Comment node. v will be a GumboText, excluding comment delimiters. */
|
465
301
|
GUMBO_NODE_COMMENT,
|
466
302
|
/** Text node, where all contents is whitespace. v will be a GumboText. */
|
467
|
-
GUMBO_NODE_WHITESPACE
|
303
|
+
GUMBO_NODE_WHITESPACE,
|
304
|
+
/** Template node. This is separate from GUMBO_NODE_ELEMENT because many
|
305
|
+
* client libraries will want to ignore the contents of template nodes, as
|
306
|
+
* the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
|
307
|
+
* here, while clients that want to include template contents should also
|
308
|
+
* check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
|
309
|
+
GUMBO_NODE_TEMPLATE
|
468
310
|
} GumboNodeType;
|
469
311
|
|
470
312
|
/**
|
@@ -473,7 +315,9 @@ typedef enum {
|
|
473
315
|
*/
|
474
316
|
typedef struct GumboInternalNode GumboNode;
|
475
317
|
|
476
|
-
/**
|
318
|
+
/**
|
319
|
+
* http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
|
320
|
+
*/
|
477
321
|
typedef enum {
|
478
322
|
GUMBO_DOCTYPE_NO_QUIRKS,
|
479
323
|
GUMBO_DOCTYPE_QUIRKS,
|
@@ -571,7 +415,6 @@ typedef enum {
|
|
571
415
|
GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
|
572
416
|
} GumboParseFlags;
|
573
417
|
|
574
|
-
|
575
418
|
/**
|
576
419
|
* Information specific to document nodes.
|
577
420
|
*/
|
@@ -690,9 +533,9 @@ struct GumboInternalNode {
|
|
690
533
|
|
691
534
|
/** The actual node data. */
|
692
535
|
union {
|
693
|
-
GumboDocument document;
|
694
|
-
GumboElement element;
|
695
|
-
GumboText text;
|
536
|
+
GumboDocument document; // For GUMBO_NODE_DOCUMENT.
|
537
|
+
GumboElement element; // For GUMBO_NODE_ELEMENT.
|
538
|
+
GumboText text; // For everything else.
|
696
539
|
} v;
|
697
540
|
};
|
698
541
|
|
@@ -750,6 +593,29 @@ typedef struct GumboInternalOptions {
|
|
750
593
|
* Default: -1
|
751
594
|
*/
|
752
595
|
int max_errors;
|
596
|
+
|
597
|
+
/**
|
598
|
+
* The fragment context for parsing:
|
599
|
+
* https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
|
600
|
+
*
|
601
|
+
* If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e.
|
602
|
+
* the regular parsing algorithm. Otherwise, pass the tag enum for the
|
603
|
+
* intended parent of the parsed fragment. We use just the tag enum rather
|
604
|
+
* than a full node because that's enough to set all the parsing context we
|
605
|
+
* need, and it provides some additional flexibility for client code to act as
|
606
|
+
* if parsing a fragment even when a full HTML tree isn't available.
|
607
|
+
*
|
608
|
+
* Default: GUMBO_TAG_LAST
|
609
|
+
*/
|
610
|
+
GumboTag fragment_context;
|
611
|
+
|
612
|
+
/**
|
613
|
+
* The namespace for the fragment context. This lets client code
|
614
|
+
* differentiate between, say, parsing a <title> tag in SVG vs. parsing it in
|
615
|
+
* HTML.
|
616
|
+
* Default: GUMBO_NAMESPACE_HTML
|
617
|
+
*/
|
618
|
+
GumboNamespaceEnum fragment_namespace;
|
753
619
|
} GumboOptions;
|
754
620
|
|
755
621
|
/** Default options struct; use this with gumbo_parse_with_options. */
|
@@ -796,9 +662,7 @@ GumboOutput* gumbo_parse_with_options(
|
|
796
662
|
const GumboOptions* options, const char* buffer, size_t buffer_length);
|
797
663
|
|
798
664
|
/** Release the memory used for the parse tree & parse errors. */
|
799
|
-
void gumbo_destroy_output(
|
800
|
-
const GumboOptions* options, GumboOutput* output);
|
801
|
-
|
665
|
+
void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output);
|
802
666
|
|
803
667
|
#ifdef __cplusplus
|
804
668
|
}
|