nokogumbo 1.5.0 → 2.0.0.pre.alpha
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +56 -0
- data/README.md +146 -22
- data/ext/nokogumbo/extconf.rb +116 -0
- data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
- data/gumbo-parser/src/ascii.c +33 -0
- data/gumbo-parser/src/ascii.h +31 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +135 -2351
- data/gumbo-parser/src/char_ref.h +13 -29
- data/gumbo-parser/src/error.c +215 -133
- data/gumbo-parser/src/error.h +34 -49
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +506 -304
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +1989 -1431
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +899 -495
- data/gumbo-parser/src/tokenizer.h +37 -37
- data/gumbo-parser/src/tokenizer_states.h +6 -22
- data/gumbo-parser/src/utf8.c +103 -86
- data/gumbo-parser/src/utf8.h +37 -41
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +10 -174
- data/lib/nokogumbo/html5.rb +250 -0
- data/lib/nokogumbo/html5/document.rb +37 -0
- data/lib/nokogumbo/html5/document_fragment.rb +46 -0
- data/lib/nokogumbo/version.rb +3 -0
- data/lib/nokogumbo/xml/node.rb +57 -0
- metadata +32 -19
- data/ext/nokogumboc/extconf.rb +0 -60
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
data/gumbo-parser/src/error.h
CHANGED
@@ -1,26 +1,6 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
//
|
17
|
-
// Error types, enums, and handling functions.
|
18
|
-
|
19
1
|
#ifndef GUMBO_ERROR_H_
|
20
2
|
#define GUMBO_ERROR_H_
|
21
|
-
|
22
|
-
#define _CRT_SECURE_NO_WARNINGS
|
23
|
-
#endif
|
3
|
+
|
24
4
|
#include <stdint.h>
|
25
5
|
|
26
6
|
#include "gumbo.h"
|
@@ -77,11 +57,12 @@ typedef enum {
|
|
77
57
|
GUMBO_ERR_DOCTYPE_END,
|
78
58
|
GUMBO_ERR_PARSER,
|
79
59
|
GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
|
60
|
+
GUMBO_ERR_SELF_CLOSING_END_TAG,
|
80
61
|
} GumboErrorType;
|
81
62
|
|
82
63
|
// Additional data for duplicated attributes.
|
83
64
|
typedef struct GumboInternalDuplicateAttrError {
|
84
|
-
// The name of the attribute.
|
65
|
+
// The name of the attribute. Owned by this struct.
|
85
66
|
const char* name;
|
86
67
|
|
87
68
|
// The (0-based) index within the attributes vector of the original
|
@@ -93,7 +74,7 @@ typedef struct GumboInternalDuplicateAttrError {
|
|
93
74
|
} GumboDuplicateAttrError;
|
94
75
|
|
95
76
|
// A simplified representation of the tokenizer state, designed to be more
|
96
|
-
// useful to clients of this library than the internal representation.
|
77
|
+
// useful to clients of this library than the internal representation. This
|
97
78
|
// condenses the actual states used in the tokenizer state machine into a few
|
98
79
|
// values that will be familiar to users of HTML.
|
99
80
|
typedef enum {
|
@@ -129,20 +110,20 @@ typedef struct GumboInternalParserError {
|
|
129
110
|
// The type of input token that resulted in this error.
|
130
111
|
GumboTokenType input_type;
|
131
112
|
|
132
|
-
// The HTML tag of the input token.
|
113
|
+
// The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
|
133
114
|
GumboTag input_tag;
|
134
115
|
|
135
116
|
// The insertion mode that the parser was in at the time.
|
136
117
|
GumboInsertionMode parser_state;
|
137
118
|
|
138
|
-
// The tag stack at the point of the error.
|
119
|
+
// The tag stack at the point of the error. Note that this is an GumboVector
|
139
120
|
// of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
|
140
121
|
// get at the tag.
|
141
122
|
GumboVector /* GumboTag */ tag_stack;
|
142
123
|
} GumboParserError;
|
143
124
|
|
144
125
|
// The overall error struct representing an error in decoding/tokenizing/parsing
|
145
|
-
// the HTML.
|
126
|
+
// the HTML. This contains an enumerated type flag, a source position, and then
|
146
127
|
// a union of fields containing data specific to the error.
|
147
128
|
typedef struct GumboInternalError {
|
148
129
|
// The type of error.
|
@@ -163,7 +144,7 @@ typedef struct GumboInternalError {
|
|
163
144
|
// * GUMBO_ERR_UTF8_TRUNCATED
|
164
145
|
// * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
|
165
146
|
// * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
|
166
|
-
|
147
|
+
uint32_t codepoint;
|
167
148
|
|
168
149
|
// Tokenizer errors.
|
169
150
|
GumboTokenizerError tokenizer;
|
@@ -183,7 +164,7 @@ typedef struct GumboInternalError {
|
|
183
164
|
} GumboError;
|
184
165
|
|
185
166
|
// Adds a new error to the parser's error list, and returns a pointer to it so
|
186
|
-
// that clients can fill out the rest of its fields.
|
167
|
+
// that clients can fill out the rest of its fields. May return NULL if we're
|
187
168
|
// already over the max_errors field specified in GumboOptions.
|
188
169
|
GumboError* gumbo_add_error(struct GumboInternalParser* parser);
|
189
170
|
|
@@ -194,32 +175,36 @@ void gumbo_init_errors(struct GumboInternalParser* errors);
|
|
194
175
|
void gumbo_destroy_errors(struct GumboInternalParser* errors);
|
195
176
|
|
196
177
|
// Frees the memory used for a single GumboError.
|
197
|
-
void gumbo_error_destroy(
|
198
|
-
|
199
|
-
// Prints an error to a string.
|
200
|
-
// freshly-allocated buffer containing the error message text.
|
201
|
-
// responsible for
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
// Prints a caret diagnostic to a string.
|
208
|
-
// with a freshly-allocated buffer containing the error message text.
|
209
|
-
// caller is responsible for
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
178
|
+
void gumbo_error_destroy(GumboError* error);
|
179
|
+
|
180
|
+
// Prints an error to a string. This fills an empty GumboStringBuffer with a
|
181
|
+
// freshly-allocated buffer containing the error message text. The caller is
|
182
|
+
// responsible for freeing the buffer.
|
183
|
+
void gumbo_error_to_string (
|
184
|
+
const GumboError* error,
|
185
|
+
GumboStringBuffer* output
|
186
|
+
);
|
187
|
+
|
188
|
+
// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
|
189
|
+
// with a freshly-allocated buffer containing the error message text. The
|
190
|
+
// caller is responsible for freeing the buffer.
|
191
|
+
void gumbo_caret_diagnostic_to_string (
|
192
|
+
const GumboError* error,
|
193
|
+
const char* source_text,
|
194
|
+
size_t source_length,
|
195
|
+
GumboStringBuffer* output
|
196
|
+
);
|
215
197
|
|
216
198
|
// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
|
217
199
|
// of writing to a string.
|
218
|
-
void gumbo_print_caret_diagnostic(
|
219
|
-
|
200
|
+
void gumbo_print_caret_diagnostic (
|
201
|
+
const GumboError* error,
|
202
|
+
const char* source_text,
|
203
|
+
size_t source_length
|
204
|
+
);
|
220
205
|
|
221
206
|
#ifdef __cplusplus
|
222
207
|
}
|
223
208
|
#endif
|
224
209
|
|
225
|
-
#endif
|
210
|
+
#endif // GUMBO_ERROR_H_
|
@@ -0,0 +1,104 @@
|
|
1
|
+
/* ANSI-C code produced by gperf version 3.1 */
|
2
|
+
/* Command-line: gperf -m100 -n lib/foreign_attrs.gperf */
|
3
|
+
/* Computed positions: -k'2,8' */
|
4
|
+
/* Filtered by: mk/gperf-filter.sed */
|
5
|
+
|
6
|
+
#include "replacement.h"
|
7
|
+
#include "macros.h"
|
8
|
+
#include <string.h>
|
9
|
+
|
10
|
+
#define TOTAL_KEYWORDS 11
|
11
|
+
#define MIN_WORD_LENGTH 5
|
12
|
+
#define MAX_WORD_LENGTH 13
|
13
|
+
#define MIN_HASH_VALUE 0
|
14
|
+
#define MAX_HASH_VALUE 10
|
15
|
+
/* maximum key range = 11, duplicates = 0 */
|
16
|
+
|
17
|
+
static inline unsigned int
|
18
|
+
hash (register const char *str, register size_t len)
|
19
|
+
{
|
20
|
+
static const unsigned char asso_values[] =
|
21
|
+
{
|
22
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
23
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
24
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
25
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
26
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
27
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
28
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
29
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
30
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
31
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 2,
|
32
|
+
11, 10, 11, 9, 7, 6, 11, 11, 1, 0,
|
33
|
+
11, 5, 11, 11, 4, 11, 11, 11, 11, 11,
|
34
|
+
11, 3, 11, 11, 11, 11, 11, 11, 11, 11,
|
35
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
36
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
37
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
38
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
39
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
40
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
41
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
42
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
43
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
44
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
45
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
46
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
47
|
+
11, 11, 11, 11, 11, 11
|
48
|
+
};
|
49
|
+
register unsigned int hval = 0;
|
50
|
+
|
51
|
+
switch (len)
|
52
|
+
{
|
53
|
+
default:
|
54
|
+
hval += asso_values[(unsigned char)str[7]];
|
55
|
+
/*FALLTHROUGH*/
|
56
|
+
case 7:
|
57
|
+
case 6:
|
58
|
+
case 5:
|
59
|
+
case 4:
|
60
|
+
case 3:
|
61
|
+
case 2:
|
62
|
+
hval += asso_values[(unsigned char)str[1]];
|
63
|
+
break;
|
64
|
+
}
|
65
|
+
return hval;
|
66
|
+
}
|
67
|
+
|
68
|
+
const ForeignAttrReplacement *
|
69
|
+
gumbo_get_foreign_attr_replacement (register const char *str, register size_t len)
|
70
|
+
{
|
71
|
+
static const unsigned char lengthtable[] =
|
72
|
+
{
|
73
|
+
5, 11, 9, 13, 10, 10, 10, 11, 10, 8, 8
|
74
|
+
};
|
75
|
+
static const ForeignAttrReplacement wordlist[] =
|
76
|
+
{
|
77
|
+
{"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
|
78
|
+
{"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
|
79
|
+
{"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
|
80
|
+
{"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
|
81
|
+
{"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
|
82
|
+
{"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
|
83
|
+
{"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
|
84
|
+
{"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
|
85
|
+
{"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
|
86
|
+
{"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
|
87
|
+
{"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML}
|
88
|
+
};
|
89
|
+
|
90
|
+
if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
|
91
|
+
{
|
92
|
+
register unsigned int key = hash (str, len);
|
93
|
+
|
94
|
+
if (key <= MAX_HASH_VALUE)
|
95
|
+
if (len == lengthtable[key])
|
96
|
+
{
|
97
|
+
register const char *s = wordlist[key].from;
|
98
|
+
|
99
|
+
if (s && *str == *s && !memcmp (str + 1, s + 1, len - 1))
|
100
|
+
return &wordlist[key];
|
101
|
+
}
|
102
|
+
}
|
103
|
+
return 0;
|
104
|
+
}
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -1,51 +1,33 @@
|
|
1
|
-
// Copyright 2010 Google Inc.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License,
|
4
|
-
|
5
|
-
//
|
6
|
-
//
|
7
|
-
//
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
//
|
17
|
-
// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
|
18
|
-
// GUMBO_ as a prefix for enum constants (static constants get the Google-style
|
19
|
-
// kGumbo prefix).
|
1
|
+
// Copyright 2010 Google Inc.
|
2
|
+
// Copyright 2018 Craig Barnes.
|
3
|
+
// Licensed under the Apache License, version 2.0.
|
4
|
+
|
5
|
+
// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
|
6
|
+
// GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
|
7
|
+
// static constants
|
20
8
|
|
21
9
|
/**
|
22
10
|
* @file
|
23
11
|
* @mainpage Gumbo HTML Parser
|
24
12
|
*
|
25
|
-
* This provides a conformant, no-dependencies implementation of the
|
26
|
-
* parsing algorithm.
|
27
|
-
* encoding, run a preprocessing step to convert
|
28
|
-
* tree made of the structs in this file.
|
13
|
+
* This provides a conformant, no-dependencies implementation of the
|
14
|
+
* [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
|
15
|
+
* to parse a different encoding, run a preprocessing step to convert
|
16
|
+
* to UTF-8. It returns a parse tree made of the structs in this file.
|
29
17
|
*
|
30
18
|
* Example:
|
31
19
|
* @code
|
32
20
|
* GumboOutput* output = gumbo_parse(input);
|
33
21
|
* do_something_with_doctype(output->document);
|
34
22
|
* do_something_with_html_tree(output->root);
|
35
|
-
* gumbo_destroy_output(
|
23
|
+
* gumbo_destroy_output(output);
|
36
24
|
* @endcode
|
37
|
-
* HTML5 Spec:
|
38
25
|
*
|
39
|
-
*
|
26
|
+
* [HTML5]: https://html.spec.whatwg.org/multipage/
|
40
27
|
*/
|
41
28
|
|
42
|
-
#ifndef
|
43
|
-
#define
|
44
|
-
|
45
|
-
#ifdef _MSC_VER
|
46
|
-
#define _CRT_SECURE_NO_WARNINGS
|
47
|
-
#define fileno _fileno
|
48
|
-
#endif
|
29
|
+
#ifndef GUMBO_H
|
30
|
+
#define GUMBO_H
|
49
31
|
|
50
32
|
#include <stdbool.h>
|
51
33
|
#include <stddef.h>
|
@@ -55,73 +37,77 @@ extern "C" {
|
|
55
37
|
#endif
|
56
38
|
|
57
39
|
/**
|
58
|
-
* A struct representing a character position within the original text
|
59
|
-
* Line and column numbers are 1-based and offsets are 0-based,
|
60
|
-
* how most editors and command-line tools work.
|
61
|
-
* positions in terms of characters while offsets measure by bytes; this is
|
62
|
-
* because the offset field is often used to pull out a particular region of
|
63
|
-
* text (which in most languages that bind to C implies pointer arithmetic on a
|
64
|
-
* buffer of bytes), while the column field is often used to reference a
|
65
|
-
* particular column on a printable display, which nowadays is usually UTF-8.
|
40
|
+
* A struct representing a character position within the original text
|
41
|
+
* buffer. Line and column numbers are 1-based and offsets are 0-based,
|
42
|
+
* which matches how most editors and command-line tools work.
|
66
43
|
*/
|
67
44
|
typedef struct {
|
68
|
-
|
69
|
-
|
70
|
-
|
45
|
+
size_t line;
|
46
|
+
size_t column;
|
47
|
+
size_t offset;
|
71
48
|
} GumboSourcePosition;
|
72
49
|
|
73
50
|
/**
|
74
|
-
* A
|
75
|
-
* parser
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
*
|
81
|
-
* parser are represented by a char* and a length; the char* points into
|
82
|
-
* an existing data buffer owned by some other code (often the original input).
|
83
|
-
* GumboStringPieces are assumed (by convention) to be immutable, because they
|
84
|
-
* may share data. Use GumboStringBuffer if you need to construct a string.
|
85
|
-
* Clients should assume that it is not NUL-terminated, and should always use
|
86
|
-
* explicit lengths when manipulating them.
|
51
|
+
* A struct representing a string or part of a string. Strings within
|
52
|
+
* the parser are represented by a `char*` and a length; the `char*`
|
53
|
+
* points into an existing data buffer owned by some other code (often
|
54
|
+
* the original input). `GumboStringPiece`s are assumed (by convention)
|
55
|
+
* to be immutable, because they may share data. Clients should assume
|
56
|
+
* that it is not NUL-terminated and should always use explicit lengths
|
57
|
+
* when manipulating them.
|
87
58
|
*/
|
88
59
|
typedef struct {
|
89
|
-
/** A pointer to the beginning of the string.
|
60
|
+
/** A pointer to the beginning of the string. `NULL` if `length == 0`. */
|
90
61
|
const char* data;
|
91
62
|
|
92
|
-
/** The length of the string fragment, in bytes
|
63
|
+
/** The length of the string fragment, in bytes (may be zero). */
|
93
64
|
size_t length;
|
94
65
|
} GumboStringPiece;
|
95
66
|
|
67
|
+
#define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
|
96
68
|
/** A constant to represent a 0-length null string. */
|
97
|
-
|
69
|
+
#define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT
|
70
|
+
|
71
|
+
/**
|
72
|
+
* Compares two `GumboStringPiece`s, and returns `true` if they're
|
73
|
+
* equal or `false` otherwise.
|
74
|
+
*/
|
75
|
+
bool gumbo_string_equals (
|
76
|
+
const GumboStringPiece* str1,
|
77
|
+
const GumboStringPiece* str2
|
78
|
+
);
|
98
79
|
|
99
80
|
/**
|
100
|
-
* Compares two
|
101
|
-
* otherwise.
|
81
|
+
* Compares two `GumboStringPiece`s, ignoring case, and returns `true`
|
82
|
+
* if they're equal or `false` otherwise.
|
102
83
|
*/
|
103
|
-
bool
|
104
|
-
|
84
|
+
bool gumbo_string_equals_ignore_case (
|
85
|
+
const GumboStringPiece* str1,
|
86
|
+
const GumboStringPiece* str2
|
87
|
+
);
|
105
88
|
|
106
89
|
/**
|
107
|
-
*
|
108
|
-
*
|
90
|
+
* Check if the first `GumboStringPiece` is a prefix of the second, ignoring
|
91
|
+
* case.
|
109
92
|
*/
|
110
|
-
bool
|
111
|
-
|
93
|
+
bool gumbo_string_prefix_ignore_case (
|
94
|
+
const GumboStringPiece* prefix,
|
95
|
+
const GumboStringPiece* str
|
96
|
+
);
|
112
97
|
|
113
98
|
/**
|
114
|
-
* A simple vector implementation.
|
115
|
-
* length.
|
116
|
-
* appropriate type.
|
117
|
-
* array, with the size doubling to maintain
|
118
|
-
* removal function, as this isn't
|
119
|
-
*
|
120
|
-
* a for
|
99
|
+
* A simple vector implementation. This stores a pointer to a data array
|
100
|
+
* and a length. All elements are stored as `void*`; client code must
|
101
|
+
* cast to the appropriate type. Overflows upon addition result in
|
102
|
+
* reallocation of the data array, with the size doubling to maintain
|
103
|
+
* `O(1)` amortized cost. There is no removal function, as this isn't
|
104
|
+
* needed for any of the operations within this library. Iteration can
|
105
|
+
* be done through inspecting the structure directly in a `for` loop.
|
121
106
|
*/
|
122
107
|
typedef struct {
|
123
|
-
/**
|
124
|
-
* elements
|
108
|
+
/**
|
109
|
+
* Data elements. This points to a dynamically-allocated array of
|
110
|
+
* `capacity` elements, each a `void*` to the element itself.
|
125
111
|
*/
|
126
112
|
void** data;
|
127
113
|
|
@@ -132,82 +118,230 @@ typedef struct {
|
|
132
118
|
unsigned int capacity;
|
133
119
|
} GumboVector;
|
134
120
|
|
135
|
-
|
136
|
-
|
121
|
+
# define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
|
122
|
+
/** An empty (0-length, 0-capacity) `GumboVector`. */
|
123
|
+
#define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT
|
137
124
|
|
138
125
|
/**
|
139
|
-
* Returns the first index at which an element appears in this vector
|
140
|
-
* by pointer equality), or
|
126
|
+
* Returns the first index at which an element appears in this vector
|
127
|
+
* (testing by pointer equality), or `-1` if it never does.
|
141
128
|
*/
|
142
129
|
int gumbo_vector_index_of(GumboVector* vector, const void* element);
|
143
130
|
|
144
131
|
/**
|
145
|
-
* An enum for all the tags defined in the HTML5 standard.
|
146
|
-
* the tag names themselves.
|
147
|
-
* the spec itself (or for tags with special
|
148
|
-
*
|
149
|
-
* name can be obtained
|
132
|
+
* An `enum` for all the tags defined in the HTML5 standard. These
|
133
|
+
* correspond to the tag names themselves. Enum constants exist only
|
134
|
+
* for tags that appear in the spec itself (or for tags with special
|
135
|
+
* handling in the SVG and MathML namespaces). Any other tags appear
|
136
|
+
* as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
|
137
|
+
* through `original_tag`.
|
150
138
|
*
|
151
|
-
* This is mostly for API convenience, so that clients of this library
|
152
|
-
* need to perform a strcasecmp to find the normalized tag
|
153
|
-
* efficiency benefits, by letting the parser work
|
154
|
-
* strings.
|
139
|
+
* This is mostly for API convenience, so that clients of this library
|
140
|
+
* don't need to perform a `strcasecmp` to find the normalized tag
|
141
|
+
* name. It also has efficiency benefits, by letting the parser work
|
142
|
+
* with enums instead of strings.
|
155
143
|
*/
|
156
144
|
typedef enum {
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
145
|
+
GUMBO_TAG_HTML,
|
146
|
+
GUMBO_TAG_HEAD,
|
147
|
+
GUMBO_TAG_TITLE,
|
148
|
+
GUMBO_TAG_BASE,
|
149
|
+
GUMBO_TAG_LINK,
|
150
|
+
GUMBO_TAG_META,
|
151
|
+
GUMBO_TAG_STYLE,
|
152
|
+
GUMBO_TAG_SCRIPT,
|
153
|
+
GUMBO_TAG_NOSCRIPT,
|
154
|
+
GUMBO_TAG_TEMPLATE,
|
155
|
+
GUMBO_TAG_BODY,
|
156
|
+
GUMBO_TAG_ARTICLE,
|
157
|
+
GUMBO_TAG_SECTION,
|
158
|
+
GUMBO_TAG_NAV,
|
159
|
+
GUMBO_TAG_ASIDE,
|
160
|
+
GUMBO_TAG_H1,
|
161
|
+
GUMBO_TAG_H2,
|
162
|
+
GUMBO_TAG_H3,
|
163
|
+
GUMBO_TAG_H4,
|
164
|
+
GUMBO_TAG_H5,
|
165
|
+
GUMBO_TAG_H6,
|
166
|
+
GUMBO_TAG_HGROUP,
|
167
|
+
GUMBO_TAG_HEADER,
|
168
|
+
GUMBO_TAG_FOOTER,
|
169
|
+
GUMBO_TAG_ADDRESS,
|
170
|
+
GUMBO_TAG_P,
|
171
|
+
GUMBO_TAG_HR,
|
172
|
+
GUMBO_TAG_PRE,
|
173
|
+
GUMBO_TAG_BLOCKQUOTE,
|
174
|
+
GUMBO_TAG_OL,
|
175
|
+
GUMBO_TAG_UL,
|
176
|
+
GUMBO_TAG_LI,
|
177
|
+
GUMBO_TAG_DL,
|
178
|
+
GUMBO_TAG_DT,
|
179
|
+
GUMBO_TAG_DD,
|
180
|
+
GUMBO_TAG_FIGURE,
|
181
|
+
GUMBO_TAG_FIGCAPTION,
|
182
|
+
GUMBO_TAG_MAIN,
|
183
|
+
GUMBO_TAG_DIV,
|
184
|
+
GUMBO_TAG_A,
|
185
|
+
GUMBO_TAG_EM,
|
186
|
+
GUMBO_TAG_STRONG,
|
187
|
+
GUMBO_TAG_SMALL,
|
188
|
+
GUMBO_TAG_S,
|
189
|
+
GUMBO_TAG_CITE,
|
190
|
+
GUMBO_TAG_Q,
|
191
|
+
GUMBO_TAG_DFN,
|
192
|
+
GUMBO_TAG_ABBR,
|
193
|
+
GUMBO_TAG_DATA,
|
194
|
+
GUMBO_TAG_TIME,
|
195
|
+
GUMBO_TAG_CODE,
|
196
|
+
GUMBO_TAG_VAR,
|
197
|
+
GUMBO_TAG_SAMP,
|
198
|
+
GUMBO_TAG_KBD,
|
199
|
+
GUMBO_TAG_SUB,
|
200
|
+
GUMBO_TAG_SUP,
|
201
|
+
GUMBO_TAG_I,
|
202
|
+
GUMBO_TAG_B,
|
203
|
+
GUMBO_TAG_U,
|
204
|
+
GUMBO_TAG_MARK,
|
205
|
+
GUMBO_TAG_RUBY,
|
206
|
+
GUMBO_TAG_RT,
|
207
|
+
GUMBO_TAG_RP,
|
208
|
+
GUMBO_TAG_BDI,
|
209
|
+
GUMBO_TAG_BDO,
|
210
|
+
GUMBO_TAG_SPAN,
|
211
|
+
GUMBO_TAG_BR,
|
212
|
+
GUMBO_TAG_WBR,
|
213
|
+
GUMBO_TAG_INS,
|
214
|
+
GUMBO_TAG_DEL,
|
215
|
+
GUMBO_TAG_IMAGE,
|
216
|
+
GUMBO_TAG_IMG,
|
217
|
+
GUMBO_TAG_IFRAME,
|
218
|
+
GUMBO_TAG_EMBED,
|
219
|
+
GUMBO_TAG_OBJECT,
|
220
|
+
GUMBO_TAG_PARAM,
|
221
|
+
GUMBO_TAG_VIDEO,
|
222
|
+
GUMBO_TAG_AUDIO,
|
223
|
+
GUMBO_TAG_SOURCE,
|
224
|
+
GUMBO_TAG_TRACK,
|
225
|
+
GUMBO_TAG_CANVAS,
|
226
|
+
GUMBO_TAG_MAP,
|
227
|
+
GUMBO_TAG_AREA,
|
228
|
+
GUMBO_TAG_MATH,
|
229
|
+
GUMBO_TAG_MI,
|
230
|
+
GUMBO_TAG_MO,
|
231
|
+
GUMBO_TAG_MN,
|
232
|
+
GUMBO_TAG_MS,
|
233
|
+
GUMBO_TAG_MTEXT,
|
234
|
+
GUMBO_TAG_MGLYPH,
|
235
|
+
GUMBO_TAG_MALIGNMARK,
|
236
|
+
GUMBO_TAG_ANNOTATION_XML,
|
237
|
+
GUMBO_TAG_SVG,
|
238
|
+
GUMBO_TAG_FOREIGNOBJECT,
|
239
|
+
GUMBO_TAG_DESC,
|
240
|
+
GUMBO_TAG_TABLE,
|
241
|
+
GUMBO_TAG_CAPTION,
|
242
|
+
GUMBO_TAG_COLGROUP,
|
243
|
+
GUMBO_TAG_COL,
|
244
|
+
GUMBO_TAG_TBODY,
|
245
|
+
GUMBO_TAG_THEAD,
|
246
|
+
GUMBO_TAG_TFOOT,
|
247
|
+
GUMBO_TAG_TR,
|
248
|
+
GUMBO_TAG_TD,
|
249
|
+
GUMBO_TAG_TH,
|
250
|
+
GUMBO_TAG_FORM,
|
251
|
+
GUMBO_TAG_FIELDSET,
|
252
|
+
GUMBO_TAG_LEGEND,
|
253
|
+
GUMBO_TAG_LABEL,
|
254
|
+
GUMBO_TAG_INPUT,
|
255
|
+
GUMBO_TAG_BUTTON,
|
256
|
+
GUMBO_TAG_SELECT,
|
257
|
+
GUMBO_TAG_DATALIST,
|
258
|
+
GUMBO_TAG_OPTGROUP,
|
259
|
+
GUMBO_TAG_OPTION,
|
260
|
+
GUMBO_TAG_TEXTAREA,
|
261
|
+
GUMBO_TAG_KEYGEN,
|
262
|
+
GUMBO_TAG_OUTPUT,
|
263
|
+
GUMBO_TAG_PROGRESS,
|
264
|
+
GUMBO_TAG_METER,
|
265
|
+
GUMBO_TAG_DETAILS,
|
266
|
+
GUMBO_TAG_SUMMARY,
|
267
|
+
GUMBO_TAG_MENU,
|
268
|
+
GUMBO_TAG_MENUITEM,
|
269
|
+
GUMBO_TAG_APPLET,
|
270
|
+
GUMBO_TAG_ACRONYM,
|
271
|
+
GUMBO_TAG_BGSOUND,
|
272
|
+
GUMBO_TAG_DIR,
|
273
|
+
GUMBO_TAG_FRAME,
|
274
|
+
GUMBO_TAG_FRAMESET,
|
275
|
+
GUMBO_TAG_NOFRAMES,
|
276
|
+
GUMBO_TAG_LISTING,
|
277
|
+
GUMBO_TAG_XMP,
|
278
|
+
GUMBO_TAG_NEXTID,
|
279
|
+
GUMBO_TAG_NOEMBED,
|
280
|
+
GUMBO_TAG_PLAINTEXT,
|
281
|
+
GUMBO_TAG_RB,
|
282
|
+
GUMBO_TAG_STRIKE,
|
283
|
+
GUMBO_TAG_BASEFONT,
|
284
|
+
GUMBO_TAG_BIG,
|
285
|
+
GUMBO_TAG_BLINK,
|
286
|
+
GUMBO_TAG_CENTER,
|
287
|
+
GUMBO_TAG_FONT,
|
288
|
+
GUMBO_TAG_MARQUEE,
|
289
|
+
GUMBO_TAG_MULTICOL,
|
290
|
+
GUMBO_TAG_NOBR,
|
291
|
+
GUMBO_TAG_SPACER,
|
292
|
+
GUMBO_TAG_TT,
|
293
|
+
GUMBO_TAG_RTC,
|
294
|
+
GUMBO_TAG_DIALOG,
|
295
|
+
// Used for all tags that don't have special handling in HTML.
|
161
296
|
GUMBO_TAG_UNKNOWN,
|
162
297
|
// A marker value to indicate the end of the enum, for iterating over it.
|
163
|
-
// Also used as the terminator for varargs functions that take tags.
|
164
298
|
GUMBO_TAG_LAST,
|
165
299
|
} GumboTag;
|
166
300
|
|
167
301
|
/**
|
168
|
-
* Returns the normalized (
|
169
|
-
*
|
170
|
-
* library.
|
302
|
+
* Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
|
303
|
+
* return value is static data owned by the library.
|
171
304
|
*/
|
172
305
|
const char* gumbo_normalized_tagname(GumboTag tag);
|
173
306
|
|
174
307
|
/**
|
175
|
-
* Extracts the tag name from the original_text field of an element
|
176
|
-
* stripping off
|
177
|
-
* GumboStringPiece appropriately.
|
178
|
-
* shares a buffer with the original
|
179
|
-
* Behavior is undefined if a
|
180
|
-
*
|
181
|
-
*
|
182
|
-
*
|
308
|
+
* Extracts the tag name from the `original_text` field of an element
|
309
|
+
* or token by stripping off `</>` characters and attributes and
|
310
|
+
* adjusting the passed-in `GumboStringPiece` appropriately. The tag
|
311
|
+
* name is in the original case and shares a buffer with the original
|
312
|
+
* text, to simplify memory management. Behavior is undefined if a
|
313
|
+
* string piece that doesn't represent an HTML tag (`<tagname>` or
|
314
|
+
* `</tagname>`) is passed in. If the string piece is completely
|
315
|
+
* empty (`NULL` data pointer), then this function will exit
|
316
|
+
* successfully as a no-op.
|
183
317
|
*/
|
184
318
|
void gumbo_tag_from_original_text(GumboStringPiece* text);
|
185
319
|
|
186
320
|
/**
|
187
|
-
* Fixes the case of SVG elements that are not all lowercase.
|
188
|
-
*
|
189
|
-
*
|
190
|
-
*
|
191
|
-
*
|
192
|
-
*
|
193
|
-
*
|
194
|
-
*
|
195
|
-
*
|
196
|
-
*
|
321
|
+
* Fixes the case of SVG elements that are not all lowercase. This is
|
322
|
+
* not done at parse time because there's no place to store a mutated
|
323
|
+
* tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
|
324
|
+
* SVG tags without special handling), while `original_tag_name` is a
|
325
|
+
* pointer into the original buffer. Instead, we provide this helper
|
326
|
+
* function that clients can use to rename SVG tags as appropriate.
|
327
|
+
* Returns the case-normalized SVG tagname if a replacement is found, or
|
328
|
+
* `NULL` if no normalization is called for. The return value is static
|
329
|
+
* data and owned by the library.
|
330
|
+
*
|
331
|
+
* @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
|
197
332
|
*/
|
198
333
|
const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
|
199
334
|
|
200
335
|
/**
|
201
|
-
* Converts a tag name string (which may be in upper or mixed case) to a
|
202
|
-
* enum.
|
336
|
+
* Converts a tag name string (which may be in upper or mixed case) to a
|
337
|
+
* tag enum.
|
203
338
|
*/
|
204
|
-
GumboTag
|
205
|
-
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
|
339
|
+
GumboTag gumbo_tagn_enum(const char* tagname, size_t length);
|
206
340
|
|
207
341
|
/**
|
208
342
|
* Attribute namespaces.
|
209
|
-
* HTML includes special handling for XLink, XML, and XMLNS namespaces
|
210
|
-
* attributes.
|
343
|
+
* HTML includes special handling for XLink, XML, and XMLNS namespaces
|
344
|
+
* on attributes. Everything else goes in the generic "NONE" namespace.
|
211
345
|
*/
|
212
346
|
typedef enum {
|
213
347
|
GUMBO_ATTR_NAMESPACE_NONE,
|
@@ -217,46 +351,47 @@ typedef enum {
|
|
217
351
|
} GumboAttributeNamespaceEnum;
|
218
352
|
|
219
353
|
/**
|
220
|
-
* A struct representing a single attribute on
|
221
|
-
* name-value pair, but also includes information about source locations
|
222
|
-
* original source text.
|
354
|
+
* A struct representing a single attribute on a HTML tag. This is a
|
355
|
+
* name-value pair, but also includes information about source locations
|
356
|
+
* and original source text.
|
223
357
|
*/
|
224
358
|
typedef struct {
|
225
359
|
/**
|
226
|
-
* The namespace for the attribute.
|
227
|
-
* GUMBO_ATTR_NAMESPACE_NONE
|
228
|
-
* values, per:
|
229
|
-
*
|
360
|
+
* The namespace for the attribute. This will usually be
|
361
|
+
* `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
|
362
|
+
* take special values, per:
|
363
|
+
* https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
|
230
364
|
*/
|
231
365
|
GumboAttributeNamespaceEnum attr_namespace;
|
232
366
|
|
233
367
|
/**
|
234
|
-
* The name of the attribute.
|
235
|
-
* with case-normalization
|
368
|
+
* The name of the attribute. This is in a freshly-allocated buffer to
|
369
|
+
* deal with case-normalization and is null-terminated.
|
236
370
|
*/
|
237
371
|
const char* name;
|
238
372
|
|
239
373
|
/**
|
240
|
-
* The original text of the attribute name, as a pointer into the
|
241
|
-
* source buffer.
|
374
|
+
* The original text of the attribute name, as a pointer into the
|
375
|
+
* original source buffer.
|
242
376
|
*/
|
243
377
|
GumboStringPiece original_name;
|
244
378
|
|
245
379
|
/**
|
246
|
-
* The value of the attribute.
|
247
|
-
* with unescaping
|
248
|
-
* that surround the attribute.
|
249
|
-
*
|
380
|
+
* The value of the attribute. This is in a freshly-allocated buffer
|
381
|
+
* to deal with unescaping and is null-terminated. It does not include
|
382
|
+
* any quotes that surround the attribute. If the attribute has no
|
383
|
+
* value (for example, `selected` on a checkbox) this will be an empty
|
384
|
+
* string.
|
250
385
|
*/
|
251
386
|
const char* value;
|
252
387
|
|
253
388
|
/**
|
254
|
-
* The original text of the value of the attribute.
|
255
|
-
* original source buffer.
|
256
|
-
* attribute
|
257
|
-
* original_value.data[original_value.length - 1] to determine what
|
258
|
-
* characters were.
|
259
|
-
* string.
|
389
|
+
* The original text of the value of the attribute. This points into
|
390
|
+
* the original source buffer. It includes any quotes that surround
|
391
|
+
* the attribute and you can look at `original_value.data[0]` and
|
392
|
+
* `original_value.data[original_value.length - 1]` to determine what
|
393
|
+
* the quote characters were. If the attribute has no value this will
|
394
|
+
* be a 0-length string.
|
260
395
|
*/
|
261
396
|
GumboStringPiece original_value;
|
262
397
|
|
@@ -264,9 +399,9 @@ typedef struct {
|
|
264
399
|
GumboSourcePosition name_start;
|
265
400
|
|
266
401
|
/**
|
267
|
-
* The ending position of the attribute name.
|
402
|
+
* The ending position of the attribute name. This is not always derivable
|
268
403
|
* from the starting position of the value because of the possibility of
|
269
|
-
* whitespace around the
|
404
|
+
* whitespace around the `=` sign.
|
270
405
|
*/
|
271
406
|
GumboSourcePosition name_end;
|
272
407
|
|
@@ -278,34 +413,37 @@ typedef struct {
|
|
278
413
|
} GumboAttribute;
|
279
414
|
|
280
415
|
/**
|
281
|
-
* Given a vector of
|
282
|
-
* and return it, or NULL if no such attribute exists.
|
283
|
-
* case-insensitive match, as HTML is case-insensitive.
|
416
|
+
* Given a vector of `GumboAttribute`s, look up the one with the
|
417
|
+
* specified name and return it, or `NULL` if no such attribute exists.
|
418
|
+
* This uses a case-insensitive match, as HTML is case-insensitive.
|
284
419
|
*/
|
285
420
|
GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
|
286
421
|
|
287
422
|
/**
|
288
|
-
* Enum denoting the type of node.
|
289
|
-
* union.
|
423
|
+
* Enum denoting the type of node. This determines the type of the
|
424
|
+
* `node.v` union.
|
290
425
|
*/
|
291
426
|
typedef enum {
|
292
|
-
/** Document node.
|
427
|
+
/** Document node. `v` will be a `GumboDocument`. */
|
293
428
|
GUMBO_NODE_DOCUMENT,
|
294
|
-
/** Element node.
|
429
|
+
/** Element node. `v` will be a `GumboElement`. */
|
295
430
|
GUMBO_NODE_ELEMENT,
|
296
|
-
/** Text node.
|
431
|
+
/** Text node. `v` will be a `GumboText`. */
|
297
432
|
GUMBO_NODE_TEXT,
|
298
|
-
/** CDATA node. v will be a GumboText
|
433
|
+
/** CDATA node. `v` will be a `GumboText`. */
|
299
434
|
GUMBO_NODE_CDATA,
|
300
|
-
/** Comment node.
|
435
|
+
/** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
|
301
436
|
GUMBO_NODE_COMMENT,
|
302
|
-
/** Text node, where all contents is whitespace.
|
437
|
+
/** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
|
303
438
|
GUMBO_NODE_WHITESPACE,
|
304
|
-
/**
|
305
|
-
*
|
306
|
-
*
|
307
|
-
*
|
308
|
-
*
|
439
|
+
/**
|
440
|
+
* Template node. This is separate from `GUMBO_NODE_ELEMENT` because
|
441
|
+
* many client libraries will want to ignore the contents of template
|
442
|
+
* nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
|
443
|
+
* do the right thing here, while clients that want to include template
|
444
|
+
* contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
|
445
|
+
* `GumboElement`.
|
446
|
+
*/
|
309
447
|
GUMBO_NODE_TEMPLATE
|
310
448
|
} GumboNodeType;
|
311
449
|
|
@@ -315,9 +453,7 @@ typedef enum {
|
|
315
453
|
*/
|
316
454
|
typedef struct GumboInternalNode GumboNode;
|
317
455
|
|
318
|
-
/**
|
319
|
-
* http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
|
320
|
-
*/
|
456
|
+
/** https://dom.spec.whatwg.org/#concept-document-quirks */
|
321
457
|
typedef enum {
|
322
458
|
GUMBO_DOCTYPE_NO_QUIRKS,
|
323
459
|
GUMBO_DOCTYPE_QUIRKS,
|
@@ -326,10 +462,11 @@ typedef enum {
|
|
326
462
|
|
327
463
|
/**
|
328
464
|
* Namespaces.
|
329
|
-
* Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
|
330
|
-
* anything inside an
|
331
|
-
*
|
332
|
-
*
|
465
|
+
* Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
|
466
|
+
* Rather, anything inside an `<svg>` tag is in the SVG namespace,
|
467
|
+
* anything inside the `<math>` tag is in the MathML namespace, and
|
468
|
+
* anything else is inside the HTML namespace. No other namespaces are
|
469
|
+
* supported, so this can be an `enum`.
|
333
470
|
*/
|
334
471
|
typedef enum {
|
335
472
|
GUMBO_NAMESPACE_HTML,
|
@@ -339,66 +476,70 @@ typedef enum {
|
|
339
476
|
|
340
477
|
/**
|
341
478
|
* Parse flags.
|
342
|
-
* We track the reasons for parser insertion of nodes and store them in
|
343
|
-
* bitvector in the node itself.
|
344
|
-
* are implied by the HTML structure of the document, or flag
|
345
|
-
* may not be allowed by a style guide, or track the
|
346
|
-
* tricky HTML code.
|
479
|
+
* We track the reasons for parser insertion of nodes and store them in
|
480
|
+
* a bitvector in the node itself. This lets client code optimize out
|
481
|
+
* nodes that are implied by the HTML structure of the document, or flag
|
482
|
+
* constructs that may not be allowed by a style guide, or track the
|
483
|
+
* prevalence of incorrect or tricky HTML code.
|
347
484
|
*/
|
348
485
|
typedef enum {
|
349
486
|
/**
|
350
|
-
* A normal node
|
351
|
-
* been reparented.
|
487
|
+
* A normal node -- both start and end tags appear in the source,
|
488
|
+
* nothing has been reparented.
|
352
489
|
*/
|
353
490
|
GUMBO_INSERTION_NORMAL = 0,
|
354
491
|
|
355
492
|
/**
|
356
|
-
* A node inserted by the parser to fulfill some implicit insertion
|
357
|
-
* This is usually set in addition to some other flag giving a
|
358
|
-
* insertion reason; it's a generic catch-all term
|
359
|
-
* this node did not appear in the document
|
493
|
+
* A node inserted by the parser to fulfill some implicit insertion
|
494
|
+
* rule. This is usually set in addition to some other flag giving a
|
495
|
+
* more specific insertion reason; it's a generic catch-all term
|
496
|
+
* meaning "The start tag for this node did not appear in the document
|
497
|
+
* source".
|
360
498
|
*/
|
361
499
|
GUMBO_INSERTION_BY_PARSER = 1 << 0,
|
362
500
|
|
363
501
|
/**
|
364
|
-
* A flag indicating that the end tag for this node did not appear in
|
365
|
-
* document source.
|
366
|
-
* parser-inserted nodes with an explicit end tag
|
367
|
-
* has GUMBO_INSERTED_BY_PARSER set on the
|
368
|
-
* GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the
|
369
|
-
*
|
370
|
-
*
|
371
|
-
*
|
372
|
-
*
|
502
|
+
* A flag indicating that the end tag for this node did not appear in
|
503
|
+
* the document source. Note that in some cases, you can still have
|
504
|
+
* parser-inserted nodes with an explicit end tag. For example,
|
505
|
+
* `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
|
506
|
+
* node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
|
507
|
+
* `</html>` tag actually exists.
|
508
|
+
*
|
509
|
+
* This flag will be set only if the end tag is completely missing.
|
510
|
+
* In some cases, the end tag may be misplaced (e.g. a `</body>` tag
|
511
|
+
* with text afterwards), which will leave this flag unset and require
|
512
|
+
* clients to inspect the parse errors for that case.
|
373
513
|
*/
|
374
514
|
GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
|
375
515
|
|
376
516
|
// Value 1 << 2 was for a flag that has since been removed.
|
377
517
|
|
378
518
|
/**
|
379
|
-
* A flag for nodes that are inserted because their presence is
|
380
|
-
* other tags,
|
519
|
+
* A flag for nodes that are inserted because their presence is
|
520
|
+
* implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
|
521
|
+
* `<tbody>`, etc.
|
381
522
|
*/
|
382
523
|
GUMBO_INSERTION_IMPLIED = 1 << 3,
|
383
524
|
|
384
525
|
/**
|
385
|
-
* A flag for nodes that are converted from their end tag equivalents.
|
386
|
-
* example,
|
387
|
-
* create a
|
388
|
-
* as
|
526
|
+
* A flag for nodes that are converted from their end tag equivalents.
|
527
|
+
* For example, `</p>` when no paragraph is open implies that the
|
528
|
+
* parser should create a `<p>` tag and immediately close it, while
|
529
|
+
* `</br>` means the same thing as `<br>`.
|
389
530
|
*/
|
390
531
|
GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
|
391
532
|
|
392
|
-
|
393
|
-
GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
|
533
|
+
// Value 1 << 5 was for a flag that has since been removed.
|
394
534
|
|
395
|
-
/** A flag for
|
535
|
+
/** A flag for `<image>` tags that are rewritten as `<img>`. */
|
396
536
|
GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
|
397
537
|
|
398
538
|
/**
|
399
|
-
* A flag for nodes that are cloned as a result of the reconstruction
|
400
|
-
* active formatting elements.
|
401
|
-
* portion of the formatting run is a NORMAL node with an
|
539
|
+
* A flag for nodes that are cloned as a result of the reconstruction
|
540
|
+
* of active formatting elements. This is set only on the clone; the
|
541
|
+
* initial portion of the formatting run is a NORMAL node with an
|
542
|
+
* `IMPLICIT_END_TAG`.
|
402
543
|
*/
|
403
544
|
GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
|
404
545
|
|
@@ -415,18 +556,19 @@ typedef enum {
|
|
415
556
|
GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
|
416
557
|
} GumboParseFlags;
|
417
558
|
|
418
|
-
/**
|
419
|
-
* Information specific to document nodes.
|
420
|
-
*/
|
559
|
+
/** Information specific to document nodes. */
|
421
560
|
typedef struct {
|
422
561
|
/**
|
423
|
-
* An array of
|
424
|
-
* normally consist of the
|
425
|
-
* Pointers are owned.
|
562
|
+
* An array of `GumboNode`s, containing the children of this element.
|
563
|
+
* This will normally consist of the `<html>` element and any comment
|
564
|
+
* nodes found. Pointers are owned.
|
426
565
|
*/
|
427
566
|
GumboVector /* GumboNode* */ children;
|
428
567
|
|
429
|
-
|
568
|
+
/**
|
569
|
+
* `true` if there was an explicit doctype token, as opposed to it
|
570
|
+
* being omitted.
|
571
|
+
*/
|
430
572
|
bool has_doctype;
|
431
573
|
|
432
574
|
// Fields from the doctype token, copied verbatim.
|
@@ -435,65 +577,70 @@ typedef struct {
|
|
435
577
|
const char* system_identifier;
|
436
578
|
|
437
579
|
/**
|
438
|
-
* Whether or not the document is in QuirksMode, as determined by the
|
439
|
-
* in the GumboTokenDocType template.
|
580
|
+
* Whether or not the document is in QuirksMode, as determined by the
|
581
|
+
* values in the GumboTokenDocType template.
|
440
582
|
*/
|
441
583
|
GumboQuirksModeEnum doc_type_quirks_mode;
|
442
584
|
} GumboDocument;
|
443
585
|
|
444
586
|
/**
|
445
|
-
* The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
|
446
|
-
* This contains just a block of text and its position.
|
587
|
+
* The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
|
588
|
+
* elements. This contains just a block of text and its position.
|
447
589
|
*/
|
448
590
|
typedef struct {
|
449
591
|
/**
|
450
|
-
* The text of this node, after entities have been parsed and decoded.
|
451
|
-
* comment
|
592
|
+
* The text of this node, after entities have been parsed and decoded.
|
593
|
+
* For comment and cdata nodes, this does not include the comment
|
594
|
+
* delimiters.
|
452
595
|
*/
|
453
596
|
const char* text;
|
454
597
|
|
455
598
|
/**
|
456
|
-
* The original text of this node, as a pointer into the original
|
457
|
-
* comment/cdata nodes, this includes the comment
|
599
|
+
* The original text of this node, as a pointer into the original
|
600
|
+
* buffer. For comment/cdata nodes, this includes the comment
|
601
|
+
* delimiters.
|
458
602
|
*/
|
459
603
|
GumboStringPiece original_text;
|
460
604
|
|
461
605
|
/**
|
462
|
-
* The starting position of this node.
|
463
|
-
* original_text
|
606
|
+
* The starting position of this node. This corresponds to the
|
607
|
+
* position of `original_text`, before entities are decoded.
|
464
608
|
* */
|
465
609
|
GumboSourcePosition start_pos;
|
466
610
|
} GumboText;
|
467
611
|
|
468
612
|
/**
|
469
|
-
* The struct used to represent all HTML elements.
|
470
|
-
* about the tag, attributes, and child nodes.
|
613
|
+
* The struct used to represent all HTML elements. This contains
|
614
|
+
* information about the tag, attributes, and child nodes.
|
471
615
|
*/
|
472
616
|
typedef struct {
|
473
617
|
/**
|
474
|
-
* An array of
|
475
|
-
* are owned.
|
618
|
+
* An array of `GumboNode`s, containing the children of this element.
|
619
|
+
* Pointers are owned.
|
476
620
|
*/
|
477
621
|
GumboVector /* GumboNode* */ children;
|
478
622
|
|
479
623
|
/** The GumboTag enum for this element. */
|
480
624
|
GumboTag tag;
|
481
625
|
|
626
|
+
/** The name for this element. */
|
627
|
+
const char* name;
|
628
|
+
|
482
629
|
/** The GumboNamespaceEnum for this element. */
|
483
630
|
GumboNamespaceEnum tag_namespace;
|
484
631
|
|
485
632
|
/**
|
486
|
-
* A GumboStringPiece pointing to the original tag text for this
|
487
|
-
* pointing directly into the source buffer.
|
488
|
-
* algorithmically (for example,
|
489
|
-
* zero-length string.
|
633
|
+
* A `GumboStringPiece` pointing to the original tag text for this
|
634
|
+
* element, pointing directly into the source buffer. If the tag was
|
635
|
+
* inserted algorithmically (for example, `<head>` or `<tbody>`
|
636
|
+
* insertion), this will be a zero-length string.
|
490
637
|
*/
|
491
638
|
GumboStringPiece original_tag;
|
492
639
|
|
493
640
|
/**
|
494
|
-
* A GumboStringPiece pointing to the original end tag text for this
|
495
|
-
* If the end tag was inserted algorithmically, (for example,
|
496
|
-
* self-closing tag), this will be a zero-length string.
|
641
|
+
* A `GumboStringPiece` pointing to the original end tag text for this
|
642
|
+
* element. If the end tag was inserted algorithmically, (for example,
|
643
|
+
* closing a self-closing tag), this will be a zero-length string.
|
497
644
|
*/
|
498
645
|
GumboStringPiece original_end_tag;
|
499
646
|
|
@@ -504,30 +651,31 @@ typedef struct {
|
|
504
651
|
GumboSourcePosition end_pos;
|
505
652
|
|
506
653
|
/**
|
507
|
-
* An array of
|
508
|
-
* order that they were parsed.
|
654
|
+
* An array of `GumboAttribute`s, containing the attributes for this
|
655
|
+
* tag in the order that they were parsed. Pointers are owned.
|
509
656
|
*/
|
510
657
|
GumboVector /* GumboAttribute* */ attributes;
|
511
658
|
} GumboElement;
|
512
659
|
|
513
660
|
/**
|
514
|
-
* A supertype for GumboElement and GumboText
|
515
|
-
* generic type in lists of children and cast as necessary
|
661
|
+
* A supertype for `GumboElement` and `GumboText`, so that we can
|
662
|
+
* include one generic type in lists of children and cast as necessary
|
663
|
+
* to subtypes.
|
516
664
|
*/
|
517
665
|
struct GumboInternalNode {
|
518
666
|
/** The type of node that this is. */
|
519
667
|
GumboNodeType type;
|
520
668
|
|
521
|
-
/** Pointer back to parent node.
|
669
|
+
/** Pointer back to parent node. Not owned. */
|
522
670
|
GumboNode* parent;
|
523
671
|
|
524
672
|
/** The index within the parent's children vector of this node. */
|
525
|
-
|
673
|
+
unsigned int index_within_parent;
|
526
674
|
|
527
675
|
/**
|
528
|
-
* A bitvector of flags containing information about why this element
|
529
|
-
* inserted into the parse tree, including a variety of special
|
530
|
-
* situations.
|
676
|
+
* A bitvector of flags containing information about why this element
|
677
|
+
* was inserted into the parse tree, including a variety of special
|
678
|
+
* parse situations.
|
531
679
|
*/
|
532
680
|
GumboParseFlags parse_flags;
|
533
681
|
|
@@ -539,133 +687,187 @@ struct GumboInternalNode {
|
|
539
687
|
} v;
|
540
688
|
};
|
541
689
|
|
542
|
-
/**
|
543
|
-
* The type for an allocator function. Takes the 'userdata' member of the
|
544
|
-
* GumboParser struct as its first argument. Semantics should be the same as
|
545
|
-
* malloc, i.e. return a block of size_t bytes on success or NULL on failure.
|
546
|
-
* Allocating a block of 0 bytes behaves as per malloc.
|
547
|
-
*/
|
548
|
-
// TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
|
549
|
-
typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
|
550
|
-
|
551
|
-
/**
|
552
|
-
* The type for a deallocator function. Takes the 'userdata' member of the
|
553
|
-
* GumboParser struct as its first argument.
|
554
|
-
*/
|
555
|
-
typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
|
556
|
-
|
557
690
|
/**
|
558
691
|
* Input struct containing configuration options for the parser.
|
559
|
-
* These let you specify alternate memory managers, provide different
|
560
|
-
* handling, etc.
|
561
|
-
*
|
692
|
+
* These let you specify alternate memory managers, provide different
|
693
|
+
* error handling, etc. Use `kGumboDefaultOptions` for sensible
|
694
|
+
* defaults and only set what you need.
|
562
695
|
*/
|
563
696
|
typedef struct GumboInternalOptions {
|
564
|
-
/** A memory allocator function. Default: malloc. */
|
565
|
-
GumboAllocatorFunction allocator;
|
566
|
-
|
567
|
-
/** A memory deallocator function. Default: free. */
|
568
|
-
GumboDeallocatorFunction deallocator;
|
569
|
-
|
570
697
|
/**
|
571
|
-
*
|
572
|
-
*
|
573
|
-
*/
|
574
|
-
void* userdata;
|
575
|
-
|
576
|
-
/**
|
577
|
-
* The tab-stop size, for computing positions in source code that uses tabs.
|
578
|
-
* Default: 8.
|
698
|
+
* The tab-stop size, for computing positions in HTML files that
|
699
|
+
* use tabs. Default: `8`.
|
579
700
|
*/
|
580
701
|
int tab_stop;
|
581
702
|
|
582
703
|
/**
|
583
704
|
* Whether or not to stop parsing when the first error is encountered.
|
584
|
-
* Default: false
|
705
|
+
* Default: `false`.
|
585
706
|
*/
|
586
707
|
bool stop_on_first_error;
|
587
708
|
|
588
709
|
/**
|
589
|
-
*
|
590
|
-
*
|
591
|
-
*
|
592
|
-
*
|
593
|
-
* Default:
|
710
|
+
* Maximum allowed depth for the parse tree. If this limit is exceeded,
|
711
|
+
* the parser will return early with a partial document and the returned
|
712
|
+
* `GumboOutput` will have its `status` field set to
|
713
|
+
* `GUMBO_STATUS_TREE_TOO_DEEP`.
|
714
|
+
* Default: `400`.
|
715
|
+
*/
|
716
|
+
unsigned int max_tree_depth;
|
717
|
+
|
718
|
+
/**
|
719
|
+
* The maximum number of errors before the parser stops recording
|
720
|
+
* them. This is provided so that if the page is totally borked, we
|
721
|
+
* don't completely fill up the errors vector and exhaust memory with
|
722
|
+
* useless redundant errors. Set to `-1` to disable the limit.
|
723
|
+
* Default: `-1`.
|
594
724
|
*/
|
595
725
|
int max_errors;
|
596
726
|
|
597
727
|
/**
|
598
728
|
* The fragment context for parsing:
|
599
|
-
* https://html.spec.whatwg.org/multipage/
|
729
|
+
* https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
|
600
730
|
*
|
601
|
-
* If
|
602
|
-
* the regular parsing algorithm.
|
603
|
-
* intended parent of the parsed fragment.
|
604
|
-
*
|
605
|
-
*
|
606
|
-
* if parsing a fragment even when a full HTML tree isn't available.
|
731
|
+
* If `NULL` is passed here, it is assumed to be "no
|
732
|
+
* fragment", i.e. the regular parsing algorithm. Otherwise, pass the
|
733
|
+
* tag name for the intended parent of the parsed fragment. We use the
|
734
|
+
* tag name, namespace, and encoding attribute which are sufficient to
|
735
|
+
* set all of the parsing context needed for fragment parsing.
|
607
736
|
*
|
608
|
-
* Default:
|
737
|
+
* Default: `NULL`.
|
609
738
|
*/
|
610
|
-
|
739
|
+
const char* fragment_context;
|
611
740
|
|
612
741
|
/**
|
613
|
-
* The namespace for the fragment context.
|
614
|
-
* differentiate between, say, parsing a
|
615
|
-
* HTML.
|
616
|
-
*
|
742
|
+
* The namespace for the fragment context. This lets client code
|
743
|
+
* differentiate between, say, parsing a `<title>` tag in SVG vs.
|
744
|
+
* parsing it in HTML.
|
745
|
+
*
|
746
|
+
* Default: `GUMBO_NAMESPACE_HTML`.
|
617
747
|
*/
|
618
748
|
GumboNamespaceEnum fragment_namespace;
|
749
|
+
|
750
|
+
/**
|
751
|
+
* The value of the fragment context's `encoding` attribute, if any.
|
752
|
+
* Set to `NULL` for no `encoding` attribute.
|
753
|
+
*
|
754
|
+
* Default: `NULL`.
|
755
|
+
*/
|
756
|
+
const char* fragment_encoding;
|
757
|
+
|
758
|
+
/**
|
759
|
+
* Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
|
760
|
+
* be looked up using `gumbo_compute_quirks_mode()`.
|
761
|
+
*
|
762
|
+
* Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
|
763
|
+
*/
|
764
|
+
GumboQuirksModeEnum quirks_mode;
|
765
|
+
|
766
|
+
/**
|
767
|
+
* For fragment parsing. Set this to true if the context node has a form
|
768
|
+
* element as an ancestor.
|
769
|
+
*
|
770
|
+
* Default: `false`.
|
771
|
+
*/
|
772
|
+
bool fragment_context_has_form_ancestor;
|
619
773
|
} GumboOptions;
|
620
774
|
|
621
775
|
/** Default options struct; use this with gumbo_parse_with_options. */
|
622
776
|
extern const GumboOptions kGumboDefaultOptions;
|
623
777
|
|
778
|
+
/**
|
779
|
+
* Status code indicating whether parsing finished successfully or
|
780
|
+
* was stopped mid-document due to exceptional circumstances.
|
781
|
+
*/
|
782
|
+
typedef enum {
|
783
|
+
/**
|
784
|
+
* Indicates that parsing completed successfuly. The resulting tree
|
785
|
+
* will be a complete document.
|
786
|
+
*/
|
787
|
+
GUMBO_STATUS_OK,
|
788
|
+
|
789
|
+
/**
|
790
|
+
* Indicates that the maximum element nesting limit
|
791
|
+
* (`GumboOptions::max_tree_depth`) was reached during parsing. The
|
792
|
+
* resulting tree will be a partial document, with no further nodes
|
793
|
+
* created after the point where the limit was reached. The partial
|
794
|
+
* document may be useful for constructing an error message but
|
795
|
+
* typically shouldn't be used for other purposes.
|
796
|
+
*/
|
797
|
+
GUMBO_STATUS_TREE_TOO_DEEP,
|
798
|
+
|
799
|
+
// Currently unused
|
800
|
+
GUMBO_STATUS_OUT_OF_MEMORY,
|
801
|
+
} GumboOutputStatus;
|
802
|
+
|
803
|
+
|
624
804
|
/** The output struct containing the results of the parse. */
|
625
805
|
typedef struct GumboInternalOutput {
|
626
806
|
/**
|
627
|
-
* Pointer to the document node.
|
628
|
-
* that contains the entire document as its child.
|
807
|
+
* Pointer to the document node. This is a `GumboNode` of type
|
808
|
+
* `NODE_DOCUMENT` that contains the entire document as its child.
|
629
809
|
*/
|
630
810
|
GumboNode* document;
|
631
811
|
|
632
812
|
/**
|
633
|
-
* Pointer to the root node.
|
634
|
-
* document.
|
813
|
+
* Pointer to the root node. This is the `<html>` tag that forms the
|
814
|
+
* root of the document.
|
635
815
|
*/
|
636
816
|
GumboNode* root;
|
637
817
|
|
638
818
|
/**
|
639
819
|
* A list of errors that occurred during the parse.
|
640
820
|
* NOTE: In version 1.0 of this library, the API for errors hasn't been fully
|
641
|
-
* fleshed out and may change in the future.
|
642
|
-
* header isn't part of the public API.
|
821
|
+
* fleshed out and may change in the future. For this reason, the GumboError
|
822
|
+
* header isn't part of the public API. Contact us if you need errors
|
643
823
|
* reported so we can work out something appropriate for your use-case.
|
644
824
|
*/
|
645
825
|
GumboVector /* GumboError */ errors;
|
826
|
+
|
827
|
+
/**
|
828
|
+
* A status code indicating whether parsing finished successfully or was
|
829
|
+
* stopped mid-document due to exceptional circumstances.
|
830
|
+
*/
|
831
|
+
GumboOutputStatus status;
|
646
832
|
} GumboOutput;
|
647
833
|
|
648
834
|
/**
|
649
|
-
* Parses a buffer of
|
650
|
-
* live at least as long as the parse tree, as some fields
|
651
|
-
* point directly into the original buffer.
|
835
|
+
* Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
|
836
|
+
* buffer must live at least as long as the parse tree, as some fields
|
837
|
+
* (eg. `original_text`) point directly into the original buffer.
|
652
838
|
*
|
653
839
|
* This doesn't support buffers longer than 4 gigabytes.
|
654
840
|
*/
|
655
841
|
GumboOutput* gumbo_parse(const char* buffer);
|
656
842
|
|
657
843
|
/**
|
658
|
-
* Extended version of gumbo_parse that takes an explicit options
|
659
|
-
* buffer, and length.
|
844
|
+
* Extended version of `gumbo_parse` that takes an explicit options
|
845
|
+
* structure, buffer, and length.
|
660
846
|
*/
|
661
|
-
GumboOutput* gumbo_parse_with_options(
|
662
|
-
|
847
|
+
GumboOutput* gumbo_parse_with_options (
|
848
|
+
const GumboOptions* options,
|
849
|
+
const char* buffer,
|
850
|
+
size_t buffer_length
|
851
|
+
);
|
852
|
+
|
853
|
+
/**
|
854
|
+
* Compute the quirks mode based on the name, public identifier, and system
|
855
|
+
* identifier. Any of these may be `NULL` to indicate a missing value.
|
856
|
+
*/
|
857
|
+
GumboQuirksModeEnum gumbo_compute_quirks_mode (
|
858
|
+
const char *name,
|
859
|
+
const char *pubid,
|
860
|
+
const char *sysid
|
861
|
+
);
|
862
|
+
|
863
|
+
/** Convert a `GumboOutputStatus` code into a readable description. */
|
864
|
+
const char* gumbo_status_to_string(GumboOutputStatus status);
|
663
865
|
|
664
|
-
/** Release the memory used for the parse tree
|
665
|
-
void gumbo_destroy_output(
|
866
|
+
/** Release the memory used for the parse tree and parse errors. */
|
867
|
+
void gumbo_destroy_output(GumboOutput* output);
|
666
868
|
|
667
869
|
#ifdef __cplusplus
|
668
870
|
}
|
669
871
|
#endif
|
670
872
|
|
671
|
-
#endif
|
873
|
+
#endif // GUMBO_H
|