nokogumbo 1.4.8 → 1.4.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/gumbo-parser/src/attribute.c +44 -0
- data/gumbo-parser/src/attribute.h +37 -0
- data/gumbo-parser/src/char_ref.c +23069 -0
- data/gumbo-parser/src/char_ref.h +60 -0
- data/gumbo-parser/src/char_ref.rl +2554 -0
- data/gumbo-parser/src/error.c +279 -0
- data/gumbo-parser/src/error.h +225 -0
- data/gumbo-parser/src/gumbo.h +671 -0
- data/gumbo-parser/src/insertion_mode.h +57 -0
- data/gumbo-parser/src/parser.c +4188 -0
- data/gumbo-parser/src/parser.h +57 -0
- data/gumbo-parser/src/string_buffer.c +110 -0
- data/gumbo-parser/src/string_buffer.h +84 -0
- data/gumbo-parser/src/string_piece.c +48 -0
- data/gumbo-parser/src/string_piece.h +38 -0
- data/gumbo-parser/src/tag.c +95 -0
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +153 -0
- data/gumbo-parser/src/tag_gperf.h +105 -0
- data/gumbo-parser/src/tag_sizes.h +4 -0
- data/gumbo-parser/src/tag_strings.h +153 -0
- data/gumbo-parser/src/token_type.h +41 -0
- data/gumbo-parser/src/tokenizer.c +2897 -0
- data/gumbo-parser/src/tokenizer.h +123 -0
- data/gumbo-parser/src/tokenizer_states.h +103 -0
- data/gumbo-parser/src/utf8.c +270 -0
- data/gumbo-parser/src/utf8.h +132 -0
- data/gumbo-parser/src/util.c +58 -0
- data/gumbo-parser/src/util.h +60 -0
- data/gumbo-parser/src/vector.c +123 -0
- data/gumbo-parser/src/vector.h +67 -0
- data/gumbo-parser/visualc/include/strings.h +4 -0
- metadata +40 -8
@@ -0,0 +1,671 @@
|
|
1
|
+
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
//
|
17
|
+
// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
|
18
|
+
// GUMBO_ as a prefix for enum constants (static constants get the Google-style
|
19
|
+
// kGumbo prefix).
|
20
|
+
|
21
|
+
/**
|
22
|
+
* @file
|
23
|
+
* @mainpage Gumbo HTML Parser
|
24
|
+
*
|
25
|
+
* This provides a conformant, no-dependencies implementation of the HTML5
|
26
|
+
* parsing algorithm. It supports only UTF8; if you need to parse a different
|
27
|
+
* encoding, run a preprocessing step to convert to UTF8. It returns a parse
|
28
|
+
* tree made of the structs in this file.
|
29
|
+
*
|
30
|
+
* Example:
|
31
|
+
* @code
|
32
|
+
* GumboOutput* output = gumbo_parse(input);
|
33
|
+
* do_something_with_doctype(output->document);
|
34
|
+
* do_something_with_html_tree(output->root);
|
35
|
+
* gumbo_destroy_output(&options, output);
|
36
|
+
* @endcode
|
37
|
+
* HTML5 Spec:
|
38
|
+
*
|
39
|
+
* http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
|
40
|
+
*/
|
41
|
+
|
42
|
+
#ifndef GUMBO_GUMBO_H_
|
43
|
+
#define GUMBO_GUMBO_H_
|
44
|
+
|
45
|
+
#ifdef _MSC_VER
|
46
|
+
#define _CRT_SECURE_NO_WARNINGS
|
47
|
+
#define fileno _fileno
|
48
|
+
#endif
|
49
|
+
|
50
|
+
#include <stdbool.h>
|
51
|
+
#include <stddef.h>
|
52
|
+
|
53
|
+
#ifdef __cplusplus
|
54
|
+
extern "C" {
|
55
|
+
#endif
|
56
|
+
|
57
|
+
/**
|
58
|
+
* A struct representing a character position within the original text buffer.
|
59
|
+
* Line and column numbers are 1-based and offsets are 0-based, which matches
|
60
|
+
* how most editors and command-line tools work. Also, columns measure
|
61
|
+
* positions in terms of characters while offsets measure by bytes; this is
|
62
|
+
* because the offset field is often used to pull out a particular region of
|
63
|
+
* text (which in most languages that bind to C implies pointer arithmetic on a
|
64
|
+
* buffer of bytes), while the column field is often used to reference a
|
65
|
+
* particular column on a printable display, which nowadays is usually UTF-8.
|
66
|
+
*/
|
67
|
+
typedef struct {
|
68
|
+
unsigned int line;
|
69
|
+
unsigned int column;
|
70
|
+
unsigned int offset;
|
71
|
+
} GumboSourcePosition;
|
72
|
+
|
73
|
+
/**
|
74
|
+
* A SourcePosition used for elements that have no source position, i.e.
|
75
|
+
* parser-inserted elements.
|
76
|
+
*/
|
77
|
+
extern const GumboSourcePosition kGumboEmptySourcePosition;
|
78
|
+
|
79
|
+
/**
|
80
|
+
* A struct representing a string or part of a string. Strings within the
|
81
|
+
* parser are represented by a char* and a length; the char* points into
|
82
|
+
* an existing data buffer owned by some other code (often the original input).
|
83
|
+
* GumboStringPieces are assumed (by convention) to be immutable, because they
|
84
|
+
* may share data. Use GumboStringBuffer if you need to construct a string.
|
85
|
+
* Clients should assume that it is not NUL-terminated, and should always use
|
86
|
+
* explicit lengths when manipulating them.
|
87
|
+
*/
|
88
|
+
typedef struct {
|
89
|
+
/** A pointer to the beginning of the string. NULL iff length == 0. */
|
90
|
+
const char* data;
|
91
|
+
|
92
|
+
/** The length of the string fragment, in bytes. May be zero. */
|
93
|
+
size_t length;
|
94
|
+
} GumboStringPiece;
|
95
|
+
|
96
|
+
/** A constant to represent a 0-length null string. */
|
97
|
+
extern const GumboStringPiece kGumboEmptyString;
|
98
|
+
|
99
|
+
/**
|
100
|
+
* Compares two GumboStringPieces, and returns true if they're equal or false
|
101
|
+
* otherwise.
|
102
|
+
*/
|
103
|
+
bool gumbo_string_equals(
|
104
|
+
const GumboStringPiece* str1, const GumboStringPiece* str2);
|
105
|
+
|
106
|
+
/**
|
107
|
+
* Compares two GumboStringPieces ignoring case, and returns true if they're
|
108
|
+
* equal or false otherwise.
|
109
|
+
*/
|
110
|
+
bool gumbo_string_equals_ignore_case(
|
111
|
+
const GumboStringPiece* str1, const GumboStringPiece* str2);
|
112
|
+
|
113
|
+
/**
|
114
|
+
* A simple vector implementation. This stores a pointer to a data array and a
|
115
|
+
* length. All elements are stored as void*; client code must cast to the
|
116
|
+
* appropriate type. Overflows upon addition result in reallocation of the data
|
117
|
+
* array, with the size doubling to maintain O(1) amortized cost. There is no
|
118
|
+
* removal function, as this isn't needed for any of the operations within this
|
119
|
+
* library. Iteration can be done through inspecting the structure directly in
|
120
|
+
* a for-loop.
|
121
|
+
*/
|
122
|
+
typedef struct {
|
123
|
+
/** Data elements. This points to a dynamically-allocated array of capacity
|
124
|
+
* elements, each a void* to the element itself.
|
125
|
+
*/
|
126
|
+
void** data;
|
127
|
+
|
128
|
+
/** Number of elements currently in the vector. */
|
129
|
+
unsigned int length;
|
130
|
+
|
131
|
+
/** Current array capacity. */
|
132
|
+
unsigned int capacity;
|
133
|
+
} GumboVector;
|
134
|
+
|
135
|
+
/** An empty (0-length, 0-capacity) GumboVector. */
|
136
|
+
extern const GumboVector kGumboEmptyVector;
|
137
|
+
|
138
|
+
/**
|
139
|
+
* Returns the first index at which an element appears in this vector (testing
|
140
|
+
* by pointer equality), or -1 if it never does.
|
141
|
+
*/
|
142
|
+
int gumbo_vector_index_of(GumboVector* vector, const void* element);
|
143
|
+
|
144
|
+
/**
|
145
|
+
* An enum for all the tags defined in the HTML5 standard. These correspond to
|
146
|
+
* the tag names themselves. Enum constants exist only for tags which appear in
|
147
|
+
* the spec itself (or for tags with special handling in the SVG and MathML
|
148
|
+
* namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
|
149
|
+
* name can be obtained through original_tag.
|
150
|
+
*
|
151
|
+
* This is mostly for API convenience, so that clients of this library don't
|
152
|
+
* need to perform a strcasecmp to find the normalized tag name. It also has
|
153
|
+
* efficiency benefits, by letting the parser work with enums instead of
|
154
|
+
* strings.
|
155
|
+
*/
|
156
|
+
typedef enum {
|
157
|
+
// Load all the tags from an external source, generated from tag.in.
|
158
|
+
#include "tag_enum.h"
|
159
|
+
// Used for all tags that don't have special handling in HTML. Add new tags
|
160
|
+
// to the end of tag.in so as to preserve backwards-compatibility.
|
161
|
+
GUMBO_TAG_UNKNOWN,
|
162
|
+
// A marker value to indicate the end of the enum, for iterating over it.
|
163
|
+
// Also used as the terminator for varargs functions that take tags.
|
164
|
+
GUMBO_TAG_LAST,
|
165
|
+
} GumboTag;
|
166
|
+
|
167
|
+
/**
|
168
|
+
* Returns the normalized (usually all-lowercased, except for foreign content)
|
169
|
+
* tag name for an GumboTag enum. Return value is static data owned by the
|
170
|
+
* library.
|
171
|
+
*/
|
172
|
+
const char* gumbo_normalized_tagname(GumboTag tag);
|
173
|
+
|
174
|
+
/**
|
175
|
+
* Extracts the tag name from the original_text field of an element or token by
|
176
|
+
* stripping off </> characters and attributes and adjusting the passed-in
|
177
|
+
* GumboStringPiece appropriately. The tag name is in the original case and
|
178
|
+
* shares a buffer with the original text, to simplify memory management.
|
179
|
+
* Behavior is undefined if a string-piece that doesn't represent an HTML tag
|
180
|
+
* (<tagname> or </tagname>) is passed in. If the string piece is completely
|
181
|
+
* empty (NULL data pointer), then this function will exit successfully as a
|
182
|
+
* no-op.
|
183
|
+
*/
|
184
|
+
void gumbo_tag_from_original_text(GumboStringPiece* text);
|
185
|
+
|
186
|
+
/**
|
187
|
+
* Fixes the case of SVG elements that are not all lowercase.
|
188
|
+
* http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
|
189
|
+
* This is not done at parse time because there's no place to store a mutated
|
190
|
+
* tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
|
191
|
+
* without special handling), while original_tag_name is a pointer into the
|
192
|
+
* original buffer. Instead, we provide this helper function that clients can
|
193
|
+
* use to rename SVG tags as appropriate.
|
194
|
+
* Returns the case-normalized SVG tagname if a replacement is found, or NULL if
|
195
|
+
* no normalization is called for. The return value is static data and owned by
|
196
|
+
* the library.
|
197
|
+
*/
|
198
|
+
const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
|
199
|
+
|
200
|
+
/**
|
201
|
+
* Converts a tag name string (which may be in upper or mixed case) to a tag
|
202
|
+
* enum. The `tag` version expects `tagname` to be NULL-terminated
|
203
|
+
*/
|
204
|
+
GumboTag gumbo_tag_enum(const char* tagname);
|
205
|
+
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
|
206
|
+
|
207
|
+
/**
|
208
|
+
* Attribute namespaces.
|
209
|
+
* HTML includes special handling for XLink, XML, and XMLNS namespaces on
|
210
|
+
* attributes. Everything else goes in the generic "NONE" namespace.
|
211
|
+
*/
|
212
|
+
typedef enum {
|
213
|
+
GUMBO_ATTR_NAMESPACE_NONE,
|
214
|
+
GUMBO_ATTR_NAMESPACE_XLINK,
|
215
|
+
GUMBO_ATTR_NAMESPACE_XML,
|
216
|
+
GUMBO_ATTR_NAMESPACE_XMLNS,
|
217
|
+
} GumboAttributeNamespaceEnum;
|
218
|
+
|
219
|
+
/**
|
220
|
+
* A struct representing a single attribute on an HTML tag. This is a
|
221
|
+
* name-value pair, but also includes information about source locations and
|
222
|
+
* original source text.
|
223
|
+
*/
|
224
|
+
typedef struct {
|
225
|
+
/**
|
226
|
+
* The namespace for the attribute. This will usually be
|
227
|
+
* GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
|
228
|
+
* values, per:
|
229
|
+
* http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
|
230
|
+
*/
|
231
|
+
GumboAttributeNamespaceEnum attr_namespace;
|
232
|
+
|
233
|
+
/**
|
234
|
+
* The name of the attribute. This is in a freshly-allocated buffer to deal
|
235
|
+
* with case-normalization, and is null-terminated.
|
236
|
+
*/
|
237
|
+
const char* name;
|
238
|
+
|
239
|
+
/**
|
240
|
+
* The original text of the attribute name, as a pointer into the original
|
241
|
+
* source buffer.
|
242
|
+
*/
|
243
|
+
GumboStringPiece original_name;
|
244
|
+
|
245
|
+
/**
|
246
|
+
* The value of the attribute. This is in a freshly-allocated buffer to deal
|
247
|
+
* with unescaping, and is null-terminated. It does not include any quotes
|
248
|
+
* that surround the attribute. If the attribute has no value (for example,
|
249
|
+
* 'selected' on a checkbox), this will be an empty string.
|
250
|
+
*/
|
251
|
+
const char* value;
|
252
|
+
|
253
|
+
/**
|
254
|
+
* The original text of the value of the attribute. This points into the
|
255
|
+
* original source buffer. It includes any quotes that surround the
|
256
|
+
* attribute, and you can look at original_value.data[0] and
|
257
|
+
* original_value.data[original_value.length - 1] to determine what the quote
|
258
|
+
* characters were. If the attribute has no value, this will be a 0-length
|
259
|
+
* string.
|
260
|
+
*/
|
261
|
+
GumboStringPiece original_value;
|
262
|
+
|
263
|
+
/** The starting position of the attribute name. */
|
264
|
+
GumboSourcePosition name_start;
|
265
|
+
|
266
|
+
/**
|
267
|
+
* The ending position of the attribute name. This is not always derivable
|
268
|
+
* from the starting position of the value because of the possibility of
|
269
|
+
* whitespace around the = sign.
|
270
|
+
*/
|
271
|
+
GumboSourcePosition name_end;
|
272
|
+
|
273
|
+
/** The starting position of the attribute value. */
|
274
|
+
GumboSourcePosition value_start;
|
275
|
+
|
276
|
+
/** The ending position of the attribute value. */
|
277
|
+
GumboSourcePosition value_end;
|
278
|
+
} GumboAttribute;
|
279
|
+
|
280
|
+
/**
|
281
|
+
* Given a vector of GumboAttributes, look up the one with the specified name
|
282
|
+
* and return it, or NULL if no such attribute exists. This uses a
|
283
|
+
* case-insensitive match, as HTML is case-insensitive.
|
284
|
+
*/
|
285
|
+
GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
|
286
|
+
|
287
|
+
/**
|
288
|
+
* Enum denoting the type of node. This determines the type of the node.v
|
289
|
+
* union.
|
290
|
+
*/
|
291
|
+
typedef enum {
|
292
|
+
/** Document node. v will be a GumboDocument. */
|
293
|
+
GUMBO_NODE_DOCUMENT,
|
294
|
+
/** Element node. v will be a GumboElement. */
|
295
|
+
GUMBO_NODE_ELEMENT,
|
296
|
+
/** Text node. v will be a GumboText. */
|
297
|
+
GUMBO_NODE_TEXT,
|
298
|
+
/** CDATA node. v will be a GumboText. */
|
299
|
+
GUMBO_NODE_CDATA,
|
300
|
+
/** Comment node. v will be a GumboText, excluding comment delimiters. */
|
301
|
+
GUMBO_NODE_COMMENT,
|
302
|
+
/** Text node, where all contents is whitespace. v will be a GumboText. */
|
303
|
+
GUMBO_NODE_WHITESPACE,
|
304
|
+
/** Template node. This is separate from GUMBO_NODE_ELEMENT because many
|
305
|
+
* client libraries will want to ignore the contents of template nodes, as
|
306
|
+
* the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
|
307
|
+
* here, while clients that want to include template contents should also
|
308
|
+
* check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
|
309
|
+
GUMBO_NODE_TEMPLATE
|
310
|
+
} GumboNodeType;
|
311
|
+
|
312
|
+
/**
|
313
|
+
* Forward declaration of GumboNode so it can be used recursively in
|
314
|
+
* GumboNode.parent.
|
315
|
+
*/
|
316
|
+
typedef struct GumboInternalNode GumboNode;
|
317
|
+
|
318
|
+
/**
|
319
|
+
* http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
|
320
|
+
*/
|
321
|
+
typedef enum {
|
322
|
+
GUMBO_DOCTYPE_NO_QUIRKS,
|
323
|
+
GUMBO_DOCTYPE_QUIRKS,
|
324
|
+
GUMBO_DOCTYPE_LIMITED_QUIRKS
|
325
|
+
} GumboQuirksModeEnum;
|
326
|
+
|
327
|
+
/**
|
328
|
+
* Namespaces.
|
329
|
+
* Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
|
330
|
+
* anything inside an <svg> tag is in the SVG namespace, anything inside the
|
331
|
+
* <math> tag is in the MathML namespace, and anything else is inside the HTML
|
332
|
+
* namespace. No other namespaces are supported, so this can be an enum only.
|
333
|
+
*/
|
334
|
+
typedef enum {
|
335
|
+
GUMBO_NAMESPACE_HTML,
|
336
|
+
GUMBO_NAMESPACE_SVG,
|
337
|
+
GUMBO_NAMESPACE_MATHML
|
338
|
+
} GumboNamespaceEnum;
|
339
|
+
|
340
|
+
/**
|
341
|
+
* Parse flags.
|
342
|
+
* We track the reasons for parser insertion of nodes and store them in a
|
343
|
+
* bitvector in the node itself. This lets client code optimize out nodes that
|
344
|
+
* are implied by the HTML structure of the document, or flag constructs that
|
345
|
+
* may not be allowed by a style guide, or track the prevalence of incorrect or
|
346
|
+
* tricky HTML code.
|
347
|
+
*/
|
348
|
+
typedef enum {
|
349
|
+
/**
|
350
|
+
* A normal node - both start and end tags appear in the source, nothing has
|
351
|
+
* been reparented.
|
352
|
+
*/
|
353
|
+
GUMBO_INSERTION_NORMAL = 0,
|
354
|
+
|
355
|
+
/**
|
356
|
+
* A node inserted by the parser to fulfill some implicit insertion rule.
|
357
|
+
* This is usually set in addition to some other flag giving a more specific
|
358
|
+
* insertion reason; it's a generic catch-all term meaning "The start tag for
|
359
|
+
* this node did not appear in the document source".
|
360
|
+
*/
|
361
|
+
GUMBO_INSERTION_BY_PARSER = 1 << 0,
|
362
|
+
|
363
|
+
/**
|
364
|
+
* A flag indicating that the end tag for this node did not appear in the
|
365
|
+
* document source. Note that in some cases, you can still have
|
366
|
+
* parser-inserted nodes with an explicit end tag: for example, "Text</html>"
|
367
|
+
* has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
|
368
|
+
* GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
|
369
|
+
* exists. This flag will be set only if the end tag is completely missing;
|
370
|
+
* in some cases, the end tag may be misplaced (eg. a </body> tag with text
|
371
|
+
* afterwards), which will leave this flag unset and require clients to
|
372
|
+
* inspect the parse errors for that case.
|
373
|
+
*/
|
374
|
+
GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
|
375
|
+
|
376
|
+
// Value 1 << 2 was for a flag that has since been removed.
|
377
|
+
|
378
|
+
/**
|
379
|
+
* A flag for nodes that are inserted because their presence is implied by
|
380
|
+
* other tags, eg. <html>, <head>, <body>, <tbody>, etc.
|
381
|
+
*/
|
382
|
+
GUMBO_INSERTION_IMPLIED = 1 << 3,
|
383
|
+
|
384
|
+
/**
|
385
|
+
* A flag for nodes that are converted from their end tag equivalents. For
|
386
|
+
* example, </p> when no paragraph is open implies that the parser should
|
387
|
+
* create a <p> tag and immediately close it, while </br> means the same thing
|
388
|
+
* as <br>.
|
389
|
+
*/
|
390
|
+
GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
|
391
|
+
|
392
|
+
/** A flag for nodes that are converted from the parse of an <isindex> tag. */
|
393
|
+
GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
|
394
|
+
|
395
|
+
/** A flag for <image> tags that are rewritten as <img>. */
|
396
|
+
GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
|
397
|
+
|
398
|
+
/**
|
399
|
+
* A flag for nodes that are cloned as a result of the reconstruction of
|
400
|
+
* active formatting elements. This is set only on the clone; the initial
|
401
|
+
* portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
|
402
|
+
*/
|
403
|
+
GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
|
404
|
+
|
405
|
+
/** A flag for nodes that are cloned by the adoption agency algorithm. */
|
406
|
+
GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
|
407
|
+
|
408
|
+
/** A flag for nodes that are moved by the adoption agency algorithm. */
|
409
|
+
GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
|
410
|
+
|
411
|
+
/**
|
412
|
+
* A flag for nodes that have been foster-parented out of a table (or
|
413
|
+
* should've been foster-parented, if verbatim mode is set).
|
414
|
+
*/
|
415
|
+
GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
|
416
|
+
} GumboParseFlags;
|
417
|
+
|
418
|
+
/**
|
419
|
+
* Information specific to document nodes.
|
420
|
+
*/
|
421
|
+
typedef struct {
|
422
|
+
/**
|
423
|
+
* An array of GumboNodes, containing the children of this element. This will
|
424
|
+
* normally consist of the <html> element and any comment nodes found.
|
425
|
+
* Pointers are owned.
|
426
|
+
*/
|
427
|
+
GumboVector /* GumboNode* */ children;
|
428
|
+
|
429
|
+
// True if there was an explicit doctype token as opposed to it being omitted.
|
430
|
+
bool has_doctype;
|
431
|
+
|
432
|
+
// Fields from the doctype token, copied verbatim.
|
433
|
+
const char* name;
|
434
|
+
const char* public_identifier;
|
435
|
+
const char* system_identifier;
|
436
|
+
|
437
|
+
/**
|
438
|
+
* Whether or not the document is in QuirksMode, as determined by the values
|
439
|
+
* in the GumboTokenDocType template.
|
440
|
+
*/
|
441
|
+
GumboQuirksModeEnum doc_type_quirks_mode;
|
442
|
+
} GumboDocument;
|
443
|
+
|
444
|
+
/**
|
445
|
+
* The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
|
446
|
+
* This contains just a block of text and its position.
|
447
|
+
*/
|
448
|
+
typedef struct {
|
449
|
+
/**
|
450
|
+
* The text of this node, after entities have been parsed and decoded. For
|
451
|
+
* comment/cdata nodes, this does not include the comment delimiters.
|
452
|
+
*/
|
453
|
+
const char* text;
|
454
|
+
|
455
|
+
/**
|
456
|
+
* The original text of this node, as a pointer into the original buffer. For
|
457
|
+
* comment/cdata nodes, this includes the comment delimiters.
|
458
|
+
*/
|
459
|
+
GumboStringPiece original_text;
|
460
|
+
|
461
|
+
/**
|
462
|
+
* The starting position of this node. This corresponds to the position of
|
463
|
+
* original_text, before entities are decoded.
|
464
|
+
* */
|
465
|
+
GumboSourcePosition start_pos;
|
466
|
+
} GumboText;
|
467
|
+
|
468
|
+
/**
|
469
|
+
* The struct used to represent all HTML elements. This contains information
|
470
|
+
* about the tag, attributes, and child nodes.
|
471
|
+
*/
|
472
|
+
typedef struct {
|
473
|
+
/**
|
474
|
+
* An array of GumboNodes, containing the children of this element. Pointers
|
475
|
+
* are owned.
|
476
|
+
*/
|
477
|
+
GumboVector /* GumboNode* */ children;
|
478
|
+
|
479
|
+
/** The GumboTag enum for this element. */
|
480
|
+
GumboTag tag;
|
481
|
+
|
482
|
+
/** The GumboNamespaceEnum for this element. */
|
483
|
+
GumboNamespaceEnum tag_namespace;
|
484
|
+
|
485
|
+
/**
|
486
|
+
* A GumboStringPiece pointing to the original tag text for this element,
|
487
|
+
* pointing directly into the source buffer. If the tag was inserted
|
488
|
+
* algorithmically (for example, <head> or <tbody> insertion), this will be a
|
489
|
+
* zero-length string.
|
490
|
+
*/
|
491
|
+
GumboStringPiece original_tag;
|
492
|
+
|
493
|
+
/**
|
494
|
+
* A GumboStringPiece pointing to the original end tag text for this element.
|
495
|
+
* If the end tag was inserted algorithmically, (for example, closing a
|
496
|
+
* self-closing tag), this will be a zero-length string.
|
497
|
+
*/
|
498
|
+
GumboStringPiece original_end_tag;
|
499
|
+
|
500
|
+
/** The source position for the start of the start tag. */
|
501
|
+
GumboSourcePosition start_pos;
|
502
|
+
|
503
|
+
/** The source position for the start of the end tag. */
|
504
|
+
GumboSourcePosition end_pos;
|
505
|
+
|
506
|
+
/**
|
507
|
+
* An array of GumboAttributes, containing the attributes for this tag in the
|
508
|
+
* order that they were parsed. Pointers are owned.
|
509
|
+
*/
|
510
|
+
GumboVector /* GumboAttribute* */ attributes;
|
511
|
+
} GumboElement;
|
512
|
+
|
513
|
+
/**
|
514
|
+
* A supertype for GumboElement and GumboText, so that we can include one
|
515
|
+
* generic type in lists of children and cast as necessary to subtypes.
|
516
|
+
*/
|
517
|
+
struct GumboInternalNode {
|
518
|
+
/** The type of node that this is. */
|
519
|
+
GumboNodeType type;
|
520
|
+
|
521
|
+
/** Pointer back to parent node. Not owned. */
|
522
|
+
GumboNode* parent;
|
523
|
+
|
524
|
+
/** The index within the parent's children vector of this node. */
|
525
|
+
size_t index_within_parent;
|
526
|
+
|
527
|
+
/**
|
528
|
+
* A bitvector of flags containing information about why this element was
|
529
|
+
* inserted into the parse tree, including a variety of special parse
|
530
|
+
* situations.
|
531
|
+
*/
|
532
|
+
GumboParseFlags parse_flags;
|
533
|
+
|
534
|
+
/** The actual node data. */
|
535
|
+
union {
|
536
|
+
GumboDocument document; // For GUMBO_NODE_DOCUMENT.
|
537
|
+
GumboElement element; // For GUMBO_NODE_ELEMENT.
|
538
|
+
GumboText text; // For everything else.
|
539
|
+
} v;
|
540
|
+
};
|
541
|
+
|
542
|
+
/**
|
543
|
+
* The type for an allocator function. Takes the 'userdata' member of the
|
544
|
+
* GumboParser struct as its first argument. Semantics should be the same as
|
545
|
+
* malloc, i.e. return a block of size_t bytes on success or NULL on failure.
|
546
|
+
* Allocating a block of 0 bytes behaves as per malloc.
|
547
|
+
*/
|
548
|
+
// TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
|
549
|
+
typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
|
550
|
+
|
551
|
+
/**
|
552
|
+
* The type for a deallocator function. Takes the 'userdata' member of the
|
553
|
+
* GumboParser struct as its first argument.
|
554
|
+
*/
|
555
|
+
typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
|
556
|
+
|
557
|
+
/**
|
558
|
+
* Input struct containing configuration options for the parser.
|
559
|
+
* These let you specify alternate memory managers, provide different error
|
560
|
+
* handling, etc.
|
561
|
+
* Use kGumboDefaultOptions for sensible defaults, and only set what you need.
|
562
|
+
*/
|
563
|
+
typedef struct GumboInternalOptions {
|
564
|
+
/** A memory allocator function. Default: malloc. */
|
565
|
+
GumboAllocatorFunction allocator;
|
566
|
+
|
567
|
+
/** A memory deallocator function. Default: free. */
|
568
|
+
GumboDeallocatorFunction deallocator;
|
569
|
+
|
570
|
+
/**
|
571
|
+
* An opaque object that's passed in as the first argument to all callbacks
|
572
|
+
* used by this library. Default: NULL.
|
573
|
+
*/
|
574
|
+
void* userdata;
|
575
|
+
|
576
|
+
/**
|
577
|
+
* The tab-stop size, for computing positions in source code that uses tabs.
|
578
|
+
* Default: 8.
|
579
|
+
*/
|
580
|
+
int tab_stop;
|
581
|
+
|
582
|
+
/**
|
583
|
+
* Whether or not to stop parsing when the first error is encountered.
|
584
|
+
* Default: false.
|
585
|
+
*/
|
586
|
+
bool stop_on_first_error;
|
587
|
+
|
588
|
+
/**
|
589
|
+
* The maximum number of errors before the parser stops recording them. This
|
590
|
+
* is provided so that if the page is totally borked, we don't completely fill
|
591
|
+
* up the errors vector and exhaust memory with useless redundant errors. Set
|
592
|
+
* to -1 to disable the limit.
|
593
|
+
* Default: -1
|
594
|
+
*/
|
595
|
+
int max_errors;
|
596
|
+
|
597
|
+
/**
|
598
|
+
* The fragment context for parsing:
|
599
|
+
* https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
|
600
|
+
*
|
601
|
+
* If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e.
|
602
|
+
* the regular parsing algorithm. Otherwise, pass the tag enum for the
|
603
|
+
* intended parent of the parsed fragment. We use just the tag enum rather
|
604
|
+
* than a full node because that's enough to set all the parsing context we
|
605
|
+
* need, and it provides some additional flexibility for client code to act as
|
606
|
+
* if parsing a fragment even when a full HTML tree isn't available.
|
607
|
+
*
|
608
|
+
* Default: GUMBO_TAG_LAST
|
609
|
+
*/
|
610
|
+
GumboTag fragment_context;
|
611
|
+
|
612
|
+
/**
|
613
|
+
* The namespace for the fragment context. This lets client code
|
614
|
+
* differentiate between, say, parsing a <title> tag in SVG vs. parsing it in
|
615
|
+
* HTML.
|
616
|
+
* Default: GUMBO_NAMESPACE_HTML
|
617
|
+
*/
|
618
|
+
GumboNamespaceEnum fragment_namespace;
|
619
|
+
} GumboOptions;
|
620
|
+
|
621
|
+
/** Default options struct; use this with gumbo_parse_with_options. */
|
622
|
+
extern const GumboOptions kGumboDefaultOptions;
|
623
|
+
|
624
|
+
/** The output struct containing the results of the parse. */
|
625
|
+
typedef struct GumboInternalOutput {
|
626
|
+
/**
|
627
|
+
* Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
|
628
|
+
* that contains the entire document as its child.
|
629
|
+
*/
|
630
|
+
GumboNode* document;
|
631
|
+
|
632
|
+
/**
|
633
|
+
* Pointer to the root node. This the <html> tag that forms the root of the
|
634
|
+
* document.
|
635
|
+
*/
|
636
|
+
GumboNode* root;
|
637
|
+
|
638
|
+
/**
|
639
|
+
* A list of errors that occurred during the parse.
|
640
|
+
* NOTE: In version 1.0 of this library, the API for errors hasn't been fully
|
641
|
+
* fleshed out and may change in the future. For this reason, the GumboError
|
642
|
+
* header isn't part of the public API. Contact us if you need errors
|
643
|
+
* reported so we can work out something appropriate for your use-case.
|
644
|
+
*/
|
645
|
+
GumboVector /* GumboError */ errors;
|
646
|
+
} GumboOutput;
|
647
|
+
|
648
|
+
/**
|
649
|
+
* Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
|
650
|
+
* live at least as long as the parse tree, as some fields (eg. original_text)
|
651
|
+
* point directly into the original buffer.
|
652
|
+
*
|
653
|
+
* This doesn't support buffers longer than 4 gigabytes.
|
654
|
+
*/
|
655
|
+
GumboOutput* gumbo_parse(const char* buffer);
|
656
|
+
|
657
|
+
/**
|
658
|
+
* Extended version of gumbo_parse that takes an explicit options structure,
|
659
|
+
* buffer, and length.
|
660
|
+
*/
|
661
|
+
GumboOutput* gumbo_parse_with_options(
|
662
|
+
const GumboOptions* options, const char* buffer, size_t buffer_length);
|
663
|
+
|
664
|
+
/** Release the memory used for the parse tree & parse errors. */
|
665
|
+
void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output);
|
666
|
+
|
667
|
+
#ifdef __cplusplus
|
668
|
+
}
|
669
|
+
#endif
|
670
|
+
|
671
|
+
#endif // GUMBO_GUMBO_H_
|