nokogumbo 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
data/work/gumbo.h DELETED
@@ -1,800 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
18
- // GUMBO_ as a prefix for enum constants (static constants get the Google-style
19
- // kGumbo prefix).
20
-
21
- /**
22
- * @file
23
- * @mainpage Gumbo HTML Parser
24
- *
25
- * This provides a conformant, no-dependencies implementation of the HTML5
26
- * parsing algorithm. It supports only UTF8; if you need to parse a different
27
- * encoding, run a preprocessing step to convert to UTF8. It returns a parse
28
- * tree made of the structs in this file.
29
- *
30
- * Example:
31
- * @code
32
- * GumboOutput* output = gumbo_parse(input);
33
- * do_something_with_doctype(output->document);
34
- * do_something_with_html_tree(output->root);
35
- * gumbo_destroy_output(&options, output);
36
- * @endcode
37
- * HTML5 Spec:
38
- *
39
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
40
- */
41
-
42
- #ifndef GUMBO_GUMBO_H_
43
- #define GUMBO_GUMBO_H_
44
-
45
- #include <stdbool.h>
46
- #include <stddef.h>
47
-
48
- #ifdef __cplusplus
49
- extern "C" {
50
- #endif
51
-
52
- /**
53
- * A struct representing a character position within the original text buffer.
54
- * Line and column numbers are 1-based and offsets are 0-based, which matches
55
- * how most editors and command-line tools work. Also, columns measure
56
- * positions in terms of characters while offsets measure by bytes; this is
57
- * because the offset field is often used to pull out a particular region of
58
- * text (which in most languages that bind to C implies pointer arithmetic on a
59
- * buffer of bytes), while the column field is often used to reference a
60
- * particular column on a printable display, which nowadays is usually UTF-8.
61
- */
62
- typedef struct _GumboSourcePosition {
63
- unsigned int line;
64
- unsigned int column;
65
- unsigned int offset;
66
- } GumboSourcePosition;
67
-
68
- /**
69
- * A SourcePosition used for elements that have no source position, i.e.
70
- * parser-inserted elements.
71
- */
72
- extern const GumboSourcePosition kGumboEmptySourcePosition;
73
-
74
-
75
- /**
76
- * A struct representing a string or part of a string. Strings within the
77
- * parser are represented by a char* and a length; the char* points into
78
- * an existing data buffer owned by some other code (often the original input).
79
- * GumboStringPieces are assumed (by convention) to be immutable, because they
80
- * may share data. Use GumboStringBuffer if you need to construct a string.
81
- * Clients should assume that it is not NUL-terminated, and should always use
82
- * explicit lengths when manipulating them.
83
- */
84
- typedef struct _GumboStringPiece {
85
- /** A pointer to the beginning of the string. NULL iff length == 0. */
86
- const char* data;
87
-
88
- /** The length of the string fragment, in bytes. May be zero. */
89
- size_t length;
90
- } GumboStringPiece;
91
-
92
- /** A constant to represent a 0-length null string. */
93
- extern const GumboStringPiece kGumboEmptyString;
94
-
95
- /**
96
- * Compares two GumboStringPieces, and returns true if they're equal or false
97
- * otherwise.
98
- */
99
- bool gumbo_string_equals(
100
- const GumboStringPiece* str1, const GumboStringPiece* str2);
101
-
102
- /**
103
- * Compares two GumboStringPieces ignoring case, and returns true if they're
104
- * equal or false otherwise.
105
- */
106
- bool gumbo_string_equals_ignore_case(
107
- const GumboStringPiece* str1, const GumboStringPiece* str2);
108
-
109
-
110
- /**
111
- * A simple vector implementation. This stores a pointer to a data array and a
112
- * length. All elements are stored as void*; client code must cast to the
113
- * appropriate type. Overflows upon addition result in reallocation of the data
114
- * array, with the size doubling to maintain O(1) amortized cost. There is no
115
- * removal function, as this isn't needed for any of the operations within this
116
- * library. Iteration can be done through inspecting the structure directly in
117
- * a for-loop.
118
- */
119
- typedef struct _GumboVector {
120
- /** Data elements. This points to a dynamically-allocated array of capacity
121
- * elements, each a void* to the element itself.
122
- */
123
- void** data;
124
-
125
- /** Number of elements currently in the vector. */
126
- unsigned int length;
127
-
128
- /** Current array capacity. */
129
- unsigned int capacity;
130
- } GumboVector;
131
-
132
- /** An empty (0-length, 0-capacity) GumboVector. */
133
- extern const GumboVector kGumboEmptyVector;
134
-
135
- /**
136
- * Returns the first index at which an element appears in this vector (testing
137
- * by pointer equality), or -1 if it never does.
138
- */
139
- int gumbo_vector_index_of(GumboVector* vector, void* element);
140
-
141
-
142
- /**
143
- * An enum for all the tags defined in the HTML5 standard. These correspond to
144
- * the tag names themselves. Enum constants exist only for tags which appear in
145
- * the spec itself (or for tags with special handling in the SVG and MathML
146
- * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
147
- * name can be obtained through original_tag.
148
- *
149
- * This is mostly for API convenience, so that clients of this library don't
150
- * need to perform a strcasecmp to find the normalized tag name. It also has
151
- * efficiency benefits, by letting the parser work with enums instead of
152
- * strings.
153
- */
154
- typedef enum _GumboTag {
155
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
156
- GUMBO_TAG_HTML,
157
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
158
- GUMBO_TAG_HEAD,
159
- GUMBO_TAG_TITLE,
160
- GUMBO_TAG_BASE,
161
- GUMBO_TAG_LINK,
162
- GUMBO_TAG_META,
163
- GUMBO_TAG_STYLE,
164
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
165
- GUMBO_TAG_SCRIPT,
166
- GUMBO_TAG_NOSCRIPT,
167
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
168
- GUMBO_TAG_BODY,
169
- GUMBO_TAG_SECTION,
170
- GUMBO_TAG_NAV,
171
- GUMBO_TAG_ARTICLE,
172
- GUMBO_TAG_ASIDE,
173
- GUMBO_TAG_H1,
174
- GUMBO_TAG_H2,
175
- GUMBO_TAG_H3,
176
- GUMBO_TAG_H4,
177
- GUMBO_TAG_H5,
178
- GUMBO_TAG_H6,
179
- GUMBO_TAG_HGROUP,
180
- GUMBO_TAG_HEADER,
181
- GUMBO_TAG_FOOTER,
182
- GUMBO_TAG_ADDRESS,
183
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
184
- GUMBO_TAG_P,
185
- GUMBO_TAG_HR,
186
- GUMBO_TAG_PRE,
187
- GUMBO_TAG_BLOCKQUOTE,
188
- GUMBO_TAG_OL,
189
- GUMBO_TAG_UL,
190
- GUMBO_TAG_LI,
191
- GUMBO_TAG_DL,
192
- GUMBO_TAG_DT,
193
- GUMBO_TAG_DD,
194
- GUMBO_TAG_FIGURE,
195
- GUMBO_TAG_FIGCAPTION,
196
- GUMBO_TAG_DIV,
197
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
198
- GUMBO_TAG_A,
199
- GUMBO_TAG_EM,
200
- GUMBO_TAG_STRONG,
201
- GUMBO_TAG_SMALL,
202
- GUMBO_TAG_S,
203
- GUMBO_TAG_CITE,
204
- GUMBO_TAG_Q,
205
- GUMBO_TAG_DFN,
206
- GUMBO_TAG_ABBR,
207
- GUMBO_TAG_TIME,
208
- GUMBO_TAG_CODE,
209
- GUMBO_TAG_VAR,
210
- GUMBO_TAG_SAMP,
211
- GUMBO_TAG_KBD,
212
- GUMBO_TAG_SUB,
213
- GUMBO_TAG_SUP,
214
- GUMBO_TAG_I,
215
- GUMBO_TAG_B,
216
- GUMBO_TAG_MARK,
217
- GUMBO_TAG_RUBY,
218
- GUMBO_TAG_RT,
219
- GUMBO_TAG_RP,
220
- GUMBO_TAG_BDI,
221
- GUMBO_TAG_BDO,
222
- GUMBO_TAG_SPAN,
223
- GUMBO_TAG_BR,
224
- GUMBO_TAG_WBR,
225
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
226
- GUMBO_TAG_INS,
227
- GUMBO_TAG_DEL,
228
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
229
- GUMBO_TAG_IMAGE,
230
- GUMBO_TAG_IMG,
231
- GUMBO_TAG_IFRAME,
232
- GUMBO_TAG_EMBED,
233
- GUMBO_TAG_OBJECT,
234
- GUMBO_TAG_PARAM,
235
- GUMBO_TAG_VIDEO,
236
- GUMBO_TAG_AUDIO,
237
- GUMBO_TAG_SOURCE,
238
- GUMBO_TAG_TRACK,
239
- GUMBO_TAG_CANVAS,
240
- GUMBO_TAG_MAP,
241
- GUMBO_TAG_AREA,
242
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
243
- GUMBO_TAG_MATH,
244
- GUMBO_TAG_MI,
245
- GUMBO_TAG_MO,
246
- GUMBO_TAG_MN,
247
- GUMBO_TAG_MS,
248
- GUMBO_TAG_MTEXT,
249
- GUMBO_TAG_MGLYPH,
250
- GUMBO_TAG_MALIGNMARK,
251
- GUMBO_TAG_ANNOTATION_XML,
252
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
253
- GUMBO_TAG_SVG,
254
- GUMBO_TAG_FOREIGNOBJECT,
255
- GUMBO_TAG_DESC,
256
- // SVG title tags will have GUMBO_TAG_TITLE as with HTML.
257
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
258
- GUMBO_TAG_TABLE,
259
- GUMBO_TAG_CAPTION,
260
- GUMBO_TAG_COLGROUP,
261
- GUMBO_TAG_COL,
262
- GUMBO_TAG_TBODY,
263
- GUMBO_TAG_THEAD,
264
- GUMBO_TAG_TFOOT,
265
- GUMBO_TAG_TR,
266
- GUMBO_TAG_TD,
267
- GUMBO_TAG_TH,
268
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
269
- GUMBO_TAG_FORM,
270
- GUMBO_TAG_FIELDSET,
271
- GUMBO_TAG_LEGEND,
272
- GUMBO_TAG_LABEL,
273
- GUMBO_TAG_INPUT,
274
- GUMBO_TAG_BUTTON,
275
- GUMBO_TAG_SELECT,
276
- GUMBO_TAG_DATALIST,
277
- GUMBO_TAG_OPTGROUP,
278
- GUMBO_TAG_OPTION,
279
- GUMBO_TAG_TEXTAREA,
280
- GUMBO_TAG_KEYGEN,
281
- GUMBO_TAG_OUTPUT,
282
- GUMBO_TAG_PROGRESS,
283
- GUMBO_TAG_METER,
284
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
285
- GUMBO_TAG_DETAILS,
286
- GUMBO_TAG_SUMMARY,
287
- GUMBO_TAG_COMMAND,
288
- GUMBO_TAG_MENU,
289
- // Non-conforming elements that nonetheless appear in the HTML5 spec.
290
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
291
- GUMBO_TAG_APPLET,
292
- GUMBO_TAG_ACRONYM,
293
- GUMBO_TAG_BGSOUND,
294
- GUMBO_TAG_DIR,
295
- GUMBO_TAG_FRAME,
296
- GUMBO_TAG_FRAMESET,
297
- GUMBO_TAG_NOFRAMES,
298
- GUMBO_TAG_ISINDEX,
299
- GUMBO_TAG_LISTING,
300
- GUMBO_TAG_XMP,
301
- GUMBO_TAG_NEXTID,
302
- GUMBO_TAG_NOEMBED,
303
- GUMBO_TAG_PLAINTEXT,
304
- GUMBO_TAG_RB,
305
- GUMBO_TAG_STRIKE,
306
- GUMBO_TAG_BASEFONT,
307
- GUMBO_TAG_BIG,
308
- GUMBO_TAG_BLINK,
309
- GUMBO_TAG_CENTER,
310
- GUMBO_TAG_FONT,
311
- GUMBO_TAG_MARQUEE,
312
- GUMBO_TAG_MULTICOL,
313
- GUMBO_TAG_NOBR,
314
- GUMBO_TAG_SPACER,
315
- GUMBO_TAG_TT,
316
- GUMBO_TAG_U,
317
- // Used for all tags that don't have special handling in HTML.
318
- GUMBO_TAG_UNKNOWN,
319
- // A marker value to indicate the end of the enum, for iterating over it.
320
- // Also used as the terminator for varargs functions that take tags.
321
- GUMBO_TAG_LAST,
322
- } GumboTag;
323
-
324
- /**
325
- * Returns the normalized (usually all-lowercased, except for foreign content)
326
- * tag name for an GumboTag enum. Return value is static data owned by the
327
- * library.
328
- */
329
- const char* gumbo_normalized_tagname(GumboTag tag);
330
-
331
- /**
332
- * Extracts the tag name from the original_text field of an element or token by
333
- * stripping off </> characters and attributes and adjusting the passed-in
334
- * GumboStringPiece appropriately. The tag name is in the original case and
335
- * shares a buffer with the original text, to simplify memory management.
336
- * Behavior is undefined if a string-piece that doesn't represent an HTML tag
337
- * (<tagname> or </tagname>) is passed in. If the string piece is completely
338
- * empty (NULL data pointer), then this function will exit successfully as a
339
- * no-op.
340
- */
341
- void gumbo_tag_from_original_text(GumboStringPiece* text);
342
-
343
- /**
344
- * Fixes the case of SVG elements that are not all lowercase.
345
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
346
- * This is not done at parse time because there's no place to store a mutated
347
- * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
348
- * without special handling), while original_tag_name is a pointer into the
349
- * original buffer. Instead, we provide this helper function that clients can
350
- * use to rename SVG tags as appropriate.
351
- * Returns the case-normalized SVG tagname if a replacement is found, or NULL if
352
- * no normalization is called for. The return value is static data and owned by
353
- * the library.
354
- */
355
- const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
356
-
357
- /**
358
- * Converts a tag name string (which may be in upper or mixed case) to a tag
359
- * enum.
360
- */
361
- GumboTag gumbo_tag_enum(const char* tagname);
362
-
363
- /**
364
- * Attribute namespaces.
365
- * HTML includes special handling for XLink, XML, and XMLNS namespaces on
366
- * attributes. Everything else goes in the generatic "NONE" namespace.
367
- */
368
- typedef enum _GumboAttributeNamespaceEnum {
369
- GUMBO_ATTR_NAMESPACE_NONE,
370
- GUMBO_ATTR_NAMESPACE_XLINK,
371
- GUMBO_ATTR_NAMESPACE_XML,
372
- GUMBO_ATTR_NAMESPACE_XMLNS,
373
- } GumboAttributeNamespaceEnum;
374
-
375
- /**
376
- * A struct representing a single attribute on an HTML tag. This is a
377
- * name-value pair, but also includes information about source locations and
378
- * original source text.
379
- */
380
- typedef struct _GumboAttribute {
381
- /**
382
- * The namespace for the attribute. This will usually be
383
- * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
384
- * values, per:
385
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
386
- */
387
- GumboAttributeNamespaceEnum attr_namespace;
388
-
389
- /**
390
- * The name of the attribute. This is in a freshly-allocated buffer to deal
391
- * with case-normalization, and is null-terminated.
392
- */
393
- const char* name;
394
-
395
- /**
396
- * The original text of the attribute name, as a pointer into the original
397
- * source buffer.
398
- */
399
- GumboStringPiece original_name;
400
-
401
- /**
402
- * The value of the attribute. This is in a freshly-allocated buffer to deal
403
- * with unescaping, and is null-terminated. It does not include any quotes
404
- * that surround the attribute. If the attribute has no value (for example,
405
- * 'selected' on a checkbox), this will be an empty string.
406
- */
407
- const char* value;
408
-
409
- /**
410
- * The original text of the value of the attribute. This points into the
411
- * original source buffer. It includes any quotes that surround the
412
- * attribute, and you can look at original_value.data[0] and
413
- * original_value.data[original_value.length - 1] to determine what the quote
414
- * characters were. If the attribute has no value, this will be a 0-length
415
- * string.
416
- */
417
- GumboStringPiece original_value;
418
-
419
- /** The starting position of the attribute name. */
420
- GumboSourcePosition name_start;
421
-
422
- /**
423
- * The ending position of the attribute name. This is not always derivable
424
- * from the starting position of the value because of the possibility of
425
- * whitespace around the = sign.
426
- */
427
- GumboSourcePosition name_end;
428
-
429
- /** The starting position of the attribute value. */
430
- GumboSourcePosition value_start;
431
-
432
- /** The ending position of the attribute value. */
433
- GumboSourcePosition value_end;
434
- } GumboAttribute;
435
-
436
- /**
437
- * Given a vector of GumboAttributes, look up the one with the specified name
438
- * and return it, or NULL if no such attribute exists. This uses a
439
- * case-insensitive match, as HTML is case-insensitive.
440
- */
441
- GumboAttribute* gumbo_get_attribute(
442
- const struct _GumboVector* attrs, const char* name);
443
-
444
- /**
445
- * Enum denoting the type of node. This determines the type of the node.v
446
- * union.
447
- */
448
- typedef enum _GumboNodeType {
449
- /** Document node. v will be a GumboDocument. */
450
- GUMBO_NODE_DOCUMENT,
451
- /** Element node. v will be a GumboElement. */
452
- GUMBO_NODE_ELEMENT,
453
- /** Text node. v will be a GumboText. */
454
- GUMBO_NODE_TEXT,
455
- /** CDATA node. v will be a GumboText. */
456
- GUMBO_NODE_CDATA,
457
- /** Comment node. v. will be a GumboText, excluding comment delimiters. */
458
- GUMBO_NODE_COMMENT,
459
- /** Text node, where all contents is whitespace. v will be a GumboText. */
460
- GUMBO_NODE_WHITESPACE
461
- } GumboNodeType;
462
-
463
- /**
464
- * Forward declaration of GumboNode so it can be used recursively in
465
- * GumboNode.parent.
466
- */
467
- typedef struct _GumboNode GumboNode;
468
-
469
- /** http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode */
470
- typedef enum _GumboQuirksModeEnum {
471
- GUMBO_DOCTYPE_NO_QUIRKS,
472
- GUMBO_DOCTYPE_QUIRKS,
473
- GUMBO_DOCTYPE_LIMITED_QUIRKS
474
- } GumboQuirksModeEnum;
475
-
476
- /**
477
- * Namespaces.
478
- * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
479
- * anything inside an <svg> tag is in the SVG namespace, anything inside the
480
- * <math> tag is in the MathML namespace, and anything else is inside the HTML
481
- * namespace. No other namespaces are supported, so this can be an enum only.
482
- */
483
- typedef enum _GumboNamespaceEnum {
484
- GUMBO_NAMESPACE_HTML,
485
- GUMBO_NAMESPACE_SVG,
486
- GUMBO_NAMESPACE_MATHML
487
- } GumboNamespaceEnum;
488
-
489
- /**
490
- * Parse flags.
491
- * We track the reasons for parser insertion of nodes and store them in a
492
- * bitvector in the node itself. This lets client code optimize out nodes that
493
- * are implied by the HTML structure of the document, or flag constructs that
494
- * may not be allowed by a style guide, or track the prevalence of incorrect or
495
- * tricky HTML code.
496
- */
497
- typedef enum _GumboParseFlags {
498
- /**
499
- * A normal node - both start and end tags appear in the source, nothing has
500
- * been reparented.
501
- */
502
- GUMBO_INSERTION_NORMAL = 0,
503
-
504
- /**
505
- * A node inserted by the parser to fulfill some implicit insertion rule.
506
- * This is usually set in addition to some other flag giving a more specific
507
- * insertion reason; it's a generic catch-all term meaning "The start tag for
508
- * this node did not appear in the document source".
509
- */
510
- GUMBO_INSERTION_BY_PARSER = 1 << 0,
511
-
512
- /**
513
- * A flag indicating that the end tag for this node did not appear in the
514
- * document source. Note that in some cases, you can still have
515
- * parser-inserted nodes with an explicit end tag: for example, "Text</html>"
516
- * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
517
- * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
518
- * exists. This flag will be set only if the end tag is completely missing;
519
- * in some cases, the end tag may be misplaced (eg. a </body> tag with text
520
- * afterwards), which will leave this flag unset and require clients to
521
- * inspect the parse errors for that case.
522
- */
523
- GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
524
-
525
- // Value 1 << 2 was for a flag that has since been removed.
526
-
527
- /**
528
- * A flag for nodes that are inserted because their presence is implied by
529
- * other tags, eg. <html>, <head>, <body>, <tbody>, etc.
530
- */
531
- GUMBO_INSERTION_IMPLIED = 1 << 3,
532
-
533
- /**
534
- * A flag for nodes that are converted from their end tag equivalents. For
535
- * example, </p> when no paragraph is open implies that the parser should
536
- * create a <p> tag and immediately close it, while </br> means the same thing
537
- * as <br>.
538
- */
539
- GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
540
-
541
- /** A flag for nodes that are converted from the parse of an <isindex> tag. */
542
- GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
543
-
544
- /** A flag for <image> tags that are rewritten as <img>. */
545
- GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
546
-
547
- /**
548
- * A flag for nodes that are cloned as a result of the reconstruction of
549
- * active formatting elements. This is set only on the clone; the initial
550
- * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
551
- */
552
- GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
553
-
554
- /** A flag for nodes that are cloned by the adoption agency algorithm. */
555
- GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
556
-
557
- /** A flag for nodes that are moved by the adoption agency algorithm. */
558
- GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
559
-
560
- /**
561
- * A flag for nodes that have been foster-parented out of a table (or
562
- * should've been foster-parented, if verbatim mode is set).
563
- */
564
- GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
565
- } GumboParseFlags;
566
-
567
-
568
- /**
569
- * Information specific to document nodes.
570
- */
571
- typedef struct _GumboDocument {
572
- /**
573
- * An array of GumboNodes, containing the children of this element. This will
574
- * normally consist of the <html> element and any comment nodes found.
575
- * Pointers are owned.
576
- */
577
- GumboVector /* GumboNode* */ children;
578
-
579
- // True if there was an explicit doctype token as opposed to it being omitted.
580
- bool has_doctype;
581
-
582
- // Fields from the doctype token, copied verbatim.
583
- const char* name;
584
- const char* public_identifier;
585
- const char* system_identifier;
586
-
587
- /**
588
- * Whether or not the document is in QuirksMode, as determined by the values
589
- * in the GumboTokenDocType template.
590
- */
591
- GumboQuirksModeEnum doc_type_quirks_mode;
592
- } GumboDocument;
593
-
594
- /**
595
- * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
596
- * This contains just a block of text and its position.
597
- */
598
- typedef struct _GumboText {
599
- /**
600
- * The text of this node, after entities have been parsed and decoded. For
601
- * comment/cdata nodes, this does not include the comment delimiters.
602
- */
603
- const char* text;
604
-
605
- /**
606
- * The original text of this node, as a pointer into the original buffer. For
607
- * comment/cdata nodes, this includes the comment delimiters.
608
- */
609
- GumboStringPiece original_text;
610
-
611
- /**
612
- * The starting position of this node. This corresponds to the position of
613
- * original_text, before entities are decoded.
614
- * */
615
- GumboSourcePosition start_pos;
616
- } GumboText;
617
-
618
- /**
619
- * The struct used to represent all HTML elements. This contains information
620
- * about the tag, attributes, and child nodes.
621
- */
622
- typedef struct _GumboElement {
623
- /**
624
- * An array of GumboNodes, containing the children of this element. Pointers
625
- * are owned.
626
- */
627
- GumboVector /* GumboNode* */ children;
628
-
629
- /** The GumboTag enum for this element. */
630
- GumboTag tag;
631
-
632
- /** The GumboNamespaceEnum for this element. */
633
- GumboNamespaceEnum tag_namespace;
634
-
635
- /**
636
- * A GumboStringPiece pointing to the original tag text for this element,
637
- * pointing directly into the source buffer. If the tag was inserted
638
- * algorithmically (for example, <head> or <tbody> insertion), this will be a
639
- * zero-length string.
640
- */
641
- GumboStringPiece original_tag;
642
-
643
- /**
644
- * A GumboStringPiece pointing to the original end tag text for this element.
645
- * If the end tag was inserted algorithmically, (for example, closing a
646
- * self-closing tag), this will be a zero-length string.
647
- */
648
- GumboStringPiece original_end_tag;
649
-
650
- /** The source position for the start of the start tag. */
651
- GumboSourcePosition start_pos;
652
-
653
- /** The source position for the start of the end tag. */
654
- GumboSourcePosition end_pos;
655
-
656
- /**
657
- * An array of GumboAttributes, containing the attributes for this tag in the
658
- * order that they were parsed. Pointers are owned.
659
- */
660
- GumboVector /* GumboAttribute* */ attributes;
661
- } GumboElement;
662
-
663
- /**
664
- * A supertype for GumboElement and GumboText, so that we can include one
665
- * generic type in lists of children and cast as necessary to subtypes.
666
- */
667
- struct _GumboNode {
668
- /** The type of node that this is. */
669
- GumboNodeType type;
670
-
671
- /** Pointer back to parent node. Not owned. */
672
- GumboNode* parent;
673
-
674
- /** The index within the parent's children vector of this node. */
675
- size_t index_within_parent;
676
-
677
- /**
678
- * A bitvector of flags containing information about why this element was
679
- * inserted into the parse tree, including a variety of special parse
680
- * situations.
681
- */
682
- GumboParseFlags parse_flags;
683
-
684
- /** The actual node data. */
685
- union {
686
- GumboDocument document; // For GUMBO_NODE_DOCUMENT.
687
- GumboElement element; // For GUMBO_NODE_ELEMENT.
688
- GumboText text; // For everything else.
689
- } v;
690
- };
691
-
692
- /**
693
- * The type for an allocator function. Takes the 'userdata' member of the
694
- * GumboParser struct as its first argument. Semantics should be the same as
695
- * malloc, i.e. return a block of size_t bytes on success or NULL on failure.
696
- * Allocating a block of 0 bytes behaves as per malloc.
697
- */
698
- // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
699
- typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
700
-
701
- /**
702
- * The type for a deallocator function. Takes the 'userdata' member of the
703
- * GumboParser struct as its first argument.
704
- */
705
- typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
706
-
707
- /**
708
- * Input struct containing configuration options for the parser.
709
- * These let you specify alternate memory managers, provide different error
710
- * handling, etc.
711
- * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
712
- */
713
- typedef struct _GumboOptions {
714
- /** A memory allocator function. Default: malloc. */
715
- GumboAllocatorFunction allocator;
716
-
717
- /** A memory deallocator function. Default: free. */
718
- GumboDeallocatorFunction deallocator;
719
-
720
- /**
721
- * An opaque object that's passed in as the first argument to all callbacks
722
- * used by this library. Default: NULL.
723
- */
724
- void* userdata;
725
-
726
- /**
727
- * The tab-stop size, for computing positions in source code that uses tabs.
728
- * Default: 8.
729
- */
730
- int tab_stop;
731
-
732
- /**
733
- * Whether or not to stop parsing when the first error is encountered.
734
- * Default: false.
735
- */
736
- bool stop_on_first_error;
737
-
738
- /**
739
- * The maximum number of errors before the parser stops recording them. This
740
- * is provided so that if the page is totally borked, we don't completely fill
741
- * up the errors vector and exhaust memory with useless redundant errors. Set
742
- * to -1 to disable the limit.
743
- * Default: -1
744
- */
745
- int max_errors;
746
- } GumboOptions;
747
-
748
- /** Default options struct; use this with gumbo_parse_with_options. */
749
- extern const GumboOptions kGumboDefaultOptions;
750
-
751
- /** The output struct containing the results of the parse. */
752
- typedef struct _GumboOutput {
753
- /**
754
- * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
755
- * that contains the entire document as its child.
756
- */
757
- GumboNode* document;
758
-
759
- /**
760
- * Pointer to the root node. This the <html> tag that forms the root of the
761
- * document.
762
- */
763
- GumboNode* root;
764
-
765
- /**
766
- * A list of errors that occurred during the parse.
767
- * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
768
- * fleshed out and may change in the future. For this reason, the GumboError
769
- * header isn't part of the public API. Contact us if you need errors
770
- * reported so we can work out something appropriate for your use-case.
771
- */
772
- GumboVector /* GumboError */ errors;
773
- } GumboOutput;
774
-
775
- /**
776
- * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
777
- * live at least as long as the parse tree, as some fields (eg. original_text)
778
- * point directly into the original buffer.
779
- *
780
- * This doesn't support buffers longer than 4 gigabytes.
781
- */
782
- struct _GumboOutput* gumbo_parse(const char* buffer);
783
-
784
- /**
785
- * Extended version of gumbo_parse that takes an explicit options structure,
786
- * buffer, and length.
787
- */
788
- struct _GumboOutput* gumbo_parse_with_options(
789
- const GumboOptions* options, const char* buffer, size_t buffer_length);
790
-
791
- /** Release the memory used for the parse tree & parse errors. */
792
- void gumbo_destroy_output(
793
- const struct _GumboOptions* options, GumboOutput* output);
794
-
795
-
796
- #ifdef __cplusplus
797
- }
798
- #endif
799
-
800
- #endif // GUMBO_GUMBO_H_