nokogumbo 1.4.8 → 1.4.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,671 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
18
+ // GUMBO_ as a prefix for enum constants (static constants get the Google-style
19
+ // kGumbo prefix).
20
+
21
+ /**
22
+ * @file
23
+ * @mainpage Gumbo HTML Parser
24
+ *
25
+ * This provides a conformant, no-dependencies implementation of the HTML5
26
+ * parsing algorithm. It supports only UTF8; if you need to parse a different
27
+ * encoding, run a preprocessing step to convert to UTF8. It returns a parse
28
+ * tree made of the structs in this file.
29
+ *
30
+ * Example:
31
+ * @code
32
+ * GumboOutput* output = gumbo_parse(input);
33
+ * do_something_with_doctype(output->document);
34
+ * do_something_with_html_tree(output->root);
35
+ * gumbo_destroy_output(&options, output);
36
+ * @endcode
37
+ * HTML5 Spec:
38
+ *
39
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
40
+ */
41
+
42
+ #ifndef GUMBO_GUMBO_H_
43
+ #define GUMBO_GUMBO_H_
44
+
45
+ #ifdef _MSC_VER
46
+ #define _CRT_SECURE_NO_WARNINGS
47
+ #define fileno _fileno
48
+ #endif
49
+
50
+ #include <stdbool.h>
51
+ #include <stddef.h>
52
+
53
+ #ifdef __cplusplus
54
+ extern "C" {
55
+ #endif
56
+
57
+ /**
58
+ * A struct representing a character position within the original text buffer.
59
+ * Line and column numbers are 1-based and offsets are 0-based, which matches
60
+ * how most editors and command-line tools work. Also, columns measure
61
+ * positions in terms of characters while offsets measure by bytes; this is
62
+ * because the offset field is often used to pull out a particular region of
63
+ * text (which in most languages that bind to C implies pointer arithmetic on a
64
+ * buffer of bytes), while the column field is often used to reference a
65
+ * particular column on a printable display, which nowadays is usually UTF-8.
66
+ */
67
+ typedef struct {
68
+ unsigned int line;
69
+ unsigned int column;
70
+ unsigned int offset;
71
+ } GumboSourcePosition;
72
+
73
+ /**
74
+ * A SourcePosition used for elements that have no source position, i.e.
75
+ * parser-inserted elements.
76
+ */
77
+ extern const GumboSourcePosition kGumboEmptySourcePosition;
78
+
79
+ /**
80
+ * A struct representing a string or part of a string. Strings within the
81
+ * parser are represented by a char* and a length; the char* points into
82
+ * an existing data buffer owned by some other code (often the original input).
83
+ * GumboStringPieces are assumed (by convention) to be immutable, because they
84
+ * may share data. Use GumboStringBuffer if you need to construct a string.
85
+ * Clients should assume that it is not NUL-terminated, and should always use
86
+ * explicit lengths when manipulating them.
87
+ */
88
+ typedef struct {
89
+ /** A pointer to the beginning of the string. NULL iff length == 0. */
90
+ const char* data;
91
+
92
+ /** The length of the string fragment, in bytes. May be zero. */
93
+ size_t length;
94
+ } GumboStringPiece;
95
+
96
+ /** A constant to represent a 0-length null string. */
97
+ extern const GumboStringPiece kGumboEmptyString;
98
+
99
+ /**
100
+ * Compares two GumboStringPieces, and returns true if they're equal or false
101
+ * otherwise.
102
+ */
103
+ bool gumbo_string_equals(
104
+ const GumboStringPiece* str1, const GumboStringPiece* str2);
105
+
106
+ /**
107
+ * Compares two GumboStringPieces ignoring case, and returns true if they're
108
+ * equal or false otherwise.
109
+ */
110
+ bool gumbo_string_equals_ignore_case(
111
+ const GumboStringPiece* str1, const GumboStringPiece* str2);
112
+
113
+ /**
114
+ * A simple vector implementation. This stores a pointer to a data array and a
115
+ * length. All elements are stored as void*; client code must cast to the
116
+ * appropriate type. Overflows upon addition result in reallocation of the data
117
+ * array, with the size doubling to maintain O(1) amortized cost. There is no
118
+ * removal function, as this isn't needed for any of the operations within this
119
+ * library. Iteration can be done through inspecting the structure directly in
120
+ * a for-loop.
121
+ */
122
+ typedef struct {
123
+ /** Data elements. This points to a dynamically-allocated array of capacity
124
+ * elements, each a void* to the element itself.
125
+ */
126
+ void** data;
127
+
128
+ /** Number of elements currently in the vector. */
129
+ unsigned int length;
130
+
131
+ /** Current array capacity. */
132
+ unsigned int capacity;
133
+ } GumboVector;
134
+
135
+ /** An empty (0-length, 0-capacity) GumboVector. */
136
+ extern const GumboVector kGumboEmptyVector;
137
+
138
+ /**
139
+ * Returns the first index at which an element appears in this vector (testing
140
+ * by pointer equality), or -1 if it never does.
141
+ */
142
+ int gumbo_vector_index_of(GumboVector* vector, const void* element);
143
+
144
+ /**
145
+ * An enum for all the tags defined in the HTML5 standard. These correspond to
146
+ * the tag names themselves. Enum constants exist only for tags which appear in
147
+ * the spec itself (or for tags with special handling in the SVG and MathML
148
+ * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
149
+ * name can be obtained through original_tag.
150
+ *
151
+ * This is mostly for API convenience, so that clients of this library don't
152
+ * need to perform a strcasecmp to find the normalized tag name. It also has
153
+ * efficiency benefits, by letting the parser work with enums instead of
154
+ * strings.
155
+ */
156
+ typedef enum {
157
+ // Load all the tags from an external source, generated from tag.in.
158
+ #include "tag_enum.h"
159
+ // Used for all tags that don't have special handling in HTML. Add new tags
160
+ // to the end of tag.in so as to preserve backwards-compatibility.
161
+ GUMBO_TAG_UNKNOWN,
162
+ // A marker value to indicate the end of the enum, for iterating over it.
163
+ // Also used as the terminator for varargs functions that take tags.
164
+ GUMBO_TAG_LAST,
165
+ } GumboTag;
166
+
167
+ /**
168
+ * Returns the normalized (usually all-lowercased, except for foreign content)
169
+ * tag name for an GumboTag enum. Return value is static data owned by the
170
+ * library.
171
+ */
172
+ const char* gumbo_normalized_tagname(GumboTag tag);
173
+
174
+ /**
175
+ * Extracts the tag name from the original_text field of an element or token by
176
+ * stripping off </> characters and attributes and adjusting the passed-in
177
+ * GumboStringPiece appropriately. The tag name is in the original case and
178
+ * shares a buffer with the original text, to simplify memory management.
179
+ * Behavior is undefined if a string-piece that doesn't represent an HTML tag
180
+ * (<tagname> or </tagname>) is passed in. If the string piece is completely
181
+ * empty (NULL data pointer), then this function will exit successfully as a
182
+ * no-op.
183
+ */
184
+ void gumbo_tag_from_original_text(GumboStringPiece* text);
185
+
186
+ /**
187
+ * Fixes the case of SVG elements that are not all lowercase.
188
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
189
+ * This is not done at parse time because there's no place to store a mutated
190
+ * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
191
+ * without special handling), while original_tag_name is a pointer into the
192
+ * original buffer. Instead, we provide this helper function that clients can
193
+ * use to rename SVG tags as appropriate.
194
+ * Returns the case-normalized SVG tagname if a replacement is found, or NULL if
195
+ * no normalization is called for. The return value is static data and owned by
196
+ * the library.
197
+ */
198
+ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
199
+
200
+ /**
201
+ * Converts a tag name string (which may be in upper or mixed case) to a tag
202
+ * enum. The `tag` version expects `tagname` to be NULL-terminated
203
+ */
204
+ GumboTag gumbo_tag_enum(const char* tagname);
205
+ GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
206
+
207
+ /**
208
+ * Attribute namespaces.
209
+ * HTML includes special handling for XLink, XML, and XMLNS namespaces on
210
+ * attributes. Everything else goes in the generic "NONE" namespace.
211
+ */
212
+ typedef enum {
213
+ GUMBO_ATTR_NAMESPACE_NONE,
214
+ GUMBO_ATTR_NAMESPACE_XLINK,
215
+ GUMBO_ATTR_NAMESPACE_XML,
216
+ GUMBO_ATTR_NAMESPACE_XMLNS,
217
+ } GumboAttributeNamespaceEnum;
218
+
219
+ /**
220
+ * A struct representing a single attribute on an HTML tag. This is a
221
+ * name-value pair, but also includes information about source locations and
222
+ * original source text.
223
+ */
224
+ typedef struct {
225
+ /**
226
+ * The namespace for the attribute. This will usually be
227
+ * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
228
+ * values, per:
229
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
230
+ */
231
+ GumboAttributeNamespaceEnum attr_namespace;
232
+
233
+ /**
234
+ * The name of the attribute. This is in a freshly-allocated buffer to deal
235
+ * with case-normalization, and is null-terminated.
236
+ */
237
+ const char* name;
238
+
239
+ /**
240
+ * The original text of the attribute name, as a pointer into the original
241
+ * source buffer.
242
+ */
243
+ GumboStringPiece original_name;
244
+
245
+ /**
246
+ * The value of the attribute. This is in a freshly-allocated buffer to deal
247
+ * with unescaping, and is null-terminated. It does not include any quotes
248
+ * that surround the attribute. If the attribute has no value (for example,
249
+ * 'selected' on a checkbox), this will be an empty string.
250
+ */
251
+ const char* value;
252
+
253
+ /**
254
+ * The original text of the value of the attribute. This points into the
255
+ * original source buffer. It includes any quotes that surround the
256
+ * attribute, and you can look at original_value.data[0] and
257
+ * original_value.data[original_value.length - 1] to determine what the quote
258
+ * characters were. If the attribute has no value, this will be a 0-length
259
+ * string.
260
+ */
261
+ GumboStringPiece original_value;
262
+
263
+ /** The starting position of the attribute name. */
264
+ GumboSourcePosition name_start;
265
+
266
+ /**
267
+ * The ending position of the attribute name. This is not always derivable
268
+ * from the starting position of the value because of the possibility of
269
+ * whitespace around the = sign.
270
+ */
271
+ GumboSourcePosition name_end;
272
+
273
+ /** The starting position of the attribute value. */
274
+ GumboSourcePosition value_start;
275
+
276
+ /** The ending position of the attribute value. */
277
+ GumboSourcePosition value_end;
278
+ } GumboAttribute;
279
+
280
+ /**
281
+ * Given a vector of GumboAttributes, look up the one with the specified name
282
+ * and return it, or NULL if no such attribute exists. This uses a
283
+ * case-insensitive match, as HTML is case-insensitive.
284
+ */
285
+ GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
286
+
287
+ /**
288
+ * Enum denoting the type of node. This determines the type of the node.v
289
+ * union.
290
+ */
291
+ typedef enum {
292
+ /** Document node. v will be a GumboDocument. */
293
+ GUMBO_NODE_DOCUMENT,
294
+ /** Element node. v will be a GumboElement. */
295
+ GUMBO_NODE_ELEMENT,
296
+ /** Text node. v will be a GumboText. */
297
+ GUMBO_NODE_TEXT,
298
+ /** CDATA node. v will be a GumboText. */
299
+ GUMBO_NODE_CDATA,
300
+ /** Comment node. v will be a GumboText, excluding comment delimiters. */
301
+ GUMBO_NODE_COMMENT,
302
+ /** Text node, where all contents is whitespace. v will be a GumboText. */
303
+ GUMBO_NODE_WHITESPACE,
304
+ /** Template node. This is separate from GUMBO_NODE_ELEMENT because many
305
+ * client libraries will want to ignore the contents of template nodes, as
306
+ * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
307
+ * here, while clients that want to include template contents should also
308
+ * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
309
+ GUMBO_NODE_TEMPLATE
310
+ } GumboNodeType;
311
+
312
+ /**
313
+ * Forward declaration of GumboNode so it can be used recursively in
314
+ * GumboNode.parent.
315
+ */
316
+ typedef struct GumboInternalNode GumboNode;
317
+
318
+ /**
319
+ * http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
320
+ */
321
+ typedef enum {
322
+ GUMBO_DOCTYPE_NO_QUIRKS,
323
+ GUMBO_DOCTYPE_QUIRKS,
324
+ GUMBO_DOCTYPE_LIMITED_QUIRKS
325
+ } GumboQuirksModeEnum;
326
+
327
+ /**
328
+ * Namespaces.
329
+ * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
330
+ * anything inside an <svg> tag is in the SVG namespace, anything inside the
331
+ * <math> tag is in the MathML namespace, and anything else is inside the HTML
332
+ * namespace. No other namespaces are supported, so this can be an enum only.
333
+ */
334
+ typedef enum {
335
+ GUMBO_NAMESPACE_HTML,
336
+ GUMBO_NAMESPACE_SVG,
337
+ GUMBO_NAMESPACE_MATHML
338
+ } GumboNamespaceEnum;
339
+
340
+ /**
341
+ * Parse flags.
342
+ * We track the reasons for parser insertion of nodes and store them in a
343
+ * bitvector in the node itself. This lets client code optimize out nodes that
344
+ * are implied by the HTML structure of the document, or flag constructs that
345
+ * may not be allowed by a style guide, or track the prevalence of incorrect or
346
+ * tricky HTML code.
347
+ */
348
+ typedef enum {
349
+ /**
350
+ * A normal node - both start and end tags appear in the source, nothing has
351
+ * been reparented.
352
+ */
353
+ GUMBO_INSERTION_NORMAL = 0,
354
+
355
+ /**
356
+ * A node inserted by the parser to fulfill some implicit insertion rule.
357
+ * This is usually set in addition to some other flag giving a more specific
358
+ * insertion reason; it's a generic catch-all term meaning "The start tag for
359
+ * this node did not appear in the document source".
360
+ */
361
+ GUMBO_INSERTION_BY_PARSER = 1 << 0,
362
+
363
+ /**
364
+ * A flag indicating that the end tag for this node did not appear in the
365
+ * document source. Note that in some cases, you can still have
366
+ * parser-inserted nodes with an explicit end tag: for example, "Text</html>"
367
+ * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
368
+ * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
369
+ * exists. This flag will be set only if the end tag is completely missing;
370
+ * in some cases, the end tag may be misplaced (eg. a </body> tag with text
371
+ * afterwards), which will leave this flag unset and require clients to
372
+ * inspect the parse errors for that case.
373
+ */
374
+ GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
375
+
376
+ // Value 1 << 2 was for a flag that has since been removed.
377
+
378
+ /**
379
+ * A flag for nodes that are inserted because their presence is implied by
380
+ * other tags, eg. <html>, <head>, <body>, <tbody>, etc.
381
+ */
382
+ GUMBO_INSERTION_IMPLIED = 1 << 3,
383
+
384
+ /**
385
+ * A flag for nodes that are converted from their end tag equivalents. For
386
+ * example, </p> when no paragraph is open implies that the parser should
387
+ * create a <p> tag and immediately close it, while </br> means the same thing
388
+ * as <br>.
389
+ */
390
+ GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
391
+
392
+ /** A flag for nodes that are converted from the parse of an <isindex> tag. */
393
+ GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
394
+
395
+ /** A flag for <image> tags that are rewritten as <img>. */
396
+ GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
397
+
398
+ /**
399
+ * A flag for nodes that are cloned as a result of the reconstruction of
400
+ * active formatting elements. This is set only on the clone; the initial
401
+ * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
402
+ */
403
+ GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
404
+
405
+ /** A flag for nodes that are cloned by the adoption agency algorithm. */
406
+ GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
407
+
408
+ /** A flag for nodes that are moved by the adoption agency algorithm. */
409
+ GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
410
+
411
+ /**
412
+ * A flag for nodes that have been foster-parented out of a table (or
413
+ * should've been foster-parented, if verbatim mode is set).
414
+ */
415
+ GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
416
+ } GumboParseFlags;
417
+
418
+ /**
419
+ * Information specific to document nodes.
420
+ */
421
+ typedef struct {
422
+ /**
423
+ * An array of GumboNodes, containing the children of this element. This will
424
+ * normally consist of the <html> element and any comment nodes found.
425
+ * Pointers are owned.
426
+ */
427
+ GumboVector /* GumboNode* */ children;
428
+
429
+ // True if there was an explicit doctype token as opposed to it being omitted.
430
+ bool has_doctype;
431
+
432
+ // Fields from the doctype token, copied verbatim.
433
+ const char* name;
434
+ const char* public_identifier;
435
+ const char* system_identifier;
436
+
437
+ /**
438
+ * Whether or not the document is in QuirksMode, as determined by the values
439
+ * in the GumboTokenDocType template.
440
+ */
441
+ GumboQuirksModeEnum doc_type_quirks_mode;
442
+ } GumboDocument;
443
+
444
+ /**
445
+ * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
446
+ * This contains just a block of text and its position.
447
+ */
448
+ typedef struct {
449
+ /**
450
+ * The text of this node, after entities have been parsed and decoded. For
451
+ * comment/cdata nodes, this does not include the comment delimiters.
452
+ */
453
+ const char* text;
454
+
455
+ /**
456
+ * The original text of this node, as a pointer into the original buffer. For
457
+ * comment/cdata nodes, this includes the comment delimiters.
458
+ */
459
+ GumboStringPiece original_text;
460
+
461
+ /**
462
+ * The starting position of this node. This corresponds to the position of
463
+ * original_text, before entities are decoded.
464
+ * */
465
+ GumboSourcePosition start_pos;
466
+ } GumboText;
467
+
468
+ /**
469
+ * The struct used to represent all HTML elements. This contains information
470
+ * about the tag, attributes, and child nodes.
471
+ */
472
+ typedef struct {
473
+ /**
474
+ * An array of GumboNodes, containing the children of this element. Pointers
475
+ * are owned.
476
+ */
477
+ GumboVector /* GumboNode* */ children;
478
+
479
+ /** The GumboTag enum for this element. */
480
+ GumboTag tag;
481
+
482
+ /** The GumboNamespaceEnum for this element. */
483
+ GumboNamespaceEnum tag_namespace;
484
+
485
+ /**
486
+ * A GumboStringPiece pointing to the original tag text for this element,
487
+ * pointing directly into the source buffer. If the tag was inserted
488
+ * algorithmically (for example, <head> or <tbody> insertion), this will be a
489
+ * zero-length string.
490
+ */
491
+ GumboStringPiece original_tag;
492
+
493
+ /**
494
+ * A GumboStringPiece pointing to the original end tag text for this element.
495
+ * If the end tag was inserted algorithmically, (for example, closing a
496
+ * self-closing tag), this will be a zero-length string.
497
+ */
498
+ GumboStringPiece original_end_tag;
499
+
500
+ /** The source position for the start of the start tag. */
501
+ GumboSourcePosition start_pos;
502
+
503
+ /** The source position for the start of the end tag. */
504
+ GumboSourcePosition end_pos;
505
+
506
+ /**
507
+ * An array of GumboAttributes, containing the attributes for this tag in the
508
+ * order that they were parsed. Pointers are owned.
509
+ */
510
+ GumboVector /* GumboAttribute* */ attributes;
511
+ } GumboElement;
512
+
513
+ /**
514
+ * A supertype for GumboElement and GumboText, so that we can include one
515
+ * generic type in lists of children and cast as necessary to subtypes.
516
+ */
517
+ struct GumboInternalNode {
518
+ /** The type of node that this is. */
519
+ GumboNodeType type;
520
+
521
+ /** Pointer back to parent node. Not owned. */
522
+ GumboNode* parent;
523
+
524
+ /** The index within the parent's children vector of this node. */
525
+ size_t index_within_parent;
526
+
527
+ /**
528
+ * A bitvector of flags containing information about why this element was
529
+ * inserted into the parse tree, including a variety of special parse
530
+ * situations.
531
+ */
532
+ GumboParseFlags parse_flags;
533
+
534
+ /** The actual node data. */
535
+ union {
536
+ GumboDocument document; // For GUMBO_NODE_DOCUMENT.
537
+ GumboElement element; // For GUMBO_NODE_ELEMENT.
538
+ GumboText text; // For everything else.
539
+ } v;
540
+ };
541
+
542
+ /**
543
+ * The type for an allocator function. Takes the 'userdata' member of the
544
+ * GumboParser struct as its first argument. Semantics should be the same as
545
+ * malloc, i.e. return a block of size_t bytes on success or NULL on failure.
546
+ * Allocating a block of 0 bytes behaves as per malloc.
547
+ */
548
+ // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
549
+ typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
550
+
551
+ /**
552
+ * The type for a deallocator function. Takes the 'userdata' member of the
553
+ * GumboParser struct as its first argument.
554
+ */
555
+ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
556
+
557
+ /**
558
+ * Input struct containing configuration options for the parser.
559
+ * These let you specify alternate memory managers, provide different error
560
+ * handling, etc.
561
+ * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
562
+ */
563
+ typedef struct GumboInternalOptions {
564
+ /** A memory allocator function. Default: malloc. */
565
+ GumboAllocatorFunction allocator;
566
+
567
+ /** A memory deallocator function. Default: free. */
568
+ GumboDeallocatorFunction deallocator;
569
+
570
+ /**
571
+ * An opaque object that's passed in as the first argument to all callbacks
572
+ * used by this library. Default: NULL.
573
+ */
574
+ void* userdata;
575
+
576
+ /**
577
+ * The tab-stop size, for computing positions in source code that uses tabs.
578
+ * Default: 8.
579
+ */
580
+ int tab_stop;
581
+
582
+ /**
583
+ * Whether or not to stop parsing when the first error is encountered.
584
+ * Default: false.
585
+ */
586
+ bool stop_on_first_error;
587
+
588
+ /**
589
+ * The maximum number of errors before the parser stops recording them. This
590
+ * is provided so that if the page is totally borked, we don't completely fill
591
+ * up the errors vector and exhaust memory with useless redundant errors. Set
592
+ * to -1 to disable the limit.
593
+ * Default: -1
594
+ */
595
+ int max_errors;
596
+
597
+ /**
598
+ * The fragment context for parsing:
599
+ * https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
600
+ *
601
+ * If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e.
602
+ * the regular parsing algorithm. Otherwise, pass the tag enum for the
603
+ * intended parent of the parsed fragment. We use just the tag enum rather
604
+ * than a full node because that's enough to set all the parsing context we
605
+ * need, and it provides some additional flexibility for client code to act as
606
+ * if parsing a fragment even when a full HTML tree isn't available.
607
+ *
608
+ * Default: GUMBO_TAG_LAST
609
+ */
610
+ GumboTag fragment_context;
611
+
612
+ /**
613
+ * The namespace for the fragment context. This lets client code
614
+ * differentiate between, say, parsing a <title> tag in SVG vs. parsing it in
615
+ * HTML.
616
+ * Default: GUMBO_NAMESPACE_HTML
617
+ */
618
+ GumboNamespaceEnum fragment_namespace;
619
+ } GumboOptions;
620
+
621
+ /** Default options struct; use this with gumbo_parse_with_options. */
622
+ extern const GumboOptions kGumboDefaultOptions;
623
+
624
+ /** The output struct containing the results of the parse. */
625
+ typedef struct GumboInternalOutput {
626
+ /**
627
+ * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
628
+ * that contains the entire document as its child.
629
+ */
630
+ GumboNode* document;
631
+
632
+ /**
633
+ * Pointer to the root node. This the <html> tag that forms the root of the
634
+ * document.
635
+ */
636
+ GumboNode* root;
637
+
638
+ /**
639
+ * A list of errors that occurred during the parse.
640
+ * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
641
+ * fleshed out and may change in the future. For this reason, the GumboError
642
+ * header isn't part of the public API. Contact us if you need errors
643
+ * reported so we can work out something appropriate for your use-case.
644
+ */
645
+ GumboVector /* GumboError */ errors;
646
+ } GumboOutput;
647
+
648
+ /**
649
+ * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
650
+ * live at least as long as the parse tree, as some fields (eg. original_text)
651
+ * point directly into the original buffer.
652
+ *
653
+ * This doesn't support buffers longer than 4 gigabytes.
654
+ */
655
+ GumboOutput* gumbo_parse(const char* buffer);
656
+
657
+ /**
658
+ * Extended version of gumbo_parse that takes an explicit options structure,
659
+ * buffer, and length.
660
+ */
661
+ GumboOutput* gumbo_parse_with_options(
662
+ const GumboOptions* options, const char* buffer, size_t buffer_length);
663
+
664
+ /** Release the memory used for the parse tree & parse errors. */
665
+ void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output);
666
+
667
+ #ifdef __cplusplus
668
+ }
669
+ #endif
670
+
671
+ #endif // GUMBO_GUMBO_H_