ruby-gumbo 1.0.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.mkd +28 -31
  4. data/Rakefile +60 -59
  5. data/ext/extconf.rb +17 -9
  6. data/ext/{gumbo.c → ruby_gumbo_ext.c} +29 -28
  7. data/lib/gumbo.rb +19 -0
  8. data/lib/gumbo/element.rb +52 -0
  9. data/lib/gumbo/{extra.rb → node.rb} +19 -22
  10. data/lib/gumbo/text.rb +29 -0
  11. data/vendor/gumbo-parser/src/attribute.c +44 -0
  12. data/vendor/gumbo-parser/src/attribute.h +37 -0
  13. data/vendor/gumbo-parser/src/char_ref.c +2561 -0
  14. data/vendor/gumbo-parser/src/char_ref.h +61 -0
  15. data/vendor/gumbo-parser/src/error.c +258 -0
  16. data/vendor/gumbo-parser/src/error.h +227 -0
  17. data/vendor/gumbo-parser/src/gumbo.h +807 -0
  18. data/vendor/gumbo-parser/src/insertion_mode.h +57 -0
  19. data/vendor/gumbo-parser/src/parser.c +3917 -0
  20. data/vendor/gumbo-parser/src/parser.h +57 -0
  21. data/vendor/gumbo-parser/src/string_buffer.c +106 -0
  22. data/vendor/gumbo-parser/src/string_buffer.h +81 -0
  23. data/vendor/gumbo-parser/src/string_piece.c +49 -0
  24. data/vendor/gumbo-parser/src/string_piece.h +39 -0
  25. data/vendor/gumbo-parser/src/tag.c +225 -0
  26. data/vendor/gumbo-parser/src/token_type.h +40 -0
  27. data/vendor/gumbo-parser/src/tokenizer.c +2980 -0
  28. data/vendor/gumbo-parser/src/tokenizer.h +123 -0
  29. data/vendor/gumbo-parser/src/tokenizer_states.h +103 -0
  30. data/vendor/gumbo-parser/src/utf8.c +275 -0
  31. data/vendor/gumbo-parser/src/utf8.h +127 -0
  32. data/vendor/gumbo-parser/src/util.c +58 -0
  33. data/vendor/gumbo-parser/src/util.h +62 -0
  34. data/vendor/gumbo-parser/src/vector.c +123 -0
  35. data/vendor/gumbo-parser/src/vector.h +69 -0
  36. metadata +40 -10
  37. data/ext/extconf.h +0 -3
@@ -0,0 +1,807 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
18
+ // GUMBO_ as a prefix for enum constants (static constants get the Google-style
19
+ // kGumbo prefix).
20
+
21
+ /**
22
+ * @file
23
+ * @mainpage Gumbo HTML Parser
24
+ *
25
+ * This provides a conformant, no-dependencies implementation of the HTML5
26
+ * parsing algorithm. It supports only UTF8; if you need to parse a different
27
+ * encoding, run a preprocessing step to convert to UTF8. It returns a parse
28
+ * tree made of the structs in this file.
29
+ *
30
+ * Example:
31
+ * @code
32
+ * GumboOutput* output = gumbo_parse(input);
33
+ * do_something_with_doctype(output->document);
34
+ * do_something_with_html_tree(output->root);
35
+ * gumbo_destroy_output(&options, output);
36
+ * @endcode
37
+ * HTML5 Spec:
38
+ *
39
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
40
+ */
41
+
42
+ #ifndef GUMBO_GUMBO_H_
43
+ #define GUMBO_GUMBO_H_
44
+
45
+ #ifdef _MSC_VER
46
+ #define _CRT_SECURE_NO_WARNINGS
47
+ #define fileno _fileno
48
+ #endif
49
+
50
+ #include <stdbool.h>
51
+ #include <stddef.h>
52
+
53
+ #ifdef __cplusplus
54
+ extern "C" {
55
+ #endif
56
+
57
+ /**
58
+ * A struct representing a character position within the original text buffer.
59
+ * Line and column numbers are 1-based and offsets are 0-based, which matches
60
+ * how most editors and command-line tools work. Also, columns measure
61
+ * positions in terms of characters while offsets measure by bytes; this is
62
+ * because the offset field is often used to pull out a particular region of
63
+ * text (which in most languages that bind to C implies pointer arithmetic on a
64
+ * buffer of bytes), while the column field is often used to reference a
65
+ * particular column on a printable display, which nowadays is usually UTF-8.
66
+ */
67
+ typedef struct {
68
+ unsigned int line;
69
+ unsigned int column;
70
+ unsigned int offset;
71
+ } GumboSourcePosition;
72
+
73
+ /**
74
+ * A SourcePosition used for elements that have no source position, i.e.
75
+ * parser-inserted elements.
76
+ */
77
+ extern const GumboSourcePosition kGumboEmptySourcePosition;
78
+
79
+
80
+ /**
81
+ * A struct representing a string or part of a string. Strings within the
82
+ * parser are represented by a char* and a length; the char* points into
83
+ * an existing data buffer owned by some other code (often the original input).
84
+ * GumboStringPieces are assumed (by convention) to be immutable, because they
85
+ * may share data. Use GumboStringBuffer if you need to construct a string.
86
+ * Clients should assume that it is not NUL-terminated, and should always use
87
+ * explicit lengths when manipulating them.
88
+ */
89
+ typedef struct {
90
+ /** A pointer to the beginning of the string. NULL iff length == 0. */
91
+ const char* data;
92
+
93
+ /** The length of the string fragment, in bytes. May be zero. */
94
+ size_t length;
95
+ } GumboStringPiece;
96
+
97
+ /** A constant to represent a 0-length null string. */
98
+ extern const GumboStringPiece kGumboEmptyString;
99
+
100
+ /**
101
+ * Compares two GumboStringPieces, and returns true if they're equal or false
102
+ * otherwise.
103
+ */
104
+ bool gumbo_string_equals(
105
+ const GumboStringPiece* str1, const GumboStringPiece* str2);
106
+
107
+ /**
108
+ * Compares two GumboStringPieces ignoring case, and returns true if they're
109
+ * equal or false otherwise.
110
+ */
111
+ bool gumbo_string_equals_ignore_case(
112
+ const GumboStringPiece* str1, const GumboStringPiece* str2);
113
+
114
+
115
+ /**
116
+ * A simple vector implementation. This stores a pointer to a data array and a
117
+ * length. All elements are stored as void*; client code must cast to the
118
+ * appropriate type. Overflows upon addition result in reallocation of the data
119
+ * array, with the size doubling to maintain O(1) amortized cost. There is no
120
+ * removal function, as this isn't needed for any of the operations within this
121
+ * library. Iteration can be done through inspecting the structure directly in
122
+ * a for-loop.
123
+ */
124
+ typedef struct {
125
+ /** Data elements. This points to a dynamically-allocated array of capacity
126
+ * elements, each a void* to the element itself.
127
+ */
128
+ void** data;
129
+
130
+ /** Number of elements currently in the vector. */
131
+ unsigned int length;
132
+
133
+ /** Current array capacity. */
134
+ unsigned int capacity;
135
+ } GumboVector;
136
+
137
+ /** An empty (0-length, 0-capacity) GumboVector. */
138
+ extern const GumboVector kGumboEmptyVector;
139
+
140
+ /**
141
+ * Returns the first index at which an element appears in this vector (testing
142
+ * by pointer equality), or -1 if it never does.
143
+ */
144
+ int gumbo_vector_index_of(GumboVector* vector, void* element);
145
+
146
+
147
+ /**
148
+ * An enum for all the tags defined in the HTML5 standard. These correspond to
149
+ * the tag names themselves. Enum constants exist only for tags which appear in
150
+ * the spec itself (or for tags with special handling in the SVG and MathML
151
+ * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
152
+ * name can be obtained through original_tag.
153
+ *
154
+ * This is mostly for API convenience, so that clients of this library don't
155
+ * need to perform a strcasecmp to find the normalized tag name. It also has
156
+ * efficiency benefits, by letting the parser work with enums instead of
157
+ * strings.
158
+ */
159
+ typedef enum {
160
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
161
+ GUMBO_TAG_HTML,
162
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
163
+ GUMBO_TAG_HEAD,
164
+ GUMBO_TAG_TITLE,
165
+ GUMBO_TAG_BASE,
166
+ GUMBO_TAG_LINK,
167
+ GUMBO_TAG_META,
168
+ GUMBO_TAG_STYLE,
169
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
170
+ GUMBO_TAG_SCRIPT,
171
+ GUMBO_TAG_NOSCRIPT,
172
+ GUMBO_TAG_TEMPLATE,
173
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
174
+ GUMBO_TAG_BODY,
175
+ GUMBO_TAG_ARTICLE,
176
+ GUMBO_TAG_SECTION,
177
+ GUMBO_TAG_NAV,
178
+ GUMBO_TAG_ASIDE,
179
+ GUMBO_TAG_H1,
180
+ GUMBO_TAG_H2,
181
+ GUMBO_TAG_H3,
182
+ GUMBO_TAG_H4,
183
+ GUMBO_TAG_H5,
184
+ GUMBO_TAG_H6,
185
+ GUMBO_TAG_HGROUP,
186
+ GUMBO_TAG_HEADER,
187
+ GUMBO_TAG_FOOTER,
188
+ GUMBO_TAG_ADDRESS,
189
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
190
+ GUMBO_TAG_P,
191
+ GUMBO_TAG_HR,
192
+ GUMBO_TAG_PRE,
193
+ GUMBO_TAG_BLOCKQUOTE,
194
+ GUMBO_TAG_OL,
195
+ GUMBO_TAG_UL,
196
+ GUMBO_TAG_LI,
197
+ GUMBO_TAG_DL,
198
+ GUMBO_TAG_DT,
199
+ GUMBO_TAG_DD,
200
+ GUMBO_TAG_FIGURE,
201
+ GUMBO_TAG_FIGCAPTION,
202
+ GUMBO_TAG_MAIN,
203
+ GUMBO_TAG_DIV,
204
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
205
+ GUMBO_TAG_A,
206
+ GUMBO_TAG_EM,
207
+ GUMBO_TAG_STRONG,
208
+ GUMBO_TAG_SMALL,
209
+ GUMBO_TAG_S,
210
+ GUMBO_TAG_CITE,
211
+ GUMBO_TAG_Q,
212
+ GUMBO_TAG_DFN,
213
+ GUMBO_TAG_ABBR,
214
+ GUMBO_TAG_DATA,
215
+ GUMBO_TAG_TIME,
216
+ GUMBO_TAG_CODE,
217
+ GUMBO_TAG_VAR,
218
+ GUMBO_TAG_SAMP,
219
+ GUMBO_TAG_KBD,
220
+ GUMBO_TAG_SUB,
221
+ GUMBO_TAG_SUP,
222
+ GUMBO_TAG_I,
223
+ GUMBO_TAG_B,
224
+ GUMBO_TAG_U,
225
+ GUMBO_TAG_MARK,
226
+ GUMBO_TAG_RUBY,
227
+ GUMBO_TAG_RT,
228
+ GUMBO_TAG_RP,
229
+ GUMBO_TAG_BDI,
230
+ GUMBO_TAG_BDO,
231
+ GUMBO_TAG_SPAN,
232
+ GUMBO_TAG_BR,
233
+ GUMBO_TAG_WBR,
234
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
235
+ GUMBO_TAG_INS,
236
+ GUMBO_TAG_DEL,
237
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
238
+ GUMBO_TAG_IMAGE,
239
+ GUMBO_TAG_IMG,
240
+ GUMBO_TAG_IFRAME,
241
+ GUMBO_TAG_EMBED,
242
+ GUMBO_TAG_OBJECT,
243
+ GUMBO_TAG_PARAM,
244
+ GUMBO_TAG_VIDEO,
245
+ GUMBO_TAG_AUDIO,
246
+ GUMBO_TAG_SOURCE,
247
+ GUMBO_TAG_TRACK,
248
+ GUMBO_TAG_CANVAS,
249
+ GUMBO_TAG_MAP,
250
+ GUMBO_TAG_AREA,
251
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
252
+ GUMBO_TAG_MATH,
253
+ GUMBO_TAG_MI,
254
+ GUMBO_TAG_MO,
255
+ GUMBO_TAG_MN,
256
+ GUMBO_TAG_MS,
257
+ GUMBO_TAG_MTEXT,
258
+ GUMBO_TAG_MGLYPH,
259
+ GUMBO_TAG_MALIGNMARK,
260
+ GUMBO_TAG_ANNOTATION_XML,
261
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
262
+ GUMBO_TAG_SVG,
263
+ GUMBO_TAG_FOREIGNOBJECT,
264
+ GUMBO_TAG_DESC,
265
+ // SVG title tags will have GUMBO_TAG_TITLE as with HTML.
266
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
267
+ GUMBO_TAG_TABLE,
268
+ GUMBO_TAG_CAPTION,
269
+ GUMBO_TAG_COLGROUP,
270
+ GUMBO_TAG_COL,
271
+ GUMBO_TAG_TBODY,
272
+ GUMBO_TAG_THEAD,
273
+ GUMBO_TAG_TFOOT,
274
+ GUMBO_TAG_TR,
275
+ GUMBO_TAG_TD,
276
+ GUMBO_TAG_TH,
277
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
278
+ GUMBO_TAG_FORM,
279
+ GUMBO_TAG_FIELDSET,
280
+ GUMBO_TAG_LEGEND,
281
+ GUMBO_TAG_LABEL,
282
+ GUMBO_TAG_INPUT,
283
+ GUMBO_TAG_BUTTON,
284
+ GUMBO_TAG_SELECT,
285
+ GUMBO_TAG_DATALIST,
286
+ GUMBO_TAG_OPTGROUP,
287
+ GUMBO_TAG_OPTION,
288
+ GUMBO_TAG_TEXTAREA,
289
+ GUMBO_TAG_KEYGEN,
290
+ GUMBO_TAG_OUTPUT,
291
+ GUMBO_TAG_PROGRESS,
292
+ GUMBO_TAG_METER,
293
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
294
+ GUMBO_TAG_DETAILS,
295
+ GUMBO_TAG_SUMMARY,
296
+ GUMBO_TAG_MENU,
297
+ GUMBO_TAG_MENUITEM,
298
+ // Non-conforming elements that nonetheless appear in the HTML5 spec.
299
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
300
+ GUMBO_TAG_APPLET,
301
+ GUMBO_TAG_ACRONYM,
302
+ GUMBO_TAG_BGSOUND,
303
+ GUMBO_TAG_DIR,
304
+ GUMBO_TAG_FRAME,
305
+ GUMBO_TAG_FRAMESET,
306
+ GUMBO_TAG_NOFRAMES,
307
+ GUMBO_TAG_ISINDEX,
308
+ GUMBO_TAG_LISTING,
309
+ GUMBO_TAG_XMP,
310
+ GUMBO_TAG_NEXTID,
311
+ GUMBO_TAG_NOEMBED,
312
+ GUMBO_TAG_PLAINTEXT,
313
+ GUMBO_TAG_RB,
314
+ GUMBO_TAG_STRIKE,
315
+ GUMBO_TAG_BASEFONT,
316
+ GUMBO_TAG_BIG,
317
+ GUMBO_TAG_BLINK,
318
+ GUMBO_TAG_CENTER,
319
+ GUMBO_TAG_FONT,
320
+ GUMBO_TAG_MARQUEE,
321
+ GUMBO_TAG_MULTICOL,
322
+ GUMBO_TAG_NOBR,
323
+ GUMBO_TAG_SPACER,
324
+ GUMBO_TAG_TT,
325
+ // Used for all tags that don't have special handling in HTML.
326
+ GUMBO_TAG_UNKNOWN,
327
+ // A marker value to indicate the end of the enum, for iterating over it.
328
+ // Also used as the terminator for varargs functions that take tags.
329
+ GUMBO_TAG_LAST,
330
+ } GumboTag;
331
+
332
+ /**
333
+ * Returns the normalized (usually all-lowercased, except for foreign content)
334
+ * tag name for an GumboTag enum. Return value is static data owned by the
335
+ * library.
336
+ */
337
+ const char* gumbo_normalized_tagname(GumboTag tag);
338
+
339
+ /**
340
+ * Extracts the tag name from the original_text field of an element or token by
341
+ * stripping off </> characters and attributes and adjusting the passed-in
342
+ * GumboStringPiece appropriately. The tag name is in the original case and
343
+ * shares a buffer with the original text, to simplify memory management.
344
+ * Behavior is undefined if a string-piece that doesn't represent an HTML tag
345
+ * (<tagname> or </tagname>) is passed in. If the string piece is completely
346
+ * empty (NULL data pointer), then this function will exit successfully as a
347
+ * no-op.
348
+ */
349
+ void gumbo_tag_from_original_text(GumboStringPiece* text);
350
+
351
+ /**
352
+ * Fixes the case of SVG elements that are not all lowercase.
353
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
354
+ * This is not done at parse time because there's no place to store a mutated
355
+ * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
356
+ * without special handling), while original_tag_name is a pointer into the
357
+ * original buffer. Instead, we provide this helper function that clients can
358
+ * use to rename SVG tags as appropriate.
359
+ * Returns the case-normalized SVG tagname if a replacement is found, or NULL if
360
+ * no normalization is called for. The return value is static data and owned by
361
+ * the library.
362
+ */
363
+ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
364
+
365
+ /**
366
+ * Converts a tag name string (which may be in upper or mixed case) to a tag
367
+ * enum.
368
+ */
369
+ GumboTag gumbo_tag_enum(const char* tagname);
370
+
371
+ /**
372
+ * Attribute namespaces.
373
+ * HTML includes special handling for XLink, XML, and XMLNS namespaces on
374
+ * attributes. Everything else goes in the generic "NONE" namespace.
375
+ */
376
+ typedef enum {
377
+ GUMBO_ATTR_NAMESPACE_NONE,
378
+ GUMBO_ATTR_NAMESPACE_XLINK,
379
+ GUMBO_ATTR_NAMESPACE_XML,
380
+ GUMBO_ATTR_NAMESPACE_XMLNS,
381
+ } GumboAttributeNamespaceEnum;
382
+
383
+ /**
384
+ * A struct representing a single attribute on an HTML tag. This is a
385
+ * name-value pair, but also includes information about source locations and
386
+ * original source text.
387
+ */
388
+ typedef struct {
389
+ /**
390
+ * The namespace for the attribute. This will usually be
391
+ * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
392
+ * values, per:
393
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
394
+ */
395
+ GumboAttributeNamespaceEnum attr_namespace;
396
+
397
+ /**
398
+ * The name of the attribute. This is in a freshly-allocated buffer to deal
399
+ * with case-normalization, and is null-terminated.
400
+ */
401
+ const char* name;
402
+
403
+ /**
404
+ * The original text of the attribute name, as a pointer into the original
405
+ * source buffer.
406
+ */
407
+ GumboStringPiece original_name;
408
+
409
+ /**
410
+ * The value of the attribute. This is in a freshly-allocated buffer to deal
411
+ * with unescaping, and is null-terminated. It does not include any quotes
412
+ * that surround the attribute. If the attribute has no value (for example,
413
+ * 'selected' on a checkbox), this will be an empty string.
414
+ */
415
+ const char* value;
416
+
417
+ /**
418
+ * The original text of the value of the attribute. This points into the
419
+ * original source buffer. It includes any quotes that surround the
420
+ * attribute, and you can look at original_value.data[0] and
421
+ * original_value.data[original_value.length - 1] to determine what the quote
422
+ * characters were. If the attribute has no value, this will be a 0-length
423
+ * string.
424
+ */
425
+ GumboStringPiece original_value;
426
+
427
+ /** The starting position of the attribute name. */
428
+ GumboSourcePosition name_start;
429
+
430
+ /**
431
+ * The ending position of the attribute name. This is not always derivable
432
+ * from the starting position of the value because of the possibility of
433
+ * whitespace around the = sign.
434
+ */
435
+ GumboSourcePosition name_end;
436
+
437
+ /** The starting position of the attribute value. */
438
+ GumboSourcePosition value_start;
439
+
440
+ /** The ending position of the attribute value. */
441
+ GumboSourcePosition value_end;
442
+ } GumboAttribute;
443
+
444
+ /**
445
+ * Given a vector of GumboAttributes, look up the one with the specified name
446
+ * and return it, or NULL if no such attribute exists. This uses a
447
+ * case-insensitive match, as HTML is case-insensitive.
448
+ */
449
+ GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
450
+
451
+ /**
452
+ * Enum denoting the type of node. This determines the type of the node.v
453
+ * union.
454
+ */
455
+ typedef enum {
456
+ /** Document node. v will be a GumboDocument. */
457
+ GUMBO_NODE_DOCUMENT,
458
+ /** Element node. v will be a GumboElement. */
459
+ GUMBO_NODE_ELEMENT,
460
+ /** Text node. v will be a GumboText. */
461
+ GUMBO_NODE_TEXT,
462
+ /** CDATA node. v will be a GumboText. */
463
+ GUMBO_NODE_CDATA,
464
+ /** Comment node. v. will be a GumboText, excluding comment delimiters. */
465
+ GUMBO_NODE_COMMENT,
466
+ /** Text node, where all contents is whitespace. v will be a GumboText. */
467
+ GUMBO_NODE_WHITESPACE
468
+ } GumboNodeType;
469
+
470
+ /**
471
+ * Forward declaration of GumboNode so it can be used recursively in
472
+ * GumboNode.parent.
473
+ */
474
+ typedef struct GumboInternalNode GumboNode;
475
+
476
+ /** http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode */
477
+ typedef enum {
478
+ GUMBO_DOCTYPE_NO_QUIRKS,
479
+ GUMBO_DOCTYPE_QUIRKS,
480
+ GUMBO_DOCTYPE_LIMITED_QUIRKS
481
+ } GumboQuirksModeEnum;
482
+
483
+ /**
484
+ * Namespaces.
485
+ * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
486
+ * anything inside an <svg> tag is in the SVG namespace, anything inside the
487
+ * <math> tag is in the MathML namespace, and anything else is inside the HTML
488
+ * namespace. No other namespaces are supported, so this can be an enum only.
489
+ */
490
+ typedef enum {
491
+ GUMBO_NAMESPACE_HTML,
492
+ GUMBO_NAMESPACE_SVG,
493
+ GUMBO_NAMESPACE_MATHML
494
+ } GumboNamespaceEnum;
495
+
496
+ /**
497
+ * Parse flags.
498
+ * We track the reasons for parser insertion of nodes and store them in a
499
+ * bitvector in the node itself. This lets client code optimize out nodes that
500
+ * are implied by the HTML structure of the document, or flag constructs that
501
+ * may not be allowed by a style guide, or track the prevalence of incorrect or
502
+ * tricky HTML code.
503
+ */
504
+ typedef enum {
505
+ /**
506
+ * A normal node - both start and end tags appear in the source, nothing has
507
+ * been reparented.
508
+ */
509
+ GUMBO_INSERTION_NORMAL = 0,
510
+
511
+ /**
512
+ * A node inserted by the parser to fulfill some implicit insertion rule.
513
+ * This is usually set in addition to some other flag giving a more specific
514
+ * insertion reason; it's a generic catch-all term meaning "The start tag for
515
+ * this node did not appear in the document source".
516
+ */
517
+ GUMBO_INSERTION_BY_PARSER = 1 << 0,
518
+
519
+ /**
520
+ * A flag indicating that the end tag for this node did not appear in the
521
+ * document source. Note that in some cases, you can still have
522
+ * parser-inserted nodes with an explicit end tag: for example, "Text</html>"
523
+ * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
524
+ * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
525
+ * exists. This flag will be set only if the end tag is completely missing;
526
+ * in some cases, the end tag may be misplaced (eg. a </body> tag with text
527
+ * afterwards), which will leave this flag unset and require clients to
528
+ * inspect the parse errors for that case.
529
+ */
530
+ GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
531
+
532
+ // Value 1 << 2 was for a flag that has since been removed.
533
+
534
+ /**
535
+ * A flag for nodes that are inserted because their presence is implied by
536
+ * other tags, eg. <html>, <head>, <body>, <tbody>, etc.
537
+ */
538
+ GUMBO_INSERTION_IMPLIED = 1 << 3,
539
+
540
+ /**
541
+ * A flag for nodes that are converted from their end tag equivalents. For
542
+ * example, </p> when no paragraph is open implies that the parser should
543
+ * create a <p> tag and immediately close it, while </br> means the same thing
544
+ * as <br>.
545
+ */
546
+ GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
547
+
548
+ /** A flag for nodes that are converted from the parse of an <isindex> tag. */
549
+ GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
550
+
551
+ /** A flag for <image> tags that are rewritten as <img>. */
552
+ GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
553
+
554
+ /**
555
+ * A flag for nodes that are cloned as a result of the reconstruction of
556
+ * active formatting elements. This is set only on the clone; the initial
557
+ * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
558
+ */
559
+ GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
560
+
561
+ /** A flag for nodes that are cloned by the adoption agency algorithm. */
562
+ GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
563
+
564
+ /** A flag for nodes that are moved by the adoption agency algorithm. */
565
+ GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
566
+
567
+ /**
568
+ * A flag for nodes that have been foster-parented out of a table (or
569
+ * should've been foster-parented, if verbatim mode is set).
570
+ */
571
+ GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
572
+ } GumboParseFlags;
573
+
574
+
575
+ /**
576
+ * Information specific to document nodes.
577
+ */
578
+ typedef struct {
579
+ /**
580
+ * An array of GumboNodes, containing the children of this element. This will
581
+ * normally consist of the <html> element and any comment nodes found.
582
+ * Pointers are owned.
583
+ */
584
+ GumboVector /* GumboNode* */ children;
585
+
586
+ // True if there was an explicit doctype token as opposed to it being omitted.
587
+ bool has_doctype;
588
+
589
+ // Fields from the doctype token, copied verbatim.
590
+ const char* name;
591
+ const char* public_identifier;
592
+ const char* system_identifier;
593
+
594
+ /**
595
+ * Whether or not the document is in QuirksMode, as determined by the values
596
+ * in the GumboTokenDocType template.
597
+ */
598
+ GumboQuirksModeEnum doc_type_quirks_mode;
599
+ } GumboDocument;
600
+
601
+ /**
602
+ * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
603
+ * This contains just a block of text and its position.
604
+ */
605
+ typedef struct {
606
+ /**
607
+ * The text of this node, after entities have been parsed and decoded. For
608
+ * comment/cdata nodes, this does not include the comment delimiters.
609
+ */
610
+ const char* text;
611
+
612
+ /**
613
+ * The original text of this node, as a pointer into the original buffer. For
614
+ * comment/cdata nodes, this includes the comment delimiters.
615
+ */
616
+ GumboStringPiece original_text;
617
+
618
+ /**
619
+ * The starting position of this node. This corresponds to the position of
620
+ * original_text, before entities are decoded.
621
+ * */
622
+ GumboSourcePosition start_pos;
623
+ } GumboText;
624
+
625
+ /**
626
+ * The struct used to represent all HTML elements. This contains information
627
+ * about the tag, attributes, and child nodes.
628
+ */
629
+ typedef struct {
630
+ /**
631
+ * An array of GumboNodes, containing the children of this element. Pointers
632
+ * are owned.
633
+ */
634
+ GumboVector /* GumboNode* */ children;
635
+
636
+ /** The GumboTag enum for this element. */
637
+ GumboTag tag;
638
+
639
+ /** The GumboNamespaceEnum for this element. */
640
+ GumboNamespaceEnum tag_namespace;
641
+
642
+ /**
643
+ * A GumboStringPiece pointing to the original tag text for this element,
644
+ * pointing directly into the source buffer. If the tag was inserted
645
+ * algorithmically (for example, <head> or <tbody> insertion), this will be a
646
+ * zero-length string.
647
+ */
648
+ GumboStringPiece original_tag;
649
+
650
+ /**
651
+ * A GumboStringPiece pointing to the original end tag text for this element.
652
+ * If the end tag was inserted algorithmically, (for example, closing a
653
+ * self-closing tag), this will be a zero-length string.
654
+ */
655
+ GumboStringPiece original_end_tag;
656
+
657
+ /** The source position for the start of the start tag. */
658
+ GumboSourcePosition start_pos;
659
+
660
+ /** The source position for the start of the end tag. */
661
+ GumboSourcePosition end_pos;
662
+
663
+ /**
664
+ * An array of GumboAttributes, containing the attributes for this tag in the
665
+ * order that they were parsed. Pointers are owned.
666
+ */
667
+ GumboVector /* GumboAttribute* */ attributes;
668
+ } GumboElement;
669
+
670
+ /**
671
+ * A supertype for GumboElement and GumboText, so that we can include one
672
+ * generic type in lists of children and cast as necessary to subtypes.
673
+ */
674
+ struct GumboInternalNode {
675
+ /** The type of node that this is. */
676
+ GumboNodeType type;
677
+
678
+ /** Pointer back to parent node. Not owned. */
679
+ GumboNode* parent;
680
+
681
+ /** The index within the parent's children vector of this node. */
682
+ size_t index_within_parent;
683
+
684
+ /**
685
+ * A bitvector of flags containing information about why this element was
686
+ * inserted into the parse tree, including a variety of special parse
687
+ * situations.
688
+ */
689
+ GumboParseFlags parse_flags;
690
+
691
+ /** The actual node data. */
692
+ union {
693
+ GumboDocument document; // For GUMBO_NODE_DOCUMENT.
694
+ GumboElement element; // For GUMBO_NODE_ELEMENT.
695
+ GumboText text; // For everything else.
696
+ } v;
697
+ };
698
+
699
+ /**
700
+ * The type for an allocator function. Takes the 'userdata' member of the
701
+ * GumboParser struct as its first argument. Semantics should be the same as
702
+ * malloc, i.e. return a block of size_t bytes on success or NULL on failure.
703
+ * Allocating a block of 0 bytes behaves as per malloc.
704
+ */
705
+ // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
706
+ typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
707
+
708
+ /**
709
+ * The type for a deallocator function. Takes the 'userdata' member of the
710
+ * GumboParser struct as its first argument.
711
+ */
712
+ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
713
+
714
+ /**
715
+ * Input struct containing configuration options for the parser.
716
+ * These let you specify alternate memory managers, provide different error
717
+ * handling, etc.
718
+ * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
719
+ */
720
+ typedef struct GumboInternalOptions {
721
+ /** A memory allocator function. Default: malloc. */
722
+ GumboAllocatorFunction allocator;
723
+
724
+ /** A memory deallocator function. Default: free. */
725
+ GumboDeallocatorFunction deallocator;
726
+
727
+ /**
728
+ * An opaque object that's passed in as the first argument to all callbacks
729
+ * used by this library. Default: NULL.
730
+ */
731
+ void* userdata;
732
+
733
+ /**
734
+ * The tab-stop size, for computing positions in source code that uses tabs.
735
+ * Default: 8.
736
+ */
737
+ int tab_stop;
738
+
739
+ /**
740
+ * Whether or not to stop parsing when the first error is encountered.
741
+ * Default: false.
742
+ */
743
+ bool stop_on_first_error;
744
+
745
+ /**
746
+ * The maximum number of errors before the parser stops recording them. This
747
+ * is provided so that if the page is totally borked, we don't completely fill
748
+ * up the errors vector and exhaust memory with useless redundant errors. Set
749
+ * to -1 to disable the limit.
750
+ * Default: -1
751
+ */
752
+ int max_errors;
753
+ } GumboOptions;
754
+
755
+ /** Default options struct; use this with gumbo_parse_with_options. */
756
+ extern const GumboOptions kGumboDefaultOptions;
757
+
758
+ /** The output struct containing the results of the parse. */
759
+ typedef struct GumboInternalOutput {
760
+ /**
761
+ * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
762
+ * that contains the entire document as its child.
763
+ */
764
+ GumboNode* document;
765
+
766
+ /**
767
+ * Pointer to the root node. This the <html> tag that forms the root of the
768
+ * document.
769
+ */
770
+ GumboNode* root;
771
+
772
+ /**
773
+ * A list of errors that occurred during the parse.
774
+ * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
775
+ * fleshed out and may change in the future. For this reason, the GumboError
776
+ * header isn't part of the public API. Contact us if you need errors
777
+ * reported so we can work out something appropriate for your use-case.
778
+ */
779
+ GumboVector /* GumboError */ errors;
780
+ } GumboOutput;
781
+
782
+ /**
783
+ * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
784
+ * live at least as long as the parse tree, as some fields (eg. original_text)
785
+ * point directly into the original buffer.
786
+ *
787
+ * This doesn't support buffers longer than 4 gigabytes.
788
+ */
789
+ GumboOutput* gumbo_parse(const char* buffer);
790
+
791
+ /**
792
+ * Extended version of gumbo_parse that takes an explicit options structure,
793
+ * buffer, and length.
794
+ */
795
+ GumboOutput* gumbo_parse_with_options(
796
+ const GumboOptions* options, const char* buffer, size_t buffer_length);
797
+
798
+ /** Release the memory used for the parse tree & parse errors. */
799
+ void gumbo_destroy_output(
800
+ const GumboOptions* options, GumboOutput* output);
801
+
802
+
803
+ #ifdef __cplusplus
804
+ }
805
+ #endif
806
+
807
+ #endif // GUMBO_GUMBO_H_