ruby-gumbo 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.mkd +28 -31
  4. data/Rakefile +60 -59
  5. data/ext/extconf.rb +17 -9
  6. data/ext/{gumbo.c → ruby_gumbo_ext.c} +29 -28
  7. data/lib/gumbo.rb +19 -0
  8. data/lib/gumbo/element.rb +52 -0
  9. data/lib/gumbo/{extra.rb → node.rb} +19 -22
  10. data/lib/gumbo/text.rb +29 -0
  11. data/vendor/gumbo-parser/src/attribute.c +44 -0
  12. data/vendor/gumbo-parser/src/attribute.h +37 -0
  13. data/vendor/gumbo-parser/src/char_ref.c +2561 -0
  14. data/vendor/gumbo-parser/src/char_ref.h +61 -0
  15. data/vendor/gumbo-parser/src/error.c +258 -0
  16. data/vendor/gumbo-parser/src/error.h +227 -0
  17. data/vendor/gumbo-parser/src/gumbo.h +807 -0
  18. data/vendor/gumbo-parser/src/insertion_mode.h +57 -0
  19. data/vendor/gumbo-parser/src/parser.c +3917 -0
  20. data/vendor/gumbo-parser/src/parser.h +57 -0
  21. data/vendor/gumbo-parser/src/string_buffer.c +106 -0
  22. data/vendor/gumbo-parser/src/string_buffer.h +81 -0
  23. data/vendor/gumbo-parser/src/string_piece.c +49 -0
  24. data/vendor/gumbo-parser/src/string_piece.h +39 -0
  25. data/vendor/gumbo-parser/src/tag.c +225 -0
  26. data/vendor/gumbo-parser/src/token_type.h +40 -0
  27. data/vendor/gumbo-parser/src/tokenizer.c +2980 -0
  28. data/vendor/gumbo-parser/src/tokenizer.h +123 -0
  29. data/vendor/gumbo-parser/src/tokenizer_states.h +103 -0
  30. data/vendor/gumbo-parser/src/utf8.c +275 -0
  31. data/vendor/gumbo-parser/src/utf8.h +127 -0
  32. data/vendor/gumbo-parser/src/util.c +58 -0
  33. data/vendor/gumbo-parser/src/util.h +62 -0
  34. data/vendor/gumbo-parser/src/vector.c +123 -0
  35. data/vendor/gumbo-parser/src/vector.h +69 -0
  36. metadata +40 -10
  37. data/ext/extconf.h +0 -3
@@ -0,0 +1,807 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
18
+ // GUMBO_ as a prefix for enum constants (static constants get the Google-style
19
+ // kGumbo prefix).
20
+
21
+ /**
22
+ * @file
23
+ * @mainpage Gumbo HTML Parser
24
+ *
25
+ * This provides a conformant, no-dependencies implementation of the HTML5
26
+ * parsing algorithm. It supports only UTF8; if you need to parse a different
27
+ * encoding, run a preprocessing step to convert to UTF8. It returns a parse
28
+ * tree made of the structs in this file.
29
+ *
30
+ * Example:
31
+ * @code
32
+ * GumboOutput* output = gumbo_parse(input);
33
+ * do_something_with_doctype(output->document);
34
+ * do_something_with_html_tree(output->root);
35
+ * gumbo_destroy_output(&options, output);
36
+ * @endcode
37
+ * HTML5 Spec:
38
+ *
39
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
40
+ */
41
+
42
+ #ifndef GUMBO_GUMBO_H_
43
+ #define GUMBO_GUMBO_H_
44
+
45
+ #ifdef _MSC_VER
46
+ #define _CRT_SECURE_NO_WARNINGS
47
+ #define fileno _fileno
48
+ #endif
49
+
50
+ #include <stdbool.h>
51
+ #include <stddef.h>
52
+
53
+ #ifdef __cplusplus
54
+ extern "C" {
55
+ #endif
56
+
57
+ /**
58
+ * A struct representing a character position within the original text buffer.
59
+ * Line and column numbers are 1-based and offsets are 0-based, which matches
60
+ * how most editors and command-line tools work. Also, columns measure
61
+ * positions in terms of characters while offsets measure by bytes; this is
62
+ * because the offset field is often used to pull out a particular region of
63
+ * text (which in most languages that bind to C implies pointer arithmetic on a
64
+ * buffer of bytes), while the column field is often used to reference a
65
+ * particular column on a printable display, which nowadays is usually UTF-8.
66
+ */
67
+ typedef struct {
68
+ unsigned int line;
69
+ unsigned int column;
70
+ unsigned int offset;
71
+ } GumboSourcePosition;
72
+
73
+ /**
74
+ * A SourcePosition used for elements that have no source position, i.e.
75
+ * parser-inserted elements.
76
+ */
77
+ extern const GumboSourcePosition kGumboEmptySourcePosition;
78
+
79
+
80
+ /**
81
+ * A struct representing a string or part of a string. Strings within the
82
+ * parser are represented by a char* and a length; the char* points into
83
+ * an existing data buffer owned by some other code (often the original input).
84
+ * GumboStringPieces are assumed (by convention) to be immutable, because they
85
+ * may share data. Use GumboStringBuffer if you need to construct a string.
86
+ * Clients should assume that it is not NUL-terminated, and should always use
87
+ * explicit lengths when manipulating them.
88
+ */
89
+ typedef struct {
90
+ /** A pointer to the beginning of the string. NULL iff length == 0. */
91
+ const char* data;
92
+
93
+ /** The length of the string fragment, in bytes. May be zero. */
94
+ size_t length;
95
+ } GumboStringPiece;
96
+
97
+ /** A constant to represent a 0-length null string. */
98
+ extern const GumboStringPiece kGumboEmptyString;
99
+
100
+ /**
101
+ * Compares two GumboStringPieces, and returns true if they're equal or false
102
+ * otherwise.
103
+ */
104
+ bool gumbo_string_equals(
105
+ const GumboStringPiece* str1, const GumboStringPiece* str2);
106
+
107
+ /**
108
+ * Compares two GumboStringPieces ignoring case, and returns true if they're
109
+ * equal or false otherwise.
110
+ */
111
+ bool gumbo_string_equals_ignore_case(
112
+ const GumboStringPiece* str1, const GumboStringPiece* str2);
113
+
114
+
115
+ /**
116
+ * A simple vector implementation. This stores a pointer to a data array and a
117
+ * length. All elements are stored as void*; client code must cast to the
118
+ * appropriate type. Overflows upon addition result in reallocation of the data
119
+ * array, with the size doubling to maintain O(1) amortized cost. There is no
120
+ * removal function, as this isn't needed for any of the operations within this
121
+ * library. Iteration can be done through inspecting the structure directly in
122
+ * a for-loop.
123
+ */
124
+ typedef struct {
125
+ /** Data elements. This points to a dynamically-allocated array of capacity
126
+ * elements, each a void* to the element itself.
127
+ */
128
+ void** data;
129
+
130
+ /** Number of elements currently in the vector. */
131
+ unsigned int length;
132
+
133
+ /** Current array capacity. */
134
+ unsigned int capacity;
135
+ } GumboVector;
136
+
137
+ /** An empty (0-length, 0-capacity) GumboVector. */
138
+ extern const GumboVector kGumboEmptyVector;
139
+
140
+ /**
141
+ * Returns the first index at which an element appears in this vector (testing
142
+ * by pointer equality), or -1 if it never does.
143
+ */
144
+ int gumbo_vector_index_of(GumboVector* vector, void* element);
145
+
146
+
147
+ /**
148
+ * An enum for all the tags defined in the HTML5 standard. These correspond to
149
+ * the tag names themselves. Enum constants exist only for tags which appear in
150
+ * the spec itself (or for tags with special handling in the SVG and MathML
151
+ * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
152
+ * name can be obtained through original_tag.
153
+ *
154
+ * This is mostly for API convenience, so that clients of this library don't
155
+ * need to perform a strcasecmp to find the normalized tag name. It also has
156
+ * efficiency benefits, by letting the parser work with enums instead of
157
+ * strings.
158
+ */
159
+ typedef enum {
160
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
161
+ GUMBO_TAG_HTML,
162
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
163
+ GUMBO_TAG_HEAD,
164
+ GUMBO_TAG_TITLE,
165
+ GUMBO_TAG_BASE,
166
+ GUMBO_TAG_LINK,
167
+ GUMBO_TAG_META,
168
+ GUMBO_TAG_STYLE,
169
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
170
+ GUMBO_TAG_SCRIPT,
171
+ GUMBO_TAG_NOSCRIPT,
172
+ GUMBO_TAG_TEMPLATE,
173
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
174
+ GUMBO_TAG_BODY,
175
+ GUMBO_TAG_ARTICLE,
176
+ GUMBO_TAG_SECTION,
177
+ GUMBO_TAG_NAV,
178
+ GUMBO_TAG_ASIDE,
179
+ GUMBO_TAG_H1,
180
+ GUMBO_TAG_H2,
181
+ GUMBO_TAG_H3,
182
+ GUMBO_TAG_H4,
183
+ GUMBO_TAG_H5,
184
+ GUMBO_TAG_H6,
185
+ GUMBO_TAG_HGROUP,
186
+ GUMBO_TAG_HEADER,
187
+ GUMBO_TAG_FOOTER,
188
+ GUMBO_TAG_ADDRESS,
189
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
190
+ GUMBO_TAG_P,
191
+ GUMBO_TAG_HR,
192
+ GUMBO_TAG_PRE,
193
+ GUMBO_TAG_BLOCKQUOTE,
194
+ GUMBO_TAG_OL,
195
+ GUMBO_TAG_UL,
196
+ GUMBO_TAG_LI,
197
+ GUMBO_TAG_DL,
198
+ GUMBO_TAG_DT,
199
+ GUMBO_TAG_DD,
200
+ GUMBO_TAG_FIGURE,
201
+ GUMBO_TAG_FIGCAPTION,
202
+ GUMBO_TAG_MAIN,
203
+ GUMBO_TAG_DIV,
204
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
205
+ GUMBO_TAG_A,
206
+ GUMBO_TAG_EM,
207
+ GUMBO_TAG_STRONG,
208
+ GUMBO_TAG_SMALL,
209
+ GUMBO_TAG_S,
210
+ GUMBO_TAG_CITE,
211
+ GUMBO_TAG_Q,
212
+ GUMBO_TAG_DFN,
213
+ GUMBO_TAG_ABBR,
214
+ GUMBO_TAG_DATA,
215
+ GUMBO_TAG_TIME,
216
+ GUMBO_TAG_CODE,
217
+ GUMBO_TAG_VAR,
218
+ GUMBO_TAG_SAMP,
219
+ GUMBO_TAG_KBD,
220
+ GUMBO_TAG_SUB,
221
+ GUMBO_TAG_SUP,
222
+ GUMBO_TAG_I,
223
+ GUMBO_TAG_B,
224
+ GUMBO_TAG_U,
225
+ GUMBO_TAG_MARK,
226
+ GUMBO_TAG_RUBY,
227
+ GUMBO_TAG_RT,
228
+ GUMBO_TAG_RP,
229
+ GUMBO_TAG_BDI,
230
+ GUMBO_TAG_BDO,
231
+ GUMBO_TAG_SPAN,
232
+ GUMBO_TAG_BR,
233
+ GUMBO_TAG_WBR,
234
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
235
+ GUMBO_TAG_INS,
236
+ GUMBO_TAG_DEL,
237
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
238
+ GUMBO_TAG_IMAGE,
239
+ GUMBO_TAG_IMG,
240
+ GUMBO_TAG_IFRAME,
241
+ GUMBO_TAG_EMBED,
242
+ GUMBO_TAG_OBJECT,
243
+ GUMBO_TAG_PARAM,
244
+ GUMBO_TAG_VIDEO,
245
+ GUMBO_TAG_AUDIO,
246
+ GUMBO_TAG_SOURCE,
247
+ GUMBO_TAG_TRACK,
248
+ GUMBO_TAG_CANVAS,
249
+ GUMBO_TAG_MAP,
250
+ GUMBO_TAG_AREA,
251
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
252
+ GUMBO_TAG_MATH,
253
+ GUMBO_TAG_MI,
254
+ GUMBO_TAG_MO,
255
+ GUMBO_TAG_MN,
256
+ GUMBO_TAG_MS,
257
+ GUMBO_TAG_MTEXT,
258
+ GUMBO_TAG_MGLYPH,
259
+ GUMBO_TAG_MALIGNMARK,
260
+ GUMBO_TAG_ANNOTATION_XML,
261
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
262
+ GUMBO_TAG_SVG,
263
+ GUMBO_TAG_FOREIGNOBJECT,
264
+ GUMBO_TAG_DESC,
265
+ // SVG title tags will have GUMBO_TAG_TITLE as with HTML.
266
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
267
+ GUMBO_TAG_TABLE,
268
+ GUMBO_TAG_CAPTION,
269
+ GUMBO_TAG_COLGROUP,
270
+ GUMBO_TAG_COL,
271
+ GUMBO_TAG_TBODY,
272
+ GUMBO_TAG_THEAD,
273
+ GUMBO_TAG_TFOOT,
274
+ GUMBO_TAG_TR,
275
+ GUMBO_TAG_TD,
276
+ GUMBO_TAG_TH,
277
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
278
+ GUMBO_TAG_FORM,
279
+ GUMBO_TAG_FIELDSET,
280
+ GUMBO_TAG_LEGEND,
281
+ GUMBO_TAG_LABEL,
282
+ GUMBO_TAG_INPUT,
283
+ GUMBO_TAG_BUTTON,
284
+ GUMBO_TAG_SELECT,
285
+ GUMBO_TAG_DATALIST,
286
+ GUMBO_TAG_OPTGROUP,
287
+ GUMBO_TAG_OPTION,
288
+ GUMBO_TAG_TEXTAREA,
289
+ GUMBO_TAG_KEYGEN,
290
+ GUMBO_TAG_OUTPUT,
291
+ GUMBO_TAG_PROGRESS,
292
+ GUMBO_TAG_METER,
293
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
294
+ GUMBO_TAG_DETAILS,
295
+ GUMBO_TAG_SUMMARY,
296
+ GUMBO_TAG_MENU,
297
+ GUMBO_TAG_MENUITEM,
298
+ // Non-conforming elements that nonetheless appear in the HTML5 spec.
299
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
300
+ GUMBO_TAG_APPLET,
301
+ GUMBO_TAG_ACRONYM,
302
+ GUMBO_TAG_BGSOUND,
303
+ GUMBO_TAG_DIR,
304
+ GUMBO_TAG_FRAME,
305
+ GUMBO_TAG_FRAMESET,
306
+ GUMBO_TAG_NOFRAMES,
307
+ GUMBO_TAG_ISINDEX,
308
+ GUMBO_TAG_LISTING,
309
+ GUMBO_TAG_XMP,
310
+ GUMBO_TAG_NEXTID,
311
+ GUMBO_TAG_NOEMBED,
312
+ GUMBO_TAG_PLAINTEXT,
313
+ GUMBO_TAG_RB,
314
+ GUMBO_TAG_STRIKE,
315
+ GUMBO_TAG_BASEFONT,
316
+ GUMBO_TAG_BIG,
317
+ GUMBO_TAG_BLINK,
318
+ GUMBO_TAG_CENTER,
319
+ GUMBO_TAG_FONT,
320
+ GUMBO_TAG_MARQUEE,
321
+ GUMBO_TAG_MULTICOL,
322
+ GUMBO_TAG_NOBR,
323
+ GUMBO_TAG_SPACER,
324
+ GUMBO_TAG_TT,
325
+ // Used for all tags that don't have special handling in HTML.
326
+ GUMBO_TAG_UNKNOWN,
327
+ // A marker value to indicate the end of the enum, for iterating over it.
328
+ // Also used as the terminator for varargs functions that take tags.
329
+ GUMBO_TAG_LAST,
330
+ } GumboTag;
331
+
332
+ /**
333
+ * Returns the normalized (usually all-lowercased, except for foreign content)
334
+ * tag name for an GumboTag enum. Return value is static data owned by the
335
+ * library.
336
+ */
337
+ const char* gumbo_normalized_tagname(GumboTag tag);
338
+
339
+ /**
340
+ * Extracts the tag name from the original_text field of an element or token by
341
+ * stripping off </> characters and attributes and adjusting the passed-in
342
+ * GumboStringPiece appropriately. The tag name is in the original case and
343
+ * shares a buffer with the original text, to simplify memory management.
344
+ * Behavior is undefined if a string-piece that doesn't represent an HTML tag
345
+ * (<tagname> or </tagname>) is passed in. If the string piece is completely
346
+ * empty (NULL data pointer), then this function will exit successfully as a
347
+ * no-op.
348
+ */
349
+ void gumbo_tag_from_original_text(GumboStringPiece* text);
350
+
351
+ /**
352
+ * Fixes the case of SVG elements that are not all lowercase.
353
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
354
+ * This is not done at parse time because there's no place to store a mutated
355
+ * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
356
+ * without special handling), while original_tag_name is a pointer into the
357
+ * original buffer. Instead, we provide this helper function that clients can
358
+ * use to rename SVG tags as appropriate.
359
+ * Returns the case-normalized SVG tagname if a replacement is found, or NULL if
360
+ * no normalization is called for. The return value is static data and owned by
361
+ * the library.
362
+ */
363
+ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
364
+
365
+ /**
366
+ * Converts a tag name string (which may be in upper or mixed case) to a tag
367
+ * enum.
368
+ */
369
+ GumboTag gumbo_tag_enum(const char* tagname);
370
+
371
+ /**
372
+ * Attribute namespaces.
373
+ * HTML includes special handling for XLink, XML, and XMLNS namespaces on
374
+ * attributes. Everything else goes in the generic "NONE" namespace.
375
+ */
376
+ typedef enum {
377
+ GUMBO_ATTR_NAMESPACE_NONE,
378
+ GUMBO_ATTR_NAMESPACE_XLINK,
379
+ GUMBO_ATTR_NAMESPACE_XML,
380
+ GUMBO_ATTR_NAMESPACE_XMLNS,
381
+ } GumboAttributeNamespaceEnum;
382
+
383
+ /**
384
+ * A struct representing a single attribute on an HTML tag. This is a
385
+ * name-value pair, but also includes information about source locations and
386
+ * original source text.
387
+ */
388
+ typedef struct {
389
+ /**
390
+ * The namespace for the attribute. This will usually be
391
+ * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
392
+ * values, per:
393
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
394
+ */
395
+ GumboAttributeNamespaceEnum attr_namespace;
396
+
397
+ /**
398
+ * The name of the attribute. This is in a freshly-allocated buffer to deal
399
+ * with case-normalization, and is null-terminated.
400
+ */
401
+ const char* name;
402
+
403
+ /**
404
+ * The original text of the attribute name, as a pointer into the original
405
+ * source buffer.
406
+ */
407
+ GumboStringPiece original_name;
408
+
409
+ /**
410
+ * The value of the attribute. This is in a freshly-allocated buffer to deal
411
+ * with unescaping, and is null-terminated. It does not include any quotes
412
+ * that surround the attribute. If the attribute has no value (for example,
413
+ * 'selected' on a checkbox), this will be an empty string.
414
+ */
415
+ const char* value;
416
+
417
+ /**
418
+ * The original text of the value of the attribute. This points into the
419
+ * original source buffer. It includes any quotes that surround the
420
+ * attribute, and you can look at original_value.data[0] and
421
+ * original_value.data[original_value.length - 1] to determine what the quote
422
+ * characters were. If the attribute has no value, this will be a 0-length
423
+ * string.
424
+ */
425
+ GumboStringPiece original_value;
426
+
427
+ /** The starting position of the attribute name. */
428
+ GumboSourcePosition name_start;
429
+
430
+ /**
431
+ * The ending position of the attribute name. This is not always derivable
432
+ * from the starting position of the value because of the possibility of
433
+ * whitespace around the = sign.
434
+ */
435
+ GumboSourcePosition name_end;
436
+
437
+ /** The starting position of the attribute value. */
438
+ GumboSourcePosition value_start;
439
+
440
+ /** The ending position of the attribute value. */
441
+ GumboSourcePosition value_end;
442
+ } GumboAttribute;
443
+
444
+ /**
445
+ * Given a vector of GumboAttributes, look up the one with the specified name
446
+ * and return it, or NULL if no such attribute exists. This uses a
447
+ * case-insensitive match, as HTML is case-insensitive.
448
+ */
449
+ GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
450
+
451
+ /**
452
+ * Enum denoting the type of node. This determines the type of the node.v
453
+ * union.
454
+ */
455
+ typedef enum {
456
+ /** Document node. v will be a GumboDocument. */
457
+ GUMBO_NODE_DOCUMENT,
458
+ /** Element node. v will be a GumboElement. */
459
+ GUMBO_NODE_ELEMENT,
460
+ /** Text node. v will be a GumboText. */
461
+ GUMBO_NODE_TEXT,
462
+ /** CDATA node. v will be a GumboText. */
463
+ GUMBO_NODE_CDATA,
464
+ /** Comment node. v. will be a GumboText, excluding comment delimiters. */
465
+ GUMBO_NODE_COMMENT,
466
+ /** Text node, where all contents is whitespace. v will be a GumboText. */
467
+ GUMBO_NODE_WHITESPACE
468
+ } GumboNodeType;
469
+
470
+ /**
471
+ * Forward declaration of GumboNode so it can be used recursively in
472
+ * GumboNode.parent.
473
+ */
474
+ typedef struct GumboInternalNode GumboNode;
475
+
476
+ /** http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode */
477
+ typedef enum {
478
+ GUMBO_DOCTYPE_NO_QUIRKS,
479
+ GUMBO_DOCTYPE_QUIRKS,
480
+ GUMBO_DOCTYPE_LIMITED_QUIRKS
481
+ } GumboQuirksModeEnum;
482
+
483
+ /**
484
+ * Namespaces.
485
+ * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
486
+ * anything inside an <svg> tag is in the SVG namespace, anything inside the
487
+ * <math> tag is in the MathML namespace, and anything else is inside the HTML
488
+ * namespace. No other namespaces are supported, so this can be an enum only.
489
+ */
490
+ typedef enum {
491
+ GUMBO_NAMESPACE_HTML,
492
+ GUMBO_NAMESPACE_SVG,
493
+ GUMBO_NAMESPACE_MATHML
494
+ } GumboNamespaceEnum;
495
+
496
+ /**
497
+ * Parse flags.
498
+ * We track the reasons for parser insertion of nodes and store them in a
499
+ * bitvector in the node itself. This lets client code optimize out nodes that
500
+ * are implied by the HTML structure of the document, or flag constructs that
501
+ * may not be allowed by a style guide, or track the prevalence of incorrect or
502
+ * tricky HTML code.
503
+ */
504
+ typedef enum {
505
+ /**
506
+ * A normal node - both start and end tags appear in the source, nothing has
507
+ * been reparented.
508
+ */
509
+ GUMBO_INSERTION_NORMAL = 0,
510
+
511
+ /**
512
+ * A node inserted by the parser to fulfill some implicit insertion rule.
513
+ * This is usually set in addition to some other flag giving a more specific
514
+ * insertion reason; it's a generic catch-all term meaning "The start tag for
515
+ * this node did not appear in the document source".
516
+ */
517
+ GUMBO_INSERTION_BY_PARSER = 1 << 0,
518
+
519
+ /**
520
+ * A flag indicating that the end tag for this node did not appear in the
521
+ * document source. Note that in some cases, you can still have
522
+ * parser-inserted nodes with an explicit end tag: for example, "Text</html>"
523
+ * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
524
+ * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
525
+ * exists. This flag will be set only if the end tag is completely missing;
526
+ * in some cases, the end tag may be misplaced (eg. a </body> tag with text
527
+ * afterwards), which will leave this flag unset and require clients to
528
+ * inspect the parse errors for that case.
529
+ */
530
+ GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
531
+
532
+ // Value 1 << 2 was for a flag that has since been removed.
533
+
534
+ /**
535
+ * A flag for nodes that are inserted because their presence is implied by
536
+ * other tags, eg. <html>, <head>, <body>, <tbody>, etc.
537
+ */
538
+ GUMBO_INSERTION_IMPLIED = 1 << 3,
539
+
540
+ /**
541
+ * A flag for nodes that are converted from their end tag equivalents. For
542
+ * example, </p> when no paragraph is open implies that the parser should
543
+ * create a <p> tag and immediately close it, while </br> means the same thing
544
+ * as <br>.
545
+ */
546
+ GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
547
+
548
+ /** A flag for nodes that are converted from the parse of an <isindex> tag. */
549
+ GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
550
+
551
+ /** A flag for <image> tags that are rewritten as <img>. */
552
+ GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
553
+
554
+ /**
555
+ * A flag for nodes that are cloned as a result of the reconstruction of
556
+ * active formatting elements. This is set only on the clone; the initial
557
+ * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
558
+ */
559
+ GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
560
+
561
+ /** A flag for nodes that are cloned by the adoption agency algorithm. */
562
+ GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
563
+
564
+ /** A flag for nodes that are moved by the adoption agency algorithm. */
565
+ GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
566
+
567
+ /**
568
+ * A flag for nodes that have been foster-parented out of a table (or
569
+ * should've been foster-parented, if verbatim mode is set).
570
+ */
571
+ GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
572
+ } GumboParseFlags;
573
+
574
+
575
+ /**
576
+ * Information specific to document nodes.
577
+ */
578
+ typedef struct {
579
+ /**
580
+ * An array of GumboNodes, containing the children of this element. This will
581
+ * normally consist of the <html> element and any comment nodes found.
582
+ * Pointers are owned.
583
+ */
584
+ GumboVector /* GumboNode* */ children;
585
+
586
+ // True if there was an explicit doctype token as opposed to it being omitted.
587
+ bool has_doctype;
588
+
589
+ // Fields from the doctype token, copied verbatim.
590
+ const char* name;
591
+ const char* public_identifier;
592
+ const char* system_identifier;
593
+
594
+ /**
595
+ * Whether or not the document is in QuirksMode, as determined by the values
596
+ * in the GumboTokenDocType template.
597
+ */
598
+ GumboQuirksModeEnum doc_type_quirks_mode;
599
+ } GumboDocument;
600
+
601
+ /**
602
+ * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
603
+ * This contains just a block of text and its position.
604
+ */
605
+ typedef struct {
606
+ /**
607
+ * The text of this node, after entities have been parsed and decoded. For
608
+ * comment/cdata nodes, this does not include the comment delimiters.
609
+ */
610
+ const char* text;
611
+
612
+ /**
613
+ * The original text of this node, as a pointer into the original buffer. For
614
+ * comment/cdata nodes, this includes the comment delimiters.
615
+ */
616
+ GumboStringPiece original_text;
617
+
618
+ /**
619
+ * The starting position of this node. This corresponds to the position of
620
+ * original_text, before entities are decoded.
621
+ * */
622
+ GumboSourcePosition start_pos;
623
+ } GumboText;
624
+
625
+ /**
626
+ * The struct used to represent all HTML elements. This contains information
627
+ * about the tag, attributes, and child nodes.
628
+ */
629
+ typedef struct {
630
+ /**
631
+ * An array of GumboNodes, containing the children of this element. Pointers
632
+ * are owned.
633
+ */
634
+ GumboVector /* GumboNode* */ children;
635
+
636
+ /** The GumboTag enum for this element. */
637
+ GumboTag tag;
638
+
639
+ /** The GumboNamespaceEnum for this element. */
640
+ GumboNamespaceEnum tag_namespace;
641
+
642
+ /**
643
+ * A GumboStringPiece pointing to the original tag text for this element,
644
+ * pointing directly into the source buffer. If the tag was inserted
645
+ * algorithmically (for example, <head> or <tbody> insertion), this will be a
646
+ * zero-length string.
647
+ */
648
+ GumboStringPiece original_tag;
649
+
650
+ /**
651
+ * A GumboStringPiece pointing to the original end tag text for this element.
652
+ * If the end tag was inserted algorithmically, (for example, closing a
653
+ * self-closing tag), this will be a zero-length string.
654
+ */
655
+ GumboStringPiece original_end_tag;
656
+
657
+ /** The source position for the start of the start tag. */
658
+ GumboSourcePosition start_pos;
659
+
660
+ /** The source position for the start of the end tag. */
661
+ GumboSourcePosition end_pos;
662
+
663
+ /**
664
+ * An array of GumboAttributes, containing the attributes for this tag in the
665
+ * order that they were parsed. Pointers are owned.
666
+ */
667
+ GumboVector /* GumboAttribute* */ attributes;
668
+ } GumboElement;
669
+
670
+ /**
671
+ * A supertype for GumboElement and GumboText, so that we can include one
672
+ * generic type in lists of children and cast as necessary to subtypes.
673
+ */
674
+ struct GumboInternalNode {
675
+ /** The type of node that this is. */
676
+ GumboNodeType type;
677
+
678
+ /** Pointer back to parent node. Not owned. */
679
+ GumboNode* parent;
680
+
681
+ /** The index within the parent's children vector of this node. */
682
+ size_t index_within_parent;
683
+
684
+ /**
685
+ * A bitvector of flags containing information about why this element was
686
+ * inserted into the parse tree, including a variety of special parse
687
+ * situations.
688
+ */
689
+ GumboParseFlags parse_flags;
690
+
691
+ /** The actual node data. */
692
+ union {
693
+ GumboDocument document; // For GUMBO_NODE_DOCUMENT.
694
+ GumboElement element; // For GUMBO_NODE_ELEMENT.
695
+ GumboText text; // For everything else.
696
+ } v;
697
+ };
698
+
699
+ /**
700
+ * The type for an allocator function. Takes the 'userdata' member of the
701
+ * GumboParser struct as its first argument. Semantics should be the same as
702
+ * malloc, i.e. return a block of size_t bytes on success or NULL on failure.
703
+ * Allocating a block of 0 bytes behaves as per malloc.
704
+ */
705
+ // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
706
+ typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
707
+
708
+ /**
709
+ * The type for a deallocator function. Takes the 'userdata' member of the
710
+ * GumboParser struct as its first argument.
711
+ */
712
+ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
713
+
714
+ /**
715
+ * Input struct containing configuration options for the parser.
716
+ * These let you specify alternate memory managers, provide different error
717
+ * handling, etc.
718
+ * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
719
+ */
720
+ typedef struct GumboInternalOptions {
721
+ /** A memory allocator function. Default: malloc. */
722
+ GumboAllocatorFunction allocator;
723
+
724
+ /** A memory deallocator function. Default: free. */
725
+ GumboDeallocatorFunction deallocator;
726
+
727
+ /**
728
+ * An opaque object that's passed in as the first argument to all callbacks
729
+ * used by this library. Default: NULL.
730
+ */
731
+ void* userdata;
732
+
733
+ /**
734
+ * The tab-stop size, for computing positions in source code that uses tabs.
735
+ * Default: 8.
736
+ */
737
+ int tab_stop;
738
+
739
+ /**
740
+ * Whether or not to stop parsing when the first error is encountered.
741
+ * Default: false.
742
+ */
743
+ bool stop_on_first_error;
744
+
745
+ /**
746
+ * The maximum number of errors before the parser stops recording them. This
747
+ * is provided so that if the page is totally borked, we don't completely fill
748
+ * up the errors vector and exhaust memory with useless redundant errors. Set
749
+ * to -1 to disable the limit.
750
+ * Default: -1
751
+ */
752
+ int max_errors;
753
+ } GumboOptions;
754
+
755
+ /** Default options struct; use this with gumbo_parse_with_options. */
756
+ extern const GumboOptions kGumboDefaultOptions;
757
+
758
+ /** The output struct containing the results of the parse. */
759
+ typedef struct GumboInternalOutput {
760
+ /**
761
+ * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
762
+ * that contains the entire document as its child.
763
+ */
764
+ GumboNode* document;
765
+
766
+ /**
767
+ * Pointer to the root node. This the <html> tag that forms the root of the
768
+ * document.
769
+ */
770
+ GumboNode* root;
771
+
772
+ /**
773
+ * A list of errors that occurred during the parse.
774
+ * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
775
+ * fleshed out and may change in the future. For this reason, the GumboError
776
+ * header isn't part of the public API. Contact us if you need errors
777
+ * reported so we can work out something appropriate for your use-case.
778
+ */
779
+ GumboVector /* GumboError */ errors;
780
+ } GumboOutput;
781
+
782
+ /**
783
+ * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
784
+ * live at least as long as the parse tree, as some fields (eg. original_text)
785
+ * point directly into the original buffer.
786
+ *
787
+ * This doesn't support buffers longer than 4 gigabytes.
788
+ */
789
+ GumboOutput* gumbo_parse(const char* buffer);
790
+
791
+ /**
792
+ * Extended version of gumbo_parse that takes an explicit options structure,
793
+ * buffer, and length.
794
+ */
795
+ GumboOutput* gumbo_parse_with_options(
796
+ const GumboOptions* options, const char* buffer, size_t buffer_length);
797
+
798
+ /** Release the memory used for the parse tree & parse errors. */
799
+ void gumbo_destroy_output(
800
+ const GumboOptions* options, GumboOutput* output);
801
+
802
+
803
+ #ifdef __cplusplus
804
+ }
805
+ #endif
806
+
807
+ #endif // GUMBO_GUMBO_H_