nokogiri 1.11.7 → 1.12.5

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE-DEPENDENCIES.md +243 -22
  3. data/LICENSE.md +1 -1
  4. data/README.md +6 -5
  5. data/ext/nokogiri/depend +35 -34
  6. data/ext/nokogiri/extconf.rb +185 -103
  7. data/ext/nokogiri/gumbo.c +584 -0
  8. data/ext/nokogiri/{html_document.c → html4_document.c} +8 -8
  9. data/ext/nokogiri/{html_element_description.c → html4_element_description.c} +21 -19
  10. data/ext/nokogiri/{html_entity_lookup.c → html4_entity_lookup.c} +7 -7
  11. data/ext/nokogiri/{html_sax_parser_context.c → html4_sax_parser_context.c} +6 -5
  12. data/ext/nokogiri/{html_sax_push_parser.c → html4_sax_push_parser.c} +4 -4
  13. data/ext/nokogiri/libxml2_backwards_compat.c +30 -30
  14. data/ext/nokogiri/nokogiri.c +51 -38
  15. data/ext/nokogiri/nokogiri.h +19 -9
  16. data/ext/nokogiri/xml_document.c +14 -14
  17. data/ext/nokogiri/xml_element_content.c +2 -0
  18. data/ext/nokogiri/xml_encoding_handler.c +11 -6
  19. data/ext/nokogiri/xml_namespace.c +4 -2
  20. data/ext/nokogiri/xml_node.c +107 -100
  21. data/ext/nokogiri/xml_node_set.c +20 -20
  22. data/ext/nokogiri/xml_reader.c +2 -0
  23. data/ext/nokogiri/xml_sax_parser.c +6 -6
  24. data/ext/nokogiri/xml_sax_parser_context.c +2 -0
  25. data/ext/nokogiri/xml_schema.c +2 -0
  26. data/ext/nokogiri/xml_xpath_context.c +67 -65
  27. data/ext/nokogiri/xslt_stylesheet.c +2 -1
  28. data/gumbo-parser/CHANGES.md +63 -0
  29. data/gumbo-parser/Makefile +101 -0
  30. data/gumbo-parser/THANKS +27 -0
  31. data/gumbo-parser/src/Makefile +34 -0
  32. data/gumbo-parser/src/README.md +41 -0
  33. data/gumbo-parser/src/ascii.c +75 -0
  34. data/gumbo-parser/src/ascii.h +115 -0
  35. data/gumbo-parser/src/attribute.c +42 -0
  36. data/gumbo-parser/src/attribute.h +17 -0
  37. data/gumbo-parser/src/char_ref.c +22225 -0
  38. data/gumbo-parser/src/char_ref.h +29 -0
  39. data/gumbo-parser/src/char_ref.rl +2154 -0
  40. data/gumbo-parser/src/error.c +626 -0
  41. data/gumbo-parser/src/error.h +148 -0
  42. data/gumbo-parser/src/foreign_attrs.c +104 -0
  43. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  44. data/gumbo-parser/src/gumbo.h +943 -0
  45. data/gumbo-parser/src/insertion_mode.h +33 -0
  46. data/gumbo-parser/src/macros.h +91 -0
  47. data/gumbo-parser/src/parser.c +4886 -0
  48. data/gumbo-parser/src/parser.h +41 -0
  49. data/gumbo-parser/src/replacement.h +33 -0
  50. data/gumbo-parser/src/string_buffer.c +103 -0
  51. data/gumbo-parser/src/string_buffer.h +68 -0
  52. data/gumbo-parser/src/string_piece.c +48 -0
  53. data/gumbo-parser/src/svg_attrs.c +174 -0
  54. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  55. data/gumbo-parser/src/svg_tags.c +137 -0
  56. data/gumbo-parser/src/svg_tags.gperf +55 -0
  57. data/gumbo-parser/src/tag.c +222 -0
  58. data/gumbo-parser/src/tag_lookup.c +382 -0
  59. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  60. data/gumbo-parser/src/tag_lookup.h +13 -0
  61. data/gumbo-parser/src/token_buffer.c +79 -0
  62. data/gumbo-parser/src/token_buffer.h +71 -0
  63. data/gumbo-parser/src/token_type.h +17 -0
  64. data/gumbo-parser/src/tokenizer.c +3463 -0
  65. data/gumbo-parser/src/tokenizer.h +112 -0
  66. data/gumbo-parser/src/tokenizer_states.h +339 -0
  67. data/gumbo-parser/src/utf8.c +245 -0
  68. data/gumbo-parser/src/utf8.h +164 -0
  69. data/gumbo-parser/src/util.c +68 -0
  70. data/gumbo-parser/src/util.h +30 -0
  71. data/gumbo-parser/src/vector.c +111 -0
  72. data/gumbo-parser/src/vector.h +45 -0
  73. data/lib/nokogiri/css/parser.rb +1 -1
  74. data/lib/nokogiri/css/parser.y +1 -1
  75. data/lib/nokogiri/css/syntax_error.rb +1 -1
  76. data/lib/nokogiri/css.rb +14 -14
  77. data/lib/nokogiri/extension.rb +7 -2
  78. data/lib/nokogiri/gumbo.rb +14 -0
  79. data/lib/nokogiri/html.rb +31 -27
  80. data/lib/nokogiri/{html → html4}/builder.rb +2 -2
  81. data/lib/nokogiri/{html → html4}/document.rb +4 -4
  82. data/lib/nokogiri/{html → html4}/document_fragment.rb +3 -3
  83. data/lib/nokogiri/{html → html4}/element_description.rb +1 -1
  84. data/lib/nokogiri/{html → html4}/element_description_defaults.rb +1 -1
  85. data/lib/nokogiri/{html → html4}/entity_lookup.rb +1 -1
  86. data/lib/nokogiri/{html → html4}/sax/parser.rb +11 -14
  87. data/lib/nokogiri/html4/sax/parser_context.rb +19 -0
  88. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +5 -5
  89. data/lib/nokogiri/html4.rb +40 -0
  90. data/lib/nokogiri/html5/document.rb +74 -0
  91. data/lib/nokogiri/html5/document_fragment.rb +80 -0
  92. data/lib/nokogiri/html5/node.rb +93 -0
  93. data/lib/nokogiri/html5.rb +473 -0
  94. data/lib/nokogiri/version/constant.rb +1 -1
  95. data/lib/nokogiri/version/info.rb +11 -2
  96. data/lib/nokogiri/xml/builder.rb +38 -0
  97. data/lib/nokogiri/xml/document.rb +46 -0
  98. data/lib/nokogiri/xml/node/save_options.rb +1 -1
  99. data/lib/nokogiri/xml/node.rb +6 -5
  100. data/lib/nokogiri/xml/parse_options.rb +2 -0
  101. data/lib/nokogiri/xml/pp.rb +2 -2
  102. data/lib/nokogiri/xml/sax/document.rb +24 -30
  103. data/lib/nokogiri/xml/sax.rb +4 -4
  104. data/lib/nokogiri/xml/xpath.rb +2 -2
  105. data/lib/nokogiri/xml.rb +35 -36
  106. data/lib/nokogiri/xslt/stylesheet.rb +1 -1
  107. data/lib/nokogiri/xslt.rb +16 -16
  108. data/lib/nokogiri.rb +31 -29
  109. metadata +100 -58
  110. data/lib/nokogiri/html/sax/parser_context.rb +0 -17
@@ -0,0 +1,943 @@
1
+ // Copyright 2010 Google Inc.
2
+ // Copyright 2018 Craig Barnes.
3
+ // Licensed under the Apache License, version 2.0.
4
+
5
+ // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
6
+ // GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
7
+ // static constants
8
+
9
+ /**
10
+ * @file
11
+ * @mainpage Gumbo HTML Parser
12
+ *
13
+ * This provides a conformant, no-dependencies implementation of the
14
+ * [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
15
+ * to parse a different encoding, run a preprocessing step to convert
16
+ * to UTF-8. It returns a parse tree made of the structs in this file.
17
+ *
18
+ * Example:
19
+ * @code
20
+ * GumboOutput* output = gumbo_parse(input);
21
+ * do_something_with_doctype(output->document);
22
+ * do_something_with_html_tree(output->root);
23
+ * gumbo_destroy_output(output);
24
+ * @endcode
25
+ *
26
+ * [HTML5]: https://html.spec.whatwg.org/multipage/
27
+ */
28
+
29
+ #ifndef GUMBO_H
30
+ #define GUMBO_H
31
+
32
+ #include <stdbool.h>
33
+ #include <stddef.h>
34
+
35
+ #ifdef __cplusplus
36
+ extern "C" {
37
+ #endif
38
+
39
+ /**
40
+ * A struct representing a character position within the original text
41
+ * buffer. Line and column numbers are 1-based and offsets are 0-based,
42
+ * which matches how most editors and command-line tools work.
43
+ */
44
+ typedef struct {
45
+ size_t line;
46
+ size_t column;
47
+ size_t offset;
48
+ } GumboSourcePosition;
49
+
50
+ /**
51
+ * A struct representing a string or part of a string. Strings within
52
+ * the parser are represented by a `char*` and a length; the `char*`
53
+ * points into an existing data buffer owned by some other code (often
54
+ * the original input). `GumboStringPiece`s are assumed (by convention)
55
+ * to be immutable, because they may share data. Clients should assume
56
+ * that it is not NUL-terminated and should always use explicit lengths
57
+ * when manipulating them.
58
+ */
59
+ typedef struct {
60
+ /** A pointer to the beginning of the string. `NULL` if `length == 0`. */
61
+ const char* data;
62
+
63
+ /** The length of the string fragment, in bytes (may be zero). */
64
+ size_t length;
65
+ } GumboStringPiece;
66
+
67
+ #define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
68
+ /** A constant to represent a 0-length null string. */
69
+ #define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT
70
+
71
+ /**
72
+ * Compares two `GumboStringPiece`s, and returns `true` if they're
73
+ * equal or `false` otherwise.
74
+ */
75
+ bool gumbo_string_equals (
76
+ const GumboStringPiece* str1,
77
+ const GumboStringPiece* str2
78
+ );
79
+
80
+ /**
81
+ * Compares two `GumboStringPiece`s, ignoring case, and returns `true`
82
+ * if they're equal or `false` otherwise.
83
+ */
84
+ bool gumbo_string_equals_ignore_case (
85
+ const GumboStringPiece* str1,
86
+ const GumboStringPiece* str2
87
+ );
88
+
89
+ /**
90
+ * Check if the first `GumboStringPiece` is a prefix of the second, ignoring
91
+ * case.
92
+ */
93
+ bool gumbo_string_prefix_ignore_case (
94
+ const GumboStringPiece* prefix,
95
+ const GumboStringPiece* str
96
+ );
97
+
98
+ /**
99
+ * A simple vector implementation. This stores a pointer to a data array
100
+ * and a length. All elements are stored as `void*`; client code must
101
+ * cast to the appropriate type. Overflows upon addition result in
102
+ * reallocation of the data array, with the size doubling to maintain
103
+ * `O(1)` amortized cost. There is no removal function, as this isn't
104
+ * needed for any of the operations within this library. Iteration can
105
+ * be done through inspecting the structure directly in a `for` loop.
106
+ */
107
+ typedef struct {
108
+ /**
109
+ * Data elements. This points to a dynamically-allocated array of
110
+ * `capacity` elements, each a `void*` to the element itself.
111
+ */
112
+ void** data;
113
+
114
+ /** Number of elements currently in the vector. */
115
+ unsigned int length;
116
+
117
+ /** Current array capacity. */
118
+ unsigned int capacity;
119
+ } GumboVector;
120
+
121
+ # define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
122
+ /** An empty (0-length, 0-capacity) `GumboVector`. */
123
+ #define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT
124
+
125
+ /**
126
+ * Returns the first index at which an element appears in this vector
127
+ * (testing by pointer equality), or `-1` if it never does.
128
+ */
129
+ int gumbo_vector_index_of(GumboVector* vector, const void* element);
130
+
131
+ /**
132
+ * An `enum` for all the tags defined in the HTML5 standard. These
133
+ * correspond to the tag names themselves. Enum constants exist only
134
+ * for tags that appear in the spec itself (or for tags with special
135
+ * handling in the SVG and MathML namespaces). Any other tags appear
136
+ * as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
137
+ * through `original_tag`.
138
+ *
139
+ * This is mostly for API convenience, so that clients of this library
140
+ * don't need to perform a `strcasecmp` to find the normalized tag
141
+ * name. It also has efficiency benefits, by letting the parser work
142
+ * with enums instead of strings.
143
+ */
144
+ typedef enum {
145
+ GUMBO_TAG_HTML,
146
+ GUMBO_TAG_HEAD,
147
+ GUMBO_TAG_TITLE,
148
+ GUMBO_TAG_BASE,
149
+ GUMBO_TAG_LINK,
150
+ GUMBO_TAG_META,
151
+ GUMBO_TAG_STYLE,
152
+ GUMBO_TAG_SCRIPT,
153
+ GUMBO_TAG_NOSCRIPT,
154
+ GUMBO_TAG_TEMPLATE,
155
+ GUMBO_TAG_BODY,
156
+ GUMBO_TAG_ARTICLE,
157
+ GUMBO_TAG_SECTION,
158
+ GUMBO_TAG_NAV,
159
+ GUMBO_TAG_ASIDE,
160
+ GUMBO_TAG_H1,
161
+ GUMBO_TAG_H2,
162
+ GUMBO_TAG_H3,
163
+ GUMBO_TAG_H4,
164
+ GUMBO_TAG_H5,
165
+ GUMBO_TAG_H6,
166
+ GUMBO_TAG_HGROUP,
167
+ GUMBO_TAG_HEADER,
168
+ GUMBO_TAG_FOOTER,
169
+ GUMBO_TAG_ADDRESS,
170
+ GUMBO_TAG_P,
171
+ GUMBO_TAG_HR,
172
+ GUMBO_TAG_PRE,
173
+ GUMBO_TAG_BLOCKQUOTE,
174
+ GUMBO_TAG_OL,
175
+ GUMBO_TAG_UL,
176
+ GUMBO_TAG_LI,
177
+ GUMBO_TAG_DL,
178
+ GUMBO_TAG_DT,
179
+ GUMBO_TAG_DD,
180
+ GUMBO_TAG_FIGURE,
181
+ GUMBO_TAG_FIGCAPTION,
182
+ GUMBO_TAG_MAIN,
183
+ GUMBO_TAG_DIV,
184
+ GUMBO_TAG_A,
185
+ GUMBO_TAG_EM,
186
+ GUMBO_TAG_STRONG,
187
+ GUMBO_TAG_SMALL,
188
+ GUMBO_TAG_S,
189
+ GUMBO_TAG_CITE,
190
+ GUMBO_TAG_Q,
191
+ GUMBO_TAG_DFN,
192
+ GUMBO_TAG_ABBR,
193
+ GUMBO_TAG_DATA,
194
+ GUMBO_TAG_TIME,
195
+ GUMBO_TAG_CODE,
196
+ GUMBO_TAG_VAR,
197
+ GUMBO_TAG_SAMP,
198
+ GUMBO_TAG_KBD,
199
+ GUMBO_TAG_SUB,
200
+ GUMBO_TAG_SUP,
201
+ GUMBO_TAG_I,
202
+ GUMBO_TAG_B,
203
+ GUMBO_TAG_U,
204
+ GUMBO_TAG_MARK,
205
+ GUMBO_TAG_RUBY,
206
+ GUMBO_TAG_RT,
207
+ GUMBO_TAG_RP,
208
+ GUMBO_TAG_BDI,
209
+ GUMBO_TAG_BDO,
210
+ GUMBO_TAG_SPAN,
211
+ GUMBO_TAG_BR,
212
+ GUMBO_TAG_WBR,
213
+ GUMBO_TAG_INS,
214
+ GUMBO_TAG_DEL,
215
+ GUMBO_TAG_IMAGE,
216
+ GUMBO_TAG_IMG,
217
+ GUMBO_TAG_IFRAME,
218
+ GUMBO_TAG_EMBED,
219
+ GUMBO_TAG_OBJECT,
220
+ GUMBO_TAG_PARAM,
221
+ GUMBO_TAG_VIDEO,
222
+ GUMBO_TAG_AUDIO,
223
+ GUMBO_TAG_SOURCE,
224
+ GUMBO_TAG_TRACK,
225
+ GUMBO_TAG_CANVAS,
226
+ GUMBO_TAG_MAP,
227
+ GUMBO_TAG_AREA,
228
+ GUMBO_TAG_MATH,
229
+ GUMBO_TAG_MI,
230
+ GUMBO_TAG_MO,
231
+ GUMBO_TAG_MN,
232
+ GUMBO_TAG_MS,
233
+ GUMBO_TAG_MTEXT,
234
+ GUMBO_TAG_MGLYPH,
235
+ GUMBO_TAG_MALIGNMARK,
236
+ GUMBO_TAG_ANNOTATION_XML,
237
+ GUMBO_TAG_SVG,
238
+ GUMBO_TAG_FOREIGNOBJECT,
239
+ GUMBO_TAG_DESC,
240
+ GUMBO_TAG_TABLE,
241
+ GUMBO_TAG_CAPTION,
242
+ GUMBO_TAG_COLGROUP,
243
+ GUMBO_TAG_COL,
244
+ GUMBO_TAG_TBODY,
245
+ GUMBO_TAG_THEAD,
246
+ GUMBO_TAG_TFOOT,
247
+ GUMBO_TAG_TR,
248
+ GUMBO_TAG_TD,
249
+ GUMBO_TAG_TH,
250
+ GUMBO_TAG_FORM,
251
+ GUMBO_TAG_FIELDSET,
252
+ GUMBO_TAG_LEGEND,
253
+ GUMBO_TAG_LABEL,
254
+ GUMBO_TAG_INPUT,
255
+ GUMBO_TAG_BUTTON,
256
+ GUMBO_TAG_SELECT,
257
+ GUMBO_TAG_DATALIST,
258
+ GUMBO_TAG_OPTGROUP,
259
+ GUMBO_TAG_OPTION,
260
+ GUMBO_TAG_TEXTAREA,
261
+ GUMBO_TAG_KEYGEN,
262
+ GUMBO_TAG_OUTPUT,
263
+ GUMBO_TAG_PROGRESS,
264
+ GUMBO_TAG_METER,
265
+ GUMBO_TAG_DETAILS,
266
+ GUMBO_TAG_SUMMARY,
267
+ GUMBO_TAG_MENU,
268
+ GUMBO_TAG_MENUITEM,
269
+ GUMBO_TAG_APPLET,
270
+ GUMBO_TAG_ACRONYM,
271
+ GUMBO_TAG_BGSOUND,
272
+ GUMBO_TAG_DIR,
273
+ GUMBO_TAG_FRAME,
274
+ GUMBO_TAG_FRAMESET,
275
+ GUMBO_TAG_NOFRAMES,
276
+ GUMBO_TAG_LISTING,
277
+ GUMBO_TAG_XMP,
278
+ GUMBO_TAG_NEXTID,
279
+ GUMBO_TAG_NOEMBED,
280
+ GUMBO_TAG_PLAINTEXT,
281
+ GUMBO_TAG_RB,
282
+ GUMBO_TAG_STRIKE,
283
+ GUMBO_TAG_BASEFONT,
284
+ GUMBO_TAG_BIG,
285
+ GUMBO_TAG_BLINK,
286
+ GUMBO_TAG_CENTER,
287
+ GUMBO_TAG_FONT,
288
+ GUMBO_TAG_MARQUEE,
289
+ GUMBO_TAG_MULTICOL,
290
+ GUMBO_TAG_NOBR,
291
+ GUMBO_TAG_SPACER,
292
+ GUMBO_TAG_TT,
293
+ GUMBO_TAG_RTC,
294
+ GUMBO_TAG_DIALOG,
295
+ // Used for all tags that don't have special handling in HTML.
296
+ GUMBO_TAG_UNKNOWN,
297
+ // A marker value to indicate the end of the enum, for iterating over it.
298
+ GUMBO_TAG_LAST,
299
+ } GumboTag;
300
+
301
+ /**
302
+ * Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
303
+ * return value is static data owned by the library.
304
+ */
305
+ const char* gumbo_normalized_tagname(GumboTag tag);
306
+
307
+ /**
308
+ * Extracts the tag name from the `original_text` field of an element
309
+ * or token by stripping off `</>` characters and attributes and
310
+ * adjusting the passed-in `GumboStringPiece` appropriately. The tag
311
+ * name is in the original case and shares a buffer with the original
312
+ * text, to simplify memory management. Behavior is undefined if a
313
+ * string piece that doesn't represent an HTML tag (`<tagname>` or
314
+ * `</tagname>`) is passed in. If the string piece is completely
315
+ * empty (`NULL` data pointer), then this function will exit
316
+ * successfully as a no-op.
317
+ */
318
+ void gumbo_tag_from_original_text(GumboStringPiece* text);
319
+
320
+ /**
321
+ * Fixes the case of SVG elements that are not all lowercase. This is
322
+ * not done at parse time because there's no place to store a mutated
323
+ * tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
324
+ * SVG tags without special handling), while `original_tag_name` is a
325
+ * pointer into the original buffer. Instead, we provide this helper
326
+ * function that clients can use to rename SVG tags as appropriate.
327
+ * Returns the case-normalized SVG tagname if a replacement is found, or
328
+ * `NULL` if no normalization is called for. The return value is static
329
+ * data and owned by the library.
330
+ *
331
+ * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
332
+ */
333
+ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
334
+
335
+ /**
336
+ * Converts a tag name string (which may be in upper or mixed case) to a
337
+ * tag enum.
338
+ */
339
+ GumboTag gumbo_tagn_enum(const char* tagname, size_t length);
340
+
341
+ /**
342
+ * Attribute namespaces.
343
+ * HTML includes special handling for XLink, XML, and XMLNS namespaces
344
+ * on attributes. Everything else goes in the generic "NONE" namespace.
345
+ */
346
+ typedef enum {
347
+ GUMBO_ATTR_NAMESPACE_NONE,
348
+ GUMBO_ATTR_NAMESPACE_XLINK,
349
+ GUMBO_ATTR_NAMESPACE_XML,
350
+ GUMBO_ATTR_NAMESPACE_XMLNS,
351
+ } GumboAttributeNamespaceEnum;
352
+
353
+ /**
354
+ * A struct representing a single attribute on a HTML tag. This is a
355
+ * name-value pair, but also includes information about source locations
356
+ * and original source text.
357
+ */
358
+ typedef struct {
359
+ /**
360
+ * The namespace for the attribute. This will usually be
361
+ * `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
362
+ * take special values, per:
363
+ * https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
364
+ */
365
+ GumboAttributeNamespaceEnum attr_namespace;
366
+
367
+ /**
368
+ * The name of the attribute. This is in a freshly-allocated buffer to
369
+ * deal with case-normalization and is null-terminated.
370
+ */
371
+ const char* name;
372
+
373
+ /**
374
+ * The original text of the attribute name, as a pointer into the
375
+ * original source buffer.
376
+ */
377
+ GumboStringPiece original_name;
378
+
379
+ /**
380
+ * The value of the attribute. This is in a freshly-allocated buffer
381
+ * to deal with unescaping and is null-terminated. It does not include
382
+ * any quotes that surround the attribute. If the attribute has no
383
+ * value (for example, `selected` on a checkbox) this will be an empty
384
+ * string.
385
+ */
386
+ const char* value;
387
+
388
+ /**
389
+ * The original text of the value of the attribute. This points into
390
+ * the original source buffer. It includes any quotes that surround
391
+ * the attribute and you can look at `original_value.data[0]` and
392
+ * `original_value.data[original_value.length - 1]` to determine what
393
+ * the quote characters were. If the attribute has no value this will
394
+ * be a 0-length string.
395
+ */
396
+ GumboStringPiece original_value;
397
+
398
+ /** The starting position of the attribute name. */
399
+ GumboSourcePosition name_start;
400
+
401
+ /**
402
+ * The ending position of the attribute name. This is not always derivable
403
+ * from the starting position of the value because of the possibility of
404
+ * whitespace around the `=` sign.
405
+ */
406
+ GumboSourcePosition name_end;
407
+
408
+ /** The starting position of the attribute value. */
409
+ GumboSourcePosition value_start;
410
+
411
+ /** The ending position of the attribute value. */
412
+ GumboSourcePosition value_end;
413
+ } GumboAttribute;
414
+
415
+ /**
416
+ * Given a vector of `GumboAttribute`s, look up the one with the
417
+ * specified name and return it, or `NULL` if no such attribute exists.
418
+ * This uses a case-insensitive match, as HTML is case-insensitive.
419
+ */
420
+ GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
421
+
422
+ /**
423
+ * Enum denoting the type of node. This determines the type of the
424
+ * `node.v` union.
425
+ */
426
+ typedef enum {
427
+ /** Document node. `v` will be a `GumboDocument`. */
428
+ GUMBO_NODE_DOCUMENT,
429
+ /** Element node. `v` will be a `GumboElement`. */
430
+ GUMBO_NODE_ELEMENT,
431
+ /** Text node. `v` will be a `GumboText`. */
432
+ GUMBO_NODE_TEXT,
433
+ /** CDATA node. `v` will be a `GumboText`. */
434
+ GUMBO_NODE_CDATA,
435
+ /** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
436
+ GUMBO_NODE_COMMENT,
437
+ /** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
438
+ GUMBO_NODE_WHITESPACE,
439
+ /**
440
+ * Template node. This is separate from `GUMBO_NODE_ELEMENT` because
441
+ * many client libraries will want to ignore the contents of template
442
+ * nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
443
+ * do the right thing here, while clients that want to include template
444
+ * contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
445
+ * `GumboElement`.
446
+ */
447
+ GUMBO_NODE_TEMPLATE
448
+ } GumboNodeType;
449
+
450
+ /**
451
+ * Forward declaration of GumboNode so it can be used recursively in
452
+ * GumboNode.parent.
453
+ */
454
+ typedef struct GumboInternalNode GumboNode;
455
+
456
+ /** https://dom.spec.whatwg.org/#concept-document-quirks */
457
+ typedef enum {
458
+ GUMBO_DOCTYPE_NO_QUIRKS,
459
+ GUMBO_DOCTYPE_QUIRKS,
460
+ GUMBO_DOCTYPE_LIMITED_QUIRKS
461
+ } GumboQuirksModeEnum;
462
+
463
+ /**
464
+ * Namespaces.
465
+ * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
466
+ * Rather, anything inside an `<svg>` tag is in the SVG namespace,
467
+ * anything inside the `<math>` tag is in the MathML namespace, and
468
+ * anything else is inside the HTML namespace. No other namespaces are
469
+ * supported, so this can be an `enum`.
470
+ */
471
+ typedef enum {
472
+ GUMBO_NAMESPACE_HTML,
473
+ GUMBO_NAMESPACE_SVG,
474
+ GUMBO_NAMESPACE_MATHML
475
+ } GumboNamespaceEnum;
476
+
477
+ /**
478
+ * Parse flags.
479
+ * We track the reasons for parser insertion of nodes and store them in
480
+ * a bitvector in the node itself. This lets client code optimize out
481
+ * nodes that are implied by the HTML structure of the document, or flag
482
+ * constructs that may not be allowed by a style guide, or track the
483
+ * prevalence of incorrect or tricky HTML code.
484
+ */
485
+ typedef enum {
486
+ /**
487
+ * A normal node -- both start and end tags appear in the source,
488
+ * nothing has been reparented.
489
+ */
490
+ GUMBO_INSERTION_NORMAL = 0,
491
+
492
+ /**
493
+ * A node inserted by the parser to fulfill some implicit insertion
494
+ * rule. This is usually set in addition to some other flag giving a
495
+ * more specific insertion reason; it's a generic catch-all term
496
+ * meaning "The start tag for this node did not appear in the document
497
+ * source".
498
+ */
499
+ GUMBO_INSERTION_BY_PARSER = 1 << 0,
500
+
501
+ /**
502
+ * A flag indicating that the end tag for this node did not appear in
503
+ * the document source. Note that in some cases, you can still have
504
+ * parser-inserted nodes with an explicit end tag. For example,
505
+ * `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
506
+ * node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
507
+ * `</html>` tag actually exists.
508
+ *
509
+ * This flag will be set only if the end tag is completely missing.
510
+ * In some cases, the end tag may be misplaced (e.g. a `</body>` tag
511
+ * with text afterwards), which will leave this flag unset and require
512
+ * clients to inspect the parse errors for that case.
513
+ */
514
+ GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
515
+
516
+ // Value 1 << 2 was for a flag that has since been removed.
517
+
518
+ /**
519
+ * A flag for nodes that are inserted because their presence is
520
+ * implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
521
+ * `<tbody>`, etc.
522
+ */
523
+ GUMBO_INSERTION_IMPLIED = 1 << 3,
524
+
525
+ /**
526
+ * A flag for nodes that are converted from their end tag equivalents.
527
+ * For example, `</p>` when no paragraph is open implies that the
528
+ * parser should create a `<p>` tag and immediately close it, while
529
+ * `</br>` means the same thing as `<br>`.
530
+ */
531
+ GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
532
+
533
+ // Value 1 << 5 was for a flag that has since been removed.
534
+
535
+ /** A flag for `<image>` tags that are rewritten as `<img>`. */
536
+ GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
537
+
538
+ /**
539
+ * A flag for nodes that are cloned as a result of the reconstruction
540
+ * of active formatting elements. This is set only on the clone; the
541
+ * initial portion of the formatting run is a NORMAL node with an
542
+ * `IMPLICIT_END_TAG`.
543
+ */
544
+ GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
545
+
546
+ /** A flag for nodes that are cloned by the adoption agency algorithm. */
547
+ GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
548
+
549
+ /** A flag for nodes that are moved by the adoption agency algorithm. */
550
+ GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
551
+
552
+ /**
553
+ * A flag for nodes that have been foster-parented out of a table (or
554
+ * should've been foster-parented, if verbatim mode is set).
555
+ */
556
+ GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
557
+ } GumboParseFlags;
558
+
559
+ /** Information specific to document nodes. */
560
+ typedef struct {
561
+ /**
562
+ * An array of `GumboNode`s, containing the children of this element.
563
+ * This will normally consist of the `<html>` element and any comment
564
+ * nodes found. Pointers are owned.
565
+ */
566
+ GumboVector /* GumboNode* */ children;
567
+
568
+ /**
569
+ * `true` if there was an explicit doctype token, as opposed to it
570
+ * being omitted.
571
+ */
572
+ bool has_doctype;
573
+
574
+ // Fields from the doctype token, copied verbatim.
575
+ const char* name;
576
+ const char* public_identifier;
577
+ const char* system_identifier;
578
+
579
+ /**
580
+ * Whether or not the document is in QuirksMode, as determined by the
581
+ * values in the GumboTokenDocType template.
582
+ */
583
+ GumboQuirksModeEnum doc_type_quirks_mode;
584
+ } GumboDocument;
585
+
586
+ /**
587
+ * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
588
+ * elements. This contains just a block of text and its position.
589
+ */
590
+ typedef struct {
591
+ /**
592
+ * The text of this node, after entities have been parsed and decoded.
593
+ * For comment and cdata nodes, this does not include the comment
594
+ * delimiters.
595
+ */
596
+ const char* text;
597
+
598
+ /**
599
+ * The original text of this node, as a pointer into the original
600
+ * buffer. For comment/cdata nodes, this includes the comment
601
+ * delimiters.
602
+ */
603
+ GumboStringPiece original_text;
604
+
605
+ /**
606
+ * The starting position of this node. This corresponds to the
607
+ * position of `original_text`, before entities are decoded.
608
+ * */
609
+ GumboSourcePosition start_pos;
610
+ } GumboText;
611
+
612
+ /**
613
+ * The struct used to represent all HTML elements. This contains
614
+ * information about the tag, attributes, and child nodes.
615
+ */
616
+ typedef struct {
617
+ /**
618
+ * An array of `GumboNode`s, containing the children of this element.
619
+ * Pointers are owned.
620
+ */
621
+ GumboVector /* GumboNode* */ children;
622
+
623
+ /** The GumboTag enum for this element. */
624
+ GumboTag tag;
625
+
626
+ /** The name for this element. */
627
+ const char* name;
628
+
629
+ /** The GumboNamespaceEnum for this element. */
630
+ GumboNamespaceEnum tag_namespace;
631
+
632
+ /**
633
+ * A `GumboStringPiece` pointing to the original tag text for this
634
+ * element, pointing directly into the source buffer. If the tag was
635
+ * inserted algorithmically (for example, `<head>` or `<tbody>`
636
+ * insertion), this will be a zero-length string.
637
+ */
638
+ GumboStringPiece original_tag;
639
+
640
+ /**
641
+ * A `GumboStringPiece` pointing to the original end tag text for this
642
+ * element. If the end tag was inserted algorithmically, (for example,
643
+ * closing a self-closing tag), this will be a zero-length string.
644
+ */
645
+ GumboStringPiece original_end_tag;
646
+
647
+ /** The source position for the start of the start tag. */
648
+ GumboSourcePosition start_pos;
649
+
650
+ /** The source position for the start of the end tag. */
651
+ GumboSourcePosition end_pos;
652
+
653
+ /**
654
+ * An array of `GumboAttribute`s, containing the attributes for this
655
+ * tag in the order that they were parsed. Pointers are owned.
656
+ */
657
+ GumboVector /* GumboAttribute* */ attributes;
658
+ } GumboElement;
659
+
660
+ /**
661
+ * A supertype for `GumboElement` and `GumboText`, so that we can
662
+ * include one generic type in lists of children and cast as necessary
663
+ * to subtypes.
664
+ */
665
+ struct GumboInternalNode {
666
+ /** The type of node that this is. */
667
+ GumboNodeType type;
668
+
669
+ /** Pointer back to parent node. Not owned. */
670
+ GumboNode* parent;
671
+
672
+ /** The index within the parent's children vector of this node. */
673
+ unsigned int index_within_parent;
674
+
675
+ /**
676
+ * A bitvector of flags containing information about why this element
677
+ * was inserted into the parse tree, including a variety of special
678
+ * parse situations.
679
+ */
680
+ GumboParseFlags parse_flags;
681
+
682
+ /** The actual node data. */
683
+ union {
684
+ GumboDocument document; // For GUMBO_NODE_DOCUMENT.
685
+ GumboElement element; // For GUMBO_NODE_ELEMENT.
686
+ GumboText text; // For everything else.
687
+ } v;
688
+ };
689
+
690
+ /**
691
+ * Input struct containing configuration options for the parser.
692
+ * These let you specify alternate memory managers, provide different
693
+ * error handling, etc. Use `kGumboDefaultOptions` for sensible
694
+ * defaults and only set what you need.
695
+ */
696
+ typedef struct GumboInternalOptions {
697
+ /**
698
+ * The tab-stop size, for computing positions in HTML files that
699
+ * use tabs. Default: `8`.
700
+ */
701
+ int tab_stop;
702
+
703
+ /**
704
+ * Whether or not to stop parsing when the first error is encountered.
705
+ * Default: `false`.
706
+ */
707
+ bool stop_on_first_error;
708
+
709
+ /**
710
+ * Maximum allowed number of attributes per element. If this limit is
711
+ * exceeded, the parser will return early with a partial document and
712
+ * the returned `GumboOutput` will have its `status` field set to
713
+ * `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
714
+ * Default: `400`.
715
+ */
716
+ int max_attributes;
717
+
718
+ /**
719
+ * Maximum allowed depth for the parse tree. If this limit is exceeded,
720
+ * the parser will return early with a partial document and the returned
721
+ * `GumboOutput` will have its `status` field set to
722
+ * `GUMBO_STATUS_TREE_TOO_DEEP`.
723
+ * Default: `400`.
724
+ */
725
+ unsigned int max_tree_depth;
726
+
727
+ /**
728
+ * The maximum number of errors before the parser stops recording
729
+ * them. This is provided so that if the page is totally borked, we
730
+ * don't completely fill up the errors vector and exhaust memory with
731
+ * useless redundant errors. Set to `-1` to disable the limit.
732
+ * Default: `-1`.
733
+ */
734
+ int max_errors;
735
+
736
+ /**
737
+ * The fragment context for parsing:
738
+ * https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
739
+ *
740
+ * If `NULL` is passed here, it is assumed to be "no
741
+ * fragment", i.e. the regular parsing algorithm. Otherwise, pass the
742
+ * tag name for the intended parent of the parsed fragment. We use the
743
+ * tag name, namespace, and encoding attribute which are sufficient to
744
+ * set all of the parsing context needed for fragment parsing.
745
+ *
746
+ * Default: `NULL`.
747
+ */
748
+ const char* fragment_context;
749
+
750
+ /**
751
+ * The namespace for the fragment context. This lets client code
752
+ * differentiate between, say, parsing a `<title>` tag in SVG vs.
753
+ * parsing it in HTML.
754
+ *
755
+ * Default: `GUMBO_NAMESPACE_HTML`.
756
+ */
757
+ GumboNamespaceEnum fragment_namespace;
758
+
759
+ /**
760
+ * The value of the fragment context's `encoding` attribute, if any.
761
+ * Set to `NULL` for no `encoding` attribute.
762
+ *
763
+ * Default: `NULL`.
764
+ */
765
+ const char* fragment_encoding;
766
+
767
+ /**
768
+ * Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
769
+ * be looked up using `gumbo_compute_quirks_mode()`.
770
+ *
771
+ * Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
772
+ */
773
+ GumboQuirksModeEnum quirks_mode;
774
+
775
+ /**
776
+ * For fragment parsing. Set this to true if the context node has a form
777
+ * element as an ancestor.
778
+ *
779
+ * Default: `false`.
780
+ */
781
+ bool fragment_context_has_form_ancestor;
782
+ } GumboOptions;
783
+
784
+ /** Default options struct; use this with gumbo_parse_with_options. */
785
+ extern const GumboOptions kGumboDefaultOptions;
786
+
787
+ /**
788
+ * Status code indicating whether parsing finished successfully or
789
+ * was stopped mid-document due to exceptional circumstances.
790
+ */
791
+ typedef enum {
792
+ /**
793
+ * Indicates that parsing completed successfuly. The resulting tree
794
+ * will be a complete document.
795
+ */
796
+ GUMBO_STATUS_OK,
797
+
798
+ /**
799
+ * Indicates that the maximum element nesting limit
800
+ * (`GumboOptions::max_tree_depth`) was reached during parsing. The
801
+ * resulting tree will be a partial document, with no further nodes
802
+ * created after the point where the limit was reached. The partial
803
+ * document may be useful for constructing an error message but
804
+ * typically shouldn't be used for other purposes.
805
+ */
806
+ GUMBO_STATUS_TREE_TOO_DEEP,
807
+
808
+ /**
809
+ * Indicates that the maximum number of attributes per element
810
+ * (`GumboOptions::max_attributes`) was reached during parsing. The
811
+ * resulting tree will be a partial document, with no further nodes
812
+ * created after the point where the limit was reached. The partial
813
+ * document may be useful for constructing an error message but
814
+ * typically shouldn't be used for other purposes.
815
+ */
816
+ GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
817
+
818
+ // Currently unused
819
+ GUMBO_STATUS_OUT_OF_MEMORY,
820
+ } GumboOutputStatus;
821
+
822
+
823
+ /** The output struct containing the results of the parse. */
824
+ typedef struct GumboInternalOutput {
825
+ /**
826
+ * Pointer to the document node. This is a `GumboNode` of type
827
+ * `NODE_DOCUMENT` that contains the entire document as its child.
828
+ */
829
+ GumboNode* document;
830
+
831
+ /**
832
+ * Pointer to the root node. This is the `<html>` tag that forms the
833
+ * root of the document.
834
+ */
835
+ GumboNode* root;
836
+
837
+ /**
838
+ * A list of errors that occurred during the parse.
839
+ */
840
+ GumboVector /* GumboError */ errors;
841
+
842
+ /**
843
+ * True if the parser encounted an error.
844
+ *
845
+ * This can be true and `errors` an empty `GumboVector` if the `max_errors`
846
+ * option was set to 0.
847
+ */
848
+ bool document_error;
849
+
850
+ /**
851
+ * A status code indicating whether parsing finished successfully or was
852
+ * stopped mid-document due to exceptional circumstances.
853
+ */
854
+ GumboOutputStatus status;
855
+ } GumboOutput;
856
+
857
+ /**
858
+ * Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
859
+ * buffer must live at least as long as the parse tree, as some fields
860
+ * (eg. `original_text`) point directly into the original buffer.
861
+ *
862
+ * This doesn't support buffers longer than 4 gigabytes.
863
+ */
864
+ GumboOutput* gumbo_parse(const char* buffer);
865
+
866
+ /**
867
+ * Extended version of `gumbo_parse` that takes an explicit options
868
+ * structure, buffer, and length.
869
+ */
870
+ GumboOutput* gumbo_parse_with_options (
871
+ const GumboOptions* options,
872
+ const char* buffer,
873
+ size_t buffer_length
874
+ );
875
+
876
+ /**
877
+ * Compute the quirks mode based on the name, public identifier, and system
878
+ * identifier. Any of these may be `NULL` to indicate a missing value.
879
+ */
880
+ GumboQuirksModeEnum gumbo_compute_quirks_mode (
881
+ const char *name,
882
+ const char *pubid,
883
+ const char *sysid
884
+ );
885
+
886
+ /** Convert a `GumboOutputStatus` code into a readable description. */
887
+ const char* gumbo_status_to_string(GumboOutputStatus status);
888
+
889
+ /** Release the memory used for the parse tree and parse errors. */
890
+ void gumbo_destroy_output(GumboOutput* output);
891
+
892
+ /** Opaque GumboError type */
893
+ typedef struct GumboInternalError GumboError;
894
+
895
+ /**
896
+ * Returns the position of the error.
897
+ */
898
+ GumboSourcePosition gumbo_error_position(const GumboError* error);
899
+
900
+ /**
901
+ * Returns a constant string representation of the error's code. This is owned
902
+ * by the library and should not be freed by the caller.
903
+ */
904
+ const char* gumbo_error_code(const GumboError* error);
905
+
906
+ /**
907
+ * Prints an error to a string. This stores a freshly-allocated buffer
908
+ * containing the error message text in output. The caller is responsible for
909
+ * freeing the buffer. The size of the error message is returned. The error
910
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
911
+ * returned size must be used.
912
+ */
913
+ size_t gumbo_error_to_string(const GumboError* error, char **output);
914
+
915
+ /**
916
+ * Prints a caret diagnostic to a string. This stores a freshly-allocated
917
+ * buffer containing the error message text in output. The caller is responsible for
918
+ * freeing the buffer. The size of the error message is returned. The error
919
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
920
+ * returned size must be used.
921
+ */
922
+ size_t gumbo_caret_diagnostic_to_string (
923
+ const GumboError* error,
924
+ const char* source_text,
925
+ size_t source_length,
926
+ char** output
927
+ );
928
+
929
+ /**
930
+ * Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
931
+ * instead of writing to a string.
932
+ */
933
+ void gumbo_print_caret_diagnostic (
934
+ const GumboError* error,
935
+ const char* source_text,
936
+ size_t source_length
937
+ );
938
+
939
+ #ifdef __cplusplus
940
+ }
941
+ #endif
942
+
943
+ #endif // GUMBO_H