nokogiri 1.9.1 → 1.15.3

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (226) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +45 -0
  3. data/LICENSE-DEPENDENCIES.md +1636 -1024
  4. data/LICENSE.md +5 -28
  5. data/README.md +203 -89
  6. data/bin/nokogiri +63 -50
  7. data/dependencies.yml +33 -61
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +864 -418
  10. data/ext/nokogiri/gumbo.c +594 -0
  11. data/ext/nokogiri/html4_document.c +165 -0
  12. data/ext/nokogiri/html4_element_description.c +299 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +108 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  17. data/ext/nokogiri/nokogiri.c +251 -105
  18. data/ext/nokogiri/nokogiri.h +215 -90
  19. data/ext/nokogiri/test_global_handlers.c +40 -0
  20. data/ext/nokogiri/xml_attr.c +17 -17
  21. data/ext/nokogiri/xml_attribute_decl.c +22 -22
  22. data/ext/nokogiri/xml_cdata.c +40 -31
  23. data/ext/nokogiri/xml_comment.c +20 -27
  24. data/ext/nokogiri/xml_document.c +401 -240
  25. data/ext/nokogiri/xml_document_fragment.c +13 -17
  26. data/ext/nokogiri/xml_dtd.c +64 -58
  27. data/ext/nokogiri/xml_element_content.c +63 -55
  28. data/ext/nokogiri/xml_element_decl.c +31 -31
  29. data/ext/nokogiri/xml_encoding_handler.c +54 -21
  30. data/ext/nokogiri/xml_entity_decl.c +37 -35
  31. data/ext/nokogiri/xml_entity_reference.c +17 -19
  32. data/ext/nokogiri/xml_namespace.c +135 -61
  33. data/ext/nokogiri/xml_node.c +1346 -677
  34. data/ext/nokogiri/xml_node_set.c +246 -216
  35. data/ext/nokogiri/xml_processing_instruction.c +18 -20
  36. data/ext/nokogiri/xml_reader.c +347 -212
  37. data/ext/nokogiri/xml_relax_ng.c +86 -77
  38. data/ext/nokogiri/xml_sax_parser.c +149 -124
  39. data/ext/nokogiri/xml_sax_parser_context.c +145 -103
  40. data/ext/nokogiri/xml_sax_push_parser.c +64 -36
  41. data/ext/nokogiri/xml_schema.c +138 -81
  42. data/ext/nokogiri/xml_syntax_error.c +42 -21
  43. data/ext/nokogiri/xml_text.c +36 -26
  44. data/ext/nokogiri/xml_xpath_context.c +366 -178
  45. data/ext/nokogiri/xslt_stylesheet.c +335 -189
  46. data/gumbo-parser/CHANGES.md +63 -0
  47. data/gumbo-parser/Makefile +111 -0
  48. data/gumbo-parser/THANKS +27 -0
  49. data/gumbo-parser/src/Makefile +34 -0
  50. data/gumbo-parser/src/README.md +41 -0
  51. data/gumbo-parser/src/ascii.c +75 -0
  52. data/gumbo-parser/src/ascii.h +115 -0
  53. data/gumbo-parser/src/attribute.c +42 -0
  54. data/gumbo-parser/src/attribute.h +17 -0
  55. data/gumbo-parser/src/char_ref.c +22225 -0
  56. data/gumbo-parser/src/char_ref.h +29 -0
  57. data/gumbo-parser/src/char_ref.rl +2154 -0
  58. data/gumbo-parser/src/error.c +630 -0
  59. data/gumbo-parser/src/error.h +148 -0
  60. data/gumbo-parser/src/foreign_attrs.c +103 -0
  61. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  62. data/gumbo-parser/src/insertion_mode.h +33 -0
  63. data/gumbo-parser/src/macros.h +91 -0
  64. data/gumbo-parser/src/nokogiri_gumbo.h +944 -0
  65. data/gumbo-parser/src/parser.c +4891 -0
  66. data/gumbo-parser/src/parser.h +41 -0
  67. data/gumbo-parser/src/replacement.h +33 -0
  68. data/gumbo-parser/src/string_buffer.c +103 -0
  69. data/gumbo-parser/src/string_buffer.h +68 -0
  70. data/gumbo-parser/src/string_piece.c +48 -0
  71. data/gumbo-parser/src/svg_attrs.c +174 -0
  72. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  73. data/gumbo-parser/src/svg_tags.c +137 -0
  74. data/gumbo-parser/src/svg_tags.gperf +55 -0
  75. data/gumbo-parser/src/tag.c +223 -0
  76. data/gumbo-parser/src/tag_lookup.c +382 -0
  77. data/gumbo-parser/src/tag_lookup.gperf +170 -0
  78. data/gumbo-parser/src/tag_lookup.h +13 -0
  79. data/gumbo-parser/src/token_buffer.c +79 -0
  80. data/gumbo-parser/src/token_buffer.h +71 -0
  81. data/gumbo-parser/src/token_type.h +17 -0
  82. data/gumbo-parser/src/tokenizer.c +3463 -0
  83. data/gumbo-parser/src/tokenizer.h +112 -0
  84. data/gumbo-parser/src/tokenizer_states.h +339 -0
  85. data/gumbo-parser/src/utf8.c +245 -0
  86. data/gumbo-parser/src/utf8.h +164 -0
  87. data/gumbo-parser/src/util.c +66 -0
  88. data/gumbo-parser/src/util.h +34 -0
  89. data/gumbo-parser/src/vector.c +111 -0
  90. data/gumbo-parser/src/vector.h +45 -0
  91. data/lib/nokogiri/class_resolver.rb +67 -0
  92. data/lib/nokogiri/css/node.rb +10 -8
  93. data/lib/nokogiri/css/parser.rb +397 -377
  94. data/lib/nokogiri/css/parser.y +250 -245
  95. data/lib/nokogiri/css/parser_extras.rb +54 -49
  96. data/lib/nokogiri/css/syntax_error.rb +3 -1
  97. data/lib/nokogiri/css/tokenizer.rb +107 -104
  98. data/lib/nokogiri/css/tokenizer.rex +3 -2
  99. data/lib/nokogiri/css/xpath_visitor.rb +224 -95
  100. data/lib/nokogiri/css.rb +56 -17
  101. data/lib/nokogiri/decorators/slop.rb +9 -7
  102. data/lib/nokogiri/encoding_handler.rb +57 -0
  103. data/lib/nokogiri/extension.rb +32 -0
  104. data/lib/nokogiri/gumbo.rb +15 -0
  105. data/lib/nokogiri/html.rb +38 -27
  106. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  107. data/lib/nokogiri/html4/document.rb +214 -0
  108. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  109. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  110. data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
  111. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  112. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  113. data/lib/nokogiri/{html → html4}/sax/parser.rb +17 -16
  114. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  115. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
  116. data/lib/nokogiri/html4.rb +47 -0
  117. data/lib/nokogiri/html5/document.rb +168 -0
  118. data/lib/nokogiri/html5/document_fragment.rb +90 -0
  119. data/lib/nokogiri/html5/node.rb +103 -0
  120. data/lib/nokogiri/html5.rb +392 -0
  121. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  122. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  123. data/lib/nokogiri/syntax_error.rb +2 -0
  124. data/lib/nokogiri/version/constant.rb +6 -0
  125. data/lib/nokogiri/version/info.rb +223 -0
  126. data/lib/nokogiri/version.rb +3 -108
  127. data/lib/nokogiri/xml/attr.rb +55 -3
  128. data/lib/nokogiri/xml/attribute_decl.rb +6 -2
  129. data/lib/nokogiri/xml/builder.rb +98 -54
  130. data/lib/nokogiri/xml/cdata.rb +3 -1
  131. data/lib/nokogiri/xml/character_data.rb +2 -0
  132. data/lib/nokogiri/xml/document.rb +312 -126
  133. data/lib/nokogiri/xml/document_fragment.rb +93 -48
  134. data/lib/nokogiri/xml/dtd.rb +4 -2
  135. data/lib/nokogiri/xml/element_content.rb +12 -2
  136. data/lib/nokogiri/xml/element_decl.rb +6 -2
  137. data/lib/nokogiri/xml/entity_decl.rb +7 -3
  138. data/lib/nokogiri/xml/entity_reference.rb +2 -0
  139. data/lib/nokogiri/xml/namespace.rb +45 -0
  140. data/lib/nokogiri/xml/node/save_options.rb +23 -8
  141. data/lib/nokogiri/xml/node.rb +1088 -418
  142. data/lib/nokogiri/xml/node_set.rb +173 -63
  143. data/lib/nokogiri/xml/notation.rb +13 -0
  144. data/lib/nokogiri/xml/parse_options.rb +145 -52
  145. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  146. data/lib/nokogiri/xml/pp/node.rb +42 -30
  147. data/lib/nokogiri/xml/pp.rb +4 -2
  148. data/lib/nokogiri/xml/processing_instruction.rb +4 -1
  149. data/lib/nokogiri/xml/reader.rb +21 -28
  150. data/lib/nokogiri/xml/relax_ng.rb +8 -2
  151. data/lib/nokogiri/xml/sax/document.rb +45 -49
  152. data/lib/nokogiri/xml/sax/parser.rb +39 -36
  153. data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
  154. data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
  155. data/lib/nokogiri/xml/sax.rb +6 -4
  156. data/lib/nokogiri/xml/schema.rb +19 -9
  157. data/lib/nokogiri/xml/searchable.rb +120 -72
  158. data/lib/nokogiri/xml/syntax_error.rb +6 -4
  159. data/lib/nokogiri/xml/text.rb +2 -0
  160. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  161. data/lib/nokogiri/xml/xpath.rb +15 -4
  162. data/lib/nokogiri/xml/xpath_context.rb +3 -3
  163. data/lib/nokogiri/xml.rb +38 -37
  164. data/lib/nokogiri/xslt/stylesheet.rb +3 -1
  165. data/lib/nokogiri/xslt.rb +101 -22
  166. data/lib/nokogiri.rb +59 -75
  167. data/lib/xsd/xmlparser/nokogiri.rb +29 -25
  168. data/patches/libxml2/0001-Remove-script-macro-support.patch +40 -0
  169. data/patches/libxml2/0002-Update-entities-to-remove-handling-of-ssi.patch +44 -0
  170. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
  171. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  172. data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
  173. data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
  174. data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
  175. data/ports/archives/libxml2-2.11.4.tar.xz +0 -0
  176. data/ports/archives/libxslt-1.1.38.tar.xz +0 -0
  177. metadata +128 -265
  178. data/ext/nokogiri/html_document.c +0 -170
  179. data/ext/nokogiri/html_document.h +0 -10
  180. data/ext/nokogiri/html_element_description.c +0 -279
  181. data/ext/nokogiri/html_element_description.h +0 -10
  182. data/ext/nokogiri/html_entity_lookup.c +0 -32
  183. data/ext/nokogiri/html_entity_lookup.h +0 -8
  184. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  185. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  186. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  187. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  188. data/ext/nokogiri/xml_attr.h +0 -9
  189. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  190. data/ext/nokogiri/xml_cdata.h +0 -9
  191. data/ext/nokogiri/xml_comment.h +0 -9
  192. data/ext/nokogiri/xml_document.h +0 -23
  193. data/ext/nokogiri/xml_document_fragment.h +0 -10
  194. data/ext/nokogiri/xml_dtd.h +0 -10
  195. data/ext/nokogiri/xml_element_content.h +0 -10
  196. data/ext/nokogiri/xml_element_decl.h +0 -9
  197. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  198. data/ext/nokogiri/xml_entity_decl.h +0 -10
  199. data/ext/nokogiri/xml_entity_reference.h +0 -9
  200. data/ext/nokogiri/xml_io.c +0 -61
  201. data/ext/nokogiri/xml_io.h +0 -11
  202. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  203. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  204. data/ext/nokogiri/xml_namespace.h +0 -14
  205. data/ext/nokogiri/xml_node.h +0 -13
  206. data/ext/nokogiri/xml_node_set.h +0 -12
  207. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  208. data/ext/nokogiri/xml_reader.h +0 -10
  209. data/ext/nokogiri/xml_relax_ng.h +0 -9
  210. data/ext/nokogiri/xml_sax_parser.h +0 -39
  211. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  212. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  213. data/ext/nokogiri/xml_schema.h +0 -9
  214. data/ext/nokogiri/xml_syntax_error.h +0 -13
  215. data/ext/nokogiri/xml_text.h +0 -9
  216. data/ext/nokogiri/xml_xpath_context.h +0 -10
  217. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  218. data/lib/nokogiri/html/document.rb +0 -335
  219. data/lib/nokogiri/html/document_fragment.rb +0 -49
  220. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  221. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  222. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  223. data/patches/libxml2/0002-Fix-nullptr-deref-with-XPath-logic-ops.patch +0 -54
  224. data/patches/libxml2/0003-Fix-infinite-loop-in-LZMA-decompression.patch +0 -50
  225. data/ports/archives/libxml2-2.9.8.tar.gz +0 -0
  226. data/ports/archives/libxslt-1.1.32.tar.gz +0 -0
@@ -0,0 +1,944 @@
1
+ // Copyright 2010 Google Inc.
2
+ // Copyright 2018 Craig Barnes.
3
+ // Licensed under the Apache License, version 2.0.
4
+
5
+ // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
6
+ // GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
7
+ // static constants
8
+
9
+ /**
10
+ * @file
11
+ * @mainpage Gumbo HTML Parser
12
+ *
13
+ * This provides a conformant, no-dependencies implementation of the
14
+ * [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
15
+ * to parse a different encoding, run a preprocessing step to convert
16
+ * to UTF-8. It returns a parse tree made of the structs in this file.
17
+ *
18
+ * Example:
19
+ * @code
20
+ * GumboOutput* output = gumbo_parse(input);
21
+ * do_something_with_doctype(output->document);
22
+ * do_something_with_html_tree(output->root);
23
+ * gumbo_destroy_output(output);
24
+ * @endcode
25
+ *
26
+ * [HTML5]: https://html.spec.whatwg.org/multipage/
27
+ */
28
+
29
+ #ifndef GUMBO_H
30
+ #define GUMBO_H
31
+
32
+ #include <stdbool.h>
33
+ #include <stddef.h>
34
+
35
+ #ifdef __cplusplus
36
+ extern "C" {
37
+ #endif
38
+
39
+ /**
40
+ * A struct representing a character position within the original text
41
+ * buffer. Line and column numbers are 1-based and offsets are 0-based,
42
+ * which matches how most editors and command-line tools work.
43
+ */
44
+ typedef struct {
45
+ size_t line;
46
+ size_t column;
47
+ size_t offset;
48
+ } GumboSourcePosition;
49
+
50
+ /**
51
+ * A struct representing a string or part of a string. Strings within
52
+ * the parser are represented by a `char*` and a length; the `char*`
53
+ * points into an existing data buffer owned by some other code (often
54
+ * the original input). `GumboStringPiece`s are assumed (by convention)
55
+ * to be immutable, because they may share data. Clients should assume
56
+ * that it is not NUL-terminated and should always use explicit lengths
57
+ * when manipulating them.
58
+ */
59
+ typedef struct {
60
+ /** A pointer to the beginning of the string. `NULL` if `length == 0`. */
61
+ const char* data;
62
+
63
+ /** The length of the string fragment, in bytes (may be zero). */
64
+ size_t length;
65
+ } GumboStringPiece;
66
+
67
+ #define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
68
+ /** A constant to represent a 0-length null string. */
69
+ #define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT
70
+
71
+ /**
72
+ * Compares two `GumboStringPiece`s, and returns `true` if they're
73
+ * equal or `false` otherwise.
74
+ */
75
+ bool gumbo_string_equals (
76
+ const GumboStringPiece* str1,
77
+ const GumboStringPiece* str2
78
+ );
79
+
80
+ /**
81
+ * Compares two `GumboStringPiece`s, ignoring case, and returns `true`
82
+ * if they're equal or `false` otherwise.
83
+ */
84
+ bool gumbo_string_equals_ignore_case (
85
+ const GumboStringPiece* str1,
86
+ const GumboStringPiece* str2
87
+ );
88
+
89
+ /**
90
+ * Check if the first `GumboStringPiece` is a prefix of the second, ignoring
91
+ * case.
92
+ */
93
+ bool gumbo_string_prefix_ignore_case (
94
+ const GumboStringPiece* prefix,
95
+ const GumboStringPiece* str
96
+ );
97
+
98
+ /**
99
+ * A simple vector implementation. This stores a pointer to a data array
100
+ * and a length. All elements are stored as `void*`; client code must
101
+ * cast to the appropriate type. Overflows upon addition result in
102
+ * reallocation of the data array, with the size doubling to maintain
103
+ * `O(1)` amortized cost. There is no removal function, as this isn't
104
+ * needed for any of the operations within this library. Iteration can
105
+ * be done through inspecting the structure directly in a `for` loop.
106
+ */
107
+ typedef struct {
108
+ /**
109
+ * Data elements. This points to a dynamically-allocated array of
110
+ * `capacity` elements, each a `void*` to the element itself.
111
+ */
112
+ void** data;
113
+
114
+ /** Number of elements currently in the vector. */
115
+ unsigned int length;
116
+
117
+ /** Current array capacity. */
118
+ unsigned int capacity;
119
+ } GumboVector;
120
+
121
+ # define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
122
+ /** An empty (0-length, 0-capacity) `GumboVector`. */
123
+ #define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT
124
+
125
+ /**
126
+ * Returns the first index at which an element appears in this vector
127
+ * (testing by pointer equality), or `-1` if it never does.
128
+ */
129
+ int gumbo_vector_index_of(GumboVector* vector, const void* element);
130
+
131
+ /**
132
+ * An `enum` for all the tags defined in the HTML5 standard. These
133
+ * correspond to the tag names themselves. Enum constants exist only
134
+ * for tags that appear in the spec itself (or for tags with special
135
+ * handling in the SVG and MathML namespaces). Any other tags appear
136
+ * as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
137
+ * through `original_tag`.
138
+ *
139
+ * This is mostly for API convenience, so that clients of this library
140
+ * don't need to perform a `strcasecmp` to find the normalized tag
141
+ * name. It also has efficiency benefits, by letting the parser work
142
+ * with enums instead of strings.
143
+ */
144
+ typedef enum {
145
+ GUMBO_TAG_HTML,
146
+ GUMBO_TAG_HEAD,
147
+ GUMBO_TAG_TITLE,
148
+ GUMBO_TAG_BASE,
149
+ GUMBO_TAG_LINK,
150
+ GUMBO_TAG_META,
151
+ GUMBO_TAG_STYLE,
152
+ GUMBO_TAG_SCRIPT,
153
+ GUMBO_TAG_NOSCRIPT,
154
+ GUMBO_TAG_TEMPLATE,
155
+ GUMBO_TAG_BODY,
156
+ GUMBO_TAG_ARTICLE,
157
+ GUMBO_TAG_SECTION,
158
+ GUMBO_TAG_NAV,
159
+ GUMBO_TAG_ASIDE,
160
+ GUMBO_TAG_H1,
161
+ GUMBO_TAG_H2,
162
+ GUMBO_TAG_H3,
163
+ GUMBO_TAG_H4,
164
+ GUMBO_TAG_H5,
165
+ GUMBO_TAG_H6,
166
+ GUMBO_TAG_HGROUP,
167
+ GUMBO_TAG_HEADER,
168
+ GUMBO_TAG_FOOTER,
169
+ GUMBO_TAG_ADDRESS,
170
+ GUMBO_TAG_P,
171
+ GUMBO_TAG_HR,
172
+ GUMBO_TAG_PRE,
173
+ GUMBO_TAG_BLOCKQUOTE,
174
+ GUMBO_TAG_OL,
175
+ GUMBO_TAG_UL,
176
+ GUMBO_TAG_LI,
177
+ GUMBO_TAG_DL,
178
+ GUMBO_TAG_DT,
179
+ GUMBO_TAG_DD,
180
+ GUMBO_TAG_FIGURE,
181
+ GUMBO_TAG_FIGCAPTION,
182
+ GUMBO_TAG_MAIN,
183
+ GUMBO_TAG_DIV,
184
+ GUMBO_TAG_A,
185
+ GUMBO_TAG_EM,
186
+ GUMBO_TAG_STRONG,
187
+ GUMBO_TAG_SMALL,
188
+ GUMBO_TAG_S,
189
+ GUMBO_TAG_CITE,
190
+ GUMBO_TAG_Q,
191
+ GUMBO_TAG_DFN,
192
+ GUMBO_TAG_ABBR,
193
+ GUMBO_TAG_DATA,
194
+ GUMBO_TAG_TIME,
195
+ GUMBO_TAG_CODE,
196
+ GUMBO_TAG_VAR,
197
+ GUMBO_TAG_SAMP,
198
+ GUMBO_TAG_KBD,
199
+ GUMBO_TAG_SUB,
200
+ GUMBO_TAG_SUP,
201
+ GUMBO_TAG_I,
202
+ GUMBO_TAG_B,
203
+ GUMBO_TAG_U,
204
+ GUMBO_TAG_MARK,
205
+ GUMBO_TAG_RUBY,
206
+ GUMBO_TAG_RT,
207
+ GUMBO_TAG_RP,
208
+ GUMBO_TAG_BDI,
209
+ GUMBO_TAG_BDO,
210
+ GUMBO_TAG_SPAN,
211
+ GUMBO_TAG_BR,
212
+ GUMBO_TAG_WBR,
213
+ GUMBO_TAG_INS,
214
+ GUMBO_TAG_DEL,
215
+ GUMBO_TAG_IMAGE,
216
+ GUMBO_TAG_IMG,
217
+ GUMBO_TAG_IFRAME,
218
+ GUMBO_TAG_EMBED,
219
+ GUMBO_TAG_OBJECT,
220
+ GUMBO_TAG_PARAM,
221
+ GUMBO_TAG_VIDEO,
222
+ GUMBO_TAG_AUDIO,
223
+ GUMBO_TAG_SOURCE,
224
+ GUMBO_TAG_TRACK,
225
+ GUMBO_TAG_CANVAS,
226
+ GUMBO_TAG_MAP,
227
+ GUMBO_TAG_AREA,
228
+ GUMBO_TAG_MATH,
229
+ GUMBO_TAG_MI,
230
+ GUMBO_TAG_MO,
231
+ GUMBO_TAG_MN,
232
+ GUMBO_TAG_MS,
233
+ GUMBO_TAG_MTEXT,
234
+ GUMBO_TAG_MGLYPH,
235
+ GUMBO_TAG_MALIGNMARK,
236
+ GUMBO_TAG_ANNOTATION_XML,
237
+ GUMBO_TAG_SVG,
238
+ GUMBO_TAG_FOREIGNOBJECT,
239
+ GUMBO_TAG_DESC,
240
+ GUMBO_TAG_TABLE,
241
+ GUMBO_TAG_CAPTION,
242
+ GUMBO_TAG_COLGROUP,
243
+ GUMBO_TAG_COL,
244
+ GUMBO_TAG_TBODY,
245
+ GUMBO_TAG_THEAD,
246
+ GUMBO_TAG_TFOOT,
247
+ GUMBO_TAG_TR,
248
+ GUMBO_TAG_TD,
249
+ GUMBO_TAG_TH,
250
+ GUMBO_TAG_FORM,
251
+ GUMBO_TAG_FIELDSET,
252
+ GUMBO_TAG_LEGEND,
253
+ GUMBO_TAG_LABEL,
254
+ GUMBO_TAG_INPUT,
255
+ GUMBO_TAG_BUTTON,
256
+ GUMBO_TAG_SELECT,
257
+ GUMBO_TAG_DATALIST,
258
+ GUMBO_TAG_OPTGROUP,
259
+ GUMBO_TAG_OPTION,
260
+ GUMBO_TAG_TEXTAREA,
261
+ GUMBO_TAG_KEYGEN,
262
+ GUMBO_TAG_OUTPUT,
263
+ GUMBO_TAG_PROGRESS,
264
+ GUMBO_TAG_METER,
265
+ GUMBO_TAG_DETAILS,
266
+ GUMBO_TAG_SUMMARY,
267
+ GUMBO_TAG_MENU,
268
+ GUMBO_TAG_MENUITEM,
269
+ GUMBO_TAG_APPLET,
270
+ GUMBO_TAG_ACRONYM,
271
+ GUMBO_TAG_BGSOUND,
272
+ GUMBO_TAG_DIR,
273
+ GUMBO_TAG_FRAME,
274
+ GUMBO_TAG_FRAMESET,
275
+ GUMBO_TAG_NOFRAMES,
276
+ GUMBO_TAG_LISTING,
277
+ GUMBO_TAG_XMP,
278
+ GUMBO_TAG_NEXTID,
279
+ GUMBO_TAG_NOEMBED,
280
+ GUMBO_TAG_PLAINTEXT,
281
+ GUMBO_TAG_RB,
282
+ GUMBO_TAG_STRIKE,
283
+ GUMBO_TAG_BASEFONT,
284
+ GUMBO_TAG_BIG,
285
+ GUMBO_TAG_BLINK,
286
+ GUMBO_TAG_CENTER,
287
+ GUMBO_TAG_FONT,
288
+ GUMBO_TAG_MARQUEE,
289
+ GUMBO_TAG_MULTICOL,
290
+ GUMBO_TAG_NOBR,
291
+ GUMBO_TAG_SPACER,
292
+ GUMBO_TAG_TT,
293
+ GUMBO_TAG_RTC,
294
+ GUMBO_TAG_DIALOG,
295
+ GUMBO_TAG_SEARCH,
296
+ // Used for all tags that don't have special handling in HTML.
297
+ GUMBO_TAG_UNKNOWN,
298
+ // A marker value to indicate the end of the enum, for iterating over it.
299
+ GUMBO_TAG_LAST,
300
+ } GumboTag;
301
+
302
+ /**
303
+ * Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
304
+ * return value is static data owned by the library.
305
+ */
306
+ const char* gumbo_normalized_tagname(GumboTag tag);
307
+
308
+ /**
309
+ * Extracts the tag name from the `original_text` field of an element
310
+ * or token by stripping off `</>` characters and attributes and
311
+ * adjusting the passed-in `GumboStringPiece` appropriately. The tag
312
+ * name is in the original case and shares a buffer with the original
313
+ * text, to simplify memory management. Behavior is undefined if a
314
+ * string piece that doesn't represent an HTML tag (`<tagname>` or
315
+ * `</tagname>`) is passed in. If the string piece is completely
316
+ * empty (`NULL` data pointer), then this function will exit
317
+ * successfully as a no-op.
318
+ */
319
+ void gumbo_tag_from_original_text(GumboStringPiece* text);
320
+
321
+ /**
322
+ * Fixes the case of SVG elements that are not all lowercase. This is
323
+ * not done at parse time because there's no place to store a mutated
324
+ * tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
325
+ * SVG tags without special handling), while `original_tag_name` is a
326
+ * pointer into the original buffer. Instead, we provide this helper
327
+ * function that clients can use to rename SVG tags as appropriate.
328
+ * Returns the case-normalized SVG tagname if a replacement is found, or
329
+ * `NULL` if no normalization is called for. The return value is static
330
+ * data and owned by the library.
331
+ *
332
+ * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
333
+ */
334
+ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
335
+
336
+ /**
337
+ * Converts a tag name string (which may be in upper or mixed case) to a
338
+ * tag enum.
339
+ */
340
+ GumboTag gumbo_tagn_enum(const char* tagname, size_t length);
341
+
342
+ /**
343
+ * Attribute namespaces.
344
+ * HTML includes special handling for XLink, XML, and XMLNS namespaces
345
+ * on attributes. Everything else goes in the generic "NONE" namespace.
346
+ */
347
+ typedef enum {
348
+ GUMBO_ATTR_NAMESPACE_NONE,
349
+ GUMBO_ATTR_NAMESPACE_XLINK,
350
+ GUMBO_ATTR_NAMESPACE_XML,
351
+ GUMBO_ATTR_NAMESPACE_XMLNS,
352
+ } GumboAttributeNamespaceEnum;
353
+
354
+ /**
355
+ * A struct representing a single attribute on a HTML tag. This is a
356
+ * name-value pair, but also includes information about source locations
357
+ * and original source text.
358
+ */
359
+ typedef struct {
360
+ /**
361
+ * The namespace for the attribute. This will usually be
362
+ * `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
363
+ * take special values, per:
364
+ * https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
365
+ */
366
+ GumboAttributeNamespaceEnum attr_namespace;
367
+
368
+ /**
369
+ * The name of the attribute. This is in a freshly-allocated buffer to
370
+ * deal with case-normalization and is null-terminated.
371
+ */
372
+ const char* name;
373
+
374
+ /**
375
+ * The original text of the attribute name, as a pointer into the
376
+ * original source buffer.
377
+ */
378
+ GumboStringPiece original_name;
379
+
380
+ /**
381
+ * The value of the attribute. This is in a freshly-allocated buffer
382
+ * to deal with unescaping and is null-terminated. It does not include
383
+ * any quotes that surround the attribute. If the attribute has no
384
+ * value (for example, `selected` on a checkbox) this will be an empty
385
+ * string.
386
+ */
387
+ const char* value;
388
+
389
+ /**
390
+ * The original text of the value of the attribute. This points into
391
+ * the original source buffer. It includes any quotes that surround
392
+ * the attribute and you can look at `original_value.data[0]` and
393
+ * `original_value.data[original_value.length - 1]` to determine what
394
+ * the quote characters were. If the attribute has no value this will
395
+ * be a 0-length string.
396
+ */
397
+ GumboStringPiece original_value;
398
+
399
+ /** The starting position of the attribute name. */
400
+ GumboSourcePosition name_start;
401
+
402
+ /**
403
+ * The ending position of the attribute name. This is not always derivable
404
+ * from the starting position of the value because of the possibility of
405
+ * whitespace around the `=` sign.
406
+ */
407
+ GumboSourcePosition name_end;
408
+
409
+ /** The starting position of the attribute value. */
410
+ GumboSourcePosition value_start;
411
+
412
+ /** The ending position of the attribute value. */
413
+ GumboSourcePosition value_end;
414
+ } GumboAttribute;
415
+
416
+ /**
417
+ * Given a vector of `GumboAttribute`s, look up the one with the
418
+ * specified name and return it, or `NULL` if no such attribute exists.
419
+ * This uses a case-insensitive match, as HTML is case-insensitive.
420
+ */
421
+ GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
422
+
423
+ /**
424
+ * Enum denoting the type of node. This determines the type of the
425
+ * `node.v` union.
426
+ */
427
+ typedef enum {
428
+ /** Document node. `v` will be a `GumboDocument`. */
429
+ GUMBO_NODE_DOCUMENT,
430
+ /** Element node. `v` will be a `GumboElement`. */
431
+ GUMBO_NODE_ELEMENT,
432
+ /** Text node. `v` will be a `GumboText`. */
433
+ GUMBO_NODE_TEXT,
434
+ /** CDATA node. `v` will be a `GumboText`. */
435
+ GUMBO_NODE_CDATA,
436
+ /** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
437
+ GUMBO_NODE_COMMENT,
438
+ /** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
439
+ GUMBO_NODE_WHITESPACE,
440
+ /**
441
+ * Template node. This is separate from `GUMBO_NODE_ELEMENT` because
442
+ * many client libraries will want to ignore the contents of template
443
+ * nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
444
+ * do the right thing here, while clients that want to include template
445
+ * contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
446
+ * `GumboElement`.
447
+ */
448
+ GUMBO_NODE_TEMPLATE
449
+ } GumboNodeType;
450
+
451
+ /**
452
+ * Forward declaration of GumboNode so it can be used recursively in
453
+ * GumboNode.parent.
454
+ */
455
+ typedef struct GumboInternalNode GumboNode;
456
+
457
+ /** https://dom.spec.whatwg.org/#concept-document-quirks */
458
+ typedef enum {
459
+ GUMBO_DOCTYPE_NO_QUIRKS,
460
+ GUMBO_DOCTYPE_QUIRKS,
461
+ GUMBO_DOCTYPE_LIMITED_QUIRKS
462
+ } GumboQuirksModeEnum;
463
+
464
+ /**
465
+ * Namespaces.
466
+ * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
467
+ * Rather, anything inside an `<svg>` tag is in the SVG namespace,
468
+ * anything inside the `<math>` tag is in the MathML namespace, and
469
+ * anything else is inside the HTML namespace. No other namespaces are
470
+ * supported, so this can be an `enum`.
471
+ */
472
+ typedef enum {
473
+ GUMBO_NAMESPACE_HTML,
474
+ GUMBO_NAMESPACE_SVG,
475
+ GUMBO_NAMESPACE_MATHML
476
+ } GumboNamespaceEnum;
477
+
478
+ /**
479
+ * Parse flags.
480
+ * We track the reasons for parser insertion of nodes and store them in
481
+ * a bitvector in the node itself. This lets client code optimize out
482
+ * nodes that are implied by the HTML structure of the document, or flag
483
+ * constructs that may not be allowed by a style guide, or track the
484
+ * prevalence of incorrect or tricky HTML code.
485
+ */
486
+ typedef enum {
487
+ /**
488
+ * A normal node -- both start and end tags appear in the source,
489
+ * nothing has been reparented.
490
+ */
491
+ GUMBO_INSERTION_NORMAL = 0,
492
+
493
+ /**
494
+ * A node inserted by the parser to fulfill some implicit insertion
495
+ * rule. This is usually set in addition to some other flag giving a
496
+ * more specific insertion reason; it's a generic catch-all term
497
+ * meaning "The start tag for this node did not appear in the document
498
+ * source".
499
+ */
500
+ GUMBO_INSERTION_BY_PARSER = 1 << 0,
501
+
502
+ /**
503
+ * A flag indicating that the end tag for this node did not appear in
504
+ * the document source. Note that in some cases, you can still have
505
+ * parser-inserted nodes with an explicit end tag. For example,
506
+ * `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
507
+ * node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
508
+ * `</html>` tag actually exists.
509
+ *
510
+ * This flag will be set only if the end tag is completely missing.
511
+ * In some cases, the end tag may be misplaced (e.g. a `</body>` tag
512
+ * with text afterwards), which will leave this flag unset and require
513
+ * clients to inspect the parse errors for that case.
514
+ */
515
+ GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
516
+
517
+ // Value 1 << 2 was for a flag that has since been removed.
518
+
519
+ /**
520
+ * A flag for nodes that are inserted because their presence is
521
+ * implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
522
+ * `<tbody>`, etc.
523
+ */
524
+ GUMBO_INSERTION_IMPLIED = 1 << 3,
525
+
526
+ /**
527
+ * A flag for nodes that are converted from their end tag equivalents.
528
+ * For example, `</p>` when no paragraph is open implies that the
529
+ * parser should create a `<p>` tag and immediately close it, while
530
+ * `</br>` means the same thing as `<br>`.
531
+ */
532
+ GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
533
+
534
+ // Value 1 << 5 was for a flag that has since been removed.
535
+
536
+ /** A flag for `<image>` tags that are rewritten as `<img>`. */
537
+ GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
538
+
539
+ /**
540
+ * A flag for nodes that are cloned as a result of the reconstruction
541
+ * of active formatting elements. This is set only on the clone; the
542
+ * initial portion of the formatting run is a NORMAL node with an
543
+ * `IMPLICIT_END_TAG`.
544
+ */
545
+ GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
546
+
547
+ /** A flag for nodes that are cloned by the adoption agency algorithm. */
548
+ GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
549
+
550
+ /** A flag for nodes that are moved by the adoption agency algorithm. */
551
+ GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
552
+
553
+ /**
554
+ * A flag for nodes that have been foster-parented out of a table (or
555
+ * should've been foster-parented, if verbatim mode is set).
556
+ */
557
+ GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
558
+ } GumboParseFlags;
559
+
560
+ /** Information specific to document nodes. */
561
+ typedef struct {
562
+ /**
563
+ * An array of `GumboNode`s, containing the children of this element.
564
+ * This will normally consist of the `<html>` element and any comment
565
+ * nodes found. Pointers are owned.
566
+ */
567
+ GumboVector /* GumboNode* */ children;
568
+
569
+ /**
570
+ * `true` if there was an explicit doctype token, as opposed to it
571
+ * being omitted.
572
+ */
573
+ bool has_doctype;
574
+
575
+ // Fields from the doctype token, copied verbatim.
576
+ const char* name;
577
+ const char* public_identifier;
578
+ const char* system_identifier;
579
+
580
+ /**
581
+ * Whether or not the document is in QuirksMode, as determined by the
582
+ * values in the GumboTokenDocType template.
583
+ */
584
+ GumboQuirksModeEnum doc_type_quirks_mode;
585
+ } GumboDocument;
586
+
587
+ /**
588
+ * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
589
+ * elements. This contains just a block of text and its position.
590
+ */
591
+ typedef struct {
592
+ /**
593
+ * The text of this node, after entities have been parsed and decoded.
594
+ * For comment and cdata nodes, this does not include the comment
595
+ * delimiters.
596
+ */
597
+ const char* text;
598
+
599
+ /**
600
+ * The original text of this node, as a pointer into the original
601
+ * buffer. For comment/cdata nodes, this includes the comment
602
+ * delimiters.
603
+ */
604
+ GumboStringPiece original_text;
605
+
606
+ /**
607
+ * The starting position of this node. This corresponds to the
608
+ * position of `original_text`, before entities are decoded.
609
+ * */
610
+ GumboSourcePosition start_pos;
611
+ } GumboText;
612
+
613
+ /**
614
+ * The struct used to represent all HTML elements. This contains
615
+ * information about the tag, attributes, and child nodes.
616
+ */
617
+ typedef struct {
618
+ /**
619
+ * An array of `GumboNode`s, containing the children of this element.
620
+ * Pointers are owned.
621
+ */
622
+ GumboVector /* GumboNode* */ children;
623
+
624
+ /** The GumboTag enum for this element. */
625
+ GumboTag tag;
626
+
627
+ /** The name for this element. */
628
+ const char* name;
629
+
630
+ /** The GumboNamespaceEnum for this element. */
631
+ GumboNamespaceEnum tag_namespace;
632
+
633
+ /**
634
+ * A `GumboStringPiece` pointing to the original tag text for this
635
+ * element, pointing directly into the source buffer. If the tag was
636
+ * inserted algorithmically (for example, `<head>` or `<tbody>`
637
+ * insertion), this will be a zero-length string.
638
+ */
639
+ GumboStringPiece original_tag;
640
+
641
+ /**
642
+ * A `GumboStringPiece` pointing to the original end tag text for this
643
+ * element. If the end tag was inserted algorithmically, (for example,
644
+ * closing a self-closing tag), this will be a zero-length string.
645
+ */
646
+ GumboStringPiece original_end_tag;
647
+
648
+ /** The source position for the start of the start tag. */
649
+ GumboSourcePosition start_pos;
650
+
651
+ /** The source position for the start of the end tag. */
652
+ GumboSourcePosition end_pos;
653
+
654
+ /**
655
+ * An array of `GumboAttribute`s, containing the attributes for this
656
+ * tag in the order that they were parsed. Pointers are owned.
657
+ */
658
+ GumboVector /* GumboAttribute* */ attributes;
659
+ } GumboElement;
660
+
661
+ /**
662
+ * A supertype for `GumboElement` and `GumboText`, so that we can
663
+ * include one generic type in lists of children and cast as necessary
664
+ * to subtypes.
665
+ */
666
+ struct GumboInternalNode {
667
+ /** The type of node that this is. */
668
+ GumboNodeType type;
669
+
670
+ /** Pointer back to parent node. Not owned. */
671
+ GumboNode* parent;
672
+
673
+ /** The index within the parent's children vector of this node. */
674
+ unsigned int index_within_parent;
675
+
676
+ /**
677
+ * A bitvector of flags containing information about why this element
678
+ * was inserted into the parse tree, including a variety of special
679
+ * parse situations.
680
+ */
681
+ GumboParseFlags parse_flags;
682
+
683
+ /** The actual node data. */
684
+ union {
685
+ GumboDocument document; // For GUMBO_NODE_DOCUMENT.
686
+ GumboElement element; // For GUMBO_NODE_ELEMENT.
687
+ GumboText text; // For everything else.
688
+ } v;
689
+ };
690
+
691
+ /**
692
+ * Input struct containing configuration options for the parser.
693
+ * These let you specify alternate memory managers, provide different
694
+ * error handling, etc. Use `kGumboDefaultOptions` for sensible
695
+ * defaults and only set what you need.
696
+ */
697
+ typedef struct GumboInternalOptions {
698
+ /**
699
+ * The tab-stop size, for computing positions in HTML files that
700
+ * use tabs. Default: `8`.
701
+ */
702
+ int tab_stop;
703
+
704
+ /**
705
+ * Whether or not to stop parsing when the first error is encountered.
706
+ * Default: `false`.
707
+ */
708
+ bool stop_on_first_error;
709
+
710
+ /**
711
+ * Maximum allowed number of attributes per element. If this limit is
712
+ * exceeded, the parser will return early with a partial document and
713
+ * the returned `GumboOutput` will have its `status` field set to
714
+ * `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
715
+ * Default: `400`.
716
+ */
717
+ int max_attributes;
718
+
719
+ /**
720
+ * Maximum allowed depth for the parse tree. If this limit is exceeded,
721
+ * the parser will return early with a partial document and the returned
722
+ * `GumboOutput` will have its `status` field set to
723
+ * `GUMBO_STATUS_TREE_TOO_DEEP`.
724
+ * Default: `400`.
725
+ */
726
+ unsigned int max_tree_depth;
727
+
728
+ /**
729
+ * The maximum number of errors before the parser stops recording
730
+ * them. This is provided so that if the page is totally borked, we
731
+ * don't completely fill up the errors vector and exhaust memory with
732
+ * useless redundant errors. Set to `-1` to disable the limit.
733
+ * Default: `-1`.
734
+ */
735
+ int max_errors;
736
+
737
+ /**
738
+ * The fragment context for parsing:
739
+ * https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
740
+ *
741
+ * If `NULL` is passed here, it is assumed to be "no
742
+ * fragment", i.e. the regular parsing algorithm. Otherwise, pass the
743
+ * tag name for the intended parent of the parsed fragment. We use the
744
+ * tag name, namespace, and encoding attribute which are sufficient to
745
+ * set all of the parsing context needed for fragment parsing.
746
+ *
747
+ * Default: `NULL`.
748
+ */
749
+ const char* fragment_context;
750
+
751
+ /**
752
+ * The namespace for the fragment context. This lets client code
753
+ * differentiate between, say, parsing a `<title>` tag in SVG vs.
754
+ * parsing it in HTML.
755
+ *
756
+ * Default: `GUMBO_NAMESPACE_HTML`.
757
+ */
758
+ GumboNamespaceEnum fragment_namespace;
759
+
760
+ /**
761
+ * The value of the fragment context's `encoding` attribute, if any.
762
+ * Set to `NULL` for no `encoding` attribute.
763
+ *
764
+ * Default: `NULL`.
765
+ */
766
+ const char* fragment_encoding;
767
+
768
+ /**
769
+ * Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
770
+ * be looked up using `gumbo_compute_quirks_mode()`.
771
+ *
772
+ * Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
773
+ */
774
+ GumboQuirksModeEnum quirks_mode;
775
+
776
+ /**
777
+ * For fragment parsing. Set this to true if the context node has a form
778
+ * element as an ancestor.
779
+ *
780
+ * Default: `false`.
781
+ */
782
+ bool fragment_context_has_form_ancestor;
783
+ } GumboOptions;
784
+
785
+ /** Default options struct; use this with gumbo_parse_with_options. */
786
+ extern const GumboOptions kGumboDefaultOptions;
787
+
788
+ /**
789
+ * Status code indicating whether parsing finished successfully or
790
+ * was stopped mid-document due to exceptional circumstances.
791
+ */
792
+ typedef enum {
793
+ /**
794
+ * Indicates that parsing completed successfuly. The resulting tree
795
+ * will be a complete document.
796
+ */
797
+ GUMBO_STATUS_OK,
798
+
799
+ /**
800
+ * Indicates that the maximum element nesting limit
801
+ * (`GumboOptions::max_tree_depth`) was reached during parsing. The
802
+ * resulting tree will be a partial document, with no further nodes
803
+ * created after the point where the limit was reached. The partial
804
+ * document may be useful for constructing an error message but
805
+ * typically shouldn't be used for other purposes.
806
+ */
807
+ GUMBO_STATUS_TREE_TOO_DEEP,
808
+
809
+ /**
810
+ * Indicates that the maximum number of attributes per element
811
+ * (`GumboOptions::max_attributes`) was reached during parsing. The
812
+ * resulting tree will be a partial document, with no further nodes
813
+ * created after the point where the limit was reached. The partial
814
+ * document may be useful for constructing an error message but
815
+ * typically shouldn't be used for other purposes.
816
+ */
817
+ GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
818
+
819
+ // Currently unused
820
+ GUMBO_STATUS_OUT_OF_MEMORY,
821
+ } GumboOutputStatus;
822
+
823
+
824
+ /** The output struct containing the results of the parse. */
825
+ typedef struct GumboInternalOutput {
826
+ /**
827
+ * Pointer to the document node. This is a `GumboNode` of type
828
+ * `NODE_DOCUMENT` that contains the entire document as its child.
829
+ */
830
+ GumboNode* document;
831
+
832
+ /**
833
+ * Pointer to the root node. This is the `<html>` tag that forms the
834
+ * root of the document.
835
+ */
836
+ GumboNode* root;
837
+
838
+ /**
839
+ * A list of errors that occurred during the parse.
840
+ */
841
+ GumboVector /* GumboError */ errors;
842
+
843
+ /**
844
+ * True if the parser encounted an error.
845
+ *
846
+ * This can be true and `errors` an empty `GumboVector` if the `max_errors`
847
+ * option was set to 0.
848
+ */
849
+ bool document_error;
850
+
851
+ /**
852
+ * A status code indicating whether parsing finished successfully or was
853
+ * stopped mid-document due to exceptional circumstances.
854
+ */
855
+ GumboOutputStatus status;
856
+ } GumboOutput;
857
+
858
+ /**
859
+ * Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
860
+ * buffer must live at least as long as the parse tree, as some fields
861
+ * (eg. `original_text`) point directly into the original buffer.
862
+ *
863
+ * This doesn't support buffers longer than 4 gigabytes.
864
+ */
865
+ GumboOutput* gumbo_parse(const char* buffer);
866
+
867
+ /**
868
+ * Extended version of `gumbo_parse` that takes an explicit options
869
+ * structure, buffer, and length.
870
+ */
871
+ GumboOutput* gumbo_parse_with_options (
872
+ const GumboOptions* options,
873
+ const char* buffer,
874
+ size_t buffer_length
875
+ );
876
+
877
+ /**
878
+ * Compute the quirks mode based on the name, public identifier, and system
879
+ * identifier. Any of these may be `NULL` to indicate a missing value.
880
+ */
881
+ GumboQuirksModeEnum gumbo_compute_quirks_mode (
882
+ const char *name,
883
+ const char *pubid,
884
+ const char *sysid
885
+ );
886
+
887
+ /** Convert a `GumboOutputStatus` code into a readable description. */
888
+ const char* gumbo_status_to_string(GumboOutputStatus status);
889
+
890
+ /** Release the memory used for the parse tree and parse errors. */
891
+ void gumbo_destroy_output(GumboOutput* output);
892
+
893
+ /** Opaque GumboError type */
894
+ typedef struct GumboInternalError GumboError;
895
+
896
+ /**
897
+ * Returns the position of the error.
898
+ */
899
+ GumboSourcePosition gumbo_error_position(const GumboError* error);
900
+
901
+ /**
902
+ * Returns a constant string representation of the error's code. This is owned
903
+ * by the library and should not be freed by the caller.
904
+ */
905
+ const char* gumbo_error_code(const GumboError* error);
906
+
907
+ /**
908
+ * Prints an error to a string. This stores a freshly-allocated buffer
909
+ * containing the error message text in output. The caller is responsible for
910
+ * freeing the buffer. The size of the error message is returned. The error
911
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
912
+ * returned size must be used.
913
+ */
914
+ size_t gumbo_error_to_string(const GumboError* error, char **output);
915
+
916
+ /**
917
+ * Prints a caret diagnostic to a string. This stores a freshly-allocated
918
+ * buffer containing the error message text in output. The caller is responsible for
919
+ * freeing the buffer. The size of the error message is returned. The error
920
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
921
+ * returned size must be used.
922
+ */
923
+ size_t gumbo_caret_diagnostic_to_string (
924
+ const GumboError* error,
925
+ const char* source_text,
926
+ size_t source_length,
927
+ char** output
928
+ );
929
+
930
+ /**
931
+ * Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
932
+ * instead of writing to a string.
933
+ */
934
+ void gumbo_print_caret_diagnostic (
935
+ const GumboError* error,
936
+ const char* source_text,
937
+ size_t source_length
938
+ );
939
+
940
+ #ifdef __cplusplus
941
+ }
942
+ #endif
943
+
944
+ #endif // GUMBO_H