Nokogiri_precompiled_aarch64_dedshit 1.14.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (263) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +44 -0
  3. data/LICENSE-DEPENDENCIES.md +2224 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +287 -0
  6. data/bin/nokogiri +131 -0
  7. data/dependencies.yml +41 -0
  8. data/ext/java/nokogiri/Html4Document.java +157 -0
  9. data/ext/java/nokogiri/Html4ElementDescription.java +133 -0
  10. data/ext/java/nokogiri/Html4EntityLookup.java +63 -0
  11. data/ext/java/nokogiri/Html4SaxParserContext.java +289 -0
  12. data/ext/java/nokogiri/Html4SaxPushParser.java +213 -0
  13. data/ext/java/nokogiri/NokogiriService.java +613 -0
  14. data/ext/java/nokogiri/XmlAttr.java +154 -0
  15. data/ext/java/nokogiri/XmlAttributeDecl.java +119 -0
  16. data/ext/java/nokogiri/XmlCdata.java +60 -0
  17. data/ext/java/nokogiri/XmlComment.java +77 -0
  18. data/ext/java/nokogiri/XmlDocument.java +705 -0
  19. data/ext/java/nokogiri/XmlDocumentFragment.java +163 -0
  20. data/ext/java/nokogiri/XmlDtd.java +516 -0
  21. data/ext/java/nokogiri/XmlElement.java +44 -0
  22. data/ext/java/nokogiri/XmlElementContent.java +412 -0
  23. data/ext/java/nokogiri/XmlElementDecl.java +148 -0
  24. data/ext/java/nokogiri/XmlEntityDecl.java +151 -0
  25. data/ext/java/nokogiri/XmlEntityReference.java +79 -0
  26. data/ext/java/nokogiri/XmlNamespace.java +193 -0
  27. data/ext/java/nokogiri/XmlNode.java +1938 -0
  28. data/ext/java/nokogiri/XmlNodeSet.java +463 -0
  29. data/ext/java/nokogiri/XmlProcessingInstruction.java +79 -0
  30. data/ext/java/nokogiri/XmlReader.java +615 -0
  31. data/ext/java/nokogiri/XmlRelaxng.java +133 -0
  32. data/ext/java/nokogiri/XmlSaxParserContext.java +329 -0
  33. data/ext/java/nokogiri/XmlSaxPushParser.java +288 -0
  34. data/ext/java/nokogiri/XmlSchema.java +423 -0
  35. data/ext/java/nokogiri/XmlSyntaxError.java +137 -0
  36. data/ext/java/nokogiri/XmlText.java +90 -0
  37. data/ext/java/nokogiri/XmlXpathContext.java +305 -0
  38. data/ext/java/nokogiri/XsltStylesheet.java +368 -0
  39. data/ext/java/nokogiri/internals/ClosedStreamException.java +13 -0
  40. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +252 -0
  41. data/ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java +27 -0
  42. data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +178 -0
  43. data/ext/java/nokogiri/internals/NokogiriDomParser.java +99 -0
  44. data/ext/java/nokogiri/internals/NokogiriEntityResolver.java +140 -0
  45. data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +65 -0
  46. data/ext/java/nokogiri/internals/NokogiriHandler.java +339 -0
  47. data/ext/java/nokogiri/internals/NokogiriHelpers.java +817 -0
  48. data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +228 -0
  49. data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +110 -0
  50. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +86 -0
  51. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +107 -0
  52. data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +62 -0
  53. data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +165 -0
  54. data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +50 -0
  55. data/ext/java/nokogiri/internals/NokogiriXPathVariableResolver.java +37 -0
  56. data/ext/java/nokogiri/internals/NokogiriXsltErrorListener.java +70 -0
  57. data/ext/java/nokogiri/internals/ParserContext.java +262 -0
  58. data/ext/java/nokogiri/internals/ReaderNode.java +564 -0
  59. data/ext/java/nokogiri/internals/SaveContextVisitor.java +865 -0
  60. data/ext/java/nokogiri/internals/SchemaErrorHandler.java +50 -0
  61. data/ext/java/nokogiri/internals/XalanDTMManagerPatch.java +174 -0
  62. data/ext/java/nokogiri/internals/XmlDeclHandler.java +11 -0
  63. data/ext/java/nokogiri/internals/XmlDomParserContext.java +265 -0
  64. data/ext/java/nokogiri/internals/XmlSaxParser.java +40 -0
  65. data/ext/java/nokogiri/internals/c14n/AttrCompare.java +122 -0
  66. data/ext/java/nokogiri/internals/c14n/C14nHelper.java +178 -0
  67. data/ext/java/nokogiri/internals/c14n/CanonicalFilter.java +43 -0
  68. data/ext/java/nokogiri/internals/c14n/CanonicalizationException.java +106 -0
  69. data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +278 -0
  70. data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +664 -0
  71. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_OmitComments.java +45 -0
  72. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_WithComments.java +45 -0
  73. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315.java +388 -0
  74. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +308 -0
  75. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclOmitComments.java +47 -0
  76. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclWithComments.java +51 -0
  77. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315OmitComments.java +51 -0
  78. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315WithComments.java +50 -0
  79. data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +660 -0
  80. data/ext/java/nokogiri/internals/c14n/CanonicalizerPhysical.java +194 -0
  81. data/ext/java/nokogiri/internals/c14n/CanonicalizerSpi.java +77 -0
  82. data/ext/java/nokogiri/internals/c14n/Constants.java +45 -0
  83. data/ext/java/nokogiri/internals/c14n/ElementProxy.java +325 -0
  84. data/ext/java/nokogiri/internals/c14n/HelperNodeList.java +106 -0
  85. data/ext/java/nokogiri/internals/c14n/IgnoreAllErrorHandler.java +86 -0
  86. data/ext/java/nokogiri/internals/c14n/InclusiveNamespaces.java +181 -0
  87. data/ext/java/nokogiri/internals/c14n/InvalidCanonicalizerException.java +87 -0
  88. data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +452 -0
  89. data/ext/java/nokogiri/internals/c14n/NodeFilter.java +52 -0
  90. data/ext/java/nokogiri/internals/c14n/UtfHelpper.java +190 -0
  91. data/ext/java/nokogiri/internals/c14n/XMLUtils.java +540 -0
  92. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +1712 -0
  93. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +737 -0
  94. data/ext/nokogiri/depend +38 -0
  95. data/ext/nokogiri/extconf.rb +1086 -0
  96. data/ext/nokogiri/gumbo.c +594 -0
  97. data/ext/nokogiri/html4_document.c +167 -0
  98. data/ext/nokogiri/html4_element_description.c +294 -0
  99. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  100. data/ext/nokogiri/html4_sax_parser_context.c +116 -0
  101. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  102. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  103. data/ext/nokogiri/nokogiri.c +265 -0
  104. data/ext/nokogiri/nokogiri.h +235 -0
  105. data/ext/nokogiri/test_global_handlers.c +42 -0
  106. data/ext/nokogiri/xml_attr.c +103 -0
  107. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  108. data/ext/nokogiri/xml_cdata.c +57 -0
  109. data/ext/nokogiri/xml_comment.c +62 -0
  110. data/ext/nokogiri/xml_document.c +689 -0
  111. data/ext/nokogiri/xml_document_fragment.c +44 -0
  112. data/ext/nokogiri/xml_dtd.c +210 -0
  113. data/ext/nokogiri/xml_element_content.c +128 -0
  114. data/ext/nokogiri/xml_element_decl.c +69 -0
  115. data/ext/nokogiri/xml_encoding_handler.c +104 -0
  116. data/ext/nokogiri/xml_entity_decl.c +112 -0
  117. data/ext/nokogiri/xml_entity_reference.c +50 -0
  118. data/ext/nokogiri/xml_namespace.c +186 -0
  119. data/ext/nokogiri/xml_node.c +2426 -0
  120. data/ext/nokogiri/xml_node_set.c +496 -0
  121. data/ext/nokogiri/xml_processing_instruction.c +54 -0
  122. data/ext/nokogiri/xml_reader.c +794 -0
  123. data/ext/nokogiri/xml_relax_ng.c +164 -0
  124. data/ext/nokogiri/xml_sax_parser.c +316 -0
  125. data/ext/nokogiri/xml_sax_parser_context.c +283 -0
  126. data/ext/nokogiri/xml_sax_push_parser.c +166 -0
  127. data/ext/nokogiri/xml_schema.c +260 -0
  128. data/ext/nokogiri/xml_syntax_error.c +85 -0
  129. data/ext/nokogiri/xml_text.c +48 -0
  130. data/ext/nokogiri/xml_xpath_context.c +415 -0
  131. data/ext/nokogiri/xslt_stylesheet.c +363 -0
  132. data/gumbo-parser/CHANGES.md +63 -0
  133. data/gumbo-parser/Makefile +111 -0
  134. data/gumbo-parser/THANKS +27 -0
  135. data/gumbo-parser/src/Makefile +34 -0
  136. data/gumbo-parser/src/README.md +41 -0
  137. data/gumbo-parser/src/ascii.c +75 -0
  138. data/gumbo-parser/src/ascii.h +115 -0
  139. data/gumbo-parser/src/attribute.c +42 -0
  140. data/gumbo-parser/src/attribute.h +17 -0
  141. data/gumbo-parser/src/char_ref.c +22225 -0
  142. data/gumbo-parser/src/char_ref.h +29 -0
  143. data/gumbo-parser/src/char_ref.rl +2154 -0
  144. data/gumbo-parser/src/error.c +626 -0
  145. data/gumbo-parser/src/error.h +148 -0
  146. data/gumbo-parser/src/foreign_attrs.c +104 -0
  147. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  148. data/gumbo-parser/src/insertion_mode.h +33 -0
  149. data/gumbo-parser/src/macros.h +91 -0
  150. data/gumbo-parser/src/nokogiri_gumbo.h +944 -0
  151. data/gumbo-parser/src/parser.c +4878 -0
  152. data/gumbo-parser/src/parser.h +41 -0
  153. data/gumbo-parser/src/replacement.h +33 -0
  154. data/gumbo-parser/src/string_buffer.c +103 -0
  155. data/gumbo-parser/src/string_buffer.h +68 -0
  156. data/gumbo-parser/src/string_piece.c +48 -0
  157. data/gumbo-parser/src/svg_attrs.c +174 -0
  158. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  159. data/gumbo-parser/src/svg_tags.c +137 -0
  160. data/gumbo-parser/src/svg_tags.gperf +55 -0
  161. data/gumbo-parser/src/tag.c +223 -0
  162. data/gumbo-parser/src/tag_lookup.c +382 -0
  163. data/gumbo-parser/src/tag_lookup.gperf +170 -0
  164. data/gumbo-parser/src/tag_lookup.h +13 -0
  165. data/gumbo-parser/src/token_buffer.c +79 -0
  166. data/gumbo-parser/src/token_buffer.h +71 -0
  167. data/gumbo-parser/src/token_type.h +17 -0
  168. data/gumbo-parser/src/tokenizer.c +3463 -0
  169. data/gumbo-parser/src/tokenizer.h +112 -0
  170. data/gumbo-parser/src/tokenizer_states.h +339 -0
  171. data/gumbo-parser/src/utf8.c +245 -0
  172. data/gumbo-parser/src/utf8.h +164 -0
  173. data/gumbo-parser/src/util.c +66 -0
  174. data/gumbo-parser/src/util.h +34 -0
  175. data/gumbo-parser/src/vector.c +111 -0
  176. data/gumbo-parser/src/vector.h +45 -0
  177. data/lib/nokogiri/class_resolver.rb +67 -0
  178. data/lib/nokogiri/css/node.rb +54 -0
  179. data/lib/nokogiri/css/parser.rb +770 -0
  180. data/lib/nokogiri/css/parser.y +277 -0
  181. data/lib/nokogiri/css/parser_extras.rb +96 -0
  182. data/lib/nokogiri/css/syntax_error.rb +9 -0
  183. data/lib/nokogiri/css/tokenizer.rb +155 -0
  184. data/lib/nokogiri/css/tokenizer.rex +56 -0
  185. data/lib/nokogiri/css/xpath_visitor.rb +359 -0
  186. data/lib/nokogiri/css.rb +66 -0
  187. data/lib/nokogiri/decorators/slop.rb +44 -0
  188. data/lib/nokogiri/encoding_handler.rb +57 -0
  189. data/lib/nokogiri/extension.rb +32 -0
  190. data/lib/nokogiri/gumbo.rb +15 -0
  191. data/lib/nokogiri/html.rb +48 -0
  192. data/lib/nokogiri/html4/builder.rb +37 -0
  193. data/lib/nokogiri/html4/document.rb +214 -0
  194. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  195. data/lib/nokogiri/html4/element_description.rb +25 -0
  196. data/lib/nokogiri/html4/element_description_defaults.rb +572 -0
  197. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  198. data/lib/nokogiri/html4/entity_lookup.rb +15 -0
  199. data/lib/nokogiri/html4/sax/parser.rb +63 -0
  200. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  201. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  202. data/lib/nokogiri/html4.rb +47 -0
  203. data/lib/nokogiri/html5/document.rb +168 -0
  204. data/lib/nokogiri/html5/document_fragment.rb +90 -0
  205. data/lib/nokogiri/html5/node.rb +98 -0
  206. data/lib/nokogiri/html5.rb +389 -0
  207. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  208. data/lib/nokogiri/jruby/isorelax/isorelax/20030108/isorelax-20030108.jar +0 -0
  209. data/lib/nokogiri/jruby/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar +0 -0
  210. data/lib/nokogiri/jruby/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar +0 -0
  211. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  212. data/lib/nokogiri/jruby/nu/validator/jing/20200702VNU/jing-20200702VNU.jar +0 -0
  213. data/lib/nokogiri/jruby/org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar +0 -0
  214. data/lib/nokogiri/jruby/xalan/serializer/2.7.3/serializer-2.7.3.jar +0 -0
  215. data/lib/nokogiri/jruby/xalan/xalan/2.7.3/xalan-2.7.3.jar +0 -0
  216. data/lib/nokogiri/jruby/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar +0 -0
  217. data/lib/nokogiri/jruby/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar +0 -0
  218. data/lib/nokogiri/syntax_error.rb +6 -0
  219. data/lib/nokogiri/version/constant.rb +6 -0
  220. data/lib/nokogiri/version/info.rb +223 -0
  221. data/lib/nokogiri/version.rb +4 -0
  222. data/lib/nokogiri/xml/attr.rb +66 -0
  223. data/lib/nokogiri/xml/attribute_decl.rb +20 -0
  224. data/lib/nokogiri/xml/builder.rb +487 -0
  225. data/lib/nokogiri/xml/cdata.rb +13 -0
  226. data/lib/nokogiri/xml/character_data.rb +9 -0
  227. data/lib/nokogiri/xml/document.rb +471 -0
  228. data/lib/nokogiri/xml/document_fragment.rb +205 -0
  229. data/lib/nokogiri/xml/dtd.rb +34 -0
  230. data/lib/nokogiri/xml/element_content.rb +38 -0
  231. data/lib/nokogiri/xml/element_decl.rb +15 -0
  232. data/lib/nokogiri/xml/entity_decl.rb +21 -0
  233. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  234. data/lib/nokogiri/xml/namespace.rb +58 -0
  235. data/lib/nokogiri/xml/node/save_options.rb +68 -0
  236. data/lib/nokogiri/xml/node.rb +1563 -0
  237. data/lib/nokogiri/xml/node_set.rb +447 -0
  238. data/lib/nokogiri/xml/notation.rb +19 -0
  239. data/lib/nokogiri/xml/parse_options.rb +213 -0
  240. data/lib/nokogiri/xml/pp/character_data.rb +21 -0
  241. data/lib/nokogiri/xml/pp/node.rb +57 -0
  242. data/lib/nokogiri/xml/pp.rb +4 -0
  243. data/lib/nokogiri/xml/processing_instruction.rb +11 -0
  244. data/lib/nokogiri/xml/reader.rb +105 -0
  245. data/lib/nokogiri/xml/relax_ng.rb +38 -0
  246. data/lib/nokogiri/xml/sax/document.rb +167 -0
  247. data/lib/nokogiri/xml/sax/parser.rb +125 -0
  248. data/lib/nokogiri/xml/sax/parser_context.rb +21 -0
  249. data/lib/nokogiri/xml/sax/push_parser.rb +61 -0
  250. data/lib/nokogiri/xml/sax.rb +6 -0
  251. data/lib/nokogiri/xml/schema.rb +73 -0
  252. data/lib/nokogiri/xml/searchable.rb +270 -0
  253. data/lib/nokogiri/xml/syntax_error.rb +72 -0
  254. data/lib/nokogiri/xml/text.rb +11 -0
  255. data/lib/nokogiri/xml/xpath/syntax_error.rb +13 -0
  256. data/lib/nokogiri/xml/xpath.rb +21 -0
  257. data/lib/nokogiri/xml/xpath_context.rb +16 -0
  258. data/lib/nokogiri/xml.rb +76 -0
  259. data/lib/nokogiri/xslt/stylesheet.rb +27 -0
  260. data/lib/nokogiri/xslt.rb +65 -0
  261. data/lib/nokogiri.rb +120 -0
  262. data/lib/xsd/xmlparser/nokogiri.rb +106 -0
  263. metadata +391 -0
@@ -0,0 +1,944 @@
1
+ // Copyright 2010 Google Inc.
2
+ // Copyright 2018 Craig Barnes.
3
+ // Licensed under the Apache License, version 2.0.
4
+
5
+ // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
6
+ // GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
7
+ // static constants
8
+
9
+ /**
10
+ * @file
11
+ * @mainpage Gumbo HTML Parser
12
+ *
13
+ * This provides a conformant, no-dependencies implementation of the
14
+ * [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
15
+ * to parse a different encoding, run a preprocessing step to convert
16
+ * to UTF-8. It returns a parse tree made of the structs in this file.
17
+ *
18
+ * Example:
19
+ * @code
20
+ * GumboOutput* output = gumbo_parse(input);
21
+ * do_something_with_doctype(output->document);
22
+ * do_something_with_html_tree(output->root);
23
+ * gumbo_destroy_output(output);
24
+ * @endcode
25
+ *
26
+ * [HTML5]: https://html.spec.whatwg.org/multipage/
27
+ */
28
+
29
+ #ifndef GUMBO_H
30
+ #define GUMBO_H
31
+
32
+ #include <stdbool.h>
33
+ #include <stddef.h>
34
+
35
+ #ifdef __cplusplus
36
+ extern "C" {
37
+ #endif
38
+
39
+ /**
40
+ * A struct representing a character position within the original text
41
+ * buffer. Line and column numbers are 1-based and offsets are 0-based,
42
+ * which matches how most editors and command-line tools work.
43
+ */
44
+ typedef struct {
45
+ size_t line;
46
+ size_t column;
47
+ size_t offset;
48
+ } GumboSourcePosition;
49
+
50
+ /**
51
+ * A struct representing a string or part of a string. Strings within
52
+ * the parser are represented by a `char*` and a length; the `char*`
53
+ * points into an existing data buffer owned by some other code (often
54
+ * the original input). `GumboStringPiece`s are assumed (by convention)
55
+ * to be immutable, because they may share data. Clients should assume
56
+ * that it is not NUL-terminated and should always use explicit lengths
57
+ * when manipulating them.
58
+ */
59
+ typedef struct {
60
+ /** A pointer to the beginning of the string. `NULL` if `length == 0`. */
61
+ const char* data;
62
+
63
+ /** The length of the string fragment, in bytes (may be zero). */
64
+ size_t length;
65
+ } GumboStringPiece;
66
+
67
+ #define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
68
+ /** A constant to represent a 0-length null string. */
69
+ #define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT
70
+
71
+ /**
72
+ * Compares two `GumboStringPiece`s, and returns `true` if they're
73
+ * equal or `false` otherwise.
74
+ */
75
+ bool gumbo_string_equals (
76
+ const GumboStringPiece* str1,
77
+ const GumboStringPiece* str2
78
+ );
79
+
80
+ /**
81
+ * Compares two `GumboStringPiece`s, ignoring case, and returns `true`
82
+ * if they're equal or `false` otherwise.
83
+ */
84
+ bool gumbo_string_equals_ignore_case (
85
+ const GumboStringPiece* str1,
86
+ const GumboStringPiece* str2
87
+ );
88
+
89
+ /**
90
+ * Check if the first `GumboStringPiece` is a prefix of the second, ignoring
91
+ * case.
92
+ */
93
+ bool gumbo_string_prefix_ignore_case (
94
+ const GumboStringPiece* prefix,
95
+ const GumboStringPiece* str
96
+ );
97
+
98
+ /**
99
+ * A simple vector implementation. This stores a pointer to a data array
100
+ * and a length. All elements are stored as `void*`; client code must
101
+ * cast to the appropriate type. Overflows upon addition result in
102
+ * reallocation of the data array, with the size doubling to maintain
103
+ * `O(1)` amortized cost. There is no removal function, as this isn't
104
+ * needed for any of the operations within this library. Iteration can
105
+ * be done through inspecting the structure directly in a `for` loop.
106
+ */
107
+ typedef struct {
108
+ /**
109
+ * Data elements. This points to a dynamically-allocated array of
110
+ * `capacity` elements, each a `void*` to the element itself.
111
+ */
112
+ void** data;
113
+
114
+ /** Number of elements currently in the vector. */
115
+ unsigned int length;
116
+
117
+ /** Current array capacity. */
118
+ unsigned int capacity;
119
+ } GumboVector;
120
+
121
+ # define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
122
+ /** An empty (0-length, 0-capacity) `GumboVector`. */
123
+ #define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT
124
+
125
+ /**
126
+ * Returns the first index at which an element appears in this vector
127
+ * (testing by pointer equality), or `-1` if it never does.
128
+ */
129
+ int gumbo_vector_index_of(GumboVector* vector, const void* element);
130
+
131
+ /**
132
+ * An `enum` for all the tags defined in the HTML5 standard. These
133
+ * correspond to the tag names themselves. Enum constants exist only
134
+ * for tags that appear in the spec itself (or for tags with special
135
+ * handling in the SVG and MathML namespaces). Any other tags appear
136
+ * as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
137
+ * through `original_tag`.
138
+ *
139
+ * This is mostly for API convenience, so that clients of this library
140
+ * don't need to perform a `strcasecmp` to find the normalized tag
141
+ * name. It also has efficiency benefits, by letting the parser work
142
+ * with enums instead of strings.
143
+ */
144
+ typedef enum {
145
+ GUMBO_TAG_HTML,
146
+ GUMBO_TAG_HEAD,
147
+ GUMBO_TAG_TITLE,
148
+ GUMBO_TAG_BASE,
149
+ GUMBO_TAG_LINK,
150
+ GUMBO_TAG_META,
151
+ GUMBO_TAG_STYLE,
152
+ GUMBO_TAG_SCRIPT,
153
+ GUMBO_TAG_NOSCRIPT,
154
+ GUMBO_TAG_TEMPLATE,
155
+ GUMBO_TAG_BODY,
156
+ GUMBO_TAG_ARTICLE,
157
+ GUMBO_TAG_SECTION,
158
+ GUMBO_TAG_NAV,
159
+ GUMBO_TAG_ASIDE,
160
+ GUMBO_TAG_H1,
161
+ GUMBO_TAG_H2,
162
+ GUMBO_TAG_H3,
163
+ GUMBO_TAG_H4,
164
+ GUMBO_TAG_H5,
165
+ GUMBO_TAG_H6,
166
+ GUMBO_TAG_HGROUP,
167
+ GUMBO_TAG_HEADER,
168
+ GUMBO_TAG_FOOTER,
169
+ GUMBO_TAG_ADDRESS,
170
+ GUMBO_TAG_P,
171
+ GUMBO_TAG_HR,
172
+ GUMBO_TAG_PRE,
173
+ GUMBO_TAG_BLOCKQUOTE,
174
+ GUMBO_TAG_OL,
175
+ GUMBO_TAG_UL,
176
+ GUMBO_TAG_LI,
177
+ GUMBO_TAG_DL,
178
+ GUMBO_TAG_DT,
179
+ GUMBO_TAG_DD,
180
+ GUMBO_TAG_FIGURE,
181
+ GUMBO_TAG_FIGCAPTION,
182
+ GUMBO_TAG_MAIN,
183
+ GUMBO_TAG_DIV,
184
+ GUMBO_TAG_A,
185
+ GUMBO_TAG_EM,
186
+ GUMBO_TAG_STRONG,
187
+ GUMBO_TAG_SMALL,
188
+ GUMBO_TAG_S,
189
+ GUMBO_TAG_CITE,
190
+ GUMBO_TAG_Q,
191
+ GUMBO_TAG_DFN,
192
+ GUMBO_TAG_ABBR,
193
+ GUMBO_TAG_DATA,
194
+ GUMBO_TAG_TIME,
195
+ GUMBO_TAG_CODE,
196
+ GUMBO_TAG_VAR,
197
+ GUMBO_TAG_SAMP,
198
+ GUMBO_TAG_KBD,
199
+ GUMBO_TAG_SUB,
200
+ GUMBO_TAG_SUP,
201
+ GUMBO_TAG_I,
202
+ GUMBO_TAG_B,
203
+ GUMBO_TAG_U,
204
+ GUMBO_TAG_MARK,
205
+ GUMBO_TAG_RUBY,
206
+ GUMBO_TAG_RT,
207
+ GUMBO_TAG_RP,
208
+ GUMBO_TAG_BDI,
209
+ GUMBO_TAG_BDO,
210
+ GUMBO_TAG_SPAN,
211
+ GUMBO_TAG_BR,
212
+ GUMBO_TAG_WBR,
213
+ GUMBO_TAG_INS,
214
+ GUMBO_TAG_DEL,
215
+ GUMBO_TAG_IMAGE,
216
+ GUMBO_TAG_IMG,
217
+ GUMBO_TAG_IFRAME,
218
+ GUMBO_TAG_EMBED,
219
+ GUMBO_TAG_OBJECT,
220
+ GUMBO_TAG_PARAM,
221
+ GUMBO_TAG_VIDEO,
222
+ GUMBO_TAG_AUDIO,
223
+ GUMBO_TAG_SOURCE,
224
+ GUMBO_TAG_TRACK,
225
+ GUMBO_TAG_CANVAS,
226
+ GUMBO_TAG_MAP,
227
+ GUMBO_TAG_AREA,
228
+ GUMBO_TAG_MATH,
229
+ GUMBO_TAG_MI,
230
+ GUMBO_TAG_MO,
231
+ GUMBO_TAG_MN,
232
+ GUMBO_TAG_MS,
233
+ GUMBO_TAG_MTEXT,
234
+ GUMBO_TAG_MGLYPH,
235
+ GUMBO_TAG_MALIGNMARK,
236
+ GUMBO_TAG_ANNOTATION_XML,
237
+ GUMBO_TAG_SVG,
238
+ GUMBO_TAG_FOREIGNOBJECT,
239
+ GUMBO_TAG_DESC,
240
+ GUMBO_TAG_TABLE,
241
+ GUMBO_TAG_CAPTION,
242
+ GUMBO_TAG_COLGROUP,
243
+ GUMBO_TAG_COL,
244
+ GUMBO_TAG_TBODY,
245
+ GUMBO_TAG_THEAD,
246
+ GUMBO_TAG_TFOOT,
247
+ GUMBO_TAG_TR,
248
+ GUMBO_TAG_TD,
249
+ GUMBO_TAG_TH,
250
+ GUMBO_TAG_FORM,
251
+ GUMBO_TAG_FIELDSET,
252
+ GUMBO_TAG_LEGEND,
253
+ GUMBO_TAG_LABEL,
254
+ GUMBO_TAG_INPUT,
255
+ GUMBO_TAG_BUTTON,
256
+ GUMBO_TAG_SELECT,
257
+ GUMBO_TAG_DATALIST,
258
+ GUMBO_TAG_OPTGROUP,
259
+ GUMBO_TAG_OPTION,
260
+ GUMBO_TAG_TEXTAREA,
261
+ GUMBO_TAG_KEYGEN,
262
+ GUMBO_TAG_OUTPUT,
263
+ GUMBO_TAG_PROGRESS,
264
+ GUMBO_TAG_METER,
265
+ GUMBO_TAG_DETAILS,
266
+ GUMBO_TAG_SUMMARY,
267
+ GUMBO_TAG_MENU,
268
+ GUMBO_TAG_MENUITEM,
269
+ GUMBO_TAG_APPLET,
270
+ GUMBO_TAG_ACRONYM,
271
+ GUMBO_TAG_BGSOUND,
272
+ GUMBO_TAG_DIR,
273
+ GUMBO_TAG_FRAME,
274
+ GUMBO_TAG_FRAMESET,
275
+ GUMBO_TAG_NOFRAMES,
276
+ GUMBO_TAG_LISTING,
277
+ GUMBO_TAG_XMP,
278
+ GUMBO_TAG_NEXTID,
279
+ GUMBO_TAG_NOEMBED,
280
+ GUMBO_TAG_PLAINTEXT,
281
+ GUMBO_TAG_RB,
282
+ GUMBO_TAG_STRIKE,
283
+ GUMBO_TAG_BASEFONT,
284
+ GUMBO_TAG_BIG,
285
+ GUMBO_TAG_BLINK,
286
+ GUMBO_TAG_CENTER,
287
+ GUMBO_TAG_FONT,
288
+ GUMBO_TAG_MARQUEE,
289
+ GUMBO_TAG_MULTICOL,
290
+ GUMBO_TAG_NOBR,
291
+ GUMBO_TAG_SPACER,
292
+ GUMBO_TAG_TT,
293
+ GUMBO_TAG_RTC,
294
+ GUMBO_TAG_DIALOG,
295
+ GUMBO_TAG_SEARCH,
296
+ // Used for all tags that don't have special handling in HTML.
297
+ GUMBO_TAG_UNKNOWN,
298
+ // A marker value to indicate the end of the enum, for iterating over it.
299
+ GUMBO_TAG_LAST,
300
+ } GumboTag;
301
+
302
+ /**
303
+ * Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
304
+ * return value is static data owned by the library.
305
+ */
306
+ const char* gumbo_normalized_tagname(GumboTag tag);
307
+
308
+ /**
309
+ * Extracts the tag name from the `original_text` field of an element
310
+ * or token by stripping off `</>` characters and attributes and
311
+ * adjusting the passed-in `GumboStringPiece` appropriately. The tag
312
+ * name is in the original case and shares a buffer with the original
313
+ * text, to simplify memory management. Behavior is undefined if a
314
+ * string piece that doesn't represent an HTML tag (`<tagname>` or
315
+ * `</tagname>`) is passed in. If the string piece is completely
316
+ * empty (`NULL` data pointer), then this function will exit
317
+ * successfully as a no-op.
318
+ */
319
+ void gumbo_tag_from_original_text(GumboStringPiece* text);
320
+
321
+ /**
322
+ * Fixes the case of SVG elements that are not all lowercase. This is
323
+ * not done at parse time because there's no place to store a mutated
324
+ * tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
325
+ * SVG tags without special handling), while `original_tag_name` is a
326
+ * pointer into the original buffer. Instead, we provide this helper
327
+ * function that clients can use to rename SVG tags as appropriate.
328
+ * Returns the case-normalized SVG tagname if a replacement is found, or
329
+ * `NULL` if no normalization is called for. The return value is static
330
+ * data and owned by the library.
331
+ *
332
+ * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
333
+ */
334
+ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
335
+
336
+ /**
337
+ * Converts a tag name string (which may be in upper or mixed case) to a
338
+ * tag enum.
339
+ */
340
+ GumboTag gumbo_tagn_enum(const char* tagname, size_t length);
341
+
342
+ /**
343
+ * Attribute namespaces.
344
+ * HTML includes special handling for XLink, XML, and XMLNS namespaces
345
+ * on attributes. Everything else goes in the generic "NONE" namespace.
346
+ */
347
+ typedef enum {
348
+ GUMBO_ATTR_NAMESPACE_NONE,
349
+ GUMBO_ATTR_NAMESPACE_XLINK,
350
+ GUMBO_ATTR_NAMESPACE_XML,
351
+ GUMBO_ATTR_NAMESPACE_XMLNS,
352
+ } GumboAttributeNamespaceEnum;
353
+
354
+ /**
355
+ * A struct representing a single attribute on a HTML tag. This is a
356
+ * name-value pair, but also includes information about source locations
357
+ * and original source text.
358
+ */
359
+ typedef struct {
360
+ /**
361
+ * The namespace for the attribute. This will usually be
362
+ * `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
363
+ * take special values, per:
364
+ * https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
365
+ */
366
+ GumboAttributeNamespaceEnum attr_namespace;
367
+
368
+ /**
369
+ * The name of the attribute. This is in a freshly-allocated buffer to
370
+ * deal with case-normalization and is null-terminated.
371
+ */
372
+ const char* name;
373
+
374
+ /**
375
+ * The original text of the attribute name, as a pointer into the
376
+ * original source buffer.
377
+ */
378
+ GumboStringPiece original_name;
379
+
380
+ /**
381
+ * The value of the attribute. This is in a freshly-allocated buffer
382
+ * to deal with unescaping and is null-terminated. It does not include
383
+ * any quotes that surround the attribute. If the attribute has no
384
+ * value (for example, `selected` on a checkbox) this will be an empty
385
+ * string.
386
+ */
387
+ const char* value;
388
+
389
+ /**
390
+ * The original text of the value of the attribute. This points into
391
+ * the original source buffer. It includes any quotes that surround
392
+ * the attribute and you can look at `original_value.data[0]` and
393
+ * `original_value.data[original_value.length - 1]` to determine what
394
+ * the quote characters were. If the attribute has no value this will
395
+ * be a 0-length string.
396
+ */
397
+ GumboStringPiece original_value;
398
+
399
+ /** The starting position of the attribute name. */
400
+ GumboSourcePosition name_start;
401
+
402
+ /**
403
+ * The ending position of the attribute name. This is not always derivable
404
+ * from the starting position of the value because of the possibility of
405
+ * whitespace around the `=` sign.
406
+ */
407
+ GumboSourcePosition name_end;
408
+
409
+ /** The starting position of the attribute value. */
410
+ GumboSourcePosition value_start;
411
+
412
+ /** The ending position of the attribute value. */
413
+ GumboSourcePosition value_end;
414
+ } GumboAttribute;
415
+
416
+ /**
417
+ * Given a vector of `GumboAttribute`s, look up the one with the
418
+ * specified name and return it, or `NULL` if no such attribute exists.
419
+ * This uses a case-insensitive match, as HTML is case-insensitive.
420
+ */
421
+ GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
422
+
423
+ /**
424
+ * Enum denoting the type of node. This determines the type of the
425
+ * `node.v` union.
426
+ */
427
+ typedef enum {
428
+ /** Document node. `v` will be a `GumboDocument`. */
429
+ GUMBO_NODE_DOCUMENT,
430
+ /** Element node. `v` will be a `GumboElement`. */
431
+ GUMBO_NODE_ELEMENT,
432
+ /** Text node. `v` will be a `GumboText`. */
433
+ GUMBO_NODE_TEXT,
434
+ /** CDATA node. `v` will be a `GumboText`. */
435
+ GUMBO_NODE_CDATA,
436
+ /** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
437
+ GUMBO_NODE_COMMENT,
438
+ /** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
439
+ GUMBO_NODE_WHITESPACE,
440
+ /**
441
+ * Template node. This is separate from `GUMBO_NODE_ELEMENT` because
442
+ * many client libraries will want to ignore the contents of template
443
+ * nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
444
+ * do the right thing here, while clients that want to include template
445
+ * contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
446
+ * `GumboElement`.
447
+ */
448
+ GUMBO_NODE_TEMPLATE
449
+ } GumboNodeType;
450
+
451
+ /**
452
+ * Forward declaration of GumboNode so it can be used recursively in
453
+ * GumboNode.parent.
454
+ */
455
+ typedef struct GumboInternalNode GumboNode;
456
+
457
+ /** https://dom.spec.whatwg.org/#concept-document-quirks */
458
+ typedef enum {
459
+ GUMBO_DOCTYPE_NO_QUIRKS,
460
+ GUMBO_DOCTYPE_QUIRKS,
461
+ GUMBO_DOCTYPE_LIMITED_QUIRKS
462
+ } GumboQuirksModeEnum;
463
+
464
+ /**
465
+ * Namespaces.
466
+ * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
467
+ * Rather, anything inside an `<svg>` tag is in the SVG namespace,
468
+ * anything inside the `<math>` tag is in the MathML namespace, and
469
+ * anything else is inside the HTML namespace. No other namespaces are
470
+ * supported, so this can be an `enum`.
471
+ */
472
+ typedef enum {
473
+ GUMBO_NAMESPACE_HTML,
474
+ GUMBO_NAMESPACE_SVG,
475
+ GUMBO_NAMESPACE_MATHML
476
+ } GumboNamespaceEnum;
477
+
478
+ /**
479
+ * Parse flags.
480
+ * We track the reasons for parser insertion of nodes and store them in
481
+ * a bitvector in the node itself. This lets client code optimize out
482
+ * nodes that are implied by the HTML structure of the document, or flag
483
+ * constructs that may not be allowed by a style guide, or track the
484
+ * prevalence of incorrect or tricky HTML code.
485
+ */
486
+ typedef enum {
487
+ /**
488
+ * A normal node -- both start and end tags appear in the source,
489
+ * nothing has been reparented.
490
+ */
491
+ GUMBO_INSERTION_NORMAL = 0,
492
+
493
+ /**
494
+ * A node inserted by the parser to fulfill some implicit insertion
495
+ * rule. This is usually set in addition to some other flag giving a
496
+ * more specific insertion reason; it's a generic catch-all term
497
+ * meaning "The start tag for this node did not appear in the document
498
+ * source".
499
+ */
500
+ GUMBO_INSERTION_BY_PARSER = 1 << 0,
501
+
502
+ /**
503
+ * A flag indicating that the end tag for this node did not appear in
504
+ * the document source. Note that in some cases, you can still have
505
+ * parser-inserted nodes with an explicit end tag. For example,
506
+ * `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
507
+ * node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
508
+ * `</html>` tag actually exists.
509
+ *
510
+ * This flag will be set only if the end tag is completely missing.
511
+ * In some cases, the end tag may be misplaced (e.g. a `</body>` tag
512
+ * with text afterwards), which will leave this flag unset and require
513
+ * clients to inspect the parse errors for that case.
514
+ */
515
+ GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
516
+
517
+ // Value 1 << 2 was for a flag that has since been removed.
518
+
519
+ /**
520
+ * A flag for nodes that are inserted because their presence is
521
+ * implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
522
+ * `<tbody>`, etc.
523
+ */
524
+ GUMBO_INSERTION_IMPLIED = 1 << 3,
525
+
526
+ /**
527
+ * A flag for nodes that are converted from their end tag equivalents.
528
+ * For example, `</p>` when no paragraph is open implies that the
529
+ * parser should create a `<p>` tag and immediately close it, while
530
+ * `</br>` means the same thing as `<br>`.
531
+ */
532
+ GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
533
+
534
+ // Value 1 << 5 was for a flag that has since been removed.
535
+
536
+ /** A flag for `<image>` tags that are rewritten as `<img>`. */
537
+ GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
538
+
539
+ /**
540
+ * A flag for nodes that are cloned as a result of the reconstruction
541
+ * of active formatting elements. This is set only on the clone; the
542
+ * initial portion of the formatting run is a NORMAL node with an
543
+ * `IMPLICIT_END_TAG`.
544
+ */
545
+ GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
546
+
547
+ /** A flag for nodes that are cloned by the adoption agency algorithm. */
548
+ GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
549
+
550
+ /** A flag for nodes that are moved by the adoption agency algorithm. */
551
+ GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
552
+
553
+ /**
554
+ * A flag for nodes that have been foster-parented out of a table (or
555
+ * should've been foster-parented, if verbatim mode is set).
556
+ */
557
+ GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
558
+ } GumboParseFlags;
559
+
560
+ /** Information specific to document nodes. */
561
+ typedef struct {
562
+ /**
563
+ * An array of `GumboNode`s, containing the children of this element.
564
+ * This will normally consist of the `<html>` element and any comment
565
+ * nodes found. Pointers are owned.
566
+ */
567
+ GumboVector /* GumboNode* */ children;
568
+
569
+ /**
570
+ * `true` if there was an explicit doctype token, as opposed to it
571
+ * being omitted.
572
+ */
573
+ bool has_doctype;
574
+
575
+ // Fields from the doctype token, copied verbatim.
576
+ const char* name;
577
+ const char* public_identifier;
578
+ const char* system_identifier;
579
+
580
+ /**
581
+ * Whether or not the document is in QuirksMode, as determined by the
582
+ * values in the GumboTokenDocType template.
583
+ */
584
+ GumboQuirksModeEnum doc_type_quirks_mode;
585
+ } GumboDocument;
586
+
587
+ /**
588
+ * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
589
+ * elements. This contains just a block of text and its position.
590
+ */
591
+ typedef struct {
592
+ /**
593
+ * The text of this node, after entities have been parsed and decoded.
594
+ * For comment and cdata nodes, this does not include the comment
595
+ * delimiters.
596
+ */
597
+ const char* text;
598
+
599
+ /**
600
+ * The original text of this node, as a pointer into the original
601
+ * buffer. For comment/cdata nodes, this includes the comment
602
+ * delimiters.
603
+ */
604
+ GumboStringPiece original_text;
605
+
606
+ /**
607
+ * The starting position of this node. This corresponds to the
608
+ * position of `original_text`, before entities are decoded.
609
+ * */
610
+ GumboSourcePosition start_pos;
611
+ } GumboText;
612
+
613
+ /**
614
+ * The struct used to represent all HTML elements. This contains
615
+ * information about the tag, attributes, and child nodes.
616
+ */
617
+ typedef struct {
618
+ /**
619
+ * An array of `GumboNode`s, containing the children of this element.
620
+ * Pointers are owned.
621
+ */
622
+ GumboVector /* GumboNode* */ children;
623
+
624
+ /** The GumboTag enum for this element. */
625
+ GumboTag tag;
626
+
627
+ /** The name for this element. */
628
+ const char* name;
629
+
630
+ /** The GumboNamespaceEnum for this element. */
631
+ GumboNamespaceEnum tag_namespace;
632
+
633
+ /**
634
+ * A `GumboStringPiece` pointing to the original tag text for this
635
+ * element, pointing directly into the source buffer. If the tag was
636
+ * inserted algorithmically (for example, `<head>` or `<tbody>`
637
+ * insertion), this will be a zero-length string.
638
+ */
639
+ GumboStringPiece original_tag;
640
+
641
+ /**
642
+ * A `GumboStringPiece` pointing to the original end tag text for this
643
+ * element. If the end tag was inserted algorithmically, (for example,
644
+ * closing a self-closing tag), this will be a zero-length string.
645
+ */
646
+ GumboStringPiece original_end_tag;
647
+
648
+ /** The source position for the start of the start tag. */
649
+ GumboSourcePosition start_pos;
650
+
651
+ /** The source position for the start of the end tag. */
652
+ GumboSourcePosition end_pos;
653
+
654
+ /**
655
+ * An array of `GumboAttribute`s, containing the attributes for this
656
+ * tag in the order that they were parsed. Pointers are owned.
657
+ */
658
+ GumboVector /* GumboAttribute* */ attributes;
659
+ } GumboElement;
660
+
661
+ /**
662
+ * A supertype for `GumboElement` and `GumboText`, so that we can
663
+ * include one generic type in lists of children and cast as necessary
664
+ * to subtypes.
665
+ */
666
+ struct GumboInternalNode {
667
+ /** The type of node that this is. */
668
+ GumboNodeType type;
669
+
670
+ /** Pointer back to parent node. Not owned. */
671
+ GumboNode* parent;
672
+
673
+ /** The index within the parent's children vector of this node. */
674
+ unsigned int index_within_parent;
675
+
676
+ /**
677
+ * A bitvector of flags containing information about why this element
678
+ * was inserted into the parse tree, including a variety of special
679
+ * parse situations.
680
+ */
681
+ GumboParseFlags parse_flags;
682
+
683
+ /** The actual node data. */
684
+ union {
685
+ GumboDocument document; // For GUMBO_NODE_DOCUMENT.
686
+ GumboElement element; // For GUMBO_NODE_ELEMENT.
687
+ GumboText text; // For everything else.
688
+ } v;
689
+ };
690
+
691
+ /**
692
+ * Input struct containing configuration options for the parser.
693
+ * These let you specify alternate memory managers, provide different
694
+ * error handling, etc. Use `kGumboDefaultOptions` for sensible
695
+ * defaults and only set what you need.
696
+ */
697
+ typedef struct GumboInternalOptions {
698
+ /**
699
+ * The tab-stop size, for computing positions in HTML files that
700
+ * use tabs. Default: `8`.
701
+ */
702
+ int tab_stop;
703
+
704
+ /**
705
+ * Whether or not to stop parsing when the first error is encountered.
706
+ * Default: `false`.
707
+ */
708
+ bool stop_on_first_error;
709
+
710
+ /**
711
+ * Maximum allowed number of attributes per element. If this limit is
712
+ * exceeded, the parser will return early with a partial document and
713
+ * the returned `GumboOutput` will have its `status` field set to
714
+ * `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
715
+ * Default: `400`.
716
+ */
717
+ int max_attributes;
718
+
719
+ /**
720
+ * Maximum allowed depth for the parse tree. If this limit is exceeded,
721
+ * the parser will return early with a partial document and the returned
722
+ * `GumboOutput` will have its `status` field set to
723
+ * `GUMBO_STATUS_TREE_TOO_DEEP`.
724
+ * Default: `400`.
725
+ */
726
+ unsigned int max_tree_depth;
727
+
728
+ /**
729
+ * The maximum number of errors before the parser stops recording
730
+ * them. This is provided so that if the page is totally borked, we
731
+ * don't completely fill up the errors vector and exhaust memory with
732
+ * useless redundant errors. Set to `-1` to disable the limit.
733
+ * Default: `-1`.
734
+ */
735
+ int max_errors;
736
+
737
+ /**
738
+ * The fragment context for parsing:
739
+ * https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
740
+ *
741
+ * If `NULL` is passed here, it is assumed to be "no
742
+ * fragment", i.e. the regular parsing algorithm. Otherwise, pass the
743
+ * tag name for the intended parent of the parsed fragment. We use the
744
+ * tag name, namespace, and encoding attribute which are sufficient to
745
+ * set all of the parsing context needed for fragment parsing.
746
+ *
747
+ * Default: `NULL`.
748
+ */
749
+ const char* fragment_context;
750
+
751
+ /**
752
+ * The namespace for the fragment context. This lets client code
753
+ * differentiate between, say, parsing a `<title>` tag in SVG vs.
754
+ * parsing it in HTML.
755
+ *
756
+ * Default: `GUMBO_NAMESPACE_HTML`.
757
+ */
758
+ GumboNamespaceEnum fragment_namespace;
759
+
760
+ /**
761
+ * The value of the fragment context's `encoding` attribute, if any.
762
+ * Set to `NULL` for no `encoding` attribute.
763
+ *
764
+ * Default: `NULL`.
765
+ */
766
+ const char* fragment_encoding;
767
+
768
+ /**
769
+ * Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
770
+ * be looked up using `gumbo_compute_quirks_mode()`.
771
+ *
772
+ * Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
773
+ */
774
+ GumboQuirksModeEnum quirks_mode;
775
+
776
+ /**
777
+ * For fragment parsing. Set this to true if the context node has a form
778
+ * element as an ancestor.
779
+ *
780
+ * Default: `false`.
781
+ */
782
+ bool fragment_context_has_form_ancestor;
783
+ } GumboOptions;
784
+
785
+ /** Default options struct; use this with gumbo_parse_with_options. */
786
+ extern const GumboOptions kGumboDefaultOptions;
787
+
788
+ /**
789
+ * Status code indicating whether parsing finished successfully or
790
+ * was stopped mid-document due to exceptional circumstances.
791
+ */
792
+ typedef enum {
793
+ /**
794
+ * Indicates that parsing completed successfuly. The resulting tree
795
+ * will be a complete document.
796
+ */
797
+ GUMBO_STATUS_OK,
798
+
799
+ /**
800
+ * Indicates that the maximum element nesting limit
801
+ * (`GumboOptions::max_tree_depth`) was reached during parsing. The
802
+ * resulting tree will be a partial document, with no further nodes
803
+ * created after the point where the limit was reached. The partial
804
+ * document may be useful for constructing an error message but
805
+ * typically shouldn't be used for other purposes.
806
+ */
807
+ GUMBO_STATUS_TREE_TOO_DEEP,
808
+
809
+ /**
810
+ * Indicates that the maximum number of attributes per element
811
+ * (`GumboOptions::max_attributes`) was reached during parsing. The
812
+ * resulting tree will be a partial document, with no further nodes
813
+ * created after the point where the limit was reached. The partial
814
+ * document may be useful for constructing an error message but
815
+ * typically shouldn't be used for other purposes.
816
+ */
817
+ GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
818
+
819
+ // Currently unused
820
+ GUMBO_STATUS_OUT_OF_MEMORY,
821
+ } GumboOutputStatus;
822
+
823
+
824
+ /** The output struct containing the results of the parse. */
825
+ typedef struct GumboInternalOutput {
826
+ /**
827
+ * Pointer to the document node. This is a `GumboNode` of type
828
+ * `NODE_DOCUMENT` that contains the entire document as its child.
829
+ */
830
+ GumboNode* document;
831
+
832
+ /**
833
+ * Pointer to the root node. This is the `<html>` tag that forms the
834
+ * root of the document.
835
+ */
836
+ GumboNode* root;
837
+
838
+ /**
839
+ * A list of errors that occurred during the parse.
840
+ */
841
+ GumboVector /* GumboError */ errors;
842
+
843
+ /**
844
+ * True if the parser encounted an error.
845
+ *
846
+ * This can be true and `errors` an empty `GumboVector` if the `max_errors`
847
+ * option was set to 0.
848
+ */
849
+ bool document_error;
850
+
851
+ /**
852
+ * A status code indicating whether parsing finished successfully or was
853
+ * stopped mid-document due to exceptional circumstances.
854
+ */
855
+ GumboOutputStatus status;
856
+ } GumboOutput;
857
+
858
+ /**
859
+ * Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
860
+ * buffer must live at least as long as the parse tree, as some fields
861
+ * (eg. `original_text`) point directly into the original buffer.
862
+ *
863
+ * This doesn't support buffers longer than 4 gigabytes.
864
+ */
865
+ GumboOutput* gumbo_parse(const char* buffer);
866
+
867
+ /**
868
+ * Extended version of `gumbo_parse` that takes an explicit options
869
+ * structure, buffer, and length.
870
+ */
871
+ GumboOutput* gumbo_parse_with_options (
872
+ const GumboOptions* options,
873
+ const char* buffer,
874
+ size_t buffer_length
875
+ );
876
+
877
+ /**
878
+ * Compute the quirks mode based on the name, public identifier, and system
879
+ * identifier. Any of these may be `NULL` to indicate a missing value.
880
+ */
881
+ GumboQuirksModeEnum gumbo_compute_quirks_mode (
882
+ const char *name,
883
+ const char *pubid,
884
+ const char *sysid
885
+ );
886
+
887
+ /** Convert a `GumboOutputStatus` code into a readable description. */
888
+ const char* gumbo_status_to_string(GumboOutputStatus status);
889
+
890
+ /** Release the memory used for the parse tree and parse errors. */
891
+ void gumbo_destroy_output(GumboOutput* output);
892
+
893
+ /** Opaque GumboError type */
894
+ typedef struct GumboInternalError GumboError;
895
+
896
+ /**
897
+ * Returns the position of the error.
898
+ */
899
+ GumboSourcePosition gumbo_error_position(const GumboError* error);
900
+
901
+ /**
902
+ * Returns a constant string representation of the error's code. This is owned
903
+ * by the library and should not be freed by the caller.
904
+ */
905
+ const char* gumbo_error_code(const GumboError* error);
906
+
907
+ /**
908
+ * Prints an error to a string. This stores a freshly-allocated buffer
909
+ * containing the error message text in output. The caller is responsible for
910
+ * freeing the buffer. The size of the error message is returned. The error
911
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
912
+ * returned size must be used.
913
+ */
914
+ size_t gumbo_error_to_string(const GumboError* error, char **output);
915
+
916
+ /**
917
+ * Prints a caret diagnostic to a string. This stores a freshly-allocated
918
+ * buffer containing the error message text in output. The caller is responsible for
919
+ * freeing the buffer. The size of the error message is returned. The error
920
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
921
+ * returned size must be used.
922
+ */
923
+ size_t gumbo_caret_diagnostic_to_string (
924
+ const GumboError* error,
925
+ const char* source_text,
926
+ size_t source_length,
927
+ char** output
928
+ );
929
+
930
+ /**
931
+ * Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
932
+ * instead of writing to a string.
933
+ */
934
+ void gumbo_print_caret_diagnostic (
935
+ const GumboError* error,
936
+ const char* source_text,
937
+ size_t source_length
938
+ );
939
+
940
+ #ifdef __cplusplus
941
+ }
942
+ #endif
943
+
944
+ #endif // GUMBO_H