Nokogiri_precompiled_aarch64_dedshit 1.14.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (263) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +44 -0
  3. data/LICENSE-DEPENDENCIES.md +2224 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +287 -0
  6. data/bin/nokogiri +131 -0
  7. data/dependencies.yml +41 -0
  8. data/ext/java/nokogiri/Html4Document.java +157 -0
  9. data/ext/java/nokogiri/Html4ElementDescription.java +133 -0
  10. data/ext/java/nokogiri/Html4EntityLookup.java +63 -0
  11. data/ext/java/nokogiri/Html4SaxParserContext.java +289 -0
  12. data/ext/java/nokogiri/Html4SaxPushParser.java +213 -0
  13. data/ext/java/nokogiri/NokogiriService.java +613 -0
  14. data/ext/java/nokogiri/XmlAttr.java +154 -0
  15. data/ext/java/nokogiri/XmlAttributeDecl.java +119 -0
  16. data/ext/java/nokogiri/XmlCdata.java +60 -0
  17. data/ext/java/nokogiri/XmlComment.java +77 -0
  18. data/ext/java/nokogiri/XmlDocument.java +705 -0
  19. data/ext/java/nokogiri/XmlDocumentFragment.java +163 -0
  20. data/ext/java/nokogiri/XmlDtd.java +516 -0
  21. data/ext/java/nokogiri/XmlElement.java +44 -0
  22. data/ext/java/nokogiri/XmlElementContent.java +412 -0
  23. data/ext/java/nokogiri/XmlElementDecl.java +148 -0
  24. data/ext/java/nokogiri/XmlEntityDecl.java +151 -0
  25. data/ext/java/nokogiri/XmlEntityReference.java +79 -0
  26. data/ext/java/nokogiri/XmlNamespace.java +193 -0
  27. data/ext/java/nokogiri/XmlNode.java +1938 -0
  28. data/ext/java/nokogiri/XmlNodeSet.java +463 -0
  29. data/ext/java/nokogiri/XmlProcessingInstruction.java +79 -0
  30. data/ext/java/nokogiri/XmlReader.java +615 -0
  31. data/ext/java/nokogiri/XmlRelaxng.java +133 -0
  32. data/ext/java/nokogiri/XmlSaxParserContext.java +329 -0
  33. data/ext/java/nokogiri/XmlSaxPushParser.java +288 -0
  34. data/ext/java/nokogiri/XmlSchema.java +423 -0
  35. data/ext/java/nokogiri/XmlSyntaxError.java +137 -0
  36. data/ext/java/nokogiri/XmlText.java +90 -0
  37. data/ext/java/nokogiri/XmlXpathContext.java +305 -0
  38. data/ext/java/nokogiri/XsltStylesheet.java +368 -0
  39. data/ext/java/nokogiri/internals/ClosedStreamException.java +13 -0
  40. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +252 -0
  41. data/ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java +27 -0
  42. data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +178 -0
  43. data/ext/java/nokogiri/internals/NokogiriDomParser.java +99 -0
  44. data/ext/java/nokogiri/internals/NokogiriEntityResolver.java +140 -0
  45. data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +65 -0
  46. data/ext/java/nokogiri/internals/NokogiriHandler.java +339 -0
  47. data/ext/java/nokogiri/internals/NokogiriHelpers.java +817 -0
  48. data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +228 -0
  49. data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +110 -0
  50. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +86 -0
  51. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +107 -0
  52. data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +62 -0
  53. data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +165 -0
  54. data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +50 -0
  55. data/ext/java/nokogiri/internals/NokogiriXPathVariableResolver.java +37 -0
  56. data/ext/java/nokogiri/internals/NokogiriXsltErrorListener.java +70 -0
  57. data/ext/java/nokogiri/internals/ParserContext.java +262 -0
  58. data/ext/java/nokogiri/internals/ReaderNode.java +564 -0
  59. data/ext/java/nokogiri/internals/SaveContextVisitor.java +865 -0
  60. data/ext/java/nokogiri/internals/SchemaErrorHandler.java +50 -0
  61. data/ext/java/nokogiri/internals/XalanDTMManagerPatch.java +174 -0
  62. data/ext/java/nokogiri/internals/XmlDeclHandler.java +11 -0
  63. data/ext/java/nokogiri/internals/XmlDomParserContext.java +265 -0
  64. data/ext/java/nokogiri/internals/XmlSaxParser.java +40 -0
  65. data/ext/java/nokogiri/internals/c14n/AttrCompare.java +122 -0
  66. data/ext/java/nokogiri/internals/c14n/C14nHelper.java +178 -0
  67. data/ext/java/nokogiri/internals/c14n/CanonicalFilter.java +43 -0
  68. data/ext/java/nokogiri/internals/c14n/CanonicalizationException.java +106 -0
  69. data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +278 -0
  70. data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +664 -0
  71. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_OmitComments.java +45 -0
  72. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_WithComments.java +45 -0
  73. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315.java +388 -0
  74. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +308 -0
  75. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclOmitComments.java +47 -0
  76. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclWithComments.java +51 -0
  77. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315OmitComments.java +51 -0
  78. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315WithComments.java +50 -0
  79. data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +660 -0
  80. data/ext/java/nokogiri/internals/c14n/CanonicalizerPhysical.java +194 -0
  81. data/ext/java/nokogiri/internals/c14n/CanonicalizerSpi.java +77 -0
  82. data/ext/java/nokogiri/internals/c14n/Constants.java +45 -0
  83. data/ext/java/nokogiri/internals/c14n/ElementProxy.java +325 -0
  84. data/ext/java/nokogiri/internals/c14n/HelperNodeList.java +106 -0
  85. data/ext/java/nokogiri/internals/c14n/IgnoreAllErrorHandler.java +86 -0
  86. data/ext/java/nokogiri/internals/c14n/InclusiveNamespaces.java +181 -0
  87. data/ext/java/nokogiri/internals/c14n/InvalidCanonicalizerException.java +87 -0
  88. data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +452 -0
  89. data/ext/java/nokogiri/internals/c14n/NodeFilter.java +52 -0
  90. data/ext/java/nokogiri/internals/c14n/UtfHelpper.java +190 -0
  91. data/ext/java/nokogiri/internals/c14n/XMLUtils.java +540 -0
  92. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +1712 -0
  93. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +737 -0
  94. data/ext/nokogiri/depend +38 -0
  95. data/ext/nokogiri/extconf.rb +1086 -0
  96. data/ext/nokogiri/gumbo.c +594 -0
  97. data/ext/nokogiri/html4_document.c +167 -0
  98. data/ext/nokogiri/html4_element_description.c +294 -0
  99. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  100. data/ext/nokogiri/html4_sax_parser_context.c +116 -0
  101. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  102. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  103. data/ext/nokogiri/nokogiri.c +265 -0
  104. data/ext/nokogiri/nokogiri.h +235 -0
  105. data/ext/nokogiri/test_global_handlers.c +42 -0
  106. data/ext/nokogiri/xml_attr.c +103 -0
  107. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  108. data/ext/nokogiri/xml_cdata.c +57 -0
  109. data/ext/nokogiri/xml_comment.c +62 -0
  110. data/ext/nokogiri/xml_document.c +689 -0
  111. data/ext/nokogiri/xml_document_fragment.c +44 -0
  112. data/ext/nokogiri/xml_dtd.c +210 -0
  113. data/ext/nokogiri/xml_element_content.c +128 -0
  114. data/ext/nokogiri/xml_element_decl.c +69 -0
  115. data/ext/nokogiri/xml_encoding_handler.c +104 -0
  116. data/ext/nokogiri/xml_entity_decl.c +112 -0
  117. data/ext/nokogiri/xml_entity_reference.c +50 -0
  118. data/ext/nokogiri/xml_namespace.c +186 -0
  119. data/ext/nokogiri/xml_node.c +2426 -0
  120. data/ext/nokogiri/xml_node_set.c +496 -0
  121. data/ext/nokogiri/xml_processing_instruction.c +54 -0
  122. data/ext/nokogiri/xml_reader.c +794 -0
  123. data/ext/nokogiri/xml_relax_ng.c +164 -0
  124. data/ext/nokogiri/xml_sax_parser.c +316 -0
  125. data/ext/nokogiri/xml_sax_parser_context.c +283 -0
  126. data/ext/nokogiri/xml_sax_push_parser.c +166 -0
  127. data/ext/nokogiri/xml_schema.c +260 -0
  128. data/ext/nokogiri/xml_syntax_error.c +85 -0
  129. data/ext/nokogiri/xml_text.c +48 -0
  130. data/ext/nokogiri/xml_xpath_context.c +415 -0
  131. data/ext/nokogiri/xslt_stylesheet.c +363 -0
  132. data/gumbo-parser/CHANGES.md +63 -0
  133. data/gumbo-parser/Makefile +111 -0
  134. data/gumbo-parser/THANKS +27 -0
  135. data/gumbo-parser/src/Makefile +34 -0
  136. data/gumbo-parser/src/README.md +41 -0
  137. data/gumbo-parser/src/ascii.c +75 -0
  138. data/gumbo-parser/src/ascii.h +115 -0
  139. data/gumbo-parser/src/attribute.c +42 -0
  140. data/gumbo-parser/src/attribute.h +17 -0
  141. data/gumbo-parser/src/char_ref.c +22225 -0
  142. data/gumbo-parser/src/char_ref.h +29 -0
  143. data/gumbo-parser/src/char_ref.rl +2154 -0
  144. data/gumbo-parser/src/error.c +626 -0
  145. data/gumbo-parser/src/error.h +148 -0
  146. data/gumbo-parser/src/foreign_attrs.c +104 -0
  147. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  148. data/gumbo-parser/src/insertion_mode.h +33 -0
  149. data/gumbo-parser/src/macros.h +91 -0
  150. data/gumbo-parser/src/nokogiri_gumbo.h +944 -0
  151. data/gumbo-parser/src/parser.c +4878 -0
  152. data/gumbo-parser/src/parser.h +41 -0
  153. data/gumbo-parser/src/replacement.h +33 -0
  154. data/gumbo-parser/src/string_buffer.c +103 -0
  155. data/gumbo-parser/src/string_buffer.h +68 -0
  156. data/gumbo-parser/src/string_piece.c +48 -0
  157. data/gumbo-parser/src/svg_attrs.c +174 -0
  158. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  159. data/gumbo-parser/src/svg_tags.c +137 -0
  160. data/gumbo-parser/src/svg_tags.gperf +55 -0
  161. data/gumbo-parser/src/tag.c +223 -0
  162. data/gumbo-parser/src/tag_lookup.c +382 -0
  163. data/gumbo-parser/src/tag_lookup.gperf +170 -0
  164. data/gumbo-parser/src/tag_lookup.h +13 -0
  165. data/gumbo-parser/src/token_buffer.c +79 -0
  166. data/gumbo-parser/src/token_buffer.h +71 -0
  167. data/gumbo-parser/src/token_type.h +17 -0
  168. data/gumbo-parser/src/tokenizer.c +3463 -0
  169. data/gumbo-parser/src/tokenizer.h +112 -0
  170. data/gumbo-parser/src/tokenizer_states.h +339 -0
  171. data/gumbo-parser/src/utf8.c +245 -0
  172. data/gumbo-parser/src/utf8.h +164 -0
  173. data/gumbo-parser/src/util.c +66 -0
  174. data/gumbo-parser/src/util.h +34 -0
  175. data/gumbo-parser/src/vector.c +111 -0
  176. data/gumbo-parser/src/vector.h +45 -0
  177. data/lib/nokogiri/class_resolver.rb +67 -0
  178. data/lib/nokogiri/css/node.rb +54 -0
  179. data/lib/nokogiri/css/parser.rb +770 -0
  180. data/lib/nokogiri/css/parser.y +277 -0
  181. data/lib/nokogiri/css/parser_extras.rb +96 -0
  182. data/lib/nokogiri/css/syntax_error.rb +9 -0
  183. data/lib/nokogiri/css/tokenizer.rb +155 -0
  184. data/lib/nokogiri/css/tokenizer.rex +56 -0
  185. data/lib/nokogiri/css/xpath_visitor.rb +359 -0
  186. data/lib/nokogiri/css.rb +66 -0
  187. data/lib/nokogiri/decorators/slop.rb +44 -0
  188. data/lib/nokogiri/encoding_handler.rb +57 -0
  189. data/lib/nokogiri/extension.rb +32 -0
  190. data/lib/nokogiri/gumbo.rb +15 -0
  191. data/lib/nokogiri/html.rb +48 -0
  192. data/lib/nokogiri/html4/builder.rb +37 -0
  193. data/lib/nokogiri/html4/document.rb +214 -0
  194. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  195. data/lib/nokogiri/html4/element_description.rb +25 -0
  196. data/lib/nokogiri/html4/element_description_defaults.rb +572 -0
  197. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  198. data/lib/nokogiri/html4/entity_lookup.rb +15 -0
  199. data/lib/nokogiri/html4/sax/parser.rb +63 -0
  200. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  201. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  202. data/lib/nokogiri/html4.rb +47 -0
  203. data/lib/nokogiri/html5/document.rb +168 -0
  204. data/lib/nokogiri/html5/document_fragment.rb +90 -0
  205. data/lib/nokogiri/html5/node.rb +98 -0
  206. data/lib/nokogiri/html5.rb +389 -0
  207. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  208. data/lib/nokogiri/jruby/isorelax/isorelax/20030108/isorelax-20030108.jar +0 -0
  209. data/lib/nokogiri/jruby/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar +0 -0
  210. data/lib/nokogiri/jruby/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar +0 -0
  211. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  212. data/lib/nokogiri/jruby/nu/validator/jing/20200702VNU/jing-20200702VNU.jar +0 -0
  213. data/lib/nokogiri/jruby/org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar +0 -0
  214. data/lib/nokogiri/jruby/xalan/serializer/2.7.3/serializer-2.7.3.jar +0 -0
  215. data/lib/nokogiri/jruby/xalan/xalan/2.7.3/xalan-2.7.3.jar +0 -0
  216. data/lib/nokogiri/jruby/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar +0 -0
  217. data/lib/nokogiri/jruby/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar +0 -0
  218. data/lib/nokogiri/syntax_error.rb +6 -0
  219. data/lib/nokogiri/version/constant.rb +6 -0
  220. data/lib/nokogiri/version/info.rb +223 -0
  221. data/lib/nokogiri/version.rb +4 -0
  222. data/lib/nokogiri/xml/attr.rb +66 -0
  223. data/lib/nokogiri/xml/attribute_decl.rb +20 -0
  224. data/lib/nokogiri/xml/builder.rb +487 -0
  225. data/lib/nokogiri/xml/cdata.rb +13 -0
  226. data/lib/nokogiri/xml/character_data.rb +9 -0
  227. data/lib/nokogiri/xml/document.rb +471 -0
  228. data/lib/nokogiri/xml/document_fragment.rb +205 -0
  229. data/lib/nokogiri/xml/dtd.rb +34 -0
  230. data/lib/nokogiri/xml/element_content.rb +38 -0
  231. data/lib/nokogiri/xml/element_decl.rb +15 -0
  232. data/lib/nokogiri/xml/entity_decl.rb +21 -0
  233. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  234. data/lib/nokogiri/xml/namespace.rb +58 -0
  235. data/lib/nokogiri/xml/node/save_options.rb +68 -0
  236. data/lib/nokogiri/xml/node.rb +1563 -0
  237. data/lib/nokogiri/xml/node_set.rb +447 -0
  238. data/lib/nokogiri/xml/notation.rb +19 -0
  239. data/lib/nokogiri/xml/parse_options.rb +213 -0
  240. data/lib/nokogiri/xml/pp/character_data.rb +21 -0
  241. data/lib/nokogiri/xml/pp/node.rb +57 -0
  242. data/lib/nokogiri/xml/pp.rb +4 -0
  243. data/lib/nokogiri/xml/processing_instruction.rb +11 -0
  244. data/lib/nokogiri/xml/reader.rb +105 -0
  245. data/lib/nokogiri/xml/relax_ng.rb +38 -0
  246. data/lib/nokogiri/xml/sax/document.rb +167 -0
  247. data/lib/nokogiri/xml/sax/parser.rb +125 -0
  248. data/lib/nokogiri/xml/sax/parser_context.rb +21 -0
  249. data/lib/nokogiri/xml/sax/push_parser.rb +61 -0
  250. data/lib/nokogiri/xml/sax.rb +6 -0
  251. data/lib/nokogiri/xml/schema.rb +73 -0
  252. data/lib/nokogiri/xml/searchable.rb +270 -0
  253. data/lib/nokogiri/xml/syntax_error.rb +72 -0
  254. data/lib/nokogiri/xml/text.rb +11 -0
  255. data/lib/nokogiri/xml/xpath/syntax_error.rb +13 -0
  256. data/lib/nokogiri/xml/xpath.rb +21 -0
  257. data/lib/nokogiri/xml/xpath_context.rb +16 -0
  258. data/lib/nokogiri/xml.rb +76 -0
  259. data/lib/nokogiri/xslt/stylesheet.rb +27 -0
  260. data/lib/nokogiri/xslt.rb +65 -0
  261. data/lib/nokogiri.rb +120 -0
  262. data/lib/xsd/xmlparser/nokogiri.rb +106 -0
  263. metadata +391 -0
@@ -0,0 +1,4878 @@
1
+ /*
2
+ Copyright 2017-2018 Craig Barnes.
3
+ Copyright 2010 Google Inc.
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ https://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+ */
17
+
18
+ #include <assert.h>
19
+ #include <stdarg.h>
20
+ #include <stdint.h>
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+
24
+ #include "ascii.h"
25
+ #include "attribute.h"
26
+ #include "error.h"
27
+ #include "nokogiri_gumbo.h"
28
+ #include "insertion_mode.h"
29
+ #include "macros.h"
30
+ #include "parser.h"
31
+ #include "replacement.h"
32
+ #include "tokenizer.h"
33
+ #include "tokenizer_states.h"
34
+ #include "token_buffer.h"
35
+ #include "utf8.h"
36
+ #include "util.h"
37
+ #include "vector.h"
38
+
39
+ typedef uint8_t TagSet[GUMBO_TAG_LAST + 1];
40
+ #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML)
41
+ #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG)
42
+ #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML)
43
+
44
+ #define GUMBO_EMPTY_SOURCE_POSITION_INIT { .line = 0, .column = 0, .offset = 0 }
45
+ #define kGumboEmptySourcePosition (const GumboSourcePosition) \
46
+ GUMBO_EMPTY_SOURCE_POSITION_INIT
47
+
48
+ const GumboOptions kGumboDefaultOptions = {
49
+ .tab_stop = 8,
50
+ .stop_on_first_error = false,
51
+ .max_attributes = 400,
52
+ .max_tree_depth = 400,
53
+ .max_errors = -1,
54
+ .fragment_context = NULL,
55
+ .fragment_namespace = GUMBO_NAMESPACE_HTML,
56
+ .fragment_encoding = NULL,
57
+ .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS,
58
+ .fragment_context_has_form_ancestor = false,
59
+ };
60
+
61
+ #define STRING(s) {.data = s, .length = sizeof(s) - 1}
62
+ #define TERMINATOR {.data = NULL, .length = 0}
63
+
64
+ // The doctype arrays have an explicit terminator because we want to pass them
65
+ // to a helper function, and passing them as a pointer discards sizeof
66
+ // information. The SVG arrays are used only by one-off functions, and so loops
67
+ // over them use sizeof directly instead of a terminator.
68
+
69
+ static const GumboStringPiece kQuirksModePublicIdPrefixes[] = {
70
+ STRING("+//Silmaril//dtd html Pro v0r11 19970101//"),
71
+ STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
72
+ STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"),
73
+ STRING("-//IETF//DTD HTML 2.0 Level 1//"),
74
+ STRING("-//IETF//DTD HTML 2.0 Level 2//"),
75
+ STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"),
76
+ STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"),
77
+ STRING("-//IETF//DTD HTML 2.0 Strict//"),
78
+ STRING("-//IETF//DTD HTML 2.0//"),
79
+ STRING("-//IETF//DTD HTML 2.1E//"),
80
+ STRING("-//IETF//DTD HTML 3.0//"),
81
+ STRING("-//IETF//DTD HTML 3.2 Final//"),
82
+ STRING("-//IETF//DTD HTML 3.2//"),
83
+ STRING("-//IETF//DTD HTML 3//"),
84
+ STRING("-//IETF//DTD HTML Level 0//"),
85
+ STRING("-//IETF//DTD HTML Level 1//"),
86
+ STRING("-//IETF//DTD HTML Level 2//"),
87
+ STRING("-//IETF//DTD HTML Level 3//"),
88
+ STRING("-//IETF//DTD HTML Strict Level 0//"),
89
+ STRING("-//IETF//DTD HTML Strict Level 1//"),
90
+ STRING("-//IETF//DTD HTML Strict Level 2//"),
91
+ STRING("-//IETF//DTD HTML Strict Level 3//"),
92
+ STRING("-//IETF//DTD HTML Strict//"),
93
+ STRING("-//IETF//DTD HTML//"),
94
+ STRING("-//Metrius//DTD Metrius Presentational//"),
95
+ STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
96
+ STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
97
+ STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
98
+ STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
99
+ STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
100
+ STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
101
+ STRING("-//Netscape Comm. Corp.//DTD HTML//"),
102
+ STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"),
103
+ STRING("-//O'Reilly and Associates//DTD HTML 2.0//"),
104
+ STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
105
+ STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
106
+ STRING(
107
+ "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)"
108
+ "extensions to HTML 4.0//"),
109
+ STRING(
110
+ "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::"
111
+ "extensions to HTML 4.0//"),
112
+ STRING("-//Spyglass//DTD HTML 2.0 Extended//"),
113
+ STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
114
+ STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
115
+ STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
116
+ STRING("-//W3C//DTD HTML 3 1995-03-24//"),
117
+ STRING("-//W3C//DTD HTML 3.2 Draft//"),
118
+ STRING("-//W3C//DTD HTML 3.2 Final//"),
119
+ STRING("-//W3C//DTD HTML 3.2//"),
120
+ STRING("-//W3C//DTD HTML 3.2S Draft//"),
121
+ STRING("-//W3C//DTD HTML 4.0 Frameset//"),
122
+ STRING("-//W3C//DTD HTML 4.0 Transitional//"),
123
+ STRING("-//W3C//DTD HTML Experimental 19960712//"),
124
+ STRING("-//W3C//DTD HTML Experimental 970421//"),
125
+ STRING("-//W3C//DTD W3 HTML//"),
126
+ STRING("-//W3O//DTD W3 HTML 3.0//"),
127
+ STRING("-//WebTechs//DTD Mozilla HTML 2.0//"),
128
+ STRING("-//WebTechs//DTD Mozilla HTML//"),
129
+ TERMINATOR
130
+ };
131
+
132
+ static const GumboStringPiece kQuirksModePublicIdExactMatches[] = {
133
+ STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"),
134
+ STRING("-/W3C/DTD HTML 4.0 Transitional/EN"),
135
+ STRING("HTML"),
136
+ TERMINATOR
137
+ };
138
+
139
+ static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = {
140
+ STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"),
141
+ TERMINATOR
142
+ };
143
+
144
+ static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = {
145
+ STRING("-//W3C//DTD XHTML 1.0 Frameset//"),
146
+ STRING("-//W3C//DTD XHTML 1.0 Transitional//"),
147
+ TERMINATOR
148
+ };
149
+
150
+ static const GumboStringPiece kSystemIdDependentPublicIdPrefixes[] = {
151
+ STRING("-//W3C//DTD HTML 4.01 Frameset//"),
152
+ STRING("-//W3C//DTD HTML 4.01 Transitional//"),
153
+ TERMINATOR
154
+ };
155
+
156
+ // Indexed by GumboNamespaceEnum; keep in sync with that.
157
+ static const char* kLegalXmlns[] = {
158
+ "http://www.w3.org/1999/xhtml",
159
+ "http://www.w3.org/2000/svg",
160
+ "http://www.w3.org/1998/Math/MathML"
161
+ };
162
+
163
+ // The "scope marker" for the list of active formatting elements. We use a
164
+ // pointer to this as a generic marker element, since the particular element
165
+ // scope doesn't matter.
166
+ static const GumboNode kActiveFormattingScopeMarker;
167
+
168
+ // The tag_is and tag_in function use true & false to denote start & end tags,
169
+ // but for readability, we define constants for them here.
170
+ static const bool kStartTag = true;
171
+ static const bool kEndTag = false;
172
+
173
+ // Because GumboStringPieces are immutable, we can't insert a character directly
174
+ // into a text node. Instead, we accumulate all pending characters here and
175
+ // flush them out to a text node whenever a new element is inserted.
176
+ //
177
+ // https://html.spec.whatwg.org/multipage/parsing.html#insert-a-character
178
+ typedef struct _TextNodeBufferState {
179
+ // The accumulated text to be inserted into the current text node.
180
+ GumboStringBuffer _buffer;
181
+
182
+ // A pointer to the original text represented by this text node. Note that
183
+ // because of foster parenting and other strange DOM manipulations, this may
184
+ // include other non-text HTML tags in it; it is defined as the span of
185
+ // original text from the first character in this text node to the last
186
+ // character in this text node.
187
+ const char* _start_original_text;
188
+
189
+ // The source position of the start of this text node.
190
+ GumboSourcePosition _start_position;
191
+
192
+ // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
193
+ GumboNodeType _type;
194
+ } TextNodeBufferState;
195
+
196
+ typedef struct GumboInternalParserState {
197
+ // https://html.spec.whatwg.org/multipage/parsing.html#insertion-mode
198
+ GumboInsertionMode _insertion_mode;
199
+
200
+ // Used for run_generic_parsing_algorithm, which needs to switch back to the
201
+ // original insertion mode at its conclusion.
202
+ GumboInsertionMode _original_insertion_mode;
203
+
204
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements
205
+ GumboVector /*GumboNode*/ _open_elements;
206
+
207
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements
208
+ GumboVector /*GumboNode*/ _active_formatting_elements;
209
+
210
+ // The stack of template insertion modes.
211
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode
212
+ GumboVector /*InsertionMode*/ _template_insertion_modes;
213
+
214
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-element-pointers
215
+ GumboNode* _head_element;
216
+ GumboNode* _form_element;
217
+
218
+ // The element used as fragment context when parsing in fragment mode
219
+ GumboNode* _fragment_ctx;
220
+
221
+ // The flag for when the spec says "Reprocess the current token in..."
222
+ bool _reprocess_current_token;
223
+
224
+ // The flag for "acknowledge the token's self-closing flag".
225
+ bool _self_closing_flag_acknowledged;
226
+
227
+ // The "frameset-ok" flag from the spec.
228
+ bool _frameset_ok;
229
+
230
+ // The flag for "If the next token is a LINE FEED, ignore that token...".
231
+ bool _ignore_next_linefeed;
232
+
233
+ // The flag for "whenever a node would be inserted into the current node, it
234
+ // must instead be foster parented". This is used for misnested table
235
+ // content, which needs to be handled according to "in body" rules yet foster
236
+ // parented outside of the table.
237
+ // It would perhaps be more explicit to have this as a parameter to
238
+ // handle_in_body and insert_element, but given how special-purpose this is
239
+ // and the number of call-sites that would need to take the extra parameter,
240
+ // it's easier just to have a state flag.
241
+ bool _foster_parent_insertions;
242
+
243
+ // The accumulated text node buffer state.
244
+ TextNodeBufferState _text_node;
245
+
246
+ // The accumulated character tokens in tables for error purposes.
247
+ GumboCharacterTokenBuffer _table_character_tokens;
248
+
249
+ // The current token.
250
+ GumboToken* _current_token;
251
+
252
+ // The way that the spec is written, the </body> and </html> tags are *always*
253
+ // implicit, because encountering one of those tokens merely switches the
254
+ // insertion mode out of "in body". So we have individual state flags for
255
+ // those end tags that are then inspected by pop_current_node when the <body>
256
+ // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG
257
+ // flag appropriately.
258
+ bool _closed_body_tag;
259
+ bool _closed_html_tag;
260
+ } GumboParserState;
261
+
262
+ static bool token_has_attribute(const GumboToken* token, const char* name) {
263
+ assert(token->type == GUMBO_TOKEN_START_TAG);
264
+ return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL;
265
+ }
266
+
267
+ // Checks if the value of the specified attribute is a case-insensitive match
268
+ // for the specified string.
269
+ static bool attribute_matches (
270
+ const GumboVector* attributes,
271
+ const char* name,
272
+ const char* value
273
+ ) {
274
+ const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
275
+ return attr ? gumbo_ascii_strcasecmp(value, attr->value) == 0 : false;
276
+ }
277
+
278
+ // Checks if the value of the specified attribute is a case-sensitive match
279
+ // for the specified string.
280
+ static bool attribute_matches_case_sensitive (
281
+ const GumboVector* attributes,
282
+ const char* name,
283
+ const char* value
284
+ ) {
285
+ const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
286
+ return attr ? strcmp(value, attr->value) == 0 : false;
287
+ }
288
+
289
+ // Checks if the specified attribute vectors are identical.
290
+ static bool all_attributes_match (
291
+ const GumboVector* attr1,
292
+ const GumboVector* attr2
293
+ ) {
294
+ unsigned int num_unmatched_attr2_elements = attr2->length;
295
+ for (unsigned int i = 0; i < attr1->length; ++i) {
296
+ const GumboAttribute* attr = attr1->data[i];
297
+ if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) {
298
+ --num_unmatched_attr2_elements;
299
+ } else {
300
+ return false;
301
+ }
302
+ }
303
+ return num_unmatched_attr2_elements == 0;
304
+ }
305
+
306
+ static void set_frameset_not_ok(GumboParser* parser) {
307
+ gumbo_debug("Setting frameset_ok to false.\n");
308
+ parser->_parser_state->_frameset_ok = false;
309
+ }
310
+
311
+ static GumboNode* create_node(GumboNodeType type) {
312
+ GumboNode* node = gumbo_alloc(sizeof(GumboNode));
313
+ node->parent = NULL;
314
+ node->index_within_parent = -1;
315
+ node->type = type;
316
+ node->parse_flags = GUMBO_INSERTION_NORMAL;
317
+ return node;
318
+ }
319
+
320
+ static GumboNode* new_document_node() {
321
+ GumboNode* document_node = create_node(GUMBO_NODE_DOCUMENT);
322
+ document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
323
+ gumbo_vector_init(1, &document_node->v.document.children);
324
+
325
+ // Must be initialized explicitly, as there's no guarantee that we'll see a
326
+ // doc type token.
327
+ GumboDocument* document = &document_node->v.document;
328
+ document->has_doctype = false;
329
+ document->name = NULL;
330
+ document->public_identifier = NULL;
331
+ document->system_identifier = NULL;
332
+ document->doc_type_quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
333
+ return document_node;
334
+ }
335
+
336
+ static void output_init(GumboParser* parser) {
337
+ GumboOutput* output = gumbo_alloc(sizeof(GumboOutput));
338
+ output->root = NULL;
339
+ output->document = new_document_node();
340
+ output->document_error = false;
341
+ output->status = GUMBO_STATUS_OK;
342
+ parser->_output = output;
343
+ gumbo_init_errors(parser);
344
+ }
345
+
346
+ static void parser_state_init(GumboParser* parser) {
347
+ GumboParserState* parser_state = gumbo_alloc(sizeof(GumboParserState));
348
+ parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL;
349
+ parser_state->_reprocess_current_token = false;
350
+ parser_state->_frameset_ok = true;
351
+ parser_state->_ignore_next_linefeed = false;
352
+ parser_state->_foster_parent_insertions = false;
353
+ parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
354
+ gumbo_string_buffer_init(&parser_state->_text_node._buffer);
355
+ gumbo_character_token_buffer_init(&parser_state->_table_character_tokens);
356
+ gumbo_vector_init(10, &parser_state->_open_elements);
357
+ gumbo_vector_init(5, &parser_state->_active_formatting_elements);
358
+ gumbo_vector_init(5, &parser_state->_template_insertion_modes);
359
+ parser_state->_head_element = NULL;
360
+ parser_state->_form_element = NULL;
361
+ parser_state->_fragment_ctx = NULL;
362
+ parser_state->_current_token = NULL;
363
+ parser_state->_closed_body_tag = false;
364
+ parser_state->_closed_html_tag = false;
365
+ parser->_parser_state = parser_state;
366
+ }
367
+
368
+ typedef void (*TreeTraversalCallback)(GumboNode* node);
369
+
370
+ static void tree_traverse(GumboNode* node, TreeTraversalCallback callback) {
371
+ GumboNode* current_node = node;
372
+ unsigned int offset = 0;
373
+
374
+ tailcall:
375
+ switch (current_node->type) {
376
+ case GUMBO_NODE_DOCUMENT:
377
+ case GUMBO_NODE_TEMPLATE:
378
+ case GUMBO_NODE_ELEMENT: {
379
+ GumboVector* children = (current_node->type == GUMBO_NODE_DOCUMENT)
380
+ ? &current_node->v.document.children
381
+ : &current_node->v.element.children
382
+ ;
383
+ if (offset >= children->length) {
384
+ assert(offset == children->length);
385
+ break;
386
+ } else {
387
+ current_node = children->data[offset];
388
+ offset = 0;
389
+ goto tailcall;
390
+ }
391
+ }
392
+ case GUMBO_NODE_TEXT:
393
+ case GUMBO_NODE_CDATA:
394
+ case GUMBO_NODE_COMMENT:
395
+ case GUMBO_NODE_WHITESPACE:
396
+ assert(offset == 0);
397
+ break;
398
+ }
399
+
400
+ offset = current_node->index_within_parent + 1;
401
+ GumboNode* next_node = current_node->parent;
402
+ callback(current_node);
403
+ if (current_node == node) {
404
+ return;
405
+ }
406
+ current_node = next_node;
407
+ goto tailcall;
408
+ }
409
+
410
+ static void destroy_node_callback(GumboNode* node) {
411
+ switch (node->type) {
412
+ case GUMBO_NODE_DOCUMENT: {
413
+ GumboDocument* doc = &node->v.document;
414
+ gumbo_free((void*) doc->children.data);
415
+ gumbo_free((void*) doc->name);
416
+ gumbo_free((void*) doc->public_identifier);
417
+ gumbo_free((void*) doc->system_identifier);
418
+ } break;
419
+ case GUMBO_NODE_TEMPLATE:
420
+ case GUMBO_NODE_ELEMENT:
421
+ for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) {
422
+ gumbo_destroy_attribute(node->v.element.attributes.data[i]);
423
+ }
424
+ gumbo_free(node->v.element.attributes.data);
425
+ gumbo_free(node->v.element.children.data);
426
+ if (node->v.element.tag == GUMBO_TAG_UNKNOWN)
427
+ gumbo_free((void *)node->v.element.name);
428
+ break;
429
+ case GUMBO_NODE_TEXT:
430
+ case GUMBO_NODE_CDATA:
431
+ case GUMBO_NODE_COMMENT:
432
+ case GUMBO_NODE_WHITESPACE:
433
+ gumbo_free((void*) node->v.text.text);
434
+ break;
435
+ }
436
+ gumbo_free(node);
437
+ }
438
+
439
+ static void destroy_node(GumboNode* node) {
440
+ tree_traverse(node, &destroy_node_callback);
441
+ }
442
+
443
+ static void destroy_fragment_ctx_element(GumboNode* ctx);
444
+
445
+ static void parser_state_destroy(GumboParser* parser) {
446
+ GumboParserState* state = parser->_parser_state;
447
+ if (state->_fragment_ctx) {
448
+ destroy_fragment_ctx_element(state->_fragment_ctx);
449
+ }
450
+ gumbo_vector_destroy(&state->_active_formatting_elements);
451
+ gumbo_vector_destroy(&state->_open_elements);
452
+ gumbo_vector_destroy(&state->_template_insertion_modes);
453
+ gumbo_string_buffer_destroy(&state->_text_node._buffer);
454
+ gumbo_character_token_buffer_destroy(&state->_table_character_tokens);
455
+ gumbo_free(state);
456
+ }
457
+
458
+ static GumboNode* get_document_node(const GumboParser* parser) {
459
+ return parser->_output->document;
460
+ }
461
+
462
+ static bool is_fragment_parser(const GumboParser* parser) {
463
+ return !!parser->_parser_state->_fragment_ctx;
464
+ }
465
+
466
+ // Returns the node at the bottom of the stack of open elements, or NULL if no
467
+ // elements have been added yet.
468
+ static GumboNode* get_current_node(const GumboParser* parser) {
469
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
470
+ if (open_elements->length == 0) {
471
+ assert(!parser->_output->root);
472
+ return NULL;
473
+ }
474
+ assert(open_elements->length > 0);
475
+ assert(open_elements->data != NULL);
476
+ return open_elements->data[open_elements->length - 1];
477
+ }
478
+
479
+ static GumboNode* get_adjusted_current_node(const GumboParser* parser) {
480
+ const GumboParserState* state = parser->_parser_state;
481
+ if (state->_open_elements.length == 1 && state->_fragment_ctx) {
482
+ return state->_fragment_ctx;
483
+ }
484
+ return get_current_node(parser);
485
+ }
486
+
487
+ // Returns true if the given needle is in the given array of literal
488
+ // GumboStringPieces. If exact_match is true, this requires that they match
489
+ // exactly; otherwise, this performs a prefix match to check if any of the
490
+ // elements in haystack start with needle. This always performs a
491
+ // case-insensitive match.
492
+ static bool is_in_static_list (
493
+ const GumboStringPiece* needle,
494
+ const GumboStringPiece* haystack,
495
+ bool exact_match
496
+ ) {
497
+ if (needle->length == 0)
498
+ return false;
499
+ if (exact_match) {
500
+ for (size_t i = 0; haystack[i].data; ++i) {
501
+ if (gumbo_string_equals_ignore_case(needle, &haystack[i]))
502
+ return true;
503
+ }
504
+ } else {
505
+ for (size_t i = 0; haystack[i].data; ++i) {
506
+ if (gumbo_string_prefix_ignore_case(&haystack[i], needle))
507
+ return true;
508
+ }
509
+ }
510
+ return false;
511
+ }
512
+
513
+ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
514
+ parser->_parser_state->_insertion_mode = mode;
515
+ }
516
+
517
+ static void push_template_insertion_mode (
518
+ GumboParser* parser,
519
+ GumboInsertionMode mode
520
+ ) {
521
+ gumbo_vector_add (
522
+ (void*) mode,
523
+ &parser->_parser_state->_template_insertion_modes
524
+ );
525
+ }
526
+
527
+ static void pop_template_insertion_mode(GumboParser* parser) {
528
+ gumbo_vector_pop(&parser->_parser_state->_template_insertion_modes);
529
+ }
530
+
531
+ // Returns the current template insertion mode. If the stack of template
532
+ // insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL.
533
+ static GumboInsertionMode get_current_template_insertion_mode (
534
+ const GumboParser* parser
535
+ ) {
536
+ GumboVector* modes = &parser->_parser_state->_template_insertion_modes;
537
+ if (modes->length == 0) {
538
+ return GUMBO_INSERTION_MODE_INITIAL;
539
+ }
540
+ return (GumboInsertionMode)(intptr_t) modes->data[(modes->length - 1)];
541
+ }
542
+
543
+ // Returns true if the specified token is either a start or end tag
544
+ // (specified by is_start) with one of the tag types in the TagSet.
545
+ static bool tag_in (
546
+ const GumboToken* token,
547
+ bool is_start,
548
+ const TagSet* tags
549
+ ) {
550
+ GumboTag token_tag;
551
+ if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
552
+ token_tag = token->v.start_tag.tag;
553
+ } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
554
+ token_tag = token->v.end_tag.tag;
555
+ } else {
556
+ return false;
557
+ }
558
+ return (*tags)[(unsigned) token_tag] != 0u;
559
+ }
560
+
561
+ // Like tag_in, but for the single-tag case.
562
+ static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
563
+ if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
564
+ return token->v.start_tag.tag == tag;
565
+ }
566
+ if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
567
+ return token->v.end_tag.tag == tag;
568
+ }
569
+ return false;
570
+ }
571
+
572
+ static inline bool tagset_includes (
573
+ const TagSet* tagset,
574
+ GumboNamespaceEnum ns,
575
+ GumboTag tag
576
+ ) {
577
+ return ((*tagset)[(unsigned) tag] & (1u << (unsigned) ns)) != 0u;
578
+ }
579
+
580
+ // Like tag_in, but checks for the tag of a node, rather than a token.
581
+ static bool node_tag_in_set(const GumboNode* node, const TagSet* tags) {
582
+ assert(node != NULL);
583
+ if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
584
+ return false;
585
+ }
586
+ return tagset_includes (
587
+ tags,
588
+ node->v.element.tag_namespace,
589
+ node->v.element.tag
590
+ );
591
+ }
592
+
593
+ static bool node_qualified_tagname_is (
594
+ const GumboNode* node,
595
+ GumboNamespaceEnum ns,
596
+ GumboTag tag,
597
+ const char *name
598
+ ) {
599
+ assert(node);
600
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
601
+ assert(node->v.element.name);
602
+ assert(tag != GUMBO_TAG_UNKNOWN || name);
603
+ GumboTag element_tag = node->v.element.tag;
604
+ const char *element_name = node->v.element.name;
605
+ assert(element_tag != GUMBO_TAG_UNKNOWN || element_name);
606
+ if (node->v.element.tag_namespace != ns || element_tag != tag)
607
+ return false;
608
+ if (tag != GUMBO_TAG_UNKNOWN)
609
+ return true;
610
+ return !gumbo_ascii_strcasecmp(element_name, name);
611
+ }
612
+
613
+ static bool node_html_tagname_is (
614
+ const GumboNode* node,
615
+ GumboTag tag,
616
+ const char *name
617
+ ) {
618
+ return node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, tag, name);
619
+ }
620
+
621
+ static bool node_tagname_is (
622
+ const GumboNode* node,
623
+ GumboTag tag,
624
+ const char *name
625
+ ) {
626
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
627
+ return node_qualified_tagname_is(node, node->v.element.tag_namespace, tag, name);
628
+ }
629
+
630
+ // Like node_tag_in, but for the single-tag case.
631
+ static bool node_qualified_tag_is (
632
+ const GumboNode* node,
633
+ GumboNamespaceEnum ns,
634
+ GumboTag tag
635
+ ) {
636
+ assert(node);
637
+ assert(tag != GUMBO_TAG_UNKNOWN);
638
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
639
+ return
640
+ node->v.element.tag == tag
641
+ && node->v.element.tag_namespace == ns;
642
+ }
643
+
644
+ // Like node_tag_in, but for the single-tag case in the HTML namespace
645
+ static bool node_html_tag_is(const GumboNode* node, GumboTag tag) {
646
+ return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
647
+ }
648
+
649
+ // https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately
650
+ // This is a helper function that returns the appropriate insertion mode instead
651
+ // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
652
+ // indicate that there is no appropriate insertion mode, and the loop should
653
+ // continue.
654
+ static GumboInsertionMode get_appropriate_insertion_mode (
655
+ const GumboParser* parser,
656
+ int index
657
+ ) {
658
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
659
+ const GumboNode* node = open_elements->data[index];
660
+ const bool is_last = index == 0;
661
+
662
+ if (is_last && is_fragment_parser(parser)) {
663
+ node = parser->_parser_state->_fragment_ctx;
664
+ }
665
+
666
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
667
+ if (node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML) {
668
+ return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
669
+ }
670
+
671
+ switch (node->v.element.tag) {
672
+ case GUMBO_TAG_SELECT: {
673
+ if (is_last) {
674
+ return GUMBO_INSERTION_MODE_IN_SELECT;
675
+ }
676
+ for (int i = index; i > 0; --i) {
677
+ const GumboNode* ancestor = open_elements->data[i];
678
+ if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) {
679
+ return GUMBO_INSERTION_MODE_IN_SELECT;
680
+ }
681
+ if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) {
682
+ return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE;
683
+ }
684
+ }
685
+ return GUMBO_INSERTION_MODE_IN_SELECT;
686
+ }
687
+ case GUMBO_TAG_TD:
688
+ case GUMBO_TAG_TH:
689
+ if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL;
690
+ break;
691
+ case GUMBO_TAG_TR:
692
+ return GUMBO_INSERTION_MODE_IN_ROW;
693
+ case GUMBO_TAG_TBODY:
694
+ case GUMBO_TAG_THEAD:
695
+ case GUMBO_TAG_TFOOT:
696
+ return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
697
+ case GUMBO_TAG_CAPTION:
698
+ return GUMBO_INSERTION_MODE_IN_CAPTION;
699
+ case GUMBO_TAG_COLGROUP:
700
+ return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
701
+ case GUMBO_TAG_TABLE:
702
+ return GUMBO_INSERTION_MODE_IN_TABLE;
703
+ case GUMBO_TAG_TEMPLATE:
704
+ return get_current_template_insertion_mode(parser);
705
+ case GUMBO_TAG_HEAD:
706
+ if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD;
707
+ break;
708
+ case GUMBO_TAG_BODY:
709
+ return GUMBO_INSERTION_MODE_IN_BODY;
710
+ case GUMBO_TAG_FRAMESET:
711
+ return GUMBO_INSERTION_MODE_IN_FRAMESET;
712
+ case GUMBO_TAG_HTML:
713
+ return parser->_parser_state->_head_element
714
+ ? GUMBO_INSERTION_MODE_AFTER_HEAD
715
+ : GUMBO_INSERTION_MODE_BEFORE_HEAD;
716
+ default:
717
+ break;
718
+ }
719
+ return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
720
+ }
721
+
722
+ // This performs the actual "reset the insertion mode" loop.
723
+ static void reset_insertion_mode_appropriately(GumboParser* parser) {
724
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
725
+ for (int i = open_elements->length; --i >= 0;) {
726
+ GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i);
727
+ if (mode != GUMBO_INSERTION_MODE_INITIAL) {
728
+ set_insertion_mode(parser, mode);
729
+ return;
730
+ }
731
+ }
732
+ // Should never get here, because is_last will be set on the last iteration
733
+ // and will force GUMBO_INSERTION_MODE_IN_BODY.
734
+ assert(0);
735
+ }
736
+
737
+ static void parser_add_parse_error (
738
+ GumboParser* parser,
739
+ const GumboToken* token
740
+ ) {
741
+ gumbo_debug("Adding parse error.\n");
742
+ GumboError* error = gumbo_add_error(parser);
743
+ if (!error) {
744
+ return;
745
+ }
746
+ error->type = GUMBO_ERR_PARSER;
747
+ error->position = token->position;
748
+ error->original_text = token->original_text;
749
+ GumboParserError* extra_data = &error->v.parser;
750
+ extra_data->input_type = token->type;
751
+ extra_data->input_tag = GUMBO_TAG_UNKNOWN;
752
+ if (token->type == GUMBO_TOKEN_START_TAG) {
753
+ extra_data->input_tag = token->v.start_tag.tag;
754
+ } else if (token->type == GUMBO_TOKEN_END_TAG) {
755
+ extra_data->input_tag = token->v.end_tag.tag;
756
+ }
757
+ const GumboParserState* state = parser->_parser_state;
758
+ extra_data->parser_state = state->_insertion_mode;
759
+ gumbo_vector_init(state->_open_elements.length, &extra_data->tag_stack);
760
+ for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
761
+ const GumboNode* node = state->_open_elements.data[i];
762
+ assert (
763
+ node->type == GUMBO_NODE_ELEMENT
764
+ || node->type == GUMBO_NODE_TEMPLATE
765
+ );
766
+ gumbo_vector_add (
767
+ (void*) node->v.element.tag,
768
+ &extra_data->tag_stack
769
+ );
770
+ }
771
+ }
772
+
773
+ // https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point
774
+ static bool is_mathml_integration_point(const GumboNode* node) {
775
+ static const TagSet mathml_integration_point_tags = {
776
+ TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
777
+ TAG_MATHML(MS), TAG_MATHML(MTEXT)
778
+ };
779
+ return node_tag_in_set(node, &mathml_integration_point_tags);
780
+ }
781
+
782
+ // https://html.spec.whatwg.org/multipage/parsing.html#html-integration-point
783
+ static bool is_html_integration_point(const GumboNode* node) {
784
+ static const TagSet html_integration_point_svg_tags = {
785
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE)
786
+ };
787
+ if (node_tag_in_set(node, &html_integration_point_svg_tags)) {
788
+ return true;
789
+ }
790
+
791
+ const bool is_mathml_annotation_xml_element = node_qualified_tag_is (
792
+ node,
793
+ GUMBO_NAMESPACE_MATHML,
794
+ GUMBO_TAG_ANNOTATION_XML
795
+ );
796
+ const GumboVector* attributes = &node->v.element.attributes;
797
+ if (
798
+ is_mathml_annotation_xml_element
799
+ && (
800
+ attribute_matches(attributes, "encoding", "text/html")
801
+ || attribute_matches(attributes, "encoding", "application/xhtml+xml")
802
+ )
803
+ ) {
804
+ return true;
805
+ }
806
+
807
+ return false;
808
+ }
809
+
810
+ // This represents a place to insert a node, consisting of a target parent and a
811
+ // child index within that parent. If the node should be inserted at the end of
812
+ // the parent's child, index will be -1.
813
+ typedef struct {
814
+ GumboNode* target;
815
+ int index;
816
+ } InsertionLocation;
817
+
818
+ static InsertionLocation get_appropriate_insertion_location (
819
+ const GumboParser* parser,
820
+ GumboNode* override_target
821
+ ) {
822
+ InsertionLocation retval = {override_target, -1};
823
+ if (retval.target == NULL) {
824
+ // No override target; default to the current node, but special-case the
825
+ // root node since get_current_node() assumes the stack of open elements is
826
+ // non-empty.
827
+ retval.target = (parser->_output->root != NULL)
828
+ ? get_current_node(parser)
829
+ : get_document_node(parser)
830
+ ;
831
+ }
832
+ if (
833
+ !parser->_parser_state->_foster_parent_insertions
834
+ || !node_tag_in_set(retval.target, &(const TagSet) {
835
+ TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
836
+ })
837
+ ) {
838
+ return retval;
839
+ }
840
+
841
+ // Foster-parenting case.
842
+ int last_template_index = -1;
843
+ int last_table_index = -1;
844
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
845
+ for (unsigned int i = 0; i < open_elements->length; ++i) {
846
+ if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) {
847
+ last_template_index = i;
848
+ }
849
+ if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) {
850
+ last_table_index = i;
851
+ }
852
+ }
853
+ if (
854
+ last_template_index != -1
855
+ && (last_table_index == -1 || last_template_index > last_table_index)
856
+ ) {
857
+ retval.target = open_elements->data[last_template_index];
858
+ return retval;
859
+ }
860
+ if (last_table_index == -1) {
861
+ retval.target = open_elements->data[0];
862
+ return retval;
863
+ }
864
+ const GumboNode* last_table = open_elements->data[last_table_index];
865
+ if (last_table->parent != NULL) {
866
+ retval.target = last_table->parent;
867
+ retval.index = last_table->index_within_parent;
868
+ return retval;
869
+ }
870
+
871
+ retval.target = open_elements->data[last_table_index - 1];
872
+ return retval;
873
+ }
874
+
875
+ // Appends a node to the end of its parent, setting the "parent" and
876
+ // "index_within_parent" fields appropriately.
877
+ static void append_node(GumboNode* parent, GumboNode* node) {
878
+ assert(node->parent == NULL);
879
+ assert(node->index_within_parent == (unsigned int) -1);
880
+ GumboVector* children;
881
+ if (
882
+ parent->type == GUMBO_NODE_ELEMENT
883
+ || parent->type == GUMBO_NODE_TEMPLATE
884
+ ) {
885
+ children = &parent->v.element.children;
886
+ } else {
887
+ assert(parent->type == GUMBO_NODE_DOCUMENT);
888
+ children = &parent->v.document.children;
889
+ }
890
+ node->parent = parent;
891
+ node->index_within_parent = children->length;
892
+ gumbo_vector_add((void*) node, children);
893
+ assert(node->index_within_parent < children->length);
894
+ }
895
+
896
+ // Inserts a node at the specified InsertionLocation, updating the
897
+ // "parent" and "index_within_parent" fields of it and all its siblings.
898
+ // If the index of the location is -1, this calls append_node.
899
+ static void insert_node(GumboNode* node, InsertionLocation location) {
900
+ assert(node->parent == NULL);
901
+ assert(node->index_within_parent == (unsigned int) -1);
902
+ GumboNode* parent = location.target;
903
+ int index = location.index;
904
+ if (index != -1) {
905
+ GumboVector* children = NULL;
906
+ if (
907
+ parent->type == GUMBO_NODE_ELEMENT
908
+ || parent->type == GUMBO_NODE_TEMPLATE
909
+ ) {
910
+ children = &parent->v.element.children;
911
+ } else if (parent->type == GUMBO_NODE_DOCUMENT) {
912
+ children = &parent->v.document.children;
913
+ assert(children->length == 0);
914
+ } else {
915
+ assert(0);
916
+ }
917
+
918
+ assert(index >= 0);
919
+ assert((unsigned int) index < children->length);
920
+ node->parent = parent;
921
+ node->index_within_parent = index;
922
+ gumbo_vector_insert_at((void*) node, index, children);
923
+ assert(node->index_within_parent < children->length);
924
+ for (unsigned int i = index + 1; i < children->length; ++i) {
925
+ GumboNode* sibling = children->data[i];
926
+ sibling->index_within_parent = i;
927
+ assert(sibling->index_within_parent < children->length);
928
+ }
929
+ } else {
930
+ append_node(parent, node);
931
+ }
932
+ }
933
+
934
+ static void maybe_flush_text_node_buffer(GumboParser* parser) {
935
+ GumboParserState* state = parser->_parser_state;
936
+ TextNodeBufferState* buffer_state = &state->_text_node;
937
+ if (buffer_state->_buffer.length == 0) {
938
+ return;
939
+ }
940
+
941
+ assert (
942
+ buffer_state->_type == GUMBO_NODE_WHITESPACE
943
+ || buffer_state->_type == GUMBO_NODE_TEXT
944
+ || buffer_state->_type == GUMBO_NODE_CDATA
945
+ );
946
+ GumboNode* text_node = create_node(buffer_state->_type);
947
+ GumboText* text_node_data = &text_node->v.text;
948
+ text_node_data->text = gumbo_string_buffer_to_string(&buffer_state->_buffer);
949
+ text_node_data->original_text.data = buffer_state->_start_original_text;
950
+ text_node_data->original_text.length =
951
+ state->_current_token->original_text.data -
952
+ buffer_state->_start_original_text;
953
+ text_node_data->start_pos = buffer_state->_start_position;
954
+
955
+ gumbo_debug (
956
+ "Flushing text node buffer of %.*s.\n",
957
+ (int) buffer_state->_buffer.length,
958
+ buffer_state->_buffer.data
959
+ );
960
+
961
+ InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
962
+ if (location.target->type == GUMBO_NODE_DOCUMENT) {
963
+ // The DOM does not allow Document nodes to have Text children, so per the
964
+ // spec, they are dropped on the floor.
965
+ destroy_node(text_node);
966
+ } else {
967
+ insert_node(text_node, location);
968
+ }
969
+
970
+ gumbo_string_buffer_clear(&buffer_state->_buffer);
971
+ buffer_state->_type = GUMBO_NODE_WHITESPACE;
972
+ assert(buffer_state->_buffer.length == 0);
973
+ }
974
+
975
+ static void record_end_of_element (
976
+ const GumboToken* current_token,
977
+ GumboElement* element
978
+ ) {
979
+ element->end_pos = current_token->position;
980
+ element->original_end_tag =
981
+ (current_token->type == GUMBO_TOKEN_END_TAG)
982
+ ? current_token->original_text
983
+ : kGumboEmptyString;
984
+ }
985
+
986
+ static GumboNode* pop_current_node(GumboParser* parser) {
987
+ GumboParserState* state = parser->_parser_state;
988
+ maybe_flush_text_node_buffer(parser);
989
+ if (state->_open_elements.length > 0) {
990
+ assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
991
+ gumbo_debug (
992
+ "Popping %s node.\n",
993
+ gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)
994
+ );
995
+ }
996
+ GumboNode* current_node = gumbo_vector_pop(&state->_open_elements);
997
+ if (!current_node) {
998
+ assert(state->_open_elements.length == 0);
999
+ return NULL;
1000
+ }
1001
+ assert (
1002
+ current_node->type == GUMBO_NODE_ELEMENT
1003
+ || current_node->type == GUMBO_NODE_TEMPLATE
1004
+ );
1005
+ bool is_closed_body_or_html_tag =
1006
+ (
1007
+ node_html_tag_is(current_node, GUMBO_TAG_BODY)
1008
+ && state->_closed_body_tag
1009
+ ) || (
1010
+ node_html_tag_is(current_node, GUMBO_TAG_HTML)
1011
+ && state->_closed_html_tag
1012
+ )
1013
+ ;
1014
+ if (
1015
+ (
1016
+ state->_current_token->type != GUMBO_TOKEN_END_TAG
1017
+ || !node_qualified_tagname_is (
1018
+ current_node,
1019
+ GUMBO_NAMESPACE_HTML,
1020
+ state->_current_token->v.end_tag.tag,
1021
+ state->_current_token->v.end_tag.name
1022
+ )
1023
+ )
1024
+ && !is_closed_body_or_html_tag
1025
+ ) {
1026
+ current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1027
+ }
1028
+ if (!is_closed_body_or_html_tag) {
1029
+ record_end_of_element(state->_current_token, &current_node->v.element);
1030
+ }
1031
+ return current_node;
1032
+ }
1033
+
1034
+ static void append_comment_node (
1035
+ GumboParser* parser,
1036
+ GumboNode* node,
1037
+ const GumboToken* token
1038
+ ) {
1039
+ maybe_flush_text_node_buffer(parser);
1040
+ GumboNode* comment = create_node(GUMBO_NODE_COMMENT);
1041
+ comment->type = GUMBO_NODE_COMMENT;
1042
+ comment->parse_flags = GUMBO_INSERTION_NORMAL;
1043
+ comment->v.text.text = token->v.text;
1044
+ comment->v.text.original_text = token->original_text;
1045
+ comment->v.text.start_pos = token->position;
1046
+ append_node(node, comment);
1047
+ }
1048
+
1049
+ // https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-row-context
1050
+ static void clear_stack_to_table_row_context(GumboParser* parser) {
1051
+ static const TagSet tags = {TAG(HTML), TAG(TR), TAG(TEMPLATE)};
1052
+ while (!node_tag_in_set(get_current_node(parser), &tags)) {
1053
+ pop_current_node(parser);
1054
+ }
1055
+ }
1056
+
1057
+ // https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-context
1058
+ static void clear_stack_to_table_context(GumboParser* parser) {
1059
+ static const TagSet tags = {TAG(HTML), TAG(TABLE), TAG(TEMPLATE)};
1060
+ while (!node_tag_in_set(get_current_node(parser), &tags)) {
1061
+ pop_current_node(parser);
1062
+ }
1063
+ }
1064
+
1065
+ // https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-body-context
1066
+ static void clear_stack_to_table_body_context(GumboParser* parser) {
1067
+ static const TagSet tags = {
1068
+ TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TEMPLATE)
1069
+ };
1070
+ while (!node_tag_in_set(get_current_node(parser), &tags)) {
1071
+ pop_current_node(parser);
1072
+ }
1073
+ }
1074
+
1075
+ // Creates a parser-inserted element in the HTML namespace and returns it.
1076
+ static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
1077
+ // XXX: This will fail for creating fragments with an element with tag
1078
+ // GUMBO_TAG_UNKNOWN
1079
+ assert(tag != GUMBO_TAG_UNKNOWN);
1080
+ GumboNode* node = create_node(GUMBO_NODE_ELEMENT);
1081
+ GumboElement* element = &node->v.element;
1082
+ gumbo_vector_init(1, &element->children);
1083
+ gumbo_vector_init(0, &element->attributes);
1084
+ element->tag = tag;
1085
+ element->name = gumbo_normalized_tagname(tag);
1086
+ element->tag_namespace = GUMBO_NAMESPACE_HTML;
1087
+ element->original_tag = kGumboEmptyString;
1088
+ element->original_end_tag = kGumboEmptyString;
1089
+ element->start_pos = (parser->_parser_state->_current_token)
1090
+ ? parser->_parser_state->_current_token->position
1091
+ : kGumboEmptySourcePosition
1092
+ ;
1093
+ element->end_pos = kGumboEmptySourcePosition;
1094
+ return node;
1095
+ }
1096
+
1097
+ // Constructs an element from the given start tag token.
1098
+ static GumboNode* create_element_from_token (
1099
+ GumboToken* token,
1100
+ GumboNamespaceEnum tag_namespace
1101
+ ) {
1102
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1103
+ GumboTokenStartTag* start_tag = &token->v.start_tag;
1104
+
1105
+ GumboNodeType type =
1106
+ (
1107
+ tag_namespace == GUMBO_NAMESPACE_HTML
1108
+ && start_tag->tag == GUMBO_TAG_TEMPLATE
1109
+ )
1110
+ ? GUMBO_NODE_TEMPLATE
1111
+ : GUMBO_NODE_ELEMENT
1112
+ ;
1113
+
1114
+ GumboNode* node = create_node(type);
1115
+ GumboElement* element = &node->v.element;
1116
+ gumbo_vector_init(1, &element->children);
1117
+ element->attributes = start_tag->attributes;
1118
+ element->tag = start_tag->tag;
1119
+ element->name = start_tag->name ? start_tag->name : gumbo_normalized_tagname(start_tag->tag);
1120
+ element->tag_namespace = tag_namespace;
1121
+
1122
+ assert(token->original_text.length >= 2);
1123
+ assert(token->original_text.data[0] == '<');
1124
+ assert(token->original_text.data[token->original_text.length - 1] == '>');
1125
+ element->original_tag = token->original_text;
1126
+ element->start_pos = token->position;
1127
+ element->original_end_tag = kGumboEmptyString;
1128
+ element->end_pos = kGumboEmptySourcePosition;
1129
+
1130
+ // The element takes ownership of the attributes and name from the token, so
1131
+ // any allocated-memory fields should be nulled out.
1132
+ start_tag->attributes = kGumboEmptyVector;
1133
+ start_tag->name = NULL;
1134
+ return node;
1135
+ }
1136
+
1137
+ // https://html.spec.whatwg.org/multipage/parsing.html#insert-an-html-element
1138
+ static void insert_element (
1139
+ GumboParser* parser,
1140
+ GumboNode* node,
1141
+ bool is_reconstructing_formatting_elements
1142
+ ) {
1143
+ GumboParserState* state = parser->_parser_state;
1144
+ // NOTE(jdtang): The text node buffer must always be flushed before inserting
1145
+ // a node, otherwise we're handling nodes in a different order than the spec
1146
+ // mandated. However, one clause of the spec (character tokens in the body)
1147
+ // requires that we reconstruct the active formatting elements *before* adding
1148
+ // the character, and reconstructing the active formatting elements may itself
1149
+ // result in the insertion of new elements (which should be pushed onto the
1150
+ // stack of open elements before the buffer is flushed). We solve this (for
1151
+ // the time being, the spec has been rewritten for <template> and the new
1152
+ // version may be simpler here) with a boolean flag to this method.
1153
+ if (!is_reconstructing_formatting_elements) {
1154
+ maybe_flush_text_node_buffer(parser);
1155
+ }
1156
+ InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
1157
+ insert_node(node, location);
1158
+ gumbo_vector_add((void*) node, &state->_open_elements);
1159
+ }
1160
+
1161
+ // Convenience method that combines create_element_from_token and
1162
+ // insert_element, inserting the generated element directly into the current
1163
+ // node. Returns the node inserted.
1164
+ static GumboNode* insert_element_from_token (
1165
+ GumboParser* parser,
1166
+ GumboToken* token
1167
+ ) {
1168
+ GumboNode* element = create_element_from_token(token, GUMBO_NAMESPACE_HTML);
1169
+ insert_element(parser, element, false);
1170
+ gumbo_debug (
1171
+ "Inserting <%s> element (@%p) from token.\n",
1172
+ gumbo_normalized_tagname(element->v.element.tag),
1173
+ (void*)element
1174
+ );
1175
+ return element;
1176
+ }
1177
+
1178
+ // Convenience method that combines create_element and insert_element, inserting
1179
+ // a parser-generated element of a specific tag type. Returns the node
1180
+ // inserted.
1181
+ static GumboNode* insert_element_of_tag_type (
1182
+ GumboParser* parser,
1183
+ GumboTag tag,
1184
+ GumboParseFlags reason
1185
+ ) {
1186
+ GumboNode* element = create_element(parser, tag);
1187
+ element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1188
+ insert_element(parser, element, false);
1189
+ gumbo_debug (
1190
+ "Inserting %s element (@%p) from tag type.\n",
1191
+ gumbo_normalized_tagname(tag),
1192
+ (void*)element
1193
+ );
1194
+ return element;
1195
+ }
1196
+
1197
+ // Convenience method for creating foreign namespaced element. Returns the node
1198
+ // inserted.
1199
+ static GumboNode* insert_foreign_element (
1200
+ GumboParser* parser,
1201
+ GumboToken* token,
1202
+ GumboNamespaceEnum tag_namespace
1203
+ ) {
1204
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1205
+ GumboNode* element = create_element_from_token(token, tag_namespace);
1206
+ insert_element(parser, element, false);
1207
+ if (
1208
+ token_has_attribute(token, "xmlns")
1209
+ && !attribute_matches_case_sensitive (
1210
+ &token->v.start_tag.attributes,
1211
+ "xmlns",
1212
+ kLegalXmlns[tag_namespace]
1213
+ )
1214
+ ) {
1215
+ // TODO(jdtang): Since there're multiple possible error codes here, we
1216
+ // eventually need reason codes to differentiate them.
1217
+ parser_add_parse_error(parser, token);
1218
+ }
1219
+ if (
1220
+ token_has_attribute(token, "xmlns:xlink")
1221
+ && !attribute_matches_case_sensitive (
1222
+ &token->v.start_tag.attributes,
1223
+ "xmlns:xlink",
1224
+ "http://www.w3.org/1999/xlink"
1225
+ )
1226
+ ) {
1227
+ parser_add_parse_error(parser, token);
1228
+ }
1229
+ return element;
1230
+ }
1231
+
1232
+ static void insert_text_token(GumboParser* parser, GumboToken* token) {
1233
+ assert (
1234
+ token->type == GUMBO_TOKEN_WHITESPACE
1235
+ || token->type == GUMBO_TOKEN_CHARACTER
1236
+ || token->type == GUMBO_TOKEN_NULL
1237
+ || token->type == GUMBO_TOKEN_CDATA
1238
+ );
1239
+ TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
1240
+ if (buffer_state->_buffer.length == 0) {
1241
+ // Initialize position fields.
1242
+ buffer_state->_start_original_text = token->original_text.data;
1243
+ buffer_state->_start_position = token->position;
1244
+ }
1245
+ gumbo_string_buffer_append_codepoint (
1246
+ token->v.character,
1247
+ &buffer_state->_buffer
1248
+ );
1249
+ if (token->type == GUMBO_TOKEN_CHARACTER) {
1250
+ buffer_state->_type = GUMBO_NODE_TEXT;
1251
+ } else if (token->type == GUMBO_TOKEN_CDATA) {
1252
+ buffer_state->_type = GUMBO_NODE_CDATA;
1253
+ }
1254
+ gumbo_debug("Inserting text token '%c'.\n", token->v.character);
1255
+ }
1256
+
1257
+ // https://html.spec.whatwg.org/multipage/parsing.html#generic-rcdata-element-parsing-algorithm
1258
+ static void run_generic_parsing_algorithm (
1259
+ GumboParser* parser,
1260
+ GumboToken* token,
1261
+ GumboTokenizerEnum lexer_state
1262
+ ) {
1263
+ insert_element_from_token(parser, token);
1264
+ gumbo_tokenizer_set_state(parser, lexer_state);
1265
+ GumboParserState* parser_state = parser->_parser_state;
1266
+ parser_state->_original_insertion_mode = parser_state->_insertion_mode;
1267
+ parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT;
1268
+ }
1269
+
1270
+ static void acknowledge_self_closing_tag(GumboParser* parser) {
1271
+ parser->_parser_state->_self_closing_flag_acknowledged = true;
1272
+ }
1273
+
1274
+ // Returns true if there's an anchor tag in the list of active formatting
1275
+ // elements, and fills in its index if so.
1276
+ static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
1277
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1278
+ for (int i = elements->length; --i >= 0;) {
1279
+ GumboNode* node = elements->data[i];
1280
+ if (node == &kActiveFormattingScopeMarker) {
1281
+ return false;
1282
+ }
1283
+ if (node_html_tag_is(node, GUMBO_TAG_A)) {
1284
+ *anchor_index = i;
1285
+ return true;
1286
+ }
1287
+ }
1288
+ return false;
1289
+ }
1290
+
1291
+ // Counts the number of open formatting elements in the list of active
1292
+ // formatting elements (after the last active scope marker) that have a specific
1293
+ // tag. If this is > 0, then earliest_matching_index will be filled in with the
1294
+ // index of the first such element.
1295
+ static int count_formatting_elements_of_tag (
1296
+ GumboParser* parser,
1297
+ const GumboNode* desired_node,
1298
+ int* earliest_matching_index
1299
+ ) {
1300
+ const GumboElement* desired_element = &desired_node->v.element;
1301
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1302
+ int num_identical_elements = 0;
1303
+ for (int i = elements->length; --i >= 0;) {
1304
+ GumboNode* node = elements->data[i];
1305
+ if (node == &kActiveFormattingScopeMarker) {
1306
+ break;
1307
+ }
1308
+ assert(node->type == GUMBO_NODE_ELEMENT);
1309
+ if (
1310
+ node_qualified_tagname_is (
1311
+ node,
1312
+ desired_element->tag_namespace,
1313
+ desired_element->tag,
1314
+ desired_element->name
1315
+ )
1316
+ && all_attributes_match(&node->v.element.attributes, &desired_element->attributes)
1317
+ ) {
1318
+ num_identical_elements++;
1319
+ *earliest_matching_index = i;
1320
+ }
1321
+ }
1322
+ return num_identical_elements;
1323
+ }
1324
+
1325
+ // https://html.spec.whatwg.org/multipage/parsing.html#reconstruct-the-active-formatting-elements
1326
+ static void add_formatting_element(GumboParser* parser, const GumboNode* node) {
1327
+ assert (
1328
+ node == &kActiveFormattingScopeMarker
1329
+ || node->type == GUMBO_NODE_ELEMENT
1330
+ );
1331
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1332
+ if (node == &kActiveFormattingScopeMarker) {
1333
+ gumbo_debug("Adding a scope marker.\n");
1334
+ } else {
1335
+ gumbo_debug("Adding a formatting element.\n");
1336
+ }
1337
+
1338
+ // Hunt for identical elements.
1339
+ int earliest_identical_element = elements->length;
1340
+ int num_identical_elements = count_formatting_elements_of_tag (
1341
+ parser,
1342
+ node,
1343
+ &earliest_identical_element
1344
+ );
1345
+
1346
+ // Noah's Ark clause: if there're at least 3, remove the earliest.
1347
+ if (num_identical_elements >= 3) {
1348
+ gumbo_debug (
1349
+ "Noah's ark clause: removing element at %d.\n",
1350
+ earliest_identical_element
1351
+ );
1352
+ gumbo_vector_remove_at(earliest_identical_element, elements);
1353
+ }
1354
+
1355
+ gumbo_vector_add((void*) node, elements);
1356
+ }
1357
+
1358
+ static bool is_open_element(const GumboParser* parser, const GumboNode* node) {
1359
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
1360
+ for (unsigned int i = 0; i < open_elements->length; ++i) {
1361
+ if (open_elements->data[i] == node) {
1362
+ return true;
1363
+ }
1364
+ }
1365
+ return false;
1366
+ }
1367
+
1368
+ // Clones attributes, tags, etc. of a node, but does not copy the content. The
1369
+ // clone shares no structure with the original node: all owned strings and
1370
+ // values are fresh copies.
1371
+ static GumboNode* clone_node (
1372
+ GumboNode* node,
1373
+ GumboParseFlags reason
1374
+ ) {
1375
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1376
+ GumboNode* new_node = gumbo_alloc(sizeof(GumboNode));
1377
+ *new_node = *node;
1378
+ new_node->parent = NULL;
1379
+ new_node->index_within_parent = -1;
1380
+ // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may
1381
+ // have a separate end tag.
1382
+ new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG;
1383
+ new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER;
1384
+ GumboElement* element = &new_node->v.element;
1385
+ gumbo_vector_init(1, &element->children);
1386
+
1387
+ const GumboVector* old_attributes = &node->v.element.attributes;
1388
+ gumbo_vector_init(old_attributes->length, &element->attributes);
1389
+ for (unsigned int i = 0; i < old_attributes->length; ++i) {
1390
+ const GumboAttribute* old_attr = old_attributes->data[i];
1391
+ GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute));
1392
+ *attr = *old_attr;
1393
+ attr->name = gumbo_strdup(old_attr->name);
1394
+ attr->value = gumbo_strdup(old_attr->value);
1395
+ gumbo_vector_add(attr, &element->attributes);
1396
+ }
1397
+ return new_node;
1398
+ }
1399
+
1400
+ // "Reconstruct active formatting elements" part of the spec.
1401
+ // This implementation is based on the html5lib translation from the
1402
+ // mess of GOTOs in the spec to reasonably structured programming.
1403
+ // https://github.com/html5lib/html5lib-python/blob/master/html5lib/treebuilders/base.py
1404
+ static void reconstruct_active_formatting_elements(GumboParser* parser) {
1405
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1406
+ // Step 1
1407
+ if (elements->length == 0) {
1408
+ return;
1409
+ }
1410
+
1411
+ // Step 2 & 3
1412
+ unsigned int i = elements->length - 1;
1413
+ GumboNode* element = elements->data[i];
1414
+ if (
1415
+ element == &kActiveFormattingScopeMarker
1416
+ || is_open_element(parser, element)
1417
+ ) {
1418
+ return;
1419
+ }
1420
+
1421
+ // Step 6
1422
+ do {
1423
+ if (i == 0) {
1424
+ // Step 4
1425
+ i = -1; // Incremented to 0 below.
1426
+ break;
1427
+ }
1428
+ // Step 5
1429
+ element = elements->data[--i];
1430
+ } while (
1431
+ element != &kActiveFormattingScopeMarker
1432
+ && !is_open_element(parser, element)
1433
+ );
1434
+
1435
+ ++i;
1436
+ gumbo_debug (
1437
+ "Reconstructing elements from %u on %s parent.\n",
1438
+ i,
1439
+ gumbo_normalized_tagname(get_current_node(parser)->v.element.tag)
1440
+ );
1441
+ for (; i < elements->length; ++i) {
1442
+ // Step 7 & 8.
1443
+ assert(elements->length > 0);
1444
+ assert(i < elements->length);
1445
+ element = elements->data[i];
1446
+ assert(element != &kActiveFormattingScopeMarker);
1447
+ GumboNode* clone = clone_node (
1448
+ element,
1449
+ GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT
1450
+ );
1451
+ // Step 9.
1452
+ InsertionLocation location =
1453
+ get_appropriate_insertion_location(parser, NULL);
1454
+ insert_node(clone, location);
1455
+ gumbo_vector_add (
1456
+ (void*) clone,
1457
+ &parser->_parser_state->_open_elements
1458
+ );
1459
+
1460
+ // Step 10.
1461
+ elements->data[i] = clone;
1462
+ gumbo_debug (
1463
+ "Reconstructed %s element at %u.\n",
1464
+ gumbo_normalized_tagname(clone->v.element.tag),
1465
+ i
1466
+ );
1467
+ }
1468
+ }
1469
+
1470
+ static void clear_active_formatting_elements(GumboParser* parser) {
1471
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1472
+ int num_elements_cleared = 0;
1473
+ const GumboNode* node;
1474
+ do {
1475
+ node = gumbo_vector_pop(elements);
1476
+ ++num_elements_cleared;
1477
+ } while (node && node != &kActiveFormattingScopeMarker);
1478
+ gumbo_debug (
1479
+ "Cleared %d elements from active formatting list.\n",
1480
+ num_elements_cleared
1481
+ );
1482
+ }
1483
+
1484
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-initial-insertion-mode
1485
+ GumboQuirksModeEnum gumbo_compute_quirks_mode (
1486
+ const char *name,
1487
+ const char *pubid_str,
1488
+ const char *sysid_str
1489
+ ) {
1490
+
1491
+ GumboStringPiece pubid = {
1492
+ .data = pubid_str,
1493
+ .length = pubid_str? strlen(pubid_str) : 0,
1494
+ };
1495
+ GumboStringPiece sysid = {
1496
+ .data = sysid_str,
1497
+ .length = sysid_str? strlen(sysid_str) : 0,
1498
+ };
1499
+ bool has_system_identifier = !!sysid_str;
1500
+ if (
1501
+ name == NULL
1502
+ || strcmp(name, "html")
1503
+ || is_in_static_list(&pubid, kQuirksModePublicIdPrefixes, false)
1504
+ || is_in_static_list(&pubid, kQuirksModePublicIdExactMatches, true)
1505
+ || is_in_static_list(&sysid, kQuirksModeSystemIdExactMatches, true)
1506
+ || (
1507
+ !has_system_identifier
1508
+ && is_in_static_list(&pubid, kSystemIdDependentPublicIdPrefixes, false)
1509
+ )
1510
+ ) {
1511
+ return GUMBO_DOCTYPE_QUIRKS;
1512
+ }
1513
+
1514
+ if (
1515
+ is_in_static_list(&pubid, kLimitedQuirksPublicIdPrefixes, false)
1516
+ || (
1517
+ has_system_identifier
1518
+ && is_in_static_list(&pubid, kSystemIdDependentPublicIdPrefixes, false)
1519
+ )
1520
+ ) {
1521
+ return GUMBO_DOCTYPE_LIMITED_QUIRKS;
1522
+ }
1523
+
1524
+ return GUMBO_DOCTYPE_NO_QUIRKS;
1525
+ }
1526
+
1527
+ static GumboQuirksModeEnum compute_quirks_mode(const GumboTokenDocType* doctype) {
1528
+ if (doctype->force_quirks)
1529
+ return GUMBO_DOCTYPE_QUIRKS;
1530
+ return gumbo_compute_quirks_mode (
1531
+ doctype->name,
1532
+ doctype->has_public_identifier? doctype->public_identifier : NULL,
1533
+ doctype->has_system_identifier? doctype->system_identifier : NULL
1534
+ );
1535
+ }
1536
+
1537
+ // The following functions are all defined by the "has an element in __ scope"
1538
+ // sections of the HTML5 spec:
1539
+ // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope
1540
+ // The basic idea behind them is that they check for an element of the given
1541
+ // qualified name, contained within a scope formed by a set of other qualified
1542
+ // names. For example, "has an element in list scope" looks for an element of
1543
+ // the given qualified name within the nearest enclosing <ol> or <ul>, along
1544
+ // with a bunch of generic element types that serve to "firewall" their content
1545
+ // from the rest of the document. Note that because of the way the spec is
1546
+ // written,
1547
+ // all elements are expected to be in the HTML namespace
1548
+ static bool has_an_element_in_specific_scope (
1549
+ const GumboParser* parser,
1550
+ int expected_size,
1551
+ const GumboTag* expected,
1552
+ bool negate,
1553
+ const TagSet* tags
1554
+ ) {
1555
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
1556
+ for (int i = open_elements->length; --i >= 0;) {
1557
+ const GumboNode* node = open_elements->data[i];
1558
+ if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
1559
+ continue;
1560
+ }
1561
+
1562
+ GumboTag node_tag = node->v.element.tag;
1563
+ GumboNamespaceEnum node_ns = node->v.element.tag_namespace;
1564
+ for (int j = 0; j < expected_size; ++j) {
1565
+ if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML) {
1566
+ return true;
1567
+ }
1568
+ }
1569
+
1570
+ bool found = tagset_includes(tags, node_ns, node_tag);
1571
+ if (negate != found) {
1572
+ return false;
1573
+ }
1574
+ }
1575
+ return false;
1576
+ }
1577
+
1578
+ // Checks for the presence of an open element of the specified tag type.
1579
+ static bool has_open_element(const GumboParser* parser, GumboTag tag) {
1580
+ static const TagSet tags = {TAG(HTML)};
1581
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags);
1582
+ }
1583
+
1584
+ // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-scope
1585
+ #define DEFAULT_SCOPE_TAGS \
1586
+ TAG(APPLET), \
1587
+ TAG(CAPTION), \
1588
+ TAG(HTML), \
1589
+ TAG(TABLE), \
1590
+ TAG(TD), \
1591
+ TAG(TH), \
1592
+ TAG(MARQUEE), \
1593
+ TAG(OBJECT), \
1594
+ TAG(TEMPLATE), \
1595
+ TAG_MATHML(MI), \
1596
+ TAG_MATHML(MO), \
1597
+ TAG_MATHML(MN), \
1598
+ TAG_MATHML(MS), \
1599
+ TAG_MATHML(MTEXT), \
1600
+ TAG_MATHML(ANNOTATION_XML), \
1601
+ TAG_SVG(FOREIGNOBJECT), \
1602
+ TAG_SVG(DESC), \
1603
+ TAG_SVG(TITLE)
1604
+
1605
+ static const TagSet heading_tags = {
1606
+ TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6)
1607
+ };
1608
+
1609
+ static const TagSet td_th_tags = {
1610
+ TAG(TD), TAG(TH)
1611
+ };
1612
+
1613
+ static const TagSet dd_dt_tags = {
1614
+ TAG(DD), TAG(DT)
1615
+ };
1616
+
1617
+ // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-scope
1618
+ static bool has_an_element_in_scope(const GumboParser* parser, GumboTag tag) {
1619
+ static const TagSet tags = {DEFAULT_SCOPE_TAGS};
1620
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags);
1621
+ }
1622
+
1623
+ // Like "has an element in scope", but for the specific case of looking for a
1624
+ // unique target node, not for any node with a given tag name. This duplicates
1625
+ // much of the algorithm from has_an_element_in_specific_scope because the
1626
+ // predicate is different when checking for an exact node, and it's easier &
1627
+ // faster just to duplicate the code for this one case than to try and
1628
+ // parameterize it.
1629
+ static bool has_node_in_scope(const GumboParser* parser, const GumboNode* node) {
1630
+ static const TagSet tags = {DEFAULT_SCOPE_TAGS};
1631
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
1632
+ for (int i = open_elements->length; --i >= 0;) {
1633
+ const GumboNode* current = open_elements->data[i];
1634
+ const GumboNodeType type = current->type;
1635
+ if (current == node) {
1636
+ return true;
1637
+ }
1638
+ if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
1639
+ continue;
1640
+ }
1641
+ if (node_tag_in_set(current, &tags)) {
1642
+ return false;
1643
+ }
1644
+ }
1645
+ assert(false);
1646
+ return false;
1647
+ }
1648
+
1649
+ // Like has_an_element_in_scope, but restricts the expected qualified name to a
1650
+ // range of possible qualified names instead of just a single one.
1651
+ static bool has_an_element_in_scope_with_tagname (
1652
+ const GumboParser* parser,
1653
+ int len,
1654
+ const GumboTag expected[]
1655
+ ) {
1656
+ static const TagSet tags = {DEFAULT_SCOPE_TAGS};
1657
+ return has_an_element_in_specific_scope(parser, len, expected, false, &tags);
1658
+ }
1659
+
1660
+ // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-list-item-scope
1661
+ static bool has_an_element_in_list_scope(const GumboParser* parser, GumboTag tag) {
1662
+ static const TagSet tags = {DEFAULT_SCOPE_TAGS, TAG(OL), TAG(UL)};
1663
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags);
1664
+ }
1665
+
1666
+ // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-button-scope
1667
+ static bool has_an_element_in_button_scope(const GumboParser* parser, GumboTag tag) {
1668
+ static const TagSet tags = {DEFAULT_SCOPE_TAGS, TAG(BUTTON)};
1669
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags);
1670
+ }
1671
+
1672
+ // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-table-scope
1673
+ static bool has_an_element_in_table_scope(const GumboParser* parser, GumboTag tag) {
1674
+ static const TagSet tags = {TAG(HTML), TAG(TABLE), TAG(TEMPLATE)};
1675
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, &tags);
1676
+ }
1677
+
1678
+ // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-select-scope
1679
+ static bool has_an_element_in_select_scope(const GumboParser* parser, GumboTag tag) {
1680
+ static const TagSet tags = {TAG(OPTGROUP), TAG(OPTION)};
1681
+ return has_an_element_in_specific_scope(parser, 1, &tag, true, &tags);
1682
+ }
1683
+
1684
+ // https://html.spec.whatwg.org/multipage/parsing.html#generate-implied-end-tags
1685
+ // "exception" is the "element to exclude from the process" listed in the spec.
1686
+ // Pass GUMBO_TAG_LAST to not exclude any of them.
1687
+ static void generate_implied_end_tags (
1688
+ GumboParser* parser,
1689
+ GumboTag exception,
1690
+ const char* exception_name
1691
+ ) {
1692
+ static const TagSet tags = {
1693
+ TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION),
1694
+ TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC)
1695
+ };
1696
+ while (
1697
+ node_tag_in_set(get_current_node(parser), &tags)
1698
+ && !node_html_tagname_is(get_current_node(parser), exception, exception_name)
1699
+ ) {
1700
+ pop_current_node(parser);
1701
+ }
1702
+ }
1703
+
1704
+ // This is the "generate all implied end tags thoroughly" clause of the spec.
1705
+ // https://html.spec.whatwg.org/multipage/parsing.html#closing-elements-that-have-implied-end-tags
1706
+ static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
1707
+ static const TagSet tags = {
1708
+ TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
1709
+ TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY),
1710
+ TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
1711
+ };
1712
+ while (node_tag_in_set(get_current_node(parser), &tags)) {
1713
+ pop_current_node(parser);
1714
+ }
1715
+ }
1716
+
1717
+ // This factors out the clauses in the "in body" insertion mode checking "if
1718
+ // there is a node in the stack of open elements that is not" one of a list of
1719
+ // elements in which case it's a parse error.
1720
+ // This is used in "an end-of-file token", "an end tag whose tag name is
1721
+ // 'body'", and "an end tag whose tag name is 'html'".
1722
+ static bool stack_contains_nonclosable_element (
1723
+ GumboParser* parser
1724
+ ) {
1725
+ static const TagSet tags = {
1726
+ TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P), TAG(RB),
1727
+ TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH),
1728
+ TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML),
1729
+ };
1730
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1731
+ for (size_t i = 0; i < open_elements->length; ++i) {
1732
+ if (!node_tag_in_set(open_elements->data[i], &tags))
1733
+ return true;
1734
+ }
1735
+ return false;
1736
+ }
1737
+
1738
+ // This factors out the clauses relating to "act as if an end tag token with tag
1739
+ // name "table" had been seen. Returns true if there's a table element in table
1740
+ // scope which was successfully closed, false if not and the token should be
1741
+ // ignored. Does not add parse errors; callers should handle that.
1742
+ static bool close_table(GumboParser* parser) {
1743
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) {
1744
+ return false;
1745
+ }
1746
+
1747
+ GumboNode* node = pop_current_node(parser);
1748
+ while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) {
1749
+ node = pop_current_node(parser);
1750
+ }
1751
+ reset_insertion_mode_appropriately(parser);
1752
+ return true;
1753
+ }
1754
+
1755
+ // This factors out the clauses relating to "act as if an end tag token with tag
1756
+ // name `cell_tag` had been seen".
1757
+ static void close_table_cell (
1758
+ GumboParser* parser,
1759
+ const GumboToken* token,
1760
+ GumboTag cell_tag
1761
+ ) {
1762
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
1763
+ const GumboNode* node = get_current_node(parser);
1764
+ if (!node_html_tag_is(node, cell_tag))
1765
+ parser_add_parse_error(parser, token);
1766
+ do {
1767
+ node = pop_current_node(parser);
1768
+ } while (!node_html_tag_is(node, cell_tag));
1769
+
1770
+ clear_active_formatting_elements(parser);
1771
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
1772
+ }
1773
+
1774
+ // https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell
1775
+ // This holds the logic to determine whether we should close a <td> or a <th>.
1776
+ static void close_current_cell(GumboParser* parser, const GumboToken* token) {
1777
+ GumboTag cell_tag;
1778
+ if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1779
+ assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1780
+ cell_tag = GUMBO_TAG_TD;
1781
+ } else {
1782
+ assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1783
+ cell_tag = GUMBO_TAG_TH;
1784
+ }
1785
+ close_table_cell(parser, token, cell_tag);
1786
+ }
1787
+
1788
+ // This factors out the "act as if an end tag of tag name 'select' had been
1789
+ // seen" clause of the spec, since it's referenced in several places. It pops
1790
+ // all nodes from the stack until the current <select> has been closed, then
1791
+ // resets the insertion mode appropriately.
1792
+ static void close_current_select(GumboParser* parser) {
1793
+ GumboNode* node = pop_current_node(parser);
1794
+ while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) {
1795
+ node = pop_current_node(parser);
1796
+ }
1797
+ reset_insertion_mode_appropriately(parser);
1798
+ }
1799
+
1800
+ // The list of nodes in the "special" category:
1801
+ // https://html.spec.whatwg.org/multipage/parsing.html#special
1802
+ static bool is_special_node(const GumboNode* node) {
1803
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1804
+ return node_tag_in_set(node, &(const TagSet) {
1805
+ TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE),
1806
+ TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE),
1807
+ TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL),
1808
+ TAG(COLGROUP), TAG(DD), TAG(DETAILS), TAG(DIR),
1809
+ TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET),
1810
+ TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME),
1811
+ TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6),
1812
+ TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME),
1813
+ TAG(IMG), TAG(INPUT), TAG(LI), TAG(LINK), TAG(LISTING),
1814
+ TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED),
1815
+ TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P),
1816
+ TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION),
1817
+ TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY),
1818
+ TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH),
1819
+ TAG(THEAD), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP),
1820
+
1821
+ TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1822
+ TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1823
+
1824
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1825
+
1826
+ // This TagSet needs to include the "title" element in both the
1827
+ // HTML and SVG namespaces. Using both TAG(TITLE) and TAG_SVG(TITLE)
1828
+ // won't work, due to the simplistic way in which the TAG macros are
1829
+ // implemented, so we do it like this instead:
1830
+ [GUMBO_TAG_TITLE] =
1831
+ (1 << GUMBO_NAMESPACE_HTML) |
1832
+ (1 << GUMBO_NAMESPACE_SVG)
1833
+ }
1834
+ );
1835
+ }
1836
+
1837
+ // Implicitly closes currently open elements until it reaches an element with
1838
+ // the
1839
+ // specified qualified name. If the elements closed are in the set handled by
1840
+ // generate_implied_end_tags, this is normal operation and this function returns
1841
+ // true. Otherwise, a parse error is recorded and this function returns false.
1842
+ static void implicitly_close_tags (
1843
+ GumboParser* parser,
1844
+ GumboToken* token,
1845
+ GumboNamespaceEnum target_ns,
1846
+ GumboTag target
1847
+ ) {
1848
+ assert(target != GUMBO_TAG_UNKNOWN);
1849
+ generate_implied_end_tags(parser, target, NULL);
1850
+ if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1851
+ parser_add_parse_error(parser, token);
1852
+ while (
1853
+ !node_qualified_tag_is(get_current_node(parser), target_ns, target)
1854
+ ) {
1855
+ pop_current_node(parser);
1856
+ }
1857
+ }
1858
+ assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
1859
+ pop_current_node(parser);
1860
+ }
1861
+
1862
+ // If the stack of open elements has a <p> tag in button scope, this acts as if
1863
+ // a </p> tag was encountered, implicitly closing tags. Returns false if a
1864
+ // parse error occurs. This is a convenience function because this particular
1865
+ // clause appears several times in the spec.
1866
+ static void maybe_implicitly_close_p_tag (
1867
+ GumboParser* parser,
1868
+ GumboToken* token
1869
+ ) {
1870
+ if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1871
+ implicitly_close_tags (
1872
+ parser,
1873
+ token,
1874
+ GUMBO_NAMESPACE_HTML,
1875
+ GUMBO_TAG_P
1876
+ );
1877
+ }
1878
+ }
1879
+
1880
+ // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
1881
+ // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
1882
+ static void maybe_implicitly_close_list_tag (
1883
+ GumboParser* parser,
1884
+ GumboToken* token,
1885
+ bool is_li
1886
+ ) {
1887
+ GumboParserState* state = parser->_parser_state;
1888
+ set_frameset_not_ok(parser);
1889
+ for (int i = state->_open_elements.length; --i >= 0;) {
1890
+ const GumboNode* node = state->_open_elements.data[i];
1891
+ bool is_list_tag = is_li
1892
+ ? node_html_tag_is(node, GUMBO_TAG_LI)
1893
+ : node_tag_in_set(node, &dd_dt_tags)
1894
+ ;
1895
+ if (is_list_tag) {
1896
+ implicitly_close_tags (
1897
+ parser,
1898
+ token,
1899
+ node->v.element.tag_namespace,
1900
+ node->v.element.tag
1901
+ );
1902
+ return;
1903
+ }
1904
+
1905
+ if (
1906
+ is_special_node(node)
1907
+ && !node_tag_in_set(node, &(const TagSet){TAG(ADDRESS), TAG(DIV), TAG(P)})
1908
+ ) {
1909
+ return;
1910
+ }
1911
+ }
1912
+ }
1913
+
1914
+ static void merge_attributes (
1915
+ GumboToken* token,
1916
+ GumboNode* node
1917
+ ) {
1918
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1919
+ assert(node->type == GUMBO_NODE_ELEMENT);
1920
+ const GumboVector* token_attr = &token->v.start_tag.attributes;
1921
+ GumboVector* node_attr = &node->v.element.attributes;
1922
+
1923
+ for (unsigned int i = 0; i < token_attr->length; ++i) {
1924
+ GumboAttribute* attr = token_attr->data[i];
1925
+ if (!gumbo_get_attribute(node_attr, attr->name)) {
1926
+ // Ownership of the attribute is transferred by this gumbo_vector_add,
1927
+ // so it has to be nulled out of the original token so it doesn't get
1928
+ // double-deleted.
1929
+ gumbo_vector_add(attr, node_attr);
1930
+ token_attr->data[i] = NULL;
1931
+ }
1932
+ }
1933
+ // When attributes are merged, it means the token has been ignored and merged
1934
+ // with another token, so we need to free its memory. The attributes that are
1935
+ // transferred need to be nulled-out in the vector above so that they aren't
1936
+ // double-deleted.
1937
+ gumbo_token_destroy(token);
1938
+
1939
+ #ifndef NDEBUG
1940
+ // Mark this sentinel so the assertion in the main loop knows it's been
1941
+ // destroyed.
1942
+ token->v.start_tag.attributes = kGumboEmptyVector;
1943
+ #endif
1944
+ }
1945
+
1946
+ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) {
1947
+ const StringReplacement *replacement = gumbo_get_svg_tag_replacement (
1948
+ tag->data,
1949
+ tag->length
1950
+ );
1951
+ return replacement ? replacement->to : NULL;
1952
+ }
1953
+
1954
+ // https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
1955
+ // This destructively modifies any matching attributes on the token and sets the
1956
+ // namespace appropriately.
1957
+ static void adjust_foreign_attributes(GumboToken* token) {
1958
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1959
+ const GumboVector* attributes = &token->v.start_tag.attributes;
1960
+ for (unsigned int i = 0, n = attributes->length; i < n; ++i) {
1961
+ GumboAttribute* attr = attributes->data[i];
1962
+ const ForeignAttrReplacement* entry = gumbo_get_foreign_attr_replacement (
1963
+ attr->name,
1964
+ strlen(attr->name)
1965
+ );
1966
+ if (!entry) {
1967
+ continue;
1968
+ }
1969
+ gumbo_free((void*) attr->name);
1970
+ attr->attr_namespace = entry->attr_namespace;
1971
+ attr->name = gumbo_strdup(entry->local_name);
1972
+ }
1973
+ }
1974
+
1975
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
1976
+ // This adjusts svg tags.
1977
+ static void adjust_svg_tag(GumboToken* token) {
1978
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1979
+ if (token->v.start_tag.tag == GUMBO_TAG_FOREIGNOBJECT) {
1980
+ assert(token->v.start_tag.name == NULL);
1981
+ token->v.start_tag.name = "foreignObject";
1982
+ } else if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
1983
+ assert(token->v.start_tag.name);
1984
+ const StringReplacement *replacement = gumbo_get_svg_tag_replacement(
1985
+ token->v.start_tag.name,
1986
+ strlen(token->v.start_tag.name)
1987
+ );
1988
+ if (replacement) {
1989
+ // This cast is safe because we allocated this memory and we'll free it.
1990
+ strcpy((char *)token->v.start_tag.name, replacement->to);
1991
+ }
1992
+ }
1993
+ }
1994
+
1995
+ // https://html.spec.whatwg.org/multipage/parsing.html#adjust-svg-attributes
1996
+ // This destructively modifies any matching attributes on the token.
1997
+ static void adjust_svg_attributes(GumboToken* token) {
1998
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1999
+ const GumboVector* attributes = &token->v.start_tag.attributes;
2000
+ for (unsigned int i = 0, n = attributes->length; i < n; i++) {
2001
+ GumboAttribute* attr = (GumboAttribute*) attributes->data[i];
2002
+ const StringReplacement* replacement = gumbo_get_svg_attr_replacement (
2003
+ attr->name,
2004
+ attr->original_name.length
2005
+ );
2006
+ if (!replacement) {
2007
+ continue;
2008
+ }
2009
+ gumbo_free((void*) attr->name);
2010
+ attr->name = gumbo_strdup(replacement->to);
2011
+ }
2012
+ }
2013
+
2014
+ // https://html.spec.whatwg.org/multipage/parsing.html#adjust-mathml-attributes
2015
+ // Note that this may destructively modify the token with the new attribute
2016
+ // value.
2017
+ static void adjust_mathml_attributes(GumboToken* token) {
2018
+ assert(token->type == GUMBO_TOKEN_START_TAG);
2019
+ GumboAttribute* attr = gumbo_get_attribute (
2020
+ &token->v.start_tag.attributes,
2021
+ "definitionurl"
2022
+ );
2023
+ if (!attr) {
2024
+ return;
2025
+ }
2026
+ gumbo_free((void*) attr->name);
2027
+ attr->name = gumbo_strdup("definitionURL");
2028
+ }
2029
+
2030
+ static void maybe_add_doctype_error (
2031
+ GumboParser* parser,
2032
+ const GumboToken* token
2033
+ ) {
2034
+ const GumboTokenDocType* doctype = &token->v.doc_type;
2035
+ if (
2036
+ strcmp(doctype->name, "html")
2037
+ || doctype->has_public_identifier
2038
+ || (doctype->has_system_identifier
2039
+ && strcmp(doctype->system_identifier, "about:legacy-compat"))
2040
+ ) {
2041
+ parser_add_parse_error(parser, token);
2042
+ }
2043
+ }
2044
+
2045
+ static void remove_from_parent(GumboNode* node) {
2046
+ if (!node->parent) {
2047
+ // The node may not have a parent if, for example, it is a newly-cloned copy
2048
+ // of an active formatting element. DOM manipulations continue with the
2049
+ // orphaned fragment of the DOM tree until it's appended/foster-parented to
2050
+ // the common ancestor at the end of the adoption agency algorithm.
2051
+ return;
2052
+ }
2053
+ assert(node->parent->type == GUMBO_NODE_ELEMENT);
2054
+ GumboVector* children = &node->parent->v.element.children;
2055
+ int index = gumbo_vector_index_of(children, node);
2056
+ assert(index != -1);
2057
+
2058
+ gumbo_vector_remove_at(index, children);
2059
+ node->parent = NULL;
2060
+ node->index_within_parent = -1;
2061
+ for (unsigned int i = index; i < children->length; ++i) {
2062
+ GumboNode* child = children->data[i];
2063
+ child->index_within_parent = i;
2064
+ }
2065
+ }
2066
+
2067
+ // This is here to clean up memory when the spec says "Ignore current token."
2068
+ static void ignore_token(GumboParser* parser) {
2069
+ GumboToken* token = parser->_parser_state->_current_token;
2070
+ // Ownership of the token's internal buffers are normally transferred to the
2071
+ // element, but if no element is emitted (as happens in non-verbatim-mode
2072
+ // when a token is ignored), we need to free it here to prevent a memory
2073
+ // leak.
2074
+ gumbo_token_destroy(token);
2075
+ #ifndef NDEBUG
2076
+ if (token->type == GUMBO_TOKEN_START_TAG) {
2077
+ // Mark this sentinel so the assertion in the main loop knows it's been
2078
+ // destroyed.
2079
+ token->v.start_tag.attributes = kGumboEmptyVector;
2080
+ token->v.start_tag.name = NULL;
2081
+ }
2082
+ #endif
2083
+ }
2084
+
2085
+ // The token is usually an end tag; however, the adoption agency algorithm may
2086
+ // invoke this for an 'a' or 'nobr' start tag.
2087
+ // Returns false if there was an error.
2088
+ static void in_body_any_other_end_tag(GumboParser* parser, GumboToken* token)
2089
+ {
2090
+ GumboParserState* state = parser->_parser_state;
2091
+ GumboTag tag;
2092
+ const char* tagname;
2093
+
2094
+ if (token->type == GUMBO_TOKEN_END_TAG) {
2095
+ tag = token->v.end_tag.tag;
2096
+ tagname = token->v.end_tag.name;
2097
+ } else {
2098
+ assert(token->type == GUMBO_TOKEN_START_TAG);
2099
+ tag = token->v.start_tag.tag;
2100
+ tagname = token->v.start_tag.name;
2101
+ }
2102
+
2103
+ assert(state->_open_elements.length > 0);
2104
+ assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
2105
+ // Walk up the stack of open elements until we find one that either:
2106
+ // a) Matches the tag name we saw
2107
+ // b) Is in the "special" category.
2108
+ // If we see a), implicitly close everything up to and including it. If we
2109
+ // see b), then record a parse error, don't close anything (except the
2110
+ // implied end tags) and ignore the end tag token.
2111
+ for (int i = state->_open_elements.length; --i >= 0;) {
2112
+ const GumboNode* node = state->_open_elements.data[i];
2113
+ if (node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, tag, tagname)) {
2114
+ generate_implied_end_tags(parser, tag, tagname);
2115
+ // <!DOCTYPE><body><sarcasm><foo></sarcasm> is an example of an error.
2116
+ // foo is the "current node" but sarcasm is node.
2117
+ // XXX: Write a test for this.
2118
+ if (node != get_current_node(parser)) {
2119
+ parser_add_parse_error(parser, token);
2120
+ }
2121
+ while (node != pop_current_node(parser))
2122
+ ; // Pop everything.
2123
+ return;
2124
+ } else if (is_special_node(node)) {
2125
+ parser_add_parse_error(parser, token);
2126
+ ignore_token(parser);
2127
+ return;
2128
+ }
2129
+ }
2130
+ // <html> is in the special category, so we should never get here.
2131
+ assert(0 && "unreachable");
2132
+ }
2133
+
2134
+ // https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
2135
+ // Also described in the "in body" handling for end formatting tags.
2136
+ // Returns false if there was an error.
2137
+ static void adoption_agency_algorithm(GumboParser* parser, GumboToken* token)
2138
+ {
2139
+ GumboParserState* state = parser->_parser_state;
2140
+ gumbo_debug("Entering adoption agency algorithm.\n");
2141
+ // Step 1.
2142
+ GumboTag subject;
2143
+ if (token->type == GUMBO_TOKEN_START_TAG) {
2144
+ subject = token->v.start_tag.tag;
2145
+ } else {
2146
+ assert(token->type == GUMBO_TOKEN_END_TAG);
2147
+ subject = token->v.end_tag.tag;
2148
+ }
2149
+ assert(subject != GUMBO_TAG_UNKNOWN);
2150
+
2151
+ // Step 2.
2152
+ GumboNode* current_node = get_current_node(parser);
2153
+ if (
2154
+ node_html_tag_is(current_node, subject)
2155
+ && -1 == gumbo_vector_index_of (
2156
+ &state->_active_formatting_elements,
2157
+ current_node
2158
+ )
2159
+ ) {
2160
+ pop_current_node(parser);
2161
+ return;
2162
+ }
2163
+
2164
+ // Steps 3-5 & 21:
2165
+ for (unsigned int i = 0; i < 8; ++i) {
2166
+ // Step 6.
2167
+ GumboNode* formatting_node = NULL;
2168
+ int formatting_node_in_open_elements = -1;
2169
+ for (int j = state->_active_formatting_elements.length; --j >= 0;) {
2170
+ GumboNode* current_node = state->_active_formatting_elements.data[j];
2171
+ if (current_node == &kActiveFormattingScopeMarker) {
2172
+ gumbo_debug("Broke on scope marker; aborting.\n");
2173
+ // Last scope marker; abort the algorithm and handle according to "any
2174
+ // other end tag" (below).
2175
+ break;
2176
+ }
2177
+ if (node_html_tag_is(current_node, subject)) {
2178
+ // Found it.
2179
+ formatting_node = current_node;
2180
+ formatting_node_in_open_elements = gumbo_vector_index_of (
2181
+ &state->_open_elements,
2182
+ formatting_node
2183
+ );
2184
+ gumbo_debug (
2185
+ "Formatting element of tag %s at %d.\n",
2186
+ gumbo_normalized_tagname(subject),
2187
+ formatting_node_in_open_elements
2188
+ );
2189
+ break;
2190
+ }
2191
+ }
2192
+ if (!formatting_node) {
2193
+ // No matching tag; not a parse error outright, but fall through to the
2194
+ // "any other end tag" clause (which may potentially add a parse error,
2195
+ // but not always).
2196
+ gumbo_debug("No active formatting elements; aborting.\n");
2197
+ in_body_any_other_end_tag(parser, token);
2198
+ return;
2199
+ }
2200
+
2201
+ // Step 7
2202
+ if (formatting_node_in_open_elements == -1) {
2203
+ gumbo_debug("Formatting node not on stack of open elements.\n");
2204
+ parser_add_parse_error(parser, token);
2205
+ gumbo_vector_remove (
2206
+ formatting_node,
2207
+ &state->_active_formatting_elements
2208
+ );
2209
+ return;
2210
+ }
2211
+
2212
+ // Step 8
2213
+ if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
2214
+ parser_add_parse_error(parser, token);
2215
+ gumbo_debug("Element not in scope.\n");
2216
+ return;
2217
+ }
2218
+
2219
+ // Step 9
2220
+ if (formatting_node != get_current_node(parser))
2221
+ parser_add_parse_error(parser, token); // But continue onwards.
2222
+ assert(formatting_node);
2223
+ assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
2224
+ assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
2225
+
2226
+ // Step 10
2227
+ GumboNode* furthest_block = NULL;
2228
+ for (
2229
+ unsigned int j = formatting_node_in_open_elements;
2230
+ j < state->_open_elements.length;
2231
+ ++j
2232
+ ) {
2233
+ assert(j > 0);
2234
+ GumboNode* current = state->_open_elements.data[j];
2235
+ if (is_special_node(current)) {
2236
+ furthest_block = current;
2237
+ break;
2238
+ }
2239
+ }
2240
+ // Step 11.
2241
+ if (!furthest_block) {
2242
+ while (pop_current_node(parser) != formatting_node)
2243
+ ;
2244
+ gumbo_vector_remove (
2245
+ formatting_node,
2246
+ &state->_active_formatting_elements
2247
+ );
2248
+ return;
2249
+ }
2250
+ assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
2251
+
2252
+ // Step 12.
2253
+ // Elements may be moved and reparented by this algorithm, so
2254
+ // common_ancestor is not necessarily the same as formatting_node->parent.
2255
+ GumboNode* common_ancestor = state->_open_elements.data [
2256
+ formatting_node_in_open_elements - 1
2257
+ ];
2258
+ gumbo_debug (
2259
+ "Common ancestor tag = %s, furthest block tag = %s.\n",
2260
+ gumbo_normalized_tagname(common_ancestor->v.element.tag),
2261
+ gumbo_normalized_tagname(furthest_block->v.element.tag)
2262
+ );
2263
+
2264
+ // Step 13.
2265
+ int bookmark = 1 + gumbo_vector_index_of (
2266
+ &state->_active_formatting_elements,
2267
+ formatting_node
2268
+ );
2269
+ gumbo_debug("Bookmark at %d.\n", bookmark);
2270
+ // Step 14.
2271
+ GumboNode* node = furthest_block;
2272
+ GumboNode* last_node = furthest_block;
2273
+ // Must be stored explicitly, in case node is removed from the stack of open
2274
+ // elements, to handle step 14.3.
2275
+ int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
2276
+ assert(saved_node_index > 0);
2277
+ // Step 14.1.
2278
+ for (int j = 0;;) {
2279
+ // Step 14.2.
2280
+ ++j;
2281
+ // Step 14.3.
2282
+ int node_index = gumbo_vector_index_of(&state->_open_elements, node);
2283
+ gumbo_debug (
2284
+ "Current index: %d, last index: %d.\n",
2285
+ node_index,
2286
+ saved_node_index
2287
+ );
2288
+ if (node_index == -1) {
2289
+ node_index = saved_node_index;
2290
+ }
2291
+ saved_node_index = --node_index;
2292
+ assert(node_index > 0);
2293
+ assert((unsigned int) node_index < state->_open_elements.capacity);
2294
+ node = state->_open_elements.data[node_index];
2295
+ assert(node->parent);
2296
+ // Step 14.4.
2297
+ if (node == formatting_node) {
2298
+ break;
2299
+ }
2300
+ int formatting_index = gumbo_vector_index_of (
2301
+ &state->_active_formatting_elements,
2302
+ node
2303
+ );
2304
+ // Step 14.5.
2305
+ if (j > 3 && formatting_index != -1) {
2306
+ gumbo_debug("Removing formatting element at %d.\n", formatting_index);
2307
+ gumbo_vector_remove_at (
2308
+ formatting_index,
2309
+ &state->_active_formatting_elements
2310
+ );
2311
+ // Removing the element shifts all indices over by one, so we may need
2312
+ // to move the bookmark.
2313
+ if (formatting_index < bookmark) {
2314
+ --bookmark;
2315
+ gumbo_debug("Moving bookmark to %d.\n", bookmark);
2316
+ }
2317
+ continue;
2318
+ }
2319
+ if (formatting_index == -1) {
2320
+ // Step 14.6.
2321
+ gumbo_vector_remove_at(node_index, &state->_open_elements);
2322
+ continue;
2323
+ }
2324
+ // Step 14.7.
2325
+ // "common ancestor as the intended parent" doesn't actually mean insert
2326
+ // it into the common ancestor; that happens below.
2327
+ node = clone_node(node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
2328
+ assert(formatting_index >= 0);
2329
+ state->_active_formatting_elements.data[formatting_index] = node;
2330
+ assert(node_index >= 0);
2331
+ state->_open_elements.data[node_index] = node;
2332
+ // Step 14.8.
2333
+ if (last_node == furthest_block) {
2334
+ bookmark = formatting_index + 1;
2335
+ gumbo_debug("Bookmark moved to %d.\n", bookmark);
2336
+ assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
2337
+ }
2338
+ // Step 14.9.
2339
+ last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
2340
+ remove_from_parent(last_node);
2341
+ append_node(node, last_node);
2342
+ // Step 14.10.
2343
+ last_node = node;
2344
+ } // Step 14.11.
2345
+
2346
+ // Step 15.
2347
+ gumbo_debug (
2348
+ "Removing %s node from parent ",
2349
+ gumbo_normalized_tagname(last_node->v.element.tag)
2350
+ );
2351
+ remove_from_parent(last_node);
2352
+ last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
2353
+ InsertionLocation location = get_appropriate_insertion_location (
2354
+ parser,
2355
+ common_ancestor
2356
+ );
2357
+ gumbo_debug (
2358
+ "and inserting it into %s.\n",
2359
+ gumbo_normalized_tagname(location.target->v.element.tag)
2360
+ );
2361
+ insert_node(last_node, location);
2362
+
2363
+ // Step 16.
2364
+ GumboNode* new_formatting_node = clone_node (
2365
+ formatting_node,
2366
+ GUMBO_INSERTION_ADOPTION_AGENCY_CLONED
2367
+ );
2368
+ formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2369
+
2370
+ // Step 17. Instead of appending nodes one-by-one, we swap the children
2371
+ // vector of furthest_block with the empty children of new_formatting_node,
2372
+ // reducing memory traffic and allocations. We still have to reset their
2373
+ // parent pointers, though.
2374
+ GumboVector temp = new_formatting_node->v.element.children;
2375
+ new_formatting_node->v.element.children = furthest_block->v.element.children;
2376
+ furthest_block->v.element.children = temp;
2377
+
2378
+ temp = new_formatting_node->v.element.children;
2379
+ for (unsigned int i = 0; i < temp.length; ++i) {
2380
+ GumboNode* child = temp.data[i];
2381
+ child->parent = new_formatting_node;
2382
+ }
2383
+
2384
+ // Step 18.
2385
+ append_node(furthest_block, new_formatting_node);
2386
+
2387
+ // Step 19.
2388
+ // If the formatting node was before the bookmark, it may shift over all
2389
+ // indices after it, so we need to explicitly find the index and possibly
2390
+ // adjust the bookmark.
2391
+ int formatting_node_index = gumbo_vector_index_of (
2392
+ &state->_active_formatting_elements,
2393
+ formatting_node
2394
+ );
2395
+ assert(formatting_node_index != -1);
2396
+ if (formatting_node_index < bookmark) {
2397
+ gumbo_debug (
2398
+ "Formatting node at %d is before bookmark at %d; decrementing.\n",
2399
+ formatting_node_index, bookmark
2400
+ );
2401
+ --bookmark;
2402
+ }
2403
+ gumbo_vector_remove_at (
2404
+ formatting_node_index,
2405
+ &state->_active_formatting_elements
2406
+ );
2407
+ assert(bookmark >= 0);
2408
+ assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
2409
+ gumbo_vector_insert_at (
2410
+ new_formatting_node,
2411
+ bookmark,
2412
+ &state->_active_formatting_elements
2413
+ );
2414
+
2415
+ // Step 20.
2416
+ gumbo_vector_remove(formatting_node, &state->_open_elements);
2417
+ int insert_at = 1 + gumbo_vector_index_of (
2418
+ &state->_open_elements,
2419
+ furthest_block
2420
+ );
2421
+ assert(insert_at >= 0);
2422
+ assert((unsigned int) insert_at <= state->_open_elements.length);
2423
+ gumbo_vector_insert_at (
2424
+ new_formatting_node,
2425
+ insert_at,
2426
+ &state->_open_elements
2427
+ );
2428
+ } // Step 21.
2429
+ }
2430
+
2431
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-end
2432
+ static void finish_parsing(GumboParser* parser) {
2433
+ gumbo_debug("Finishing parsing");
2434
+ maybe_flush_text_node_buffer(parser);
2435
+ GumboParserState* state = parser->_parser_state;
2436
+ for (
2437
+ GumboNode* node = pop_current_node(parser);
2438
+ node;
2439
+ node = pop_current_node(parser)
2440
+ ) {
2441
+ if (
2442
+ (node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag)
2443
+ || (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)
2444
+ ) {
2445
+ continue;
2446
+ }
2447
+ node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2448
+ }
2449
+ while (pop_current_node(parser))
2450
+ ; // Pop them all.
2451
+ }
2452
+
2453
+ static void handle_initial(GumboParser* parser, GumboToken* token) {
2454
+ GumboDocument* document = &get_document_node(parser)->v.document;
2455
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2456
+ ignore_token(parser);
2457
+ return;
2458
+ }
2459
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2460
+ append_comment_node(parser, get_document_node(parser), token);
2461
+ return;
2462
+ }
2463
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2464
+ document->has_doctype = true;
2465
+ document->name = token->v.doc_type.name;
2466
+ document->public_identifier = token->v.doc_type.public_identifier;
2467
+ document->system_identifier = token->v.doc_type.system_identifier;
2468
+ document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
2469
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2470
+ maybe_add_doctype_error(parser, token);
2471
+ return;
2472
+ }
2473
+ parser_add_parse_error(parser, token);
2474
+ document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
2475
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2476
+ parser->_parser_state->_reprocess_current_token = true;
2477
+ }
2478
+
2479
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode
2480
+ static void handle_before_html(GumboParser* parser, GumboToken* token) {
2481
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2482
+ parser_add_parse_error(parser, token);
2483
+ ignore_token(parser);
2484
+ return;
2485
+ }
2486
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2487
+ append_comment_node(parser, get_document_node(parser), token);
2488
+ return;
2489
+ }
2490
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2491
+ ignore_token(parser);
2492
+ return;
2493
+ }
2494
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2495
+ GumboNode* html_node = insert_element_from_token(parser, token);
2496
+ parser->_output->root = html_node;
2497
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2498
+ return;
2499
+ }
2500
+ if (
2501
+ token->type == GUMBO_TOKEN_END_TAG
2502
+ && !tag_in(token, false, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2503
+ ) {
2504
+ parser_add_parse_error(parser, token);
2505
+ ignore_token(parser);
2506
+ return;
2507
+ }
2508
+ GumboNode* html_node = insert_element_of_tag_type (
2509
+ parser,
2510
+ GUMBO_TAG_HTML,
2511
+ GUMBO_INSERTION_IMPLIED
2512
+ );
2513
+ assert(html_node);
2514
+ parser->_output->root = html_node;
2515
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2516
+ parser->_parser_state->_reprocess_current_token = true;
2517
+ }
2518
+
2519
+ // Forward declarations because of mutual dependencies.
2520
+ static void handle_token(GumboParser* parser, GumboToken* token);
2521
+ static void handle_in_body(GumboParser* parser, GumboToken* token);
2522
+ static void handle_in_template(GumboParser* parser, GumboToken* token);
2523
+
2524
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode
2525
+ static void handle_before_head(GumboParser* parser, GumboToken* token) {
2526
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2527
+ ignore_token(parser);
2528
+ return;
2529
+ }
2530
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2531
+ append_comment_node(parser, get_current_node(parser), token);
2532
+ return;
2533
+ }
2534
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2535
+ parser_add_parse_error(parser, token);
2536
+ ignore_token(parser);
2537
+ return;
2538
+ }
2539
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2540
+ handle_in_body(parser, token);
2541
+ return;
2542
+ }
2543
+ if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2544
+ GumboNode* node = insert_element_from_token(parser, token);
2545
+ parser->_parser_state->_head_element = node;
2546
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2547
+ return;
2548
+ }
2549
+ if (
2550
+ token->type == GUMBO_TOKEN_END_TAG
2551
+ && !tag_in(token, kEndTag, &(const TagSet){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})
2552
+ ) {
2553
+ parser_add_parse_error(parser, token);
2554
+ ignore_token(parser);
2555
+ return;
2556
+ }
2557
+ GumboNode* node = insert_element_of_tag_type (
2558
+ parser,
2559
+ GUMBO_TAG_HEAD,
2560
+ GUMBO_INSERTION_IMPLIED
2561
+ );
2562
+ parser->_parser_state->_head_element = node;
2563
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2564
+ parser->_parser_state->_reprocess_current_token = true;
2565
+ }
2566
+
2567
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead
2568
+ static void handle_in_head(GumboParser* parser, GumboToken* token) {
2569
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2570
+ insert_text_token(parser, token);
2571
+ return;
2572
+ }
2573
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2574
+ append_comment_node(parser, get_current_node(parser), token);
2575
+ return;
2576
+ }
2577
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2578
+ parser_add_parse_error(parser, token);
2579
+ ignore_token(parser);
2580
+ return;
2581
+ }
2582
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2583
+ return handle_in_body(parser, token);
2584
+ }
2585
+ if (
2586
+ tag_in(token, kStartTag, &(const TagSet) {
2587
+ TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK)
2588
+ })
2589
+ ) {
2590
+ insert_element_from_token(parser, token);
2591
+ pop_current_node(parser);
2592
+ acknowledge_self_closing_tag(parser);
2593
+ return;
2594
+ }
2595
+ if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2596
+ insert_element_from_token(parser, token);
2597
+ pop_current_node(parser);
2598
+ acknowledge_self_closing_tag(parser);
2599
+ // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the
2600
+ // spec doesn't apply. If clients want to handle meta-tag re-encoding, they
2601
+ // should specifically look for that string in the document and re-encode it
2602
+ // before passing to Gumbo.
2603
+ return;
2604
+ }
2605
+ if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2606
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2607
+ return;
2608
+ }
2609
+ if (
2610
+ tag_in(token, kStartTag, &(const TagSet){TAG(NOFRAMES), TAG(STYLE)})
2611
+ ) {
2612
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2613
+ return;
2614
+ }
2615
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2616
+ insert_element_from_token(parser, token);
2617
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2618
+ return;
2619
+ }
2620
+ if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2621
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT_DATA);
2622
+ return;
2623
+ }
2624
+ if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2625
+ GumboNode* head = pop_current_node(parser);
2626
+ UNUSED_IF_NDEBUG(head);
2627
+ assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
2628
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2629
+ return;
2630
+ }
2631
+ if (
2632
+ tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)})
2633
+ ) {
2634
+ pop_current_node(parser);
2635
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2636
+ parser->_parser_state->_reprocess_current_token = true;
2637
+ return;
2638
+ }
2639
+ if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2640
+ insert_element_from_token(parser, token);
2641
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
2642
+ set_frameset_not_ok(parser);
2643
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2644
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2645
+ return;
2646
+ }
2647
+ if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2648
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2649
+ parser_add_parse_error(parser, token);
2650
+ ignore_token(parser);
2651
+ return;
2652
+ }
2653
+ generate_all_implied_end_tags_thoroughly(parser);
2654
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE))
2655
+ parser_add_parse_error(parser, token);
2656
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
2657
+ ;
2658
+ clear_active_formatting_elements(parser);
2659
+ pop_template_insertion_mode(parser);
2660
+ reset_insertion_mode_appropriately(parser);
2661
+ return;
2662
+ }
2663
+ if (
2664
+ tag_is(token, kStartTag, GUMBO_TAG_HEAD)
2665
+ || (token->type == GUMBO_TOKEN_END_TAG)
2666
+ ) {
2667
+ parser_add_parse_error(parser, token);
2668
+ ignore_token(parser);
2669
+ return;
2670
+ }
2671
+ pop_current_node(parser);
2672
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2673
+ parser->_parser_state->_reprocess_current_token = true;
2674
+ return;
2675
+ }
2676
+
2677
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inheadnoscript
2678
+ static void handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2679
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2680
+ parser_add_parse_error(parser, token);
2681
+ return;
2682
+ }
2683
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2684
+ handle_in_body(parser, token);
2685
+ return;
2686
+ }
2687
+ if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2688
+ const GumboNode* node = pop_current_node(parser);
2689
+ assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2690
+ UNUSED_IF_NDEBUG(node);
2691
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2692
+ return;
2693
+ }
2694
+ if (
2695
+ token->type == GUMBO_TOKEN_WHITESPACE
2696
+ || token->type == GUMBO_TOKEN_COMMENT
2697
+ || tag_in (token, kStartTag, &(const TagSet) {
2698
+ TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
2699
+ TAG(META), TAG(NOFRAMES), TAG(STYLE)
2700
+ })
2701
+ ) {
2702
+ handle_in_head(parser, token);
2703
+ return;
2704
+ }
2705
+ if (
2706
+ tag_in(token, kStartTag, &(const TagSet){TAG(HEAD), TAG(NOSCRIPT)})
2707
+ || (
2708
+ token->type == GUMBO_TOKEN_END_TAG
2709
+ && !tag_is(token, kEndTag, GUMBO_TAG_BR)
2710
+ )
2711
+ ) {
2712
+ parser_add_parse_error(parser, token);
2713
+ ignore_token(parser);
2714
+ return;
2715
+ }
2716
+ parser_add_parse_error(parser, token);
2717
+ const GumboNode* node = pop_current_node(parser);
2718
+ assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2719
+ UNUSED_IF_NDEBUG(node);
2720
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2721
+ parser->_parser_state->_reprocess_current_token = true;
2722
+ }
2723
+
2724
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode
2725
+ static void handle_after_head(GumboParser* parser, GumboToken* token) {
2726
+ GumboParserState* state = parser->_parser_state;
2727
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2728
+ insert_text_token(parser, token);
2729
+ return;
2730
+ }
2731
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2732
+ append_comment_node(parser, get_current_node(parser), token);
2733
+ return;
2734
+ }
2735
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2736
+ parser_add_parse_error(parser, token);
2737
+ ignore_token(parser);
2738
+ return;
2739
+ }
2740
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2741
+ handle_in_body(parser, token);
2742
+ return;
2743
+ }
2744
+ if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2745
+ insert_element_from_token(parser, token);
2746
+ set_frameset_not_ok(parser);
2747
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2748
+ return;
2749
+ }
2750
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2751
+ insert_element_from_token(parser, token);
2752
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2753
+ return;
2754
+ }
2755
+ if (
2756
+ tag_in(token, kStartTag, &(const TagSet) {
2757
+ TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META),
2758
+ TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)
2759
+ })
2760
+ ) {
2761
+ parser_add_parse_error(parser, token);
2762
+ assert(state->_head_element != NULL);
2763
+ // This must be flushed before we push the head element on, as there may be
2764
+ // pending character tokens that should be attached to the root.
2765
+ maybe_flush_text_node_buffer(parser);
2766
+ gumbo_vector_add(state->_head_element, &state->_open_elements);
2767
+ handle_in_head(parser, token);
2768
+ gumbo_vector_remove(state->_head_element, &state->_open_elements);
2769
+ return;
2770
+ }
2771
+ if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2772
+ handle_in_head(parser, token);
2773
+ return;
2774
+ }
2775
+ if (
2776
+ tag_is(token, kStartTag, GUMBO_TAG_HEAD)
2777
+ || (
2778
+ token->type == GUMBO_TOKEN_END_TAG
2779
+ && !tag_in(token, kEndTag, &(const TagSet){TAG(BODY), TAG(HTML), TAG(BR)})
2780
+ )
2781
+ ) {
2782
+ parser_add_parse_error(parser, token);
2783
+ ignore_token(parser);
2784
+ return;
2785
+ }
2786
+ insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2787
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2788
+ state->_reprocess_current_token = true;
2789
+ }
2790
+
2791
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
2792
+ static void handle_in_body(GumboParser* parser, GumboToken* token) {
2793
+ GumboParserState* state = parser->_parser_state;
2794
+ assert(state->_open_elements.length > 0);
2795
+ if (token->type == GUMBO_TOKEN_NULL) {
2796
+ parser_add_parse_error(parser, token);
2797
+ ignore_token(parser);
2798
+ return;
2799
+ }
2800
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2801
+ reconstruct_active_formatting_elements(parser);
2802
+ insert_text_token(parser, token);
2803
+ return;
2804
+ }
2805
+ if (
2806
+ token->type == GUMBO_TOKEN_CHARACTER
2807
+ || token->type == GUMBO_TOKEN_CDATA
2808
+ ) {
2809
+ reconstruct_active_formatting_elements(parser);
2810
+ insert_text_token(parser, token);
2811
+ set_frameset_not_ok(parser);
2812
+ return;
2813
+ }
2814
+ if (token->type == GUMBO_TOKEN_COMMENT) {
2815
+ append_comment_node(parser, get_current_node(parser), token);
2816
+ return;
2817
+ }
2818
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2819
+ parser_add_parse_error(parser, token);
2820
+ ignore_token(parser);
2821
+ return;
2822
+ }
2823
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2824
+ parser_add_parse_error(parser, token);
2825
+ if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2826
+ ignore_token(parser);
2827
+ return;
2828
+ }
2829
+ assert(parser->_output->root != NULL);
2830
+ assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2831
+ merge_attributes(token, parser->_output->root);
2832
+ return;
2833
+ }
2834
+ if (
2835
+ tag_in(token, kStartTag, &(const TagSet) {
2836
+ TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
2837
+ TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE),
2838
+ TAG(TITLE)
2839
+ })
2840
+ || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
2841
+ ) {
2842
+ handle_in_head(parser, token);
2843
+ return;
2844
+ }
2845
+ if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2846
+ parser_add_parse_error(parser, token);
2847
+ if (
2848
+ state->_open_elements.length < 2
2849
+ || !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY)
2850
+ || has_open_element(parser, GUMBO_TAG_TEMPLATE)
2851
+ ) {
2852
+ ignore_token(parser);
2853
+ } else {
2854
+ set_frameset_not_ok(parser);
2855
+ merge_attributes(token, state->_open_elements.data[1]);
2856
+ }
2857
+ return;
2858
+ }
2859
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2860
+ parser_add_parse_error(parser, token);
2861
+ if (
2862
+ state->_open_elements.length < 2
2863
+ || !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY)
2864
+ || !state->_frameset_ok
2865
+ ) {
2866
+ ignore_token(parser);
2867
+ return;
2868
+ }
2869
+ // Save the body node for later removal.
2870
+ GumboNode* body_node = state->_open_elements.data[1];
2871
+
2872
+ // Pop all nodes except root HTML element.
2873
+ GumboNode* node;
2874
+ do {
2875
+ node = pop_current_node(parser);
2876
+ } while (node != state->_open_elements.data[1]);
2877
+
2878
+ // Removing & destroying the body node is going to kill any nodes that have
2879
+ // been added to the list of active formatting elements, and so we should
2880
+ // clear it to prevent a use-after-free if the list of active formatting
2881
+ // elements is reconstructed afterwards. This may happen if whitespace
2882
+ // follows the </frameset>.
2883
+ clear_active_formatting_elements(parser);
2884
+
2885
+ // Remove the body node. We may want to factor this out into a generic
2886
+ // helper, but right now this is the only code that needs to do this.
2887
+ GumboVector* children = &parser->_output->root->v.element.children;
2888
+ for (unsigned int i = 0; i < children->length; ++i) {
2889
+ if (children->data[i] == body_node) {
2890
+ gumbo_vector_remove_at(i, children);
2891
+ break;
2892
+ }
2893
+ }
2894
+ destroy_node(body_node);
2895
+
2896
+ // Insert the <frameset>, and switch the insertion mode.
2897
+ insert_element_from_token(parser, token);
2898
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2899
+ return;
2900
+ }
2901
+ if (token->type == GUMBO_TOKEN_EOF) {
2902
+ if (get_current_template_insertion_mode(parser) !=
2903
+ GUMBO_INSERTION_MODE_INITIAL) {
2904
+ handle_in_template(parser, token);
2905
+ return;
2906
+ }
2907
+ if (stack_contains_nonclosable_element(parser))
2908
+ parser_add_parse_error(parser, token);
2909
+ return;
2910
+ }
2911
+ if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
2912
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2913
+ parser_add_parse_error(parser, token);
2914
+ ignore_token(parser);
2915
+ return;
2916
+ }
2917
+ if (stack_contains_nonclosable_element(parser))
2918
+ parser_add_parse_error(parser, token);
2919
+ GumboNode* body = state->_open_elements.data[1];
2920
+ assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2921
+ record_end_of_element(state->_current_token, &body->v.element);
2922
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2923
+ return;
2924
+ }
2925
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2926
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2927
+ parser_add_parse_error(parser, token);
2928
+ ignore_token(parser);
2929
+ return;
2930
+ }
2931
+ if (stack_contains_nonclosable_element(parser))
2932
+ parser_add_parse_error(parser, token);
2933
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2934
+ parser->_parser_state->_reprocess_current_token = true;
2935
+ return;
2936
+ }
2937
+ if (
2938
+ tag_in(token, kStartTag, &(const TagSet) {
2939
+ TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER),
2940
+ TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
2941
+ TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
2942
+ TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
2943
+ TAG(SUMMARY), TAG(UL), TAG(SEARCH)
2944
+ })
2945
+ ) {
2946
+ maybe_implicitly_close_p_tag(parser, token);
2947
+ insert_element_from_token(parser, token);
2948
+ return;
2949
+ }
2950
+ if (tag_in(token, kStartTag, &heading_tags)) {
2951
+ maybe_implicitly_close_p_tag(parser, token);
2952
+ if (node_tag_in_set(get_current_node(parser), &heading_tags)) {
2953
+ parser_add_parse_error(parser, token);
2954
+ pop_current_node(parser);
2955
+ }
2956
+ insert_element_from_token(parser, token);
2957
+ return;
2958
+ }
2959
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(PRE), TAG(LISTING)})) {
2960
+ maybe_implicitly_close_p_tag(parser, token);
2961
+ insert_element_from_token(parser, token);
2962
+ state->_ignore_next_linefeed = true;
2963
+ set_frameset_not_ok(parser);
2964
+ return;
2965
+ }
2966
+ if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2967
+ if (
2968
+ state->_form_element != NULL
2969
+ && !has_open_element(parser, GUMBO_TAG_TEMPLATE)
2970
+ ) {
2971
+ gumbo_debug("Ignoring nested form.\n");
2972
+ parser_add_parse_error(parser, token);
2973
+ ignore_token(parser);
2974
+ return;
2975
+ }
2976
+ maybe_implicitly_close_p_tag(parser, token);
2977
+ GumboNode* form_element = insert_element_from_token(parser, token);
2978
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2979
+ state->_form_element = form_element;
2980
+ }
2981
+ return;
2982
+ }
2983
+ if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2984
+ maybe_implicitly_close_list_tag(parser, token, true);
2985
+ maybe_implicitly_close_p_tag(parser, token);
2986
+ insert_element_from_token(parser, token);
2987
+ return;
2988
+ }
2989
+ if (tag_in(token, kStartTag, &dd_dt_tags)) {
2990
+ maybe_implicitly_close_list_tag(parser, token, false);
2991
+ maybe_implicitly_close_p_tag(parser, token);
2992
+ insert_element_from_token(parser, token);
2993
+ return;
2994
+ }
2995
+ if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2996
+ maybe_implicitly_close_p_tag(parser, token);
2997
+ insert_element_from_token(parser, token);
2998
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
2999
+ return;
3000
+ }
3001
+ if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
3002
+ if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
3003
+ parser_add_parse_error(parser, token);
3004
+ // We don't want to use implicitly_close_tags here because it may add an
3005
+ // error and we've already added the only error the standard specifies.
3006
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3007
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_BUTTON))
3008
+ ;
3009
+ }
3010
+ reconstruct_active_formatting_elements(parser);
3011
+ insert_element_from_token(parser, token);
3012
+ set_frameset_not_ok(parser);
3013
+ return;
3014
+ }
3015
+ if (
3016
+ tag_in(token, kEndTag, &(const TagSet) {
3017
+ TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON),
3018
+ TAG(CENTER), TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL),
3019
+ TAG(FIELDSET), TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER),
3020
+ TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL),
3021
+ TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL), TAG(SEARCH)
3022
+ })
3023
+ ) {
3024
+ GumboTag tag = token->v.end_tag.tag;
3025
+ if (!has_an_element_in_scope(parser, tag)) {
3026
+ parser_add_parse_error(parser, token);
3027
+ ignore_token(parser);
3028
+ return;
3029
+ }
3030
+ return implicitly_close_tags (
3031
+ parser,
3032
+ token,
3033
+ GUMBO_NAMESPACE_HTML,
3034
+ token->v.end_tag.tag
3035
+ );
3036
+ }
3037
+ if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
3038
+ if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3039
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
3040
+ parser_add_parse_error(parser, token);
3041
+ ignore_token(parser);
3042
+ return;
3043
+ }
3044
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3045
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM))
3046
+ parser_add_parse_error(parser, token);
3047
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
3048
+ ;
3049
+ return;
3050
+ } else {
3051
+ GumboNode* node = state->_form_element;
3052
+ assert(!node || node->type == GUMBO_NODE_ELEMENT);
3053
+ state->_form_element = NULL;
3054
+ if (!node || !has_node_in_scope(parser, node)) {
3055
+ gumbo_debug("Closing an unopened form.\n");
3056
+ parser_add_parse_error(parser, token);
3057
+ ignore_token(parser);
3058
+ return;
3059
+ }
3060
+ // Since we remove the form node without popping, we need to make sure
3061
+ // that we flush any text nodes at the end of the form.
3062
+ maybe_flush_text_node_buffer(parser);
3063
+ // This differs from implicitly_close_tags because we remove *only* the
3064
+ // <form> element; other nodes are left in scope.
3065
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3066
+ if (get_current_node(parser) != node)
3067
+ parser_add_parse_error(parser, token);
3068
+ else
3069
+ record_end_of_element(token, &node->v.element);
3070
+
3071
+ GumboVector* open_elements = &state->_open_elements;
3072
+ int index = gumbo_vector_index_of(open_elements, node);
3073
+ assert(index >= 0);
3074
+ gumbo_vector_remove_at(index, open_elements);
3075
+ return;
3076
+ }
3077
+ }
3078
+ if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
3079
+ if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
3080
+ parser_add_parse_error(parser, token);
3081
+ // reconstruct_active_formatting_elements(parser);
3082
+ insert_element_of_tag_type (
3083
+ parser,
3084
+ GUMBO_TAG_P,
3085
+ GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3086
+ );
3087
+ }
3088
+ implicitly_close_tags (
3089
+ parser,
3090
+ token,
3091
+ GUMBO_NAMESPACE_HTML,
3092
+ GUMBO_TAG_P
3093
+ );
3094
+ return;
3095
+ }
3096
+ if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
3097
+ if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
3098
+ parser_add_parse_error(parser, token);
3099
+ ignore_token(parser);
3100
+ return;
3101
+ }
3102
+ implicitly_close_tags (
3103
+ parser,
3104
+ token,
3105
+ GUMBO_NAMESPACE_HTML,
3106
+ GUMBO_TAG_LI
3107
+ );
3108
+ return;
3109
+ }
3110
+ if (tag_in(token, kEndTag, &dd_dt_tags)) {
3111
+ GumboTag token_tag = token->v.end_tag.tag;
3112
+ if (!has_an_element_in_scope(parser, token_tag)) {
3113
+ parser_add_parse_error(parser, token);
3114
+ ignore_token(parser);
3115
+ return;
3116
+ }
3117
+ implicitly_close_tags (
3118
+ parser,
3119
+ token,
3120
+ GUMBO_NAMESPACE_HTML,
3121
+ token_tag
3122
+ );
3123
+ return;
3124
+ }
3125
+ if (tag_in(token, kEndTag, &heading_tags)) {
3126
+ if (
3127
+ !has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) {
3128
+ GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
3129
+ GUMBO_TAG_H5, GUMBO_TAG_H6
3130
+ })
3131
+ ) {
3132
+ // No heading open; ignore the token entirely.
3133
+ parser_add_parse_error(parser, token);
3134
+ ignore_token(parser);
3135
+ return;
3136
+ }
3137
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3138
+ const GumboNode* current_node = get_current_node(parser);
3139
+ if (!node_html_tag_is(current_node, token->v.end_tag.tag)) {
3140
+ // There're children of the heading currently open; close them below and
3141
+ // record a parse error.
3142
+ // TODO(jdtang): Add a way to distinguish this error case from the one
3143
+ // above.
3144
+ parser_add_parse_error(parser, token);
3145
+ }
3146
+ do {
3147
+ current_node = pop_current_node(parser);
3148
+ } while (!node_tag_in_set(current_node, &heading_tags));
3149
+ return;
3150
+ }
3151
+ if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
3152
+ int last_a;
3153
+ int has_matching_a = find_last_anchor_index(parser, &last_a);
3154
+ if (has_matching_a) {
3155
+ assert(has_matching_a == 1);
3156
+ parser_add_parse_error(parser, token);
3157
+ (void)adoption_agency_algorithm(parser, token);
3158
+ // The adoption agency algorithm usually removes all instances of <a>
3159
+ // from the list of active formatting elements, but in case it doesn't,
3160
+ // we're supposed to do this. (The conditions where it might not are
3161
+ // listed in the spec.)
3162
+ if (find_last_anchor_index(parser, &last_a)) {
3163
+ void* last_element = gumbo_vector_remove_at (
3164
+ last_a,
3165
+ &state->_active_formatting_elements
3166
+ );
3167
+ gumbo_vector_remove(last_element, &state->_open_elements);
3168
+ }
3169
+ }
3170
+ reconstruct_active_formatting_elements(parser);
3171
+ add_formatting_element(parser, insert_element_from_token(parser, token));
3172
+ return;
3173
+ }
3174
+ if (
3175
+ tag_in(token, kStartTag, &(const TagSet) {
3176
+ TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I), TAG(S),
3177
+ TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)
3178
+ })
3179
+ ) {
3180
+ reconstruct_active_formatting_elements(parser);
3181
+ add_formatting_element(parser, insert_element_from_token(parser, token));
3182
+ return;
3183
+ }
3184
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
3185
+ reconstruct_active_formatting_elements(parser);
3186
+ if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
3187
+ parser_add_parse_error(parser, token);
3188
+ adoption_agency_algorithm(parser, token);
3189
+ reconstruct_active_formatting_elements(parser);
3190
+ }
3191
+ insert_element_from_token(parser, token);
3192
+ add_formatting_element(parser, get_current_node(parser));
3193
+ return;
3194
+ }
3195
+ if (
3196
+ tag_in(token, kEndTag, &(const TagSet) {
3197
+ TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT), TAG(I),
3198
+ TAG(NOBR), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG), TAG(TT),
3199
+ TAG(U)
3200
+ })
3201
+ ) {
3202
+ adoption_agency_algorithm(parser, token);
3203
+ return;
3204
+ }
3205
+ if (
3206
+ tag_in(token, kStartTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
3207
+ ) {
3208
+ reconstruct_active_formatting_elements(parser);
3209
+ insert_element_from_token(parser, token);
3210
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
3211
+ set_frameset_not_ok(parser);
3212
+ return;
3213
+ }
3214
+ if (
3215
+ tag_in(token, kEndTag, &(const TagSet){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})
3216
+ ) {
3217
+ GumboTag token_tag = token->v.end_tag.tag;
3218
+ if (!has_an_element_in_scope(parser, token_tag)) {
3219
+ parser_add_parse_error(parser, token);
3220
+ ignore_token(parser);
3221
+ return;
3222
+ }
3223
+ implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
3224
+ clear_active_formatting_elements(parser);
3225
+ return;
3226
+ }
3227
+ if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3228
+ if (
3229
+ get_document_node(parser)->v.document.doc_type_quirks_mode
3230
+ != GUMBO_DOCTYPE_QUIRKS
3231
+ ) {
3232
+ maybe_implicitly_close_p_tag(parser, token);
3233
+ }
3234
+ insert_element_from_token(parser, token);
3235
+ set_frameset_not_ok(parser);
3236
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3237
+ return;
3238
+ }
3239
+ if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
3240
+ parser_add_parse_error(parser, token);
3241
+ reconstruct_active_formatting_elements(parser);
3242
+ insert_element_of_tag_type (
3243
+ parser,
3244
+ GUMBO_TAG_BR,
3245
+ GUMBO_INSERTION_CONVERTED_FROM_END_TAG
3246
+ );
3247
+ pop_current_node(parser);
3248
+ acknowledge_self_closing_tag(parser);
3249
+ set_frameset_not_ok(parser);
3250
+ return;
3251
+ }
3252
+ if (
3253
+ tag_in(token, kStartTag, &(const TagSet) {
3254
+ TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG), TAG(IMAGE), TAG(KEYGEN),
3255
+ TAG(WBR)
3256
+ })
3257
+ ) {
3258
+ bool is_image = tag_is(token, kStartTag, GUMBO_TAG_IMAGE);
3259
+ if (is_image) {
3260
+ parser_add_parse_error(parser, token);
3261
+ token->v.start_tag.tag = GUMBO_TAG_IMG;
3262
+ }
3263
+ reconstruct_active_formatting_elements(parser);
3264
+ GumboNode* node = insert_element_from_token(parser, token);
3265
+ if (is_image)
3266
+ node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
3267
+ pop_current_node(parser);
3268
+ acknowledge_self_closing_tag(parser);
3269
+ set_frameset_not_ok(parser);
3270
+ return;
3271
+ }
3272
+ if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
3273
+ reconstruct_active_formatting_elements(parser);
3274
+ GumboNode *input = insert_element_from_token(parser, token);
3275
+ pop_current_node(parser);
3276
+ acknowledge_self_closing_tag(parser);
3277
+ if (!attribute_matches(&input->v.element.attributes, "type", "hidden"))
3278
+ set_frameset_not_ok(parser);
3279
+ return;
3280
+ }
3281
+ if (
3282
+ tag_in(token, kStartTag, &(const TagSet){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})
3283
+ ) {
3284
+ insert_element_from_token(parser, token);
3285
+ pop_current_node(parser);
3286
+ acknowledge_self_closing_tag(parser);
3287
+ return;
3288
+ }
3289
+ if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
3290
+ maybe_implicitly_close_p_tag(parser, token);
3291
+ insert_element_from_token(parser, token);
3292
+ pop_current_node(parser);
3293
+ acknowledge_self_closing_tag(parser);
3294
+ set_frameset_not_ok(parser);
3295
+ return;
3296
+ }
3297
+ if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
3298
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
3299
+ parser->_parser_state->_ignore_next_linefeed = true;
3300
+ set_frameset_not_ok(parser);
3301
+ return;
3302
+ }
3303
+ if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
3304
+ maybe_implicitly_close_p_tag(parser, token);
3305
+ reconstruct_active_formatting_elements(parser);
3306
+ set_frameset_not_ok(parser);
3307
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3308
+ return;
3309
+ }
3310
+ if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
3311
+ set_frameset_not_ok(parser);
3312
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3313
+ return;
3314
+ }
3315
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
3316
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
3317
+ return;
3318
+ }
3319
+ if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3320
+ reconstruct_active_formatting_elements(parser);
3321
+ insert_element_from_token(parser, token);
3322
+ set_frameset_not_ok(parser);
3323
+ GumboInsertionMode state = parser->_parser_state->_insertion_mode;
3324
+ if (
3325
+ state == GUMBO_INSERTION_MODE_IN_TABLE
3326
+ || state == GUMBO_INSERTION_MODE_IN_CAPTION
3327
+ || state == GUMBO_INSERTION_MODE_IN_TABLE_BODY
3328
+ || state == GUMBO_INSERTION_MODE_IN_ROW
3329
+ || state == GUMBO_INSERTION_MODE_IN_CELL
3330
+ ) {
3331
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
3332
+ } else {
3333
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
3334
+ }
3335
+ return;
3336
+ }
3337
+ if (
3338
+ tag_in(token, kStartTag, &(const TagSet){TAG(OPTGROUP), TAG(OPTION)})
3339
+ ) {
3340
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3341
+ pop_current_node(parser);
3342
+ }
3343
+ reconstruct_active_formatting_elements(parser);
3344
+ insert_element_from_token(parser, token);
3345
+ return;
3346
+ }
3347
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(RB), TAG(RTC)})) {
3348
+ if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
3349
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3350
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY))
3351
+ parser_add_parse_error(parser, token);
3352
+ }
3353
+ insert_element_from_token(parser, token);
3354
+ return;
3355
+ }
3356
+ if (tag_in(token, kStartTag, &(const TagSet){TAG(RP), TAG(RT)})) {
3357
+ if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
3358
+ generate_implied_end_tags(parser, GUMBO_TAG_RTC, NULL);
3359
+ GumboNode* current = get_current_node(parser);
3360
+ if (!node_html_tag_is(current, GUMBO_TAG_RUBY) &&
3361
+ !node_html_tag_is(current, GUMBO_TAG_RTC)) {
3362
+ parser_add_parse_error(parser, token);
3363
+ }
3364
+ }
3365
+ insert_element_from_token(parser, token);
3366
+ return;
3367
+ }
3368
+ if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
3369
+ reconstruct_active_formatting_elements(parser);
3370
+ adjust_mathml_attributes(token);
3371
+ adjust_foreign_attributes(token);
3372
+ insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML);
3373
+ if (token->v.start_tag.is_self_closing) {
3374
+ pop_current_node(parser);
3375
+ acknowledge_self_closing_tag(parser);
3376
+ }
3377
+ return;
3378
+ }
3379
+ if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
3380
+ reconstruct_active_formatting_elements(parser);
3381
+ adjust_svg_attributes(token);
3382
+ adjust_foreign_attributes(token);
3383
+ insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG);
3384
+ if (token->v.start_tag.is_self_closing) {
3385
+ pop_current_node(parser);
3386
+ acknowledge_self_closing_tag(parser);
3387
+ }
3388
+ return;
3389
+ }
3390
+ if (
3391
+ tag_in(token, kStartTag, &(const TagSet) {
3392
+ TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(FRAME), TAG(HEAD),
3393
+ TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
3394
+ })
3395
+ ) {
3396
+ parser_add_parse_error(parser, token);
3397
+ ignore_token(parser);
3398
+ return;
3399
+ }
3400
+ if (token->type == GUMBO_TOKEN_START_TAG) {
3401
+ reconstruct_active_formatting_elements(parser);
3402
+ insert_element_from_token(parser, token);
3403
+ return;
3404
+ }
3405
+ in_body_any_other_end_tag(parser, token);
3406
+ }
3407
+
3408
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incdata
3409
+ static void handle_text(GumboParser* parser, GumboToken* token) {
3410
+ if (
3411
+ token->type == GUMBO_TOKEN_CHARACTER
3412
+ || token->type == GUMBO_TOKEN_WHITESPACE
3413
+ ) {
3414
+ insert_text_token(parser, token);
3415
+ return;
3416
+ }
3417
+ // We provide only bare-bones script handling that doesn't involve any of
3418
+ // the parser-pause/already-started/script-nesting flags or re-entrant
3419
+ // invocations of the tokenizer. Because the intended usage of this library
3420
+ // is mostly for templating, refactoring, and static-analysis libraries, we
3421
+ // provide the script body as a text-node child of the <script> element.
3422
+ // This behavior doesn't support document.write of partial HTML elements,
3423
+ // but should be adequate for almost all other scripting support.
3424
+ if (token->type == GUMBO_TOKEN_EOF) {
3425
+ parser_add_parse_error(parser, token);
3426
+ parser->_parser_state->_reprocess_current_token = true;
3427
+ }
3428
+ pop_current_node(parser);
3429
+ set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3430
+ }
3431
+
3432
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intable
3433
+ static void handle_in_table(GumboParser* parser, GumboToken* token) {
3434
+ GumboParserState* state = parser->_parser_state;
3435
+ if (
3436
+ (token->type == GUMBO_TOKEN_CHARACTER
3437
+ || token->type == GUMBO_TOKEN_WHITESPACE
3438
+ || token->type == GUMBO_TOKEN_NULL)
3439
+ && node_tag_in_set(get_current_node(parser), &(const TagSet) {
3440
+ TAG(TABLE), TAG(TBODY), TAG(TEMPLATE), TAG(TFOOT), TAG(THEAD), TAG(TR)
3441
+ })
3442
+ ) {
3443
+ // The "pending table character tokens" list described in the spec is
3444
+ // nothing more than the TextNodeBufferState. We accumulate text tokens as
3445
+ // normal, except that when we go to flush them in the handle_in_table_text,
3446
+ // we set _foster_parent_insertions if there're non-whitespace characters in
3447
+ // the buffer.
3448
+ assert(state->_text_node._buffer.length == 0);
3449
+ assert(state->_table_character_tokens.length == 0);
3450
+ state->_original_insertion_mode = state->_insertion_mode;
3451
+ state->_reprocess_current_token = true;
3452
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
3453
+ return;
3454
+ }
3455
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3456
+ append_comment_node(parser, get_current_node(parser), token);
3457
+ return;
3458
+ }
3459
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3460
+ parser_add_parse_error(parser, token);
3461
+ ignore_token(parser);
3462
+ return;
3463
+ }
3464
+ if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3465
+ clear_stack_to_table_context(parser);
3466
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
3467
+ insert_element_from_token(parser, token);
3468
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
3469
+ return;
3470
+ }
3471
+ if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3472
+ clear_stack_to_table_context(parser);
3473
+ insert_element_from_token(parser, token);
3474
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3475
+ return;
3476
+ }
3477
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3478
+ clear_stack_to_table_context(parser);
3479
+ insert_element_of_tag_type (
3480
+ parser,
3481
+ GUMBO_TAG_COLGROUP,
3482
+ GUMBO_INSERTION_IMPLIED
3483
+ );
3484
+ state->_reprocess_current_token = true;
3485
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3486
+ return;
3487
+ }
3488
+ if (
3489
+ tag_in(token, kStartTag, &(const TagSet) {
3490
+ TAG(TBODY), TAG(TFOOT), TAG(THEAD)
3491
+ })
3492
+ ) {
3493
+ clear_stack_to_table_context(parser);
3494
+ insert_element_from_token(parser, token);
3495
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3496
+ return;
3497
+ }
3498
+ if (
3499
+ tag_in(token, kStartTag, &(const TagSet) {
3500
+ TAG(TD), TAG(TH), TAG(TR)
3501
+ })
3502
+ ) {
3503
+ clear_stack_to_table_context(parser);
3504
+ insert_element_of_tag_type (
3505
+ parser,
3506
+ GUMBO_TAG_TBODY,
3507
+ GUMBO_INSERTION_IMPLIED
3508
+ );
3509
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3510
+ state->_reprocess_current_token = true;
3511
+ return;
3512
+ }
3513
+ if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3514
+ parser_add_parse_error(parser, token);
3515
+ if (close_table(parser)) {
3516
+ state->_reprocess_current_token = true;
3517
+ } else {
3518
+ ignore_token(parser);
3519
+ }
3520
+ return;
3521
+ }
3522
+ if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3523
+ if (!close_table(parser)) {
3524
+ parser_add_parse_error(parser, token);
3525
+ return;
3526
+ }
3527
+ return;
3528
+ }
3529
+ if (
3530
+ tag_in(token, kEndTag, &(const TagSet) {
3531
+ TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
3532
+ TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
3533
+ })
3534
+ ) {
3535
+ parser_add_parse_error(parser, token);
3536
+ ignore_token(parser);
3537
+ return;
3538
+ }
3539
+ if (
3540
+ tag_in(token, kStartTag, &(const TagSet){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)})
3541
+ || (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))
3542
+ ) {
3543
+ handle_in_head(parser, token);
3544
+ return;
3545
+ }
3546
+ if (
3547
+ tag_is(token, kStartTag, GUMBO_TAG_INPUT)
3548
+ && attribute_matches(&token->v.start_tag.attributes, "type", "hidden")
3549
+ ) {
3550
+ parser_add_parse_error(parser, token);
3551
+ insert_element_from_token(parser, token);
3552
+ pop_current_node(parser);
3553
+ acknowledge_self_closing_tag(parser);
3554
+ return;
3555
+ }
3556
+ if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3557
+ parser_add_parse_error(parser, token);
3558
+ if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3559
+ ignore_token(parser);
3560
+ return;
3561
+ }
3562
+ state->_form_element = insert_element_from_token(parser, token);
3563
+ pop_current_node(parser);
3564
+ return;
3565
+ }
3566
+ if (token->type == GUMBO_TOKEN_EOF) {
3567
+ handle_in_body(parser, token);
3568
+ return;
3569
+ }
3570
+ // foster-parenting-start-tag or foster-parenting-end-tag error
3571
+ parser_add_parse_error(parser, token);
3572
+ state->_foster_parent_insertions = true;
3573
+ handle_in_body(parser, token);
3574
+ state->_foster_parent_insertions = false;
3575
+ }
3576
+
3577
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intabletext
3578
+ static void handle_in_table_text(GumboParser* parser, GumboToken* token) {
3579
+ if (token->type == GUMBO_TOKEN_NULL) {
3580
+ parser_add_parse_error(parser, token);
3581
+ ignore_token(parser);
3582
+ return;
3583
+ }
3584
+ GumboParserState* state = parser->_parser_state;
3585
+ // Non-whitespace tokens will cause parse errors later.
3586
+ // It's not entirely clear from the spec how this is supposed to work.
3587
+ // https://github.com/whatwg/html/issues/4046
3588
+ if (token->type == GUMBO_TOKEN_WHITESPACE
3589
+ || token->type == GUMBO_TOKEN_CHARACTER) {
3590
+ insert_text_token(parser, token);
3591
+ gumbo_character_token_buffer_append(token, &state->_table_character_tokens);
3592
+ return;
3593
+ }
3594
+
3595
+ GumboCharacterTokenBuffer* buffer = &state->_table_character_tokens;
3596
+ if (state->_text_node._type != GUMBO_NODE_WHITESPACE) {
3597
+ // Each character in buffer is an error. Unfortunately, that means we need
3598
+ // to emit a bunch of errors at the appropriate locations.
3599
+ for (size_t i = 0, n = buffer->length; i < n; ++i) {
3600
+ GumboToken tok;
3601
+ gumbo_character_token_buffer_get(buffer, i, &tok);
3602
+ // foster-parenting-character error
3603
+ parser_add_parse_error(parser, &tok);
3604
+ }
3605
+ state->_foster_parent_insertions = true;
3606
+ set_frameset_not_ok(parser);
3607
+ reconstruct_active_formatting_elements(parser);
3608
+ }
3609
+ maybe_flush_text_node_buffer(parser);
3610
+ gumbo_character_token_buffer_clear(buffer);
3611
+ state->_foster_parent_insertions = false;
3612
+ state->_reprocess_current_token = true;
3613
+ state->_insertion_mode = state->_original_insertion_mode;
3614
+ }
3615
+
3616
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incaption
3617
+ static void handle_in_caption(GumboParser* parser, GumboToken* token) {
3618
+ if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3619
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3620
+ parser_add_parse_error(parser, token);
3621
+ ignore_token(parser);
3622
+ return;
3623
+ }
3624
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3625
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION))
3626
+ parser_add_parse_error(parser, token);
3627
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3628
+ ;
3629
+ clear_active_formatting_elements(parser);
3630
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3631
+ return;
3632
+ }
3633
+ if (
3634
+ tag_in(token, kStartTag, &(const TagSet) {
3635
+ TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
3636
+ TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
3637
+ })
3638
+ || (tag_is(token, kEndTag, GUMBO_TAG_TABLE))
3639
+ ) {
3640
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3641
+ parser_add_parse_error(parser, token);
3642
+ ignore_token(parser);
3643
+ return;
3644
+ }
3645
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
3646
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION))
3647
+ parser_add_parse_error(parser, token);
3648
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3649
+ ;
3650
+ clear_active_formatting_elements(parser);
3651
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3652
+ parser->_parser_state->_reprocess_current_token = true;
3653
+ return;
3654
+ }
3655
+ if (
3656
+ tag_in(token, kEndTag, &(const TagSet) {
3657
+ TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD),
3658
+ TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
3659
+ })
3660
+ ) {
3661
+ parser_add_parse_error(parser, token);
3662
+ ignore_token(parser);
3663
+ return;
3664
+ }
3665
+ handle_in_body(parser, token);
3666
+ }
3667
+
3668
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-incolgroup
3669
+ static void handle_in_column_group(GumboParser* parser, GumboToken* token) {
3670
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
3671
+ insert_text_token(parser, token);
3672
+ return;
3673
+ }
3674
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3675
+ append_comment_node(parser, get_current_node(parser), token);
3676
+ return;
3677
+ }
3678
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3679
+ parser_add_parse_error(parser, token);
3680
+ ignore_token(parser);
3681
+ return;
3682
+ }
3683
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3684
+ handle_in_body(parser, token);
3685
+ return;
3686
+ }
3687
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3688
+ insert_element_from_token(parser, token);
3689
+ pop_current_node(parser);
3690
+ acknowledge_self_closing_tag(parser);
3691
+ return;
3692
+ }
3693
+ if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3694
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3695
+ parser_add_parse_error(parser, token);
3696
+ ignore_token(parser);
3697
+ return;
3698
+ }
3699
+ pop_current_node(parser);
3700
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3701
+ return;
3702
+ }
3703
+ if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3704
+ parser_add_parse_error(parser, token);
3705
+ ignore_token(parser);
3706
+ return;
3707
+ }
3708
+ if (
3709
+ tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)
3710
+ || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
3711
+ ) {
3712
+ handle_in_head(parser, token);
3713
+ return;
3714
+ }
3715
+ if (token->type == GUMBO_TOKEN_EOF) {
3716
+ handle_in_body(parser, token);
3717
+ return;
3718
+ }
3719
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3720
+ parser_add_parse_error(parser, token);
3721
+ ignore_token(parser);
3722
+ return;
3723
+ }
3724
+ pop_current_node(parser);
3725
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3726
+ parser->_parser_state->_reprocess_current_token = true;
3727
+ }
3728
+
3729
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intbody
3730
+ static void handle_in_table_body(GumboParser* parser, GumboToken* token) {
3731
+ if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3732
+ clear_stack_to_table_body_context(parser);
3733
+ insert_element_from_token(parser, token);
3734
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3735
+ return;
3736
+ }
3737
+ if (tag_in(token, kStartTag, &td_th_tags)) {
3738
+ parser_add_parse_error(parser, token);
3739
+ clear_stack_to_table_body_context(parser);
3740
+ insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3741
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3742
+ parser->_parser_state->_reprocess_current_token = true;
3743
+ return;
3744
+ }
3745
+ if (
3746
+ tag_in(token, kEndTag, &(const TagSet){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
3747
+ ) {
3748
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3749
+ parser_add_parse_error(parser, token);
3750
+ ignore_token(parser);
3751
+ return;
3752
+ }
3753
+ clear_stack_to_table_body_context(parser);
3754
+ pop_current_node(parser);
3755
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3756
+ return;
3757
+ }
3758
+ if (
3759
+ tag_in(token, kStartTag, &(const TagSet) {
3760
+ TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
3761
+ TAG(THEAD)
3762
+ })
3763
+ || tag_is(token, kEndTag, GUMBO_TAG_TABLE)
3764
+ ) {
3765
+ if (
3766
+ !(
3767
+ has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY)
3768
+ || has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD)
3769
+ || has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT)
3770
+ )
3771
+ ) {
3772
+ parser_add_parse_error(parser, token);
3773
+ ignore_token(parser);
3774
+ return;
3775
+ }
3776
+ clear_stack_to_table_body_context(parser);
3777
+ pop_current_node(parser);
3778
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3779
+ parser->_parser_state->_reprocess_current_token = true;
3780
+ return;
3781
+ }
3782
+ if (
3783
+ tag_in(token, kEndTag, &(const TagSet) {
3784
+ TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML), TAG(TD),
3785
+ TAG(TH), TAG(TR)
3786
+ })
3787
+ ) {
3788
+ parser_add_parse_error(parser, token);
3789
+ ignore_token(parser);
3790
+ return;
3791
+ }
3792
+ handle_in_table(parser, token);
3793
+ }
3794
+
3795
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intr
3796
+ static void handle_in_row(GumboParser* parser, GumboToken* token) {
3797
+ if (tag_in(token, kStartTag, &td_th_tags)) {
3798
+ clear_stack_to_table_row_context(parser);
3799
+ insert_element_from_token(parser, token);
3800
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3801
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
3802
+ return;
3803
+ }
3804
+ if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3805
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3806
+ parser_add_parse_error(parser, token);
3807
+ ignore_token(parser);
3808
+ return;
3809
+ }
3810
+ clear_stack_to_table_row_context(parser);
3811
+ pop_current_node(parser);
3812
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3813
+ return;
3814
+ }
3815
+ if (
3816
+ tag_in(token, kStartTag, &(const TagSet) {
3817
+ TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT),
3818
+ TAG(THEAD), TAG(TR)
3819
+ })
3820
+ || tag_is(token, kEndTag, GUMBO_TAG_TABLE)
3821
+ ) {
3822
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3823
+ parser_add_parse_error(parser, token);
3824
+ ignore_token(parser);
3825
+ return;
3826
+ }
3827
+ clear_stack_to_table_row_context(parser);
3828
+ pop_current_node(parser);
3829
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3830
+ parser->_parser_state->_reprocess_current_token = true;
3831
+ return;
3832
+ }
3833
+ if (
3834
+ tag_in(token, kEndTag, &(const TagSet) {TAG(TBODY), TAG(TFOOT), TAG(THEAD)})
3835
+ ) {
3836
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3837
+ parser_add_parse_error(parser, token);
3838
+ ignore_token(parser);
3839
+ return;
3840
+ }
3841
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3842
+ ignore_token(parser);
3843
+ return;
3844
+ }
3845
+ clear_stack_to_table_row_context(parser);
3846
+ pop_current_node(parser);
3847
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3848
+ parser->_parser_state->_reprocess_current_token = true;
3849
+ return;
3850
+ }
3851
+ if (
3852
+ tag_in(token, kEndTag, &(const TagSet) {
3853
+ TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML),
3854
+ TAG(TD), TAG(TH)
3855
+ })
3856
+ ) {
3857
+ parser_add_parse_error(parser, token);
3858
+ ignore_token(parser);
3859
+ return;
3860
+ }
3861
+ handle_in_table(parser, token);
3862
+ }
3863
+
3864
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intd
3865
+ static void handle_in_cell(GumboParser* parser, GumboToken* token) {
3866
+ if (tag_in(token, kEndTag, &td_th_tags)) {
3867
+ GumboTag token_tag = token->v.end_tag.tag;
3868
+ if (!has_an_element_in_table_scope(parser, token_tag)) {
3869
+ parser_add_parse_error(parser, token);
3870
+ ignore_token(parser);
3871
+ return;
3872
+ }
3873
+ close_table_cell(parser, token, token_tag);
3874
+ return;
3875
+ }
3876
+ if (
3877
+ tag_in(token, kStartTag, &(const TagSet) {
3878
+ TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(TBODY), TAG(TD),
3879
+ TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR)
3880
+ })
3881
+ ) {
3882
+ gumbo_debug("Handling <td> in cell.\n");
3883
+ if (
3884
+ !has_an_element_in_table_scope(parser, GUMBO_TAG_TH)
3885
+ && !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)
3886
+ ) {
3887
+ gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
3888
+ parser_add_parse_error(parser, token);
3889
+ ignore_token(parser);
3890
+ return;
3891
+ }
3892
+ parser->_parser_state->_reprocess_current_token = true;
3893
+ close_current_cell(parser, token);
3894
+ return;
3895
+ }
3896
+ if (
3897
+ tag_in(token, kEndTag, &(const TagSet) {
3898
+ TAG(BODY), TAG(CAPTION), TAG(COL), TAG(COLGROUP), TAG(HTML)
3899
+ })
3900
+ ) {
3901
+ parser_add_parse_error(parser, token);
3902
+ ignore_token(parser);
3903
+ return;
3904
+ }
3905
+ if (
3906
+ tag_in(token, kEndTag, &(const TagSet) {
3907
+ TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
3908
+ })
3909
+ ) {
3910
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
3911
+ parser_add_parse_error(parser, token);
3912
+ ignore_token(parser);
3913
+ return;
3914
+ }
3915
+ parser->_parser_state->_reprocess_current_token = true;
3916
+ close_current_cell(parser, token);
3917
+ return;
3918
+ }
3919
+ handle_in_body(parser, token);
3920
+ }
3921
+
3922
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselect
3923
+ static void handle_in_select(GumboParser* parser, GumboToken* token) {
3924
+ if (token->type == GUMBO_TOKEN_NULL) {
3925
+ parser_add_parse_error(parser, token);
3926
+ ignore_token(parser);
3927
+ return;
3928
+ }
3929
+ if (
3930
+ token->type == GUMBO_TOKEN_CHARACTER
3931
+ || token->type == GUMBO_TOKEN_WHITESPACE
3932
+ ) {
3933
+ insert_text_token(parser, token);
3934
+ return;
3935
+ }
3936
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3937
+ append_comment_node(parser, get_current_node(parser), token);
3938
+ return;
3939
+ }
3940
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
3941
+ parser_add_parse_error(parser, token);
3942
+ ignore_token(parser);
3943
+ return;
3944
+ }
3945
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3946
+ handle_in_body(parser, token);
3947
+ return;
3948
+ }
3949
+ if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3950
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3951
+ pop_current_node(parser);
3952
+ }
3953
+ insert_element_from_token(parser, token);
3954
+ return;
3955
+ }
3956
+ if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3957
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3958
+ pop_current_node(parser);
3959
+ }
3960
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3961
+ pop_current_node(parser);
3962
+ }
3963
+ insert_element_from_token(parser, token);
3964
+ return;
3965
+ }
3966
+ if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3967
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
3968
+ if (
3969
+ node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)
3970
+ && node_html_tag_is (
3971
+ open_elements->data[open_elements->length - 2],
3972
+ GUMBO_TAG_OPTGROUP
3973
+ )
3974
+ ) {
3975
+ pop_current_node(parser);
3976
+ }
3977
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3978
+ pop_current_node(parser);
3979
+ return;
3980
+ }
3981
+ parser_add_parse_error(parser, token);
3982
+ ignore_token(parser);
3983
+ return;
3984
+ }
3985
+ if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3986
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3987
+ pop_current_node(parser);
3988
+ return;
3989
+ }
3990
+ parser_add_parse_error(parser, token);
3991
+ ignore_token(parser);
3992
+ return;
3993
+ }
3994
+ if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3995
+ if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3996
+ parser_add_parse_error(parser, token);
3997
+ ignore_token(parser);
3998
+ return;
3999
+ }
4000
+ close_current_select(parser);
4001
+ return;
4002
+ }
4003
+ if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
4004
+ parser_add_parse_error(parser, token);
4005
+ ignore_token(parser);
4006
+ if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
4007
+ close_current_select(parser);
4008
+ }
4009
+ return;
4010
+ }
4011
+ if (
4012
+ tag_in(token, kStartTag, &(const TagSet) {TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})
4013
+ ) {
4014
+ parser_add_parse_error(parser, token);
4015
+ if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
4016
+ ignore_token(parser);
4017
+ } else {
4018
+ close_current_select(parser);
4019
+ parser->_parser_state->_reprocess_current_token = true;
4020
+ }
4021
+ return;
4022
+ }
4023
+ if (
4024
+ tag_in(token, kStartTag, &(const TagSet){TAG(SCRIPT), TAG(TEMPLATE)})
4025
+ || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
4026
+ ) {
4027
+ handle_in_head(parser, token);
4028
+ return;
4029
+ }
4030
+ if (token->type == GUMBO_TOKEN_EOF) {
4031
+ handle_in_body(parser, token);
4032
+ return;
4033
+ }
4034
+ parser_add_parse_error(parser, token);
4035
+ ignore_token(parser);
4036
+ }
4037
+
4038
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inselectintable
4039
+ static void handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
4040
+ static const TagSet tags = {
4041
+ TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD),
4042
+ TAG(TR), TAG(TD), TAG(TH)
4043
+ };
4044
+ if (tag_in(token, kStartTag, &tags)) {
4045
+ parser_add_parse_error(parser, token);
4046
+ close_current_select(parser);
4047
+ parser->_parser_state->_reprocess_current_token = true;
4048
+ return;
4049
+ }
4050
+ if (tag_in(token, kEndTag, &tags)) {
4051
+ parser_add_parse_error(parser, token);
4052
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag.tag)) {
4053
+ ignore_token(parser);
4054
+ return;
4055
+ }
4056
+ close_current_select(parser);
4057
+ parser->_parser_state->_reprocess_current_token = true;
4058
+ return;
4059
+ }
4060
+ handle_in_select(parser, token);
4061
+ }
4062
+
4063
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-intemplate
4064
+ static void handle_in_template(GumboParser* parser, GumboToken* token) {
4065
+ GumboParserState* state = parser->_parser_state;
4066
+ switch (token->type) {
4067
+ case GUMBO_TOKEN_WHITESPACE:
4068
+ case GUMBO_TOKEN_CHARACTER:
4069
+ case GUMBO_TOKEN_COMMENT:
4070
+ case GUMBO_TOKEN_NULL:
4071
+ case GUMBO_TOKEN_DOCTYPE:
4072
+ handle_in_body(parser, token);
4073
+ return;
4074
+ default:
4075
+ break;
4076
+ }
4077
+ if (
4078
+ tag_in(token, kStartTag, &(const TagSet) {
4079
+ TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(LINK), TAG(META),
4080
+ TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)
4081
+ })
4082
+ || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)
4083
+ ) {
4084
+ handle_in_head(parser, token);
4085
+ return;
4086
+ }
4087
+ if (
4088
+ tag_in(token, kStartTag, &(const TagSet) {
4089
+ TAG(CAPTION), TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), TAG(THEAD)
4090
+ })
4091
+ ) {
4092
+ pop_template_insertion_mode(parser);
4093
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
4094
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
4095
+ state->_reprocess_current_token = true;
4096
+ return;
4097
+ }
4098
+ if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
4099
+ pop_template_insertion_mode(parser);
4100
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
4101
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
4102
+ state->_reprocess_current_token = true;
4103
+ return;
4104
+ }
4105
+ if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
4106
+ pop_template_insertion_mode(parser);
4107
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
4108
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
4109
+ state->_reprocess_current_token = true;
4110
+ return;
4111
+ }
4112
+ if (tag_in(token, kStartTag, &td_th_tags)) {
4113
+ pop_template_insertion_mode(parser);
4114
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
4115
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
4116
+ state->_reprocess_current_token = true;
4117
+ return;
4118
+ }
4119
+ if (token->type == GUMBO_TOKEN_START_TAG) {
4120
+ pop_template_insertion_mode(parser);
4121
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4122
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4123
+ state->_reprocess_current_token = true;
4124
+ return;
4125
+ }
4126
+ if (token->type == GUMBO_TOKEN_END_TAG) {
4127
+ parser_add_parse_error(parser, token);
4128
+ ignore_token(parser);
4129
+ return;
4130
+ }
4131
+ if (token->type == GUMBO_TOKEN_EOF) {
4132
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
4133
+ // Stop parsing.
4134
+ return;
4135
+ }
4136
+ parser_add_parse_error(parser, token);
4137
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
4138
+ ;
4139
+ clear_active_formatting_elements(parser);
4140
+ pop_template_insertion_mode(parser);
4141
+ reset_insertion_mode_appropriately(parser);
4142
+ state->_reprocess_current_token = true;
4143
+ return;
4144
+ }
4145
+ assert(0 && "unreachable");
4146
+ }
4147
+
4148
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterbody
4149
+ static void handle_after_body(GumboParser* parser, GumboToken* token) {
4150
+ if (
4151
+ token->type == GUMBO_TOKEN_WHITESPACE
4152
+ || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4153
+ ) {
4154
+ handle_in_body(parser, token);
4155
+ return;
4156
+ }
4157
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4158
+ GumboNode* html_node = parser->_output->root;
4159
+ assert(html_node != NULL);
4160
+ append_comment_node(parser, html_node, token);
4161
+ return;
4162
+ }
4163
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4164
+ parser_add_parse_error(parser, token);
4165
+ ignore_token(parser);
4166
+ return;
4167
+ }
4168
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4169
+ handle_in_body(parser, token);
4170
+ return;
4171
+ }
4172
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4173
+ /* fragment case: ignore the closing HTML token */
4174
+ if (is_fragment_parser(parser)) {
4175
+ parser_add_parse_error(parser, token);
4176
+ ignore_token(parser);
4177
+ return;
4178
+ }
4179
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
4180
+ GumboNode* html = parser->_parser_state->_open_elements.data[0];
4181
+ assert(node_html_tag_is(html, GUMBO_TAG_HTML));
4182
+ record_end_of_element (
4183
+ parser->_parser_state->_current_token,
4184
+ &html->v.element
4185
+ );
4186
+ return;
4187
+ }
4188
+ if (token->type == GUMBO_TOKEN_EOF) {
4189
+ return;
4190
+ }
4191
+ parser_add_parse_error(parser, token);
4192
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4193
+ parser->_parser_state->_reprocess_current_token = true;
4194
+ }
4195
+
4196
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inframeset
4197
+ static void handle_in_frameset(GumboParser* parser, GumboToken* token) {
4198
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
4199
+ insert_text_token(parser, token);
4200
+ return;
4201
+ }
4202
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4203
+ append_comment_node(parser, get_current_node(parser), token);
4204
+ return;
4205
+ }
4206
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4207
+ parser_add_parse_error(parser, token);
4208
+ ignore_token(parser);
4209
+ return;
4210
+ }
4211
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4212
+ handle_in_body(parser, token);
4213
+ return;
4214
+ }
4215
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
4216
+ insert_element_from_token(parser, token);
4217
+ return;
4218
+ }
4219
+ if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
4220
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
4221
+ parser_add_parse_error(parser, token);
4222
+ ignore_token(parser);
4223
+ return;
4224
+ }
4225
+ pop_current_node(parser);
4226
+ if (
4227
+ !is_fragment_parser(parser)
4228
+ && !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)
4229
+ ) {
4230
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
4231
+ }
4232
+ return;
4233
+ }
4234
+ if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
4235
+ insert_element_from_token(parser, token);
4236
+ pop_current_node(parser);
4237
+ acknowledge_self_closing_tag(parser);
4238
+ return;
4239
+ }
4240
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4241
+ handle_in_head(parser, token);
4242
+ return;
4243
+ }
4244
+ if (token->type == GUMBO_TOKEN_EOF) {
4245
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML))
4246
+ parser_add_parse_error(parser, token);
4247
+ return;
4248
+ }
4249
+ parser_add_parse_error(parser, token);
4250
+ ignore_token(parser);
4251
+ }
4252
+
4253
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterframeset
4254
+ static void handle_after_frameset(GumboParser* parser, GumboToken* token) {
4255
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
4256
+ insert_text_token(parser, token);
4257
+ return;
4258
+ }
4259
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4260
+ append_comment_node(parser, get_current_node(parser), token);
4261
+ return;
4262
+ }
4263
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
4264
+ parser_add_parse_error(parser, token);
4265
+ ignore_token(parser);
4266
+ return;
4267
+ }
4268
+ if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
4269
+ handle_in_body(parser, token);
4270
+ return;
4271
+ }
4272
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4273
+ GumboNode* html = parser->_parser_state->_open_elements.data[0];
4274
+ assert(node_html_tag_is(html, GUMBO_TAG_HTML));
4275
+ record_end_of_element (
4276
+ parser->_parser_state->_current_token,
4277
+ &html->v.element
4278
+ );
4279
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
4280
+ return;
4281
+ }
4282
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4283
+ return handle_in_head(parser, token);
4284
+ }
4285
+ if (token->type == GUMBO_TOKEN_EOF) {
4286
+ return;
4287
+ }
4288
+ parser_add_parse_error(parser, token);
4289
+ ignore_token(parser);
4290
+ }
4291
+
4292
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-body-insertion-mode
4293
+ static void handle_after_after_body(GumboParser* parser, GumboToken* token) {
4294
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4295
+ append_comment_node(parser, get_document_node(parser), token);
4296
+ return;
4297
+ }
4298
+ if (
4299
+ token->type == GUMBO_TOKEN_DOCTYPE
4300
+ || token->type == GUMBO_TOKEN_WHITESPACE
4301
+ || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4302
+ ) {
4303
+ handle_in_body(parser, token);
4304
+ return;
4305
+ }
4306
+ if (token->type == GUMBO_TOKEN_EOF) {
4307
+ return;
4308
+ }
4309
+ parser_add_parse_error(parser, token);
4310
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
4311
+ parser->_parser_state->_reprocess_current_token = true;
4312
+ }
4313
+
4314
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-after-after-frameset-insertion-mode
4315
+ static void handle_after_after_frameset (
4316
+ GumboParser* parser,
4317
+ GumboToken* token
4318
+ ) {
4319
+ if (token->type == GUMBO_TOKEN_COMMENT) {
4320
+ append_comment_node(parser, get_document_node(parser), token);
4321
+ return;
4322
+ }
4323
+ if (
4324
+ token->type == GUMBO_TOKEN_DOCTYPE
4325
+ || token->type == GUMBO_TOKEN_WHITESPACE
4326
+ || tag_is(token, kStartTag, GUMBO_TAG_HTML)
4327
+ ) {
4328
+ handle_in_body(parser, token);
4329
+ return;
4330
+ }
4331
+ if (token->type == GUMBO_TOKEN_EOF) {
4332
+ return;
4333
+ }
4334
+ if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
4335
+ handle_in_head(parser, token);
4336
+ return;
4337
+ }
4338
+ parser_add_parse_error(parser, token);
4339
+ ignore_token(parser);
4340
+ }
4341
+
4342
+ // Function pointers for each insertion mode.
4343
+ // Keep in sync with insertion_mode.h.
4344
+ typedef void (*TokenHandler)(GumboParser* parser, GumboToken* token);
4345
+ static const TokenHandler kTokenHandlers[] = {
4346
+ handle_initial,
4347
+ handle_before_html,
4348
+ handle_before_head,
4349
+ handle_in_head,
4350
+ handle_in_head_noscript,
4351
+ handle_after_head,
4352
+ handle_in_body,
4353
+ handle_text,
4354
+ handle_in_table,
4355
+ handle_in_table_text,
4356
+ handle_in_caption,
4357
+ handle_in_column_group,
4358
+ handle_in_table_body,
4359
+ handle_in_row,
4360
+ handle_in_cell,
4361
+ handle_in_select,
4362
+ handle_in_select_in_table,
4363
+ handle_in_template,
4364
+ handle_after_body,
4365
+ handle_in_frameset,
4366
+ handle_after_frameset,
4367
+ handle_after_after_body,
4368
+ handle_after_after_frameset
4369
+ };
4370
+
4371
+ static void handle_html_content(GumboParser* parser, GumboToken* token) {
4372
+ const GumboInsertionMode mode = parser->_parser_state->_insertion_mode;
4373
+ const TokenHandler handler = kTokenHandlers[mode];
4374
+ handler(parser, token);
4375
+ }
4376
+
4377
+ // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
4378
+ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
4379
+ gumbo_debug("Handling foreign content");
4380
+ switch (token->type) {
4381
+ case GUMBO_TOKEN_NULL:
4382
+ parser_add_parse_error(parser, token);
4383
+ token->v.character = kUtf8ReplacementChar;
4384
+ insert_text_token(parser, token);
4385
+ return;
4386
+ case GUMBO_TOKEN_WHITESPACE:
4387
+ insert_text_token(parser, token);
4388
+ return;
4389
+ case GUMBO_TOKEN_CDATA:
4390
+ case GUMBO_TOKEN_CHARACTER:
4391
+ insert_text_token(parser, token);
4392
+ set_frameset_not_ok(parser);
4393
+ return;
4394
+ case GUMBO_TOKEN_COMMENT:
4395
+ append_comment_node(parser, get_current_node(parser), token);
4396
+ return;
4397
+ case GUMBO_TOKEN_DOCTYPE:
4398
+ parser_add_parse_error(parser, token);
4399
+ ignore_token(parser);
4400
+ return;
4401
+ default:
4402
+ // Fall through to the if-statements below.
4403
+ break;
4404
+ }
4405
+ // Order matters for these clauses.
4406
+ if (
4407
+ tag_in(token, kStartTag, &(const TagSet) {
4408
+ TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR), TAG(CENTER),
4409
+ TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT), TAG(EM), TAG(EMBED),
4410
+ TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6), TAG(HEAD),
4411
+ TAG(HR), TAG(I), TAG(IMG), TAG(LI), TAG(LISTING), TAG(MENU), TAG(META),
4412
+ TAG(NOBR), TAG(OL), TAG(P), TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL),
4413
+ TAG(SPAN), TAG(STRONG), TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE),
4414
+ TAG(TT), TAG(U), TAG(UL), TAG(VAR)
4415
+ })
4416
+ || (
4417
+ tag_is(token, kStartTag, GUMBO_TAG_FONT)
4418
+ && (
4419
+ token_has_attribute(token, "color")
4420
+ || token_has_attribute(token, "face")
4421
+ || token_has_attribute(token, "size")
4422
+ )
4423
+ )
4424
+ || tag_in(token, kEndTag, &(const TagSet) { TAG(BR), TAG(P) })
4425
+ ) {
4426
+ /* Parse error */
4427
+ parser_add_parse_error(parser, token);
4428
+
4429
+ while (
4430
+ !(
4431
+ is_mathml_integration_point(get_current_node(parser))
4432
+ || is_html_integration_point(get_current_node(parser))
4433
+ || get_current_node(parser)->v.element.tag_namespace == GUMBO_NAMESPACE_HTML
4434
+ )
4435
+ ) {
4436
+ pop_current_node(parser);
4437
+ }
4438
+ handle_html_content(parser, token);
4439
+ return;
4440
+ }
4441
+
4442
+ if (token->type == GUMBO_TOKEN_START_TAG) {
4443
+ const GumboNamespaceEnum current_namespace =
4444
+ get_adjusted_current_node(parser)->v.element.tag_namespace;
4445
+ if (current_namespace == GUMBO_NAMESPACE_MATHML) {
4446
+ adjust_mathml_attributes(token);
4447
+ }
4448
+ if (current_namespace == GUMBO_NAMESPACE_SVG) {
4449
+ adjust_svg_tag(token);
4450
+ adjust_svg_attributes(token);
4451
+ }
4452
+ adjust_foreign_attributes(token);
4453
+ insert_foreign_element(parser, token, current_namespace);
4454
+ if (token->v.start_tag.is_self_closing) {
4455
+ pop_current_node(parser);
4456
+ acknowledge_self_closing_tag(parser);
4457
+ }
4458
+ return;
4459
+ // </script> tags are handled like any other end tag, putting the script's
4460
+ // text into a text node child and closing the current node.
4461
+ }
4462
+ assert(token->type == GUMBO_TOKEN_END_TAG);
4463
+ GumboNode* node = get_current_node(parser);
4464
+ GumboTag tag = token->v.end_tag.tag;
4465
+ const char* name = token->v.end_tag.name;
4466
+ assert(node != NULL);
4467
+
4468
+ if (!node_tagname_is(node, tag, name))
4469
+ parser_add_parse_error(parser, token);
4470
+ int i = parser->_parser_state->_open_elements.length;
4471
+ for (--i; i > 0;) {
4472
+ // Here we move up the stack until we find an HTML element (in which
4473
+ // case we do nothing) or we find the element that we're about to
4474
+ // close (in which case we pop everything we've seen until that
4475
+ // point.)
4476
+ gumbo_debug("Foreign %s node at %d.\n", node->v.element.name, i);
4477
+ if (node_tagname_is(node, tag, name)) {
4478
+ gumbo_debug("Matches.\n");
4479
+ while (node != pop_current_node(parser)) {
4480
+ // Pop all the nodes below the current one. Node is guaranteed to
4481
+ // be an element on the stack of open elements (set below), so
4482
+ // this loop is guaranteed to terminate.
4483
+ }
4484
+ return;
4485
+ }
4486
+ --i;
4487
+ node = parser->_parser_state->_open_elements.data[i];
4488
+ if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
4489
+ // The loop continues only in foreign namespaces.
4490
+ break;
4491
+ }
4492
+ }
4493
+ assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
4494
+ if (i == 0)
4495
+ return;
4496
+ // We can't call handle_token directly because the current node is still in
4497
+ // a foriegn namespace, so it would re-enter this and result in infinite
4498
+ // recursion.
4499
+ handle_html_content(parser, token);
4500
+ }
4501
+
4502
+ // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
4503
+ static void handle_token(GumboParser* parser, GumboToken* token) {
4504
+ if (
4505
+ parser->_parser_state->_ignore_next_linefeed
4506
+ && token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n'
4507
+ ) {
4508
+ parser->_parser_state->_ignore_next_linefeed = false;
4509
+ ignore_token(parser);
4510
+ return;
4511
+ }
4512
+ // This needs to be reset both here and in the conditional above to catch both
4513
+ // the case where the next token is not whitespace (so we don't ignore
4514
+ // whitespace in the middle of <pre> tags) and where there are multiple
4515
+ // whitespace tokens (so we don't ignore the second one).
4516
+ parser->_parser_state->_ignore_next_linefeed = false;
4517
+
4518
+ if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
4519
+ parser->_parser_state->_closed_body_tag = true;
4520
+ }
4521
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
4522
+ parser->_parser_state->_closed_html_tag = true;
4523
+ }
4524
+
4525
+ const GumboNode* current_node = get_adjusted_current_node(parser);
4526
+ assert (
4527
+ !current_node
4528
+ || current_node->type == GUMBO_NODE_ELEMENT
4529
+ || current_node->type == GUMBO_NODE_TEMPLATE
4530
+ );
4531
+ if (current_node)
4532
+ gumbo_debug("Current node: <%s>.\n", current_node->v.element.name);
4533
+ if (!current_node ||
4534
+ current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML ||
4535
+ (is_mathml_integration_point(current_node) &&
4536
+ (token->type == GUMBO_TOKEN_CHARACTER ||
4537
+ token->type == GUMBO_TOKEN_WHITESPACE ||
4538
+ token->type == GUMBO_TOKEN_NULL ||
4539
+ (token->type == GUMBO_TOKEN_START_TAG &&
4540
+ !tag_in(token, kStartTag,
4541
+ &(const TagSet){TAG(MGLYPH), TAG(MALIGNMARK)})))) ||
4542
+ (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
4543
+ node_qualified_tag_is(
4544
+ current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
4545
+ tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
4546
+ (is_html_integration_point(current_node) &&
4547
+ (token->type == GUMBO_TOKEN_START_TAG ||
4548
+ token->type == GUMBO_TOKEN_CHARACTER ||
4549
+ token->type == GUMBO_TOKEN_NULL ||
4550
+ token->type == GUMBO_TOKEN_WHITESPACE)) ||
4551
+ token->type == GUMBO_TOKEN_EOF) {
4552
+ handle_html_content(parser, token);
4553
+ } else {
4554
+ handle_in_foreign_content(parser, token);
4555
+ }
4556
+ }
4557
+
4558
+ static GumboNode* create_fragment_ctx_element (
4559
+ const char* tag_name,
4560
+ GumboNamespaceEnum ns,
4561
+ const char* encoding
4562
+ ) {
4563
+ assert(tag_name);
4564
+ GumboTag tag = gumbo_tagn_enum(tag_name, strlen(tag_name));
4565
+ GumboNodeType type =
4566
+ ns == GUMBO_NAMESPACE_HTML && tag == GUMBO_TAG_TEMPLATE
4567
+ ? GUMBO_NODE_TEMPLATE : GUMBO_NODE_ELEMENT;
4568
+ GumboNode* node = create_node(type);
4569
+ GumboElement* element = &node->v.element;
4570
+ element->children = kGumboEmptyVector;
4571
+ if (encoding) {
4572
+ gumbo_vector_init(1, &element->attributes);
4573
+ GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute));
4574
+ attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
4575
+ attr->name = "encoding"; // Do not free this!
4576
+ attr->original_name = kGumboEmptyString;
4577
+ attr->value = encoding; // Do not free this!
4578
+ attr->original_value = kGumboEmptyString;
4579
+ attr->name_start = kGumboEmptySourcePosition;
4580
+ gumbo_vector_add(attr, &element->attributes);
4581
+ } else {
4582
+ element->attributes = kGumboEmptyVector;
4583
+ }
4584
+ element->tag = tag;
4585
+ element->tag_namespace = ns;
4586
+ element->name = tag_name; // Do not free this!
4587
+ element->original_tag = kGumboEmptyString;
4588
+ element->original_end_tag = kGumboEmptyString;
4589
+ element->start_pos = kGumboEmptySourcePosition;
4590
+ element->end_pos = kGumboEmptySourcePosition;
4591
+ return node;
4592
+ }
4593
+
4594
+ static void destroy_fragment_ctx_element(GumboNode* ctx) {
4595
+ assert(ctx->type == GUMBO_NODE_ELEMENT || ctx->type == GUMBO_NODE_TEMPLATE);
4596
+ GumboElement* element = &ctx->v.element;
4597
+ element->name = NULL; // Do not free.
4598
+ if (element->attributes.length > 0) {
4599
+ assert(element->attributes.length == 1);
4600
+ GumboAttribute* attr = gumbo_vector_pop(&element->attributes);
4601
+ // Do not free attr->name or attr->value, just free the attr.
4602
+ gumbo_free(attr);
4603
+ }
4604
+ destroy_node(ctx);
4605
+ }
4606
+
4607
+ static void fragment_parser_init (
4608
+ GumboParser* parser,
4609
+ const GumboOptions* options
4610
+ ) {
4611
+ assert(options->fragment_context != NULL);
4612
+ const char* fragment_ctx = options->fragment_context;
4613
+ GumboNamespaceEnum fragment_namespace = options->fragment_namespace;
4614
+ const char* fragment_encoding = options->fragment_encoding;
4615
+ GumboQuirksModeEnum quirks = options->quirks_mode;
4616
+ bool ctx_has_form_ancestor = options->fragment_context_has_form_ancestor;
4617
+
4618
+ GumboNode* root;
4619
+ // 2.
4620
+ get_document_node(parser)->v.document.doc_type_quirks_mode = quirks;
4621
+
4622
+ // 3.
4623
+ parser->_parser_state->_fragment_ctx =
4624
+ create_fragment_ctx_element(fragment_ctx, fragment_namespace, fragment_encoding);
4625
+ GumboTag ctx_tag = parser->_parser_state->_fragment_ctx->v.element.tag;
4626
+
4627
+ // 4.
4628
+ if (fragment_namespace == GUMBO_NAMESPACE_HTML) {
4629
+ // Non-HTML namespaces always start in the DATA state.
4630
+ switch (ctx_tag) {
4631
+ case GUMBO_TAG_TITLE:
4632
+ case GUMBO_TAG_TEXTAREA:
4633
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
4634
+ break;
4635
+
4636
+ case GUMBO_TAG_STYLE:
4637
+ case GUMBO_TAG_XMP:
4638
+ case GUMBO_TAG_IFRAME:
4639
+ case GUMBO_TAG_NOEMBED:
4640
+ case GUMBO_TAG_NOFRAMES:
4641
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
4642
+ break;
4643
+
4644
+ case GUMBO_TAG_SCRIPT:
4645
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
4646
+ break;
4647
+
4648
+ case GUMBO_TAG_NOSCRIPT:
4649
+ /* scripting is disabled in Gumbo, so leave the tokenizer
4650
+ * in the default data state */
4651
+ break;
4652
+
4653
+ case GUMBO_TAG_PLAINTEXT:
4654
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
4655
+ break;
4656
+
4657
+ default:
4658
+ /* default data state */
4659
+ break;
4660
+ }
4661
+ }
4662
+
4663
+ // 5. 6. 7.
4664
+ root = insert_element_of_tag_type (
4665
+ parser,
4666
+ GUMBO_TAG_HTML,
4667
+ GUMBO_INSERTION_IMPLIED
4668
+ );
4669
+ parser->_output->root = root;
4670
+
4671
+ // 8.
4672
+ if (ctx_tag == GUMBO_TAG_TEMPLATE) {
4673
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
4674
+ }
4675
+
4676
+ // 10.
4677
+ reset_insertion_mode_appropriately(parser);
4678
+
4679
+ // 11.
4680
+ if (ctx_has_form_ancestor
4681
+ || (ctx_tag == GUMBO_TAG_FORM
4682
+ && fragment_namespace == GUMBO_NAMESPACE_HTML)) {
4683
+ static const GumboNode form_ancestor = {
4684
+ .type = GUMBO_NODE_ELEMENT,
4685
+ .parent = NULL,
4686
+ .index_within_parent = -1,
4687
+ .parse_flags = GUMBO_INSERTION_BY_PARSER,
4688
+ .v.element = {
4689
+ .children = GUMBO_EMPTY_VECTOR_INIT,
4690
+ .tag = GUMBO_TAG_FORM,
4691
+ .name = NULL,
4692
+ .tag_namespace = GUMBO_NAMESPACE_HTML,
4693
+ .original_tag = GUMBO_EMPTY_STRING_INIT,
4694
+ .original_end_tag = GUMBO_EMPTY_STRING_INIT,
4695
+ .start_pos = GUMBO_EMPTY_SOURCE_POSITION_INIT,
4696
+ .end_pos = GUMBO_EMPTY_SOURCE_POSITION_INIT,
4697
+ .attributes = GUMBO_EMPTY_VECTOR_INIT,
4698
+ },
4699
+ };
4700
+ // This cast is okay because _form_element is only modified if it is
4701
+ // in in the list of open elements. This will never be.
4702
+ parser->_parser_state->_form_element = (GumboNode *)&form_ancestor;
4703
+ }
4704
+ }
4705
+
4706
+ GumboOutput* gumbo_parse(const char* buffer) {
4707
+ return gumbo_parse_with_options (
4708
+ &kGumboDefaultOptions,
4709
+ buffer,
4710
+ strlen(buffer)
4711
+ );
4712
+ }
4713
+
4714
+ GumboOutput* gumbo_parse_with_options (
4715
+ const GumboOptions* options,
4716
+ const char* buffer,
4717
+ size_t length
4718
+ ) {
4719
+ GumboParser parser;
4720
+ parser._options = options;
4721
+ output_init(&parser);
4722
+ gumbo_tokenizer_state_init(&parser, buffer, length);
4723
+ parser_state_init(&parser);
4724
+
4725
+ if (options->fragment_context != NULL)
4726
+ fragment_parser_init(&parser, options);
4727
+
4728
+ GumboParserState* state = parser._parser_state;
4729
+ gumbo_debug (
4730
+ "Parsing %.*s.\n",
4731
+ (int) length,
4732
+ buffer
4733
+ );
4734
+
4735
+ // Sanity check so that infinite loops die with an assertion failure instead
4736
+ // of hanging the process before we ever get an error.
4737
+ uint_fast32_t loop_count = 0;
4738
+
4739
+ const unsigned int max_tree_depth = options->max_tree_depth;
4740
+ GumboToken token;
4741
+
4742
+ do {
4743
+ if (state->_reprocess_current_token) {
4744
+ state->_reprocess_current_token = false;
4745
+ } else {
4746
+ GumboNode* adjusted_current_node = get_adjusted_current_node(&parser);
4747
+ gumbo_tokenizer_set_is_adjusted_current_node_foreign (
4748
+ &parser,
4749
+ adjusted_current_node &&
4750
+ adjusted_current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML
4751
+ );
4752
+ gumbo_lex(&parser, &token);
4753
+ }
4754
+
4755
+ const char* token_type = "text";
4756
+ switch (token.type) {
4757
+ case GUMBO_TOKEN_DOCTYPE:
4758
+ token_type = "doctype";
4759
+ break;
4760
+ case GUMBO_TOKEN_START_TAG:
4761
+ if (token.v.start_tag.tag == GUMBO_TAG_UNKNOWN)
4762
+ token_type = token.v.start_tag.name;
4763
+ else
4764
+ token_type = gumbo_normalized_tagname(token.v.start_tag.tag);
4765
+ break;
4766
+ case GUMBO_TOKEN_END_TAG:
4767
+ token_type = gumbo_normalized_tagname(token.v.end_tag.tag);
4768
+ break;
4769
+ case GUMBO_TOKEN_COMMENT:
4770
+ token_type = "comment";
4771
+ break;
4772
+ default:
4773
+ break;
4774
+ }
4775
+ gumbo_debug (
4776
+ "Handling %s token @%lu:%lu in state %u.\n",
4777
+ (char*) token_type,
4778
+ (unsigned long)token.position.line,
4779
+ (unsigned long)token.position.column,
4780
+ state->_insertion_mode
4781
+ );
4782
+
4783
+ state->_current_token = &token;
4784
+ state->_self_closing_flag_acknowledged = false;
4785
+
4786
+ handle_token(&parser, &token);
4787
+
4788
+ // Check for memory leaks when ownership is transferred from start tag
4789
+ // tokens to nodes.
4790
+ assert (
4791
+ state->_reprocess_current_token
4792
+ || token.type != GUMBO_TOKEN_START_TAG
4793
+ || (token.v.start_tag.attributes.data == NULL
4794
+ && token.v.start_tag.name == NULL)
4795
+ );
4796
+
4797
+ if (!state->_reprocess_current_token) {
4798
+ // If we're done with the token, check for unacknowledged self-closing
4799
+ // flags on start tags.
4800
+ if (token.type == GUMBO_TOKEN_START_TAG &&
4801
+ token.v.start_tag.is_self_closing &&
4802
+ !state->_self_closing_flag_acknowledged) {
4803
+ GumboError* error = gumbo_add_error(&parser);
4804
+ if (error) {
4805
+ // This is essentially a tokenizer error that's only caught during
4806
+ // tree construction.
4807
+ error->type = GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS;
4808
+ error->original_text = token.original_text;
4809
+ error->position = token.position;
4810
+ }
4811
+ }
4812
+ // Make sure we free the end tag's name since it doesn't get transferred
4813
+ // to a token.
4814
+ if (token.type == GUMBO_TOKEN_END_TAG &&
4815
+ token.v.end_tag.tag == GUMBO_TAG_UNKNOWN)
4816
+ gumbo_free(token.v.end_tag.name);
4817
+ }
4818
+
4819
+ if (unlikely(state->_open_elements.length > max_tree_depth)) {
4820
+ parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP;
4821
+ gumbo_debug("Tree depth limit exceeded.\n");
4822
+ break;
4823
+ }
4824
+
4825
+ ++loop_count;
4826
+ assert(loop_count < 1000000000UL);
4827
+
4828
+ } while (
4829
+ (token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token)
4830
+ && !(options->stop_on_first_error && parser._output->document_error)
4831
+ );
4832
+
4833
+ finish_parsing(&parser);
4834
+ // For API uniformity reasons, if the doctype still has nulls, convert them to
4835
+ // empty strings.
4836
+ GumboDocument* doc_type = &parser._output->document->v.document;
4837
+ if (doc_type->name == NULL) {
4838
+ doc_type->name = gumbo_strdup("");
4839
+ }
4840
+ if (doc_type->public_identifier == NULL) {
4841
+ doc_type->public_identifier = gumbo_strdup("");
4842
+ }
4843
+ if (doc_type->system_identifier == NULL) {
4844
+ doc_type->system_identifier = gumbo_strdup("");
4845
+ }
4846
+
4847
+ parser_state_destroy(&parser);
4848
+ gumbo_tokenizer_state_destroy(&parser);
4849
+ return parser._output;
4850
+ }
4851
+
4852
+ const char* gumbo_status_to_string(GumboOutputStatus status) {
4853
+ switch (status) {
4854
+ case GUMBO_STATUS_OK:
4855
+ return "OK";
4856
+ case GUMBO_STATUS_OUT_OF_MEMORY:
4857
+ return "System allocator returned NULL during parsing";
4858
+ case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
4859
+ return "Attributes per element limit exceeded";
4860
+ case GUMBO_STATUS_TREE_TOO_DEEP:
4861
+ return "Document tree depth limit exceeded";
4862
+ default:
4863
+ return "Unknown GumboOutputStatus value";
4864
+ }
4865
+ }
4866
+
4867
+ void gumbo_destroy_node(GumboNode* node) {
4868
+ destroy_node(node);
4869
+ }
4870
+
4871
+ void gumbo_destroy_output(GumboOutput* output) {
4872
+ destroy_node(output->document);
4873
+ for (unsigned int i = 0; i < output->errors.length; ++i) {
4874
+ gumbo_error_destroy(output->errors.data[i]);
4875
+ }
4876
+ gumbo_vector_destroy(&output->errors);
4877
+ gumbo_free(output);
4878
+ }