Nokogiri_precompiled_aarch64_dedshit 1.14.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (263) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +44 -0
  3. data/LICENSE-DEPENDENCIES.md +2224 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +287 -0
  6. data/bin/nokogiri +131 -0
  7. data/dependencies.yml +41 -0
  8. data/ext/java/nokogiri/Html4Document.java +157 -0
  9. data/ext/java/nokogiri/Html4ElementDescription.java +133 -0
  10. data/ext/java/nokogiri/Html4EntityLookup.java +63 -0
  11. data/ext/java/nokogiri/Html4SaxParserContext.java +289 -0
  12. data/ext/java/nokogiri/Html4SaxPushParser.java +213 -0
  13. data/ext/java/nokogiri/NokogiriService.java +613 -0
  14. data/ext/java/nokogiri/XmlAttr.java +154 -0
  15. data/ext/java/nokogiri/XmlAttributeDecl.java +119 -0
  16. data/ext/java/nokogiri/XmlCdata.java +60 -0
  17. data/ext/java/nokogiri/XmlComment.java +77 -0
  18. data/ext/java/nokogiri/XmlDocument.java +705 -0
  19. data/ext/java/nokogiri/XmlDocumentFragment.java +163 -0
  20. data/ext/java/nokogiri/XmlDtd.java +516 -0
  21. data/ext/java/nokogiri/XmlElement.java +44 -0
  22. data/ext/java/nokogiri/XmlElementContent.java +412 -0
  23. data/ext/java/nokogiri/XmlElementDecl.java +148 -0
  24. data/ext/java/nokogiri/XmlEntityDecl.java +151 -0
  25. data/ext/java/nokogiri/XmlEntityReference.java +79 -0
  26. data/ext/java/nokogiri/XmlNamespace.java +193 -0
  27. data/ext/java/nokogiri/XmlNode.java +1938 -0
  28. data/ext/java/nokogiri/XmlNodeSet.java +463 -0
  29. data/ext/java/nokogiri/XmlProcessingInstruction.java +79 -0
  30. data/ext/java/nokogiri/XmlReader.java +615 -0
  31. data/ext/java/nokogiri/XmlRelaxng.java +133 -0
  32. data/ext/java/nokogiri/XmlSaxParserContext.java +329 -0
  33. data/ext/java/nokogiri/XmlSaxPushParser.java +288 -0
  34. data/ext/java/nokogiri/XmlSchema.java +423 -0
  35. data/ext/java/nokogiri/XmlSyntaxError.java +137 -0
  36. data/ext/java/nokogiri/XmlText.java +90 -0
  37. data/ext/java/nokogiri/XmlXpathContext.java +305 -0
  38. data/ext/java/nokogiri/XsltStylesheet.java +368 -0
  39. data/ext/java/nokogiri/internals/ClosedStreamException.java +13 -0
  40. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +252 -0
  41. data/ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java +27 -0
  42. data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +178 -0
  43. data/ext/java/nokogiri/internals/NokogiriDomParser.java +99 -0
  44. data/ext/java/nokogiri/internals/NokogiriEntityResolver.java +140 -0
  45. data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +65 -0
  46. data/ext/java/nokogiri/internals/NokogiriHandler.java +339 -0
  47. data/ext/java/nokogiri/internals/NokogiriHelpers.java +817 -0
  48. data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +228 -0
  49. data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +110 -0
  50. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +86 -0
  51. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +107 -0
  52. data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +62 -0
  53. data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +165 -0
  54. data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +50 -0
  55. data/ext/java/nokogiri/internals/NokogiriXPathVariableResolver.java +37 -0
  56. data/ext/java/nokogiri/internals/NokogiriXsltErrorListener.java +70 -0
  57. data/ext/java/nokogiri/internals/ParserContext.java +262 -0
  58. data/ext/java/nokogiri/internals/ReaderNode.java +564 -0
  59. data/ext/java/nokogiri/internals/SaveContextVisitor.java +865 -0
  60. data/ext/java/nokogiri/internals/SchemaErrorHandler.java +50 -0
  61. data/ext/java/nokogiri/internals/XalanDTMManagerPatch.java +174 -0
  62. data/ext/java/nokogiri/internals/XmlDeclHandler.java +11 -0
  63. data/ext/java/nokogiri/internals/XmlDomParserContext.java +265 -0
  64. data/ext/java/nokogiri/internals/XmlSaxParser.java +40 -0
  65. data/ext/java/nokogiri/internals/c14n/AttrCompare.java +122 -0
  66. data/ext/java/nokogiri/internals/c14n/C14nHelper.java +178 -0
  67. data/ext/java/nokogiri/internals/c14n/CanonicalFilter.java +43 -0
  68. data/ext/java/nokogiri/internals/c14n/CanonicalizationException.java +106 -0
  69. data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +278 -0
  70. data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +664 -0
  71. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_OmitComments.java +45 -0
  72. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_WithComments.java +45 -0
  73. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315.java +388 -0
  74. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +308 -0
  75. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclOmitComments.java +47 -0
  76. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclWithComments.java +51 -0
  77. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315OmitComments.java +51 -0
  78. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315WithComments.java +50 -0
  79. data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +660 -0
  80. data/ext/java/nokogiri/internals/c14n/CanonicalizerPhysical.java +194 -0
  81. data/ext/java/nokogiri/internals/c14n/CanonicalizerSpi.java +77 -0
  82. data/ext/java/nokogiri/internals/c14n/Constants.java +45 -0
  83. data/ext/java/nokogiri/internals/c14n/ElementProxy.java +325 -0
  84. data/ext/java/nokogiri/internals/c14n/HelperNodeList.java +106 -0
  85. data/ext/java/nokogiri/internals/c14n/IgnoreAllErrorHandler.java +86 -0
  86. data/ext/java/nokogiri/internals/c14n/InclusiveNamespaces.java +181 -0
  87. data/ext/java/nokogiri/internals/c14n/InvalidCanonicalizerException.java +87 -0
  88. data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +452 -0
  89. data/ext/java/nokogiri/internals/c14n/NodeFilter.java +52 -0
  90. data/ext/java/nokogiri/internals/c14n/UtfHelpper.java +190 -0
  91. data/ext/java/nokogiri/internals/c14n/XMLUtils.java +540 -0
  92. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +1712 -0
  93. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +737 -0
  94. data/ext/nokogiri/depend +38 -0
  95. data/ext/nokogiri/extconf.rb +1086 -0
  96. data/ext/nokogiri/gumbo.c +594 -0
  97. data/ext/nokogiri/html4_document.c +167 -0
  98. data/ext/nokogiri/html4_element_description.c +294 -0
  99. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  100. data/ext/nokogiri/html4_sax_parser_context.c +116 -0
  101. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  102. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  103. data/ext/nokogiri/nokogiri.c +265 -0
  104. data/ext/nokogiri/nokogiri.h +235 -0
  105. data/ext/nokogiri/test_global_handlers.c +42 -0
  106. data/ext/nokogiri/xml_attr.c +103 -0
  107. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  108. data/ext/nokogiri/xml_cdata.c +57 -0
  109. data/ext/nokogiri/xml_comment.c +62 -0
  110. data/ext/nokogiri/xml_document.c +689 -0
  111. data/ext/nokogiri/xml_document_fragment.c +44 -0
  112. data/ext/nokogiri/xml_dtd.c +210 -0
  113. data/ext/nokogiri/xml_element_content.c +128 -0
  114. data/ext/nokogiri/xml_element_decl.c +69 -0
  115. data/ext/nokogiri/xml_encoding_handler.c +104 -0
  116. data/ext/nokogiri/xml_entity_decl.c +112 -0
  117. data/ext/nokogiri/xml_entity_reference.c +50 -0
  118. data/ext/nokogiri/xml_namespace.c +186 -0
  119. data/ext/nokogiri/xml_node.c +2426 -0
  120. data/ext/nokogiri/xml_node_set.c +496 -0
  121. data/ext/nokogiri/xml_processing_instruction.c +54 -0
  122. data/ext/nokogiri/xml_reader.c +794 -0
  123. data/ext/nokogiri/xml_relax_ng.c +164 -0
  124. data/ext/nokogiri/xml_sax_parser.c +316 -0
  125. data/ext/nokogiri/xml_sax_parser_context.c +283 -0
  126. data/ext/nokogiri/xml_sax_push_parser.c +166 -0
  127. data/ext/nokogiri/xml_schema.c +260 -0
  128. data/ext/nokogiri/xml_syntax_error.c +85 -0
  129. data/ext/nokogiri/xml_text.c +48 -0
  130. data/ext/nokogiri/xml_xpath_context.c +415 -0
  131. data/ext/nokogiri/xslt_stylesheet.c +363 -0
  132. data/gumbo-parser/CHANGES.md +63 -0
  133. data/gumbo-parser/Makefile +111 -0
  134. data/gumbo-parser/THANKS +27 -0
  135. data/gumbo-parser/src/Makefile +34 -0
  136. data/gumbo-parser/src/README.md +41 -0
  137. data/gumbo-parser/src/ascii.c +75 -0
  138. data/gumbo-parser/src/ascii.h +115 -0
  139. data/gumbo-parser/src/attribute.c +42 -0
  140. data/gumbo-parser/src/attribute.h +17 -0
  141. data/gumbo-parser/src/char_ref.c +22225 -0
  142. data/gumbo-parser/src/char_ref.h +29 -0
  143. data/gumbo-parser/src/char_ref.rl +2154 -0
  144. data/gumbo-parser/src/error.c +626 -0
  145. data/gumbo-parser/src/error.h +148 -0
  146. data/gumbo-parser/src/foreign_attrs.c +104 -0
  147. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  148. data/gumbo-parser/src/insertion_mode.h +33 -0
  149. data/gumbo-parser/src/macros.h +91 -0
  150. data/gumbo-parser/src/nokogiri_gumbo.h +944 -0
  151. data/gumbo-parser/src/parser.c +4878 -0
  152. data/gumbo-parser/src/parser.h +41 -0
  153. data/gumbo-parser/src/replacement.h +33 -0
  154. data/gumbo-parser/src/string_buffer.c +103 -0
  155. data/gumbo-parser/src/string_buffer.h +68 -0
  156. data/gumbo-parser/src/string_piece.c +48 -0
  157. data/gumbo-parser/src/svg_attrs.c +174 -0
  158. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  159. data/gumbo-parser/src/svg_tags.c +137 -0
  160. data/gumbo-parser/src/svg_tags.gperf +55 -0
  161. data/gumbo-parser/src/tag.c +223 -0
  162. data/gumbo-parser/src/tag_lookup.c +382 -0
  163. data/gumbo-parser/src/tag_lookup.gperf +170 -0
  164. data/gumbo-parser/src/tag_lookup.h +13 -0
  165. data/gumbo-parser/src/token_buffer.c +79 -0
  166. data/gumbo-parser/src/token_buffer.h +71 -0
  167. data/gumbo-parser/src/token_type.h +17 -0
  168. data/gumbo-parser/src/tokenizer.c +3463 -0
  169. data/gumbo-parser/src/tokenizer.h +112 -0
  170. data/gumbo-parser/src/tokenizer_states.h +339 -0
  171. data/gumbo-parser/src/utf8.c +245 -0
  172. data/gumbo-parser/src/utf8.h +164 -0
  173. data/gumbo-parser/src/util.c +66 -0
  174. data/gumbo-parser/src/util.h +34 -0
  175. data/gumbo-parser/src/vector.c +111 -0
  176. data/gumbo-parser/src/vector.h +45 -0
  177. data/lib/nokogiri/class_resolver.rb +67 -0
  178. data/lib/nokogiri/css/node.rb +54 -0
  179. data/lib/nokogiri/css/parser.rb +770 -0
  180. data/lib/nokogiri/css/parser.y +277 -0
  181. data/lib/nokogiri/css/parser_extras.rb +96 -0
  182. data/lib/nokogiri/css/syntax_error.rb +9 -0
  183. data/lib/nokogiri/css/tokenizer.rb +155 -0
  184. data/lib/nokogiri/css/tokenizer.rex +56 -0
  185. data/lib/nokogiri/css/xpath_visitor.rb +359 -0
  186. data/lib/nokogiri/css.rb +66 -0
  187. data/lib/nokogiri/decorators/slop.rb +44 -0
  188. data/lib/nokogiri/encoding_handler.rb +57 -0
  189. data/lib/nokogiri/extension.rb +32 -0
  190. data/lib/nokogiri/gumbo.rb +15 -0
  191. data/lib/nokogiri/html.rb +48 -0
  192. data/lib/nokogiri/html4/builder.rb +37 -0
  193. data/lib/nokogiri/html4/document.rb +214 -0
  194. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  195. data/lib/nokogiri/html4/element_description.rb +25 -0
  196. data/lib/nokogiri/html4/element_description_defaults.rb +572 -0
  197. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  198. data/lib/nokogiri/html4/entity_lookup.rb +15 -0
  199. data/lib/nokogiri/html4/sax/parser.rb +63 -0
  200. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  201. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  202. data/lib/nokogiri/html4.rb +47 -0
  203. data/lib/nokogiri/html5/document.rb +168 -0
  204. data/lib/nokogiri/html5/document_fragment.rb +90 -0
  205. data/lib/nokogiri/html5/node.rb +98 -0
  206. data/lib/nokogiri/html5.rb +389 -0
  207. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  208. data/lib/nokogiri/jruby/isorelax/isorelax/20030108/isorelax-20030108.jar +0 -0
  209. data/lib/nokogiri/jruby/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar +0 -0
  210. data/lib/nokogiri/jruby/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar +0 -0
  211. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  212. data/lib/nokogiri/jruby/nu/validator/jing/20200702VNU/jing-20200702VNU.jar +0 -0
  213. data/lib/nokogiri/jruby/org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar +0 -0
  214. data/lib/nokogiri/jruby/xalan/serializer/2.7.3/serializer-2.7.3.jar +0 -0
  215. data/lib/nokogiri/jruby/xalan/xalan/2.7.3/xalan-2.7.3.jar +0 -0
  216. data/lib/nokogiri/jruby/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar +0 -0
  217. data/lib/nokogiri/jruby/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar +0 -0
  218. data/lib/nokogiri/syntax_error.rb +6 -0
  219. data/lib/nokogiri/version/constant.rb +6 -0
  220. data/lib/nokogiri/version/info.rb +223 -0
  221. data/lib/nokogiri/version.rb +4 -0
  222. data/lib/nokogiri/xml/attr.rb +66 -0
  223. data/lib/nokogiri/xml/attribute_decl.rb +20 -0
  224. data/lib/nokogiri/xml/builder.rb +487 -0
  225. data/lib/nokogiri/xml/cdata.rb +13 -0
  226. data/lib/nokogiri/xml/character_data.rb +9 -0
  227. data/lib/nokogiri/xml/document.rb +471 -0
  228. data/lib/nokogiri/xml/document_fragment.rb +205 -0
  229. data/lib/nokogiri/xml/dtd.rb +34 -0
  230. data/lib/nokogiri/xml/element_content.rb +38 -0
  231. data/lib/nokogiri/xml/element_decl.rb +15 -0
  232. data/lib/nokogiri/xml/entity_decl.rb +21 -0
  233. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  234. data/lib/nokogiri/xml/namespace.rb +58 -0
  235. data/lib/nokogiri/xml/node/save_options.rb +68 -0
  236. data/lib/nokogiri/xml/node.rb +1563 -0
  237. data/lib/nokogiri/xml/node_set.rb +447 -0
  238. data/lib/nokogiri/xml/notation.rb +19 -0
  239. data/lib/nokogiri/xml/parse_options.rb +213 -0
  240. data/lib/nokogiri/xml/pp/character_data.rb +21 -0
  241. data/lib/nokogiri/xml/pp/node.rb +57 -0
  242. data/lib/nokogiri/xml/pp.rb +4 -0
  243. data/lib/nokogiri/xml/processing_instruction.rb +11 -0
  244. data/lib/nokogiri/xml/reader.rb +105 -0
  245. data/lib/nokogiri/xml/relax_ng.rb +38 -0
  246. data/lib/nokogiri/xml/sax/document.rb +167 -0
  247. data/lib/nokogiri/xml/sax/parser.rb +125 -0
  248. data/lib/nokogiri/xml/sax/parser_context.rb +21 -0
  249. data/lib/nokogiri/xml/sax/push_parser.rb +61 -0
  250. data/lib/nokogiri/xml/sax.rb +6 -0
  251. data/lib/nokogiri/xml/schema.rb +73 -0
  252. data/lib/nokogiri/xml/searchable.rb +270 -0
  253. data/lib/nokogiri/xml/syntax_error.rb +72 -0
  254. data/lib/nokogiri/xml/text.rb +11 -0
  255. data/lib/nokogiri/xml/xpath/syntax_error.rb +13 -0
  256. data/lib/nokogiri/xml/xpath.rb +21 -0
  257. data/lib/nokogiri/xml/xpath_context.rb +16 -0
  258. data/lib/nokogiri/xml.rb +76 -0
  259. data/lib/nokogiri/xslt/stylesheet.rb +27 -0
  260. data/lib/nokogiri/xslt.rb +65 -0
  261. data/lib/nokogiri.rb +120 -0
  262. data/lib/xsd/xmlparser/nokogiri.rb +106 -0
  263. metadata +391 -0
@@ -0,0 +1,3463 @@
1
+ /*
2
+ Copyright 2010 Google Inc.
3
+ Copyright 2017-2018 Craig Barnes
4
+ Copyright 2018 Stephen Checkoway
5
+
6
+ Licensed under the Apache License, Version 2.0 (the "License");
7
+ you may not use this file except in compliance with the License.
8
+ You may obtain a copy of the License at
9
+
10
+ https://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+ */
18
+
19
+ /*
20
+ Coding conventions specific to this file:
21
+
22
+ 1. Functions that fill in a token should be named emit_*, and should be
23
+ followed immediately by a return from the tokenizer.
24
+ 2. Functions that shuffle data from temporaries to final API structures
25
+ should be named finish_*, and be called just before the tokenizer exits the
26
+ state that accumulates the temporary.
27
+ 3. All internal data structures should be kept in an initialized state from
28
+ tokenizer creation onwards, ready to accept input. When a buffer's flushed
29
+ and reset, it should be deallocated and immediately reinitialized.
30
+ 4. Make sure there are appropriate break statements following each state.
31
+ 5. Assertions on the state of the temporary and tag buffers are usually a
32
+ good idea, and should go at the entry point of each state when added.
33
+ 6. Statement order within states goes:
34
+ 1. Add parse errors, if appropriate.
35
+ 2. Call finish_* functions to build up tag state.
36
+ 2. Switch to new state. Set _reconsume flag if appropriate.
37
+ 3. Perform any other temporary buffer manipulation.
38
+ 4. Emit tokens
39
+ 5. Return/break.
40
+ This order ensures that we can verify that every emit is followed by
41
+ a return, ensures that the correct state is recorded with any parse
42
+ errors, and prevents parse error position from being messed up by
43
+ possible mark/resets in temporary buffer manipulation.
44
+ */
45
+
46
+ #include <assert.h>
47
+ #include <string.h>
48
+ #include "tokenizer.h"
49
+ #include "ascii.h"
50
+ #include "attribute.h"
51
+ #include "char_ref.h"
52
+ #include "error.h"
53
+ #include "nokogiri_gumbo.h"
54
+ #include "parser.h"
55
+ #include "string_buffer.h"
56
+ #include "token_type.h"
57
+ #include "tokenizer_states.h"
58
+ #include "utf8.h"
59
+ #include "util.h"
60
+ #include "vector.h"
61
+
62
+ // Compared against _temporary_buffer to determine if we're in
63
+ // double-escaped script mode.
64
+ static const GumboStringPiece kScriptTag = {.data = "script", .length = 6};
65
+
66
+ // An enum for the return value of each individual state. Each of the emit_*
67
+ // functions should return EMIT_TOKEN and should be called as
68
+ // return emit_foo(parser, ..., output);
69
+ // Each of the handle_*_state functions that do not return emit_* should
70
+ // instead return CONTINUE to indicate to gumbo_lex to continue lexing.
71
+ typedef enum {
72
+ EMIT_TOKEN,
73
+ CONTINUE,
74
+ } StateResult;
75
+
76
+ // This is a struct containing state necessary to build up a tag token,
77
+ // character by character.
78
+ typedef struct GumboInternalTagState {
79
+ // A buffer to accumulate characters for various GumboStringPiece fields.
80
+ GumboStringBuffer _buffer;
81
+
82
+ // A pointer to the start of the original text corresponding to the contents
83
+ // of the buffer.
84
+ const char* _original_text;
85
+
86
+ // The current tag enum, computed once the tag name state has finished so that
87
+ // the buffer can be re-used for building up attributes.
88
+ GumboTag _tag;
89
+
90
+ // The current tag name. It's set at the same time that _tag is set if _tag
91
+ // is set to GUMBO_TAG_UNKNOWN.
92
+ char *_name;
93
+
94
+ // The starting location of the text in the buffer.
95
+ GumboSourcePosition _start_pos;
96
+
97
+ // The current list of attributes. This is copied (and ownership of its data
98
+ // transferred) to the GumboStartTag token upon completion of the tag. New
99
+ // attributes are added as soon as their attribute name state is complete, and
100
+ // values are filled in by operating on _attributes.data[attributes.length-1].
101
+ GumboVector /* GumboAttribute */ _attributes;
102
+
103
+ // If true, the next attribute value to be finished should be dropped. This
104
+ // happens if a duplicate attribute name is encountered - we want to consume
105
+ // the attribute value, but shouldn't overwrite the existing value.
106
+ bool _drop_next_attr_value;
107
+
108
+ // The last start tag to have been emitted by the tokenizer. This is
109
+ // necessary to check for appropriate end tags.
110
+ GumboTag _last_start_tag;
111
+
112
+ // If true, then this is a start tag. If false, it's an end tag. This is
113
+ // necessary to generate the appropriate token type at tag-closing time.
114
+ bool _is_start_tag;
115
+
116
+ // If true, then this tag is "self-closing" and doesn't have an end tag.
117
+ bool _is_self_closing;
118
+ } GumboTagState;
119
+
120
+ // This is the main tokenizer state struct, containing all state used by in
121
+ // tokenizing the input stream.
122
+ typedef struct GumboInternalTokenizerState {
123
+ // The current lexer state. Starts in GUMBO_LEX_DATA.
124
+ GumboTokenizerEnum _state;
125
+
126
+ // A flag indicating whether the current input character needs to reconsumed
127
+ // in another state, or whether the next input character should be read for
128
+ // the next iteration of the state loop. This is set when the spec reads
129
+ // "Reconsume the current input character in..."
130
+ bool _reconsume_current_input;
131
+
132
+ // A flag indicating whether the adjusted current node is a foreign element.
133
+ // This is set by gumbo_tokenizer_set_is_adjusted_current_node_foreign and
134
+ // checked in the markup declaration state.
135
+ bool _is_adjusted_current_node_foreign;
136
+
137
+ // A flag indicating whether the tokenizer is in a CDATA section. If so, then
138
+ // text tokens emitted will be GUMBO_TOKEN_CDATA.
139
+ bool _is_in_cdata;
140
+
141
+ // Certain states (notably character references) may emit two character tokens
142
+ // at once, but the contract for lex() fills in only one token at a time. The
143
+ // extra character is buffered here, and then this is checked on entry to
144
+ // lex(). If a character is stored here, it's immediately emitted and control
145
+ // returns from the lexer. kGumboNoChar is used to represent 'no character
146
+ // stored.'
147
+ //
148
+ // Note that characters emitted through this mechanism will have their source
149
+ // position marked as the character under the mark, i.e. multiple characters
150
+ // may be emitted with the same position. This is desirable for character
151
+ // references, but unsuitable for many other cases. Use the _temporary_buffer
152
+ // mechanism if the buffered characters must have their original positions in
153
+ // the document.
154
+ int _buffered_emit_char;
155
+
156
+ // A temporary buffer to accumulate characters, as described by the "temporary
157
+ // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
158
+ // way: In situations where the spec calls for inserting characters into the
159
+ // temporary buffer that exactly match the input in order to emit them as
160
+ // character tokens, we don't actually do it.
161
+ // Instead, we mark the input and reset the input to it using set_mark() and
162
+ // emit_from_mark(). We do use the temporary buffer for other uses such as
163
+ // DOCTYPEs, comments, and detecting escaped <script> tags.
164
+ GumboStringBuffer _temporary_buffer;
165
+
166
+ // The position to resume normal operation after we start emitting from the
167
+ // mark. NULL whenever we're not emitting from the mark.
168
+ const char* _resume_pos;
169
+
170
+ // The character reference state uses a return state to return to the state
171
+ // it was invoked from.
172
+ GumboTokenizerEnum _return_state;
173
+
174
+ // Numeric character reference.
175
+ uint32_t _character_reference_code;
176
+
177
+ // Pointer to the beginning of the current token in the original buffer; used
178
+ // to record the original text.
179
+ const char* _token_start;
180
+
181
+ // GumboSourcePosition recording the source location of the start of the
182
+ // current token.
183
+ GumboSourcePosition _token_start_pos;
184
+
185
+ // Current tag state.
186
+ GumboTagState _tag_state;
187
+
188
+ // Doctype state. We use the temporary buffer to accumulate characters (it's
189
+ // not used for anything else in the doctype states), and then freshly
190
+ // allocate the strings in the doctype token, then copy it over on emit.
191
+ GumboTokenDocType _doc_type_state;
192
+
193
+ // The UTF8Iterator over the tokenizer input.
194
+ Utf8Iterator _input;
195
+ } GumboTokenizerState;
196
+
197
+ // Adds a parse error to the parser's error struct.
198
+ static void tokenizer_add_parse_error (
199
+ GumboParser* parser,
200
+ GumboErrorType type
201
+ ) {
202
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
203
+ GumboError* error = gumbo_add_error(parser);
204
+ if (!error) {
205
+ return;
206
+ }
207
+ const Utf8Iterator* input = &tokenizer->_input;
208
+ utf8iterator_get_position(input, &error->position);
209
+ error->original_text.data = utf8iterator_get_char_pointer(input);
210
+ error->original_text.length = utf8iterator_get_width(input);
211
+ error->type = type;
212
+ error->v.tokenizer.state = tokenizer->_state;
213
+ error->v.tokenizer.codepoint = utf8iterator_current(input);
214
+ }
215
+
216
+ // Adds an error pointing at the start of the character reference.
217
+ static void tokenizer_add_char_ref_error (
218
+ struct GumboInternalParser* parser,
219
+ GumboErrorType type,
220
+ int codepoint
221
+ ) {
222
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
223
+ GumboError* error = gumbo_add_error(parser);
224
+ if (!error)
225
+ return;
226
+ Utf8Iterator* input = &tokenizer->_input;
227
+ error->type = type;
228
+ error->position = utf8iterator_get_mark_position(input);
229
+ const char* mark = utf8iterator_get_mark_pointer(input);
230
+ error->original_text.data = mark;
231
+ error->original_text.length = utf8iterator_get_char_pointer(input) - mark;
232
+ error->v.tokenizer.state = tokenizer->_state;
233
+ error->v.tokenizer.codepoint = codepoint;
234
+ }
235
+
236
+ // Adds an error pointing at the start of the token.
237
+ static void tokenizer_add_token_parse_error (
238
+ GumboParser* parser,
239
+ GumboErrorType type
240
+ ) {
241
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
242
+ GumboError* error = gumbo_add_error(parser);
243
+ if (!error)
244
+ return;
245
+ Utf8Iterator* input = &tokenizer->_input;
246
+ error->type = type;
247
+ error->position = tokenizer->_token_start_pos;
248
+ error->original_text.data = tokenizer->_token_start;
249
+ error->original_text.length =
250
+ utf8iterator_get_char_pointer(input) - tokenizer->_token_start;
251
+ error->v.tokenizer.state = tokenizer->_state;
252
+ error->v.tokenizer.codepoint = 0;
253
+ }
254
+
255
+ static bool is_alpha(int c) {
256
+ return gumbo_ascii_isalpha(c);
257
+ }
258
+
259
+ static int ensure_lowercase(int c) {
260
+ return gumbo_ascii_tolower(c);
261
+ }
262
+
263
+ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
264
+ if (is_in_cdata && c > 0) {
265
+ return GUMBO_TOKEN_CDATA;
266
+ }
267
+
268
+ switch (c) {
269
+ case '\t':
270
+ case '\n':
271
+ case '\r':
272
+ case '\f':
273
+ case ' ':
274
+ return GUMBO_TOKEN_WHITESPACE;
275
+ case 0:
276
+ gumbo_debug("Emitted null byte.\n");
277
+ return GUMBO_TOKEN_NULL;
278
+ case -1:
279
+ return GUMBO_TOKEN_EOF;
280
+ default:
281
+ return GUMBO_TOKEN_CHARACTER;
282
+ }
283
+ }
284
+
285
+ // Starts recording characters in the temporary buffer.
286
+ static void clear_temporary_buffer(GumboParser* parser) {
287
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
288
+ gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
289
+ }
290
+
291
+ // Appends a codepoint to the temporary buffer.
292
+ static void append_char_to_temporary_buffer (
293
+ GumboParser* parser,
294
+ int codepoint
295
+ ) {
296
+ gumbo_string_buffer_append_codepoint (
297
+ codepoint,
298
+ &parser->_tokenizer_state->_temporary_buffer
299
+ );
300
+ }
301
+
302
+ static void append_string_to_temporary_buffer (
303
+ GumboParser* parser,
304
+ const GumboStringPiece* str
305
+ ) {
306
+ gumbo_string_buffer_append_string (
307
+ str,
308
+ &parser->_tokenizer_state->_temporary_buffer
309
+ );
310
+ }
311
+
312
+
313
+ static bool temporary_buffer_is_empty(const GumboParser* parser) {
314
+ return parser->_tokenizer_state->_temporary_buffer.length == 0;
315
+ }
316
+
317
+ static void doc_type_state_init(GumboParser* parser) {
318
+ GumboTokenDocType* doc_type_state =
319
+ &parser->_tokenizer_state->_doc_type_state;
320
+ // We initialize these to NULL here so that we don't end up leaking memory if
321
+ // we never see a doctype token. When we do see a doctype token, we reset
322
+ // them to a freshly-allocated empty string so that we can present a uniform
323
+ // interface to client code and not make them check for null. Ownership is
324
+ // transferred to the doctype token when it's emitted.
325
+ doc_type_state->name = NULL;
326
+ doc_type_state->public_identifier = NULL;
327
+ doc_type_state->system_identifier = NULL;
328
+ doc_type_state->force_quirks = false;
329
+ doc_type_state->has_public_identifier = false;
330
+ doc_type_state->has_system_identifier = false;
331
+ }
332
+
333
+ // Sets the token original_text and position to the current iterator position.
334
+ // This is necessary because [CDATA[ sections may include text that is ignored
335
+ // by the tokenizer.
336
+ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
337
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
338
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
339
+ }
340
+
341
+ // Sets the tag buffer original text and start point to the current iterator
342
+ // position. This is necessary because attribute names & values may have
343
+ // whitespace preceeding them, and so we can't assume that the actual token
344
+ // starting point was the end of the last tag buffer usage.
345
+ static void reset_tag_buffer_start_point(GumboParser* parser) {
346
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
347
+ GumboTagState* tag_state = &tokenizer->_tag_state;
348
+
349
+ utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
350
+ tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
351
+ }
352
+
353
+ // Moves the temporary buffer contents over to the specified output string,
354
+ // and clears the temporary buffer.
355
+ static void finish_temporary_buffer(GumboParser* parser, const char** output) {
356
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
357
+ *output = gumbo_string_buffer_to_string(&tokenizer->_temporary_buffer);
358
+ clear_temporary_buffer(parser);
359
+ }
360
+
361
+ // Advances the iterator past the end of the token, and then fills in the
362
+ // relevant position fields. It's assumed that after every emit, the tokenizer
363
+ // will immediately return (letting the tree-construction stage read the filled
364
+ // in Token). Thus, it's safe to advance the input stream here, since it will
365
+ // bypass the advance at the bottom of the state machine loop.
366
+ //
367
+ // Since this advances the iterator and resets the current input, make sure to
368
+ // call it after you've recorded any other data you need for the token.
369
+ static void finish_token(GumboParser* parser, GumboToken* token) {
370
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
371
+ if (!tokenizer->_reconsume_current_input) {
372
+ utf8iterator_next(&tokenizer->_input);
373
+ }
374
+
375
+ token->position = tokenizer->_token_start_pos;
376
+ token->original_text.data = tokenizer->_token_start;
377
+ reset_token_start_point(tokenizer);
378
+ token->original_text.length =
379
+ tokenizer->_token_start - token->original_text.data;
380
+ if (token->original_text.length > 0 &&
381
+ token->original_text.data[token->original_text.length - 1] == '\r') {
382
+ // The UTF8 iterator will ignore carriage returns in the input stream, which
383
+ // means that the next token may start one past a \r character. The pointer
384
+ // arithmetic above results in that \r being appended to the original text
385
+ // of the preceding token, so we have to adjust its length here to chop the
386
+ // \r off.
387
+ --token->original_text.length;
388
+ }
389
+ }
390
+
391
+ // Records the doctype public ID, assumed to be in the temporary buffer.
392
+ // Convenience method that also sets has_public_identifier to true.
393
+ static void finish_doctype_public_id(GumboParser* parser) {
394
+ GumboTokenDocType* doc_type_state =
395
+ &parser->_tokenizer_state->_doc_type_state;
396
+ gumbo_free((void*) doc_type_state->public_identifier);
397
+ finish_temporary_buffer(parser, &doc_type_state->public_identifier);
398
+ doc_type_state->has_public_identifier = true;
399
+ }
400
+
401
+ // Records the doctype system ID, assumed to be in the temporary buffer.
402
+ // Convenience method that also sets has_system_identifier to true.
403
+ static void finish_doctype_system_id(GumboParser* parser) {
404
+ GumboTokenDocType* doc_type_state =
405
+ &parser->_tokenizer_state->_doc_type_state;
406
+ gumbo_free((void*) doc_type_state->system_identifier);
407
+ finish_temporary_buffer(parser, &doc_type_state->system_identifier);
408
+ doc_type_state->has_system_identifier = true;
409
+ }
410
+
411
+ // Writes a single specified character to the output token.
412
+ static StateResult emit_char(GumboParser* parser, int c, GumboToken* output) {
413
+ output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
414
+ output->v.character = c;
415
+ finish_token(parser, output);
416
+ return EMIT_TOKEN;
417
+ }
418
+
419
+ // Writes a replacement character token and records a parse error.
420
+ // Always returns EMIT_TOKEN, per gumbo_lex return value.
421
+ static StateResult emit_replacement_char(
422
+ GumboParser* parser, GumboToken* output) {
423
+ // In all cases, this is because of a null byte in the input stream.
424
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
425
+ emit_char(parser, kUtf8ReplacementChar, output);
426
+ return EMIT_TOKEN;
427
+ }
428
+
429
+ // Writes an EOF character token. Always returns EMIT_TOKEN.
430
+ static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
431
+ return emit_char(parser, -1, output);
432
+ }
433
+
434
+ // Writes out a doctype token, copying it from the tokenizer state.
435
+ static StateResult emit_doctype(GumboParser* parser, GumboToken* output) {
436
+ output->type = GUMBO_TOKEN_DOCTYPE;
437
+ output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
438
+ finish_token(parser, output);
439
+ doc_type_state_init(parser);
440
+ return EMIT_TOKEN;
441
+ }
442
+
443
+ // Debug-only function that explicitly sets the attribute vector data to NULL so
444
+ // it can be asserted on tag creation, verifying that there are no memory leaks.
445
+ static void mark_tag_state_as_empty(GumboTagState* tag_state) {
446
+ UNUSED_IF_NDEBUG(tag_state);
447
+ tag_state->_name = NULL;
448
+ #ifndef NDEBUG
449
+ tag_state->_attributes = kGumboEmptyVector;
450
+ #endif
451
+ }
452
+
453
+ // Writes out the current tag as a start or end tag token.
454
+ // Always returns EMIT_TOKEN.
455
+ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
456
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
457
+ if (tag_state->_is_start_tag) {
458
+ output->type = GUMBO_TOKEN_START_TAG;
459
+ output->v.start_tag.tag = tag_state->_tag;
460
+ output->v.start_tag.name = tag_state->_name;
461
+ output->v.start_tag.attributes = tag_state->_attributes;
462
+ output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
463
+ tag_state->_last_start_tag = tag_state->_tag;
464
+ mark_tag_state_as_empty(tag_state);
465
+ gumbo_debug(
466
+ "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
467
+ } else {
468
+ output->type = GUMBO_TOKEN_END_TAG;
469
+ output->v.end_tag.tag = tag_state->_tag;
470
+ output->v.end_tag.name = tag_state->_name;
471
+ if (tag_state->_is_self_closing)
472
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS);
473
+ if (tag_state->_attributes.length > 0)
474
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_ATTRIBUTES);
475
+ // In end tags, ownership of the attributes vector is not transferred to the
476
+ // token, but it's still initialized as normal, so it must be manually
477
+ // deallocated. There may also be attributes to destroy, in certain broken
478
+ // cases like </div</th> (the "th" is an attribute there).
479
+ for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
480
+ gumbo_destroy_attribute(tag_state->_attributes.data[i]);
481
+ }
482
+ gumbo_free(tag_state->_attributes.data);
483
+ mark_tag_state_as_empty(tag_state);
484
+ gumbo_debug(
485
+ "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
486
+ }
487
+ gumbo_string_buffer_destroy(&tag_state->_buffer);
488
+ finish_token(parser, output);
489
+ gumbo_debug (
490
+ "Original text = %.*s.\n",
491
+ (int) output->original_text.length,
492
+ output->original_text.data
493
+ );
494
+ assert(output->original_text.length >= 2);
495
+ assert(output->original_text.data[0] == '<');
496
+ assert(output->original_text.data[output->original_text.length - 1] == '>');
497
+ return EMIT_TOKEN;
498
+ }
499
+
500
+ // In some states, we speculatively start a tag, but don't know whether it'll be
501
+ // emitted as tag token or as a series of character tokens until we finish it.
502
+ // We need to abandon the tag we'd started & free its memory in that case to
503
+ // avoid a memory leak.
504
+ static void abandon_current_tag(GumboParser* parser) {
505
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
506
+ for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
507
+ gumbo_destroy_attribute(tag_state->_attributes.data[i]);
508
+ }
509
+ gumbo_free(tag_state->_attributes.data);
510
+ mark_tag_state_as_empty(tag_state);
511
+ gumbo_string_buffer_destroy(&tag_state->_buffer);
512
+ gumbo_debug("Abandoning current tag.\n");
513
+ }
514
+
515
+ // Emits a comment token. Comments use the temporary buffer to accumulate their
516
+ // data, and then it's copied over and released to the 'text' field of the
517
+ // GumboToken union. Always returns EMIT_TOKEN.
518
+ static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
519
+ output->type = GUMBO_TOKEN_COMMENT;
520
+ finish_temporary_buffer(parser, &output->v.text);
521
+ finish_token(parser, output);
522
+ return EMIT_TOKEN;
523
+ }
524
+
525
+ static void set_mark(GumboParser* parser) {
526
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
527
+ utf8iterator_mark(&tokenizer->_input);
528
+ }
529
+
530
+ // Checks to see we should be emitting characters from the mark, and fills the
531
+ // output token with the next output character if so.
532
+ // Returns EMIT_TOKEN if a character has been emitted and the tokenizer should
533
+ // immediately return, CONTINUE if we should resume normal operation.
534
+ static StateResult maybe_emit_from_mark (
535
+ GumboParser* parser,
536
+ GumboToken* output
537
+ ) {
538
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
539
+ const char* pos = tokenizer->_resume_pos;
540
+
541
+ if (!pos)
542
+ return CONTINUE;
543
+ if (utf8iterator_get_char_pointer(&tokenizer->_input) >= pos) {
544
+ tokenizer->_resume_pos = NULL;
545
+ return CONTINUE;
546
+ }
547
+
548
+ // emit_char advances the input stream. _reconsume_current_input should
549
+ // *never* be set when emitting from the mark since those characters have
550
+ // already been advanced past.
551
+ assert(!tokenizer->_reconsume_current_input);
552
+ return emit_char(parser, utf8iterator_current(&tokenizer->_input), output);
553
+ }
554
+
555
+ // Sets up the tokenizer to begin emitting from the mark up to, but not
556
+ // including, the current code point. This resets the input iterator stream to
557
+ // the mark, sets up _resume_pos, and then emits the first character in it.
558
+ // Returns EMIT_TOKEN.
559
+ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
560
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
561
+ tokenizer->_resume_pos = utf8iterator_get_char_pointer(&tokenizer->_input);
562
+ utf8iterator_reset(&tokenizer->_input);
563
+ // Now that we have reset the input, we need to advance through it.
564
+ tokenizer->_reconsume_current_input = false;
565
+ StateResult result = maybe_emit_from_mark(parser, output);
566
+ assert(result == EMIT_TOKEN);
567
+ return result;
568
+ }
569
+
570
+ // Appends a codepoint to the current tag buffer. If
571
+ // reinitilize_position_on_first is set, this also initializes the tag buffer
572
+ // start point; the only time you would *not* want to pass true for this
573
+ // parameter is if you want the original_text to include character (like an
574
+ // opening quote) that doesn't appear in the value.
575
+ static void append_char_to_tag_buffer (
576
+ GumboParser* parser,
577
+ int codepoint,
578
+ bool reinitilize_position_on_first
579
+ ) {
580
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
581
+ if (buffer->length == 0 && reinitilize_position_on_first) {
582
+ reset_tag_buffer_start_point(parser);
583
+ }
584
+ gumbo_string_buffer_append_codepoint(codepoint, buffer);
585
+ }
586
+
587
+ // Like above but append a string.
588
+ static void append_string_to_tag_buffer (
589
+ GumboParser* parser,
590
+ GumboStringPiece* str,
591
+ bool reinitilize_position_on_first
592
+ ) {
593
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
594
+ if (buffer->length == 0 && reinitilize_position_on_first) {
595
+ reset_tag_buffer_start_point(parser);
596
+ }
597
+ gumbo_string_buffer_append_string(str, buffer);
598
+ }
599
+
600
+ // (Re-)initialize the tag buffer. This also resets the original_text pointer
601
+ // and _start_pos field to point to the current position.
602
+ static void initialize_tag_buffer(GumboParser* parser) {
603
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
604
+ GumboTagState* tag_state = &tokenizer->_tag_state;
605
+
606
+ gumbo_string_buffer_init(&tag_state->_buffer);
607
+ reset_tag_buffer_start_point(parser);
608
+ }
609
+
610
+ // https://html.spec.whatwg.org/multipage/parsing.html#charref-in-attribute
611
+ static bool character_reference_part_of_attribute(GumboParser* parser) {
612
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
613
+ switch (tokenizer->_return_state) {
614
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
615
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
616
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
617
+ return true;
618
+ default:
619
+ return false;
620
+ }
621
+ }
622
+
623
+ // https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference
624
+ // For each code point in the temporary buffer, add to the current attribute
625
+ // value if the character reference was consumed as part of an attribute or
626
+ // emit the code point as a character token.
627
+ static StateResult flush_code_points_consumed_as_character_reference (
628
+ GumboParser* parser,
629
+ GumboToken* output
630
+ ) {
631
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
632
+ if (character_reference_part_of_attribute(parser)) {
633
+ const char *start = utf8iterator_get_mark_pointer(&tokenizer->_input);
634
+ assert(start);
635
+ GumboStringPiece str = {
636
+ .data = start,
637
+ .length = utf8iterator_get_char_pointer(&tokenizer->_input) - start,
638
+ };
639
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
640
+ append_string_to_tag_buffer(parser, &str, unquoted);
641
+ return CONTINUE;
642
+ }
643
+ return emit_from_mark(parser, output);
644
+ }
645
+
646
+ // After a character reference has been successfully constructed, the standard
647
+ // says to set the temporary buffer equal to the empty string, append the code
648
+ // point(s) associated with the reference and flush code points consumed as a
649
+ // character reference.
650
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
651
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
652
+ // That doesn't work for us because we use the temporary buffer in lock step
653
+ // with the input for position and that would fail if we inserted a different
654
+ // number of code points. So duplicate a bit of the above logic.
655
+ static StateResult flush_char_ref (
656
+ GumboParser* parser,
657
+ int first,
658
+ int second,
659
+ GumboToken* output
660
+ ) {
661
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
662
+ if (character_reference_part_of_attribute(parser)) {
663
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
664
+ append_char_to_tag_buffer(parser, first, unquoted);
665
+ if (second != kGumboNoChar)
666
+ append_char_to_tag_buffer(parser, second, unquoted);
667
+ return CONTINUE;
668
+ }
669
+ tokenizer->_buffered_emit_char = second;
670
+ return emit_char(parser, first, output);
671
+ }
672
+
673
+
674
+ // Initializes the tag_state to start a new tag, keeping track of the opening
675
+ // positions and original text. Takes a boolean indicating whether this is a
676
+ // start or end tag.
677
+ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
678
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
679
+ GumboTagState* tag_state = &tokenizer->_tag_state;
680
+ int c = utf8iterator_current(&tokenizer->_input);
681
+ assert(is_alpha(c));
682
+ c = ensure_lowercase(c);
683
+ assert(is_alpha(c));
684
+
685
+ initialize_tag_buffer(parser);
686
+
687
+ assert(tag_state->_name == NULL);
688
+ assert(tag_state->_attributes.data == NULL);
689
+ // Initial size chosen by statistical analysis of a corpus of 60k webpages.
690
+ // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
691
+ // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
692
+ // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
693
+ gumbo_vector_init(1, &tag_state->_attributes);
694
+ tag_state->_drop_next_attr_value = false;
695
+ tag_state->_is_start_tag = is_start_tag;
696
+ tag_state->_is_self_closing = false;
697
+ gumbo_debug("Starting new tag.\n");
698
+ }
699
+
700
+ // Fills in the specified char* with the contents of the tag buffer.
701
+ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
702
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
703
+ GumboTagState* tag_state = &tokenizer->_tag_state;
704
+ *output = gumbo_string_buffer_to_string(&tag_state->_buffer);
705
+ }
706
+
707
+ // Fills in:
708
+ // * The original_text GumboStringPiece with the portion of the original
709
+ // buffer that corresponds to the tag buffer.
710
+ // * The start_pos GumboSourcePosition with the start position of the tag
711
+ // buffer.
712
+ // * The end_pos GumboSourcePosition with the current source position.
713
+ static void copy_over_original_tag_text (
714
+ GumboParser* parser,
715
+ GumboStringPiece* original_text,
716
+ GumboSourcePosition* start_pos,
717
+ GumboSourcePosition* end_pos
718
+ ) {
719
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
720
+ GumboTagState* tag_state = &tokenizer->_tag_state;
721
+
722
+ original_text->data = tag_state->_original_text;
723
+ original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
724
+ tag_state->_original_text;
725
+ if (
726
+ original_text->length
727
+ && original_text->data[original_text->length - 1] == '\r'
728
+ ) {
729
+ // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
730
+ // appended to the end of original text even when it's really the first part
731
+ // of the next character. If we detect this situation, shrink the length of
732
+ // the original text by 1 to remove the carriage return.
733
+ --original_text->length;
734
+ }
735
+ *start_pos = tag_state->_start_pos;
736
+ utf8iterator_get_position(&tokenizer->_input, end_pos);
737
+ }
738
+
739
+ // Releases and then re-initializes the tag buffer.
740
+ static void reinitialize_tag_buffer(GumboParser* parser) {
741
+ gumbo_free(parser->_tokenizer_state->_tag_state._buffer.data);
742
+ initialize_tag_buffer(parser);
743
+ }
744
+
745
+ // Moves some data from the temporary buffer over the the tag-based fields in
746
+ // TagState.
747
+ static void finish_tag_name(GumboParser* parser) {
748
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
749
+ GumboTagState* tag_state = &tokenizer->_tag_state;
750
+
751
+ const char *data = tag_state->_buffer.data;
752
+ size_t length = tag_state->_buffer.length;
753
+ tag_state->_tag = gumbo_tagn_enum(data, length);
754
+ if (tag_state->_tag == GUMBO_TAG_UNKNOWN) {
755
+ char *name = gumbo_alloc(length + 1);
756
+ memcpy(name, data, length);
757
+ name[length] = 0;
758
+ tag_state->_name = name;
759
+ }
760
+ reinitialize_tag_buffer(parser);
761
+ }
762
+
763
+ // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
764
+ static void add_duplicate_attr_error(GumboParser* parser) {
765
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
766
+ GumboError* error = gumbo_add_error(parser);
767
+ if (!error) {
768
+ return;
769
+ }
770
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
771
+ error->type = GUMBO_ERR_DUPLICATE_ATTRIBUTE;
772
+ error->position = tag_state->_start_pos;
773
+ error->original_text.data = tag_state->_original_text;
774
+ error->original_text.length =
775
+ utf8iterator_get_char_pointer(&tokenizer->_input) - error->original_text.data;
776
+ error->v.tokenizer.state = tokenizer->_state;
777
+ }
778
+
779
+ // Creates a new attribute in the current tag, copying the current tag buffer to
780
+ // the attribute's name. The attribute's value starts out as the empty string
781
+ // (following the "Boolean attributes" section of the spec) and is only
782
+ // overwritten on finish_attribute_value(). If the attribute has already been
783
+ // specified, the new attribute is dropped and a parse error is added
784
+ static void finish_attribute_name(GumboParser* parser) {
785
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
786
+ GumboTagState* tag_state = &tokenizer->_tag_state;
787
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
788
+
789
+ int max_attributes = parser->_options->max_attributes;
790
+ if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
791
+ parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
792
+ gumbo_debug("Attributes limit exceeded.\n");
793
+ reinitialize_tag_buffer(parser);
794
+ tag_state->_drop_next_attr_value = true;
795
+ return;
796
+ }
797
+
798
+ // May've been set by a previous attribute without a value; reset it here.
799
+ tag_state->_drop_next_attr_value = false;
800
+ assert(tag_state->_attributes.data);
801
+ assert(tag_state->_attributes.capacity);
802
+
803
+ for (unsigned int i = 0; i < attributes->length; ++i) {
804
+ GumboAttribute* attr = attributes->data[i];
805
+ if (
806
+ strlen(attr->name) == tag_state->_buffer.length
807
+ && 0 == memcmp (
808
+ attr->name,
809
+ tag_state->_buffer.data,
810
+ tag_state->_buffer.length
811
+ )
812
+ ) {
813
+ // Identical attribute; bail.
814
+ add_duplicate_attr_error(parser);
815
+ reinitialize_tag_buffer(parser);
816
+ tag_state->_drop_next_attr_value = true;
817
+ return;
818
+ }
819
+ }
820
+
821
+ GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute));
822
+ attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
823
+ copy_over_tag_buffer(parser, &attr->name);
824
+ copy_over_original_tag_text (
825
+ parser,
826
+ &attr->original_name,
827
+ &attr->name_start,
828
+ &attr->name_end
829
+ );
830
+ attr->value = gumbo_strdup("");
831
+ copy_over_original_tag_text (
832
+ parser,
833
+ &attr->original_value,
834
+ &attr->name_start,
835
+ &attr->name_end
836
+ );
837
+ gumbo_vector_add(attr, attributes);
838
+ reinitialize_tag_buffer(parser);
839
+ }
840
+
841
+ // Finishes an attribute value. This sets the value of the most recently added
842
+ // attribute to the current contents of the tag buffer.
843
+ static void finish_attribute_value(GumboParser* parser) {
844
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
845
+ if (tag_state->_drop_next_attr_value) {
846
+ // Duplicate attribute name detected in an earlier state, so we have to
847
+ // ignore the value.
848
+ tag_state->_drop_next_attr_value = false;
849
+ reinitialize_tag_buffer(parser);
850
+ return;
851
+ }
852
+
853
+ GumboAttribute* attr =
854
+ tag_state->_attributes.data[tag_state->_attributes.length - 1];
855
+ gumbo_free((void*) attr->value);
856
+ copy_over_tag_buffer(parser, &attr->value);
857
+ copy_over_original_tag_text(
858
+ parser, &attr->original_value, &attr->value_start, &attr->value_end);
859
+ reinitialize_tag_buffer(parser);
860
+ }
861
+
862
+ // Returns true if the current end tag matches the last start tag emitted.
863
+ static bool is_appropriate_end_tag(GumboParser* parser) {
864
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
865
+ assert(!tag_state->_is_start_tag);
866
+ return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
867
+ tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
868
+ tag_state->_buffer.length);
869
+ }
870
+
871
+ void gumbo_tokenizer_state_init (
872
+ GumboParser* parser,
873
+ const char* text,
874
+ size_t text_length
875
+ ) {
876
+ GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState));
877
+ parser->_tokenizer_state = tokenizer;
878
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
879
+ tokenizer->_return_state = GUMBO_LEX_DATA;
880
+ tokenizer->_character_reference_code = 0;
881
+ tokenizer->_reconsume_current_input = false;
882
+ tokenizer->_is_adjusted_current_node_foreign = false;
883
+ tokenizer->_is_in_cdata = false;
884
+ tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
885
+ tokenizer->_tag_state._name = NULL;
886
+
887
+ tokenizer->_buffered_emit_char = kGumboNoChar;
888
+ gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
889
+ tokenizer->_resume_pos = NULL;
890
+
891
+ mark_tag_state_as_empty(&tokenizer->_tag_state);
892
+
893
+ utf8iterator_init(parser, text, text_length, &tokenizer->_input);
894
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
895
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
896
+ doc_type_state_init(parser);
897
+ }
898
+
899
+ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
900
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
901
+ assert(tokenizer->_doc_type_state.name == NULL);
902
+ assert(tokenizer->_doc_type_state.public_identifier == NULL);
903
+ assert(tokenizer->_doc_type_state.system_identifier == NULL);
904
+ gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
905
+ assert(tokenizer->_tag_state._name == NULL);
906
+ assert(tokenizer->_tag_state._attributes.data == NULL);
907
+ gumbo_free(tokenizer);
908
+ }
909
+
910
+ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
911
+ parser->_tokenizer_state->_state = state;
912
+ }
913
+
914
+ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
915
+ GumboParser* parser,
916
+ bool is_foreign
917
+ ) {
918
+ if (is_foreign != parser->_tokenizer_state->_is_adjusted_current_node_foreign) {
919
+ gumbo_debug (
920
+ "Toggling is_current_node_foreign to %s.\n",
921
+ is_foreign ? "true" : "false"
922
+ );
923
+ }
924
+ parser->_tokenizer_state->_is_adjusted_current_node_foreign = is_foreign;
925
+ }
926
+
927
+ static void reconsume_in_state(GumboParser* parser, GumboTokenizerEnum state) {
928
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
929
+ tokenizer->_reconsume_current_input = true;
930
+ tokenizer->_state = state;
931
+ }
932
+
933
+ // https://html.spec.whatwg.org/multipage/parsing.html#data-state
934
+ static StateResult handle_data_state (
935
+ GumboParser* parser,
936
+ GumboTokenizerState* tokenizer,
937
+ int c,
938
+ GumboToken* output
939
+ ) {
940
+ switch (c) {
941
+ case '&':
942
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
943
+ set_mark(parser);
944
+ tokenizer->_return_state = GUMBO_LEX_DATA;
945
+ return CONTINUE;
946
+ case '<':
947
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
948
+ set_mark(parser);
949
+ return CONTINUE;
950
+ case '\0':
951
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
952
+ return emit_char(parser, c, output);
953
+ case -1:
954
+ return emit_eof(parser, output);
955
+ default:
956
+ return emit_char(parser, c, output);
957
+ }
958
+ }
959
+
960
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
961
+ static StateResult handle_rcdata_state (
962
+ GumboParser* parser,
963
+ GumboTokenizerState* tokenizer,
964
+ int c,
965
+ GumboToken* output
966
+ ) {
967
+ switch (c) {
968
+ case '&':
969
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
970
+ set_mark(parser);
971
+ tokenizer->_return_state = GUMBO_LEX_RCDATA;
972
+ return CONTINUE;
973
+ case '<':
974
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
975
+ set_mark(parser);
976
+ return CONTINUE;
977
+ case '\0':
978
+ return emit_replacement_char(parser, output);
979
+ case -1:
980
+ return emit_eof(parser, output);
981
+ default:
982
+ return emit_char(parser, c, output);
983
+ }
984
+ }
985
+
986
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
987
+ static StateResult handle_rawtext_state (
988
+ GumboParser* parser,
989
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
990
+ int c,
991
+ GumboToken* output
992
+ ) {
993
+ switch (c) {
994
+ case '<':
995
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
996
+ set_mark(parser);
997
+ return CONTINUE;
998
+ case '\0':
999
+ return emit_replacement_char(parser, output);
1000
+ case -1:
1001
+ return emit_eof(parser, output);
1002
+ default:
1003
+ return emit_char(parser, c, output);
1004
+ }
1005
+ }
1006
+
1007
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
1008
+ static StateResult handle_script_data_state (
1009
+ GumboParser* parser,
1010
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1011
+ int c,
1012
+ GumboToken* output
1013
+ ) {
1014
+ switch (c) {
1015
+ case '<':
1016
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_LT);
1017
+ set_mark(parser);
1018
+ return CONTINUE;
1019
+ case '\0':
1020
+ return emit_replacement_char(parser, output);
1021
+ case -1:
1022
+ return emit_eof(parser, output);
1023
+ default:
1024
+ return emit_char(parser, c, output);
1025
+ }
1026
+ }
1027
+
1028
+ // https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
1029
+ static StateResult handle_plaintext_state (
1030
+ GumboParser* parser,
1031
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1032
+ int c,
1033
+ GumboToken* output
1034
+ ) {
1035
+ switch (c) {
1036
+ case '\0':
1037
+ return emit_replacement_char(parser, output);
1038
+ case -1:
1039
+ return emit_eof(parser, output);
1040
+ default:
1041
+ return emit_char(parser, c, output);
1042
+ }
1043
+ }
1044
+
1045
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
1046
+ static StateResult handle_tag_open_state (
1047
+ GumboParser* parser,
1048
+ GumboTokenizerState* tokenizer,
1049
+ int c,
1050
+ GumboToken* output
1051
+ ) {
1052
+ switch (c) {
1053
+ case '!':
1054
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION_OPEN);
1055
+ clear_temporary_buffer(parser);
1056
+ return CONTINUE;
1057
+ case '/':
1058
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1059
+ return CONTINUE;
1060
+ case '?':
1061
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
1062
+ clear_temporary_buffer(parser);
1063
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1064
+ return CONTINUE;
1065
+ case -1:
1066
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1067
+ // Switch to data to emit EOF.
1068
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1069
+ return emit_from_mark(parser, output);
1070
+ default:
1071
+ if (is_alpha(c)) {
1072
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1073
+ start_new_tag(parser, true);
1074
+ return CONTINUE;
1075
+ }
1076
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1077
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1078
+ return emit_from_mark(parser, output);
1079
+ }
1080
+ }
1081
+
1082
+ // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
1083
+ static StateResult handle_end_tag_open_state (
1084
+ GumboParser* parser,
1085
+ GumboTokenizerState* tokenizer,
1086
+ int c,
1087
+ GumboToken* output
1088
+ ) {
1089
+ switch (c) {
1090
+ case '>':
1091
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_END_TAG_NAME);
1092
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1093
+ return CONTINUE;
1094
+ case -1:
1095
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1096
+ // Similar to the tag open state except we need to emit '<' and '/'
1097
+ // before the EOF.
1098
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1099
+ return emit_from_mark(parser, output);
1100
+ default:
1101
+ if (is_alpha(c)) {
1102
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1103
+ start_new_tag(parser, false);
1104
+ } else {
1105
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1106
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1107
+ clear_temporary_buffer(parser);
1108
+ }
1109
+ return CONTINUE;
1110
+ }
1111
+ }
1112
+
1113
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
1114
+ static StateResult handle_tag_name_state (
1115
+ GumboParser* parser,
1116
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1117
+ int c,
1118
+ GumboToken* output
1119
+ ) {
1120
+ switch (c) {
1121
+ case '\t':
1122
+ case '\n':
1123
+ case '\f':
1124
+ case ' ':
1125
+ finish_tag_name(parser);
1126
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1127
+ return CONTINUE;
1128
+ case '/':
1129
+ finish_tag_name(parser);
1130
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1131
+ return CONTINUE;
1132
+ case '>':
1133
+ finish_tag_name(parser);
1134
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1135
+ return emit_current_tag(parser, output);
1136
+ case '\0':
1137
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1138
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1139
+ return CONTINUE;
1140
+ case -1:
1141
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1142
+ abandon_current_tag(parser);
1143
+ return emit_eof(parser, output);
1144
+ default:
1145
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1146
+ return CONTINUE;
1147
+ }
1148
+ }
1149
+
1150
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
1151
+ static StateResult handle_rcdata_lt_state (
1152
+ GumboParser* parser,
1153
+ GumboTokenizerState* tokenizer,
1154
+ int c,
1155
+ GumboToken* output
1156
+ ) {
1157
+ if (c == '/') {
1158
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1159
+ return CONTINUE;
1160
+ } else {
1161
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1162
+ return emit_from_mark(parser, output);
1163
+ }
1164
+ }
1165
+
1166
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
1167
+ static StateResult handle_rcdata_end_tag_open_state (
1168
+ GumboParser* parser,
1169
+ GumboTokenizerState* tokenizer,
1170
+ int c,
1171
+ GumboToken* output
1172
+ ) {
1173
+ if (is_alpha(c)) {
1174
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1175
+ start_new_tag(parser, false);
1176
+ return CONTINUE;
1177
+ }
1178
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1179
+ return emit_from_mark(parser, output);
1180
+ }
1181
+
1182
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
1183
+ static StateResult handle_rcdata_end_tag_name_state (
1184
+ GumboParser* parser,
1185
+ GumboTokenizerState* tokenizer,
1186
+ int c,
1187
+ GumboToken* output
1188
+ ) {
1189
+ UNUSED_IF_NDEBUG(tokenizer);
1190
+ if (is_alpha(c)) {
1191
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1192
+ return CONTINUE;
1193
+ }
1194
+ switch (c) {
1195
+ case '\t':
1196
+ case '\n':
1197
+ case '\f':
1198
+ case ' ':
1199
+ if (is_appropriate_end_tag(parser)) {
1200
+ finish_tag_name(parser);
1201
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1202
+ return CONTINUE;
1203
+ }
1204
+ break;
1205
+ case '/':
1206
+ if (is_appropriate_end_tag(parser)) {
1207
+ finish_tag_name(parser);
1208
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1209
+ return CONTINUE;
1210
+ }
1211
+ break;
1212
+ case '>':
1213
+ if (is_appropriate_end_tag(parser)) {
1214
+ finish_tag_name(parser);
1215
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1216
+ return emit_current_tag(parser, output);
1217
+ }
1218
+ break;
1219
+ }
1220
+ abandon_current_tag(parser);
1221
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1222
+ return emit_from_mark(parser, output);
1223
+ }
1224
+
1225
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
1226
+ static StateResult handle_rawtext_lt_state (
1227
+ GumboParser* parser,
1228
+ GumboTokenizerState* tokenizer,
1229
+ int c,
1230
+ GumboToken* output
1231
+ ) {
1232
+ if (c == '/') {
1233
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1234
+ return CONTINUE;
1235
+ } else {
1236
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1237
+ return emit_from_mark(parser, output);
1238
+ }
1239
+ }
1240
+
1241
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
1242
+ static StateResult handle_rawtext_end_tag_open_state (
1243
+ GumboParser* parser,
1244
+ GumboTokenizerState* tokenizer,
1245
+ int c,
1246
+ GumboToken* output
1247
+ ) {
1248
+ if (is_alpha(c)) {
1249
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1250
+ start_new_tag(parser, false);
1251
+ return CONTINUE;
1252
+ } else {
1253
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1254
+ return emit_from_mark(parser, output);
1255
+ }
1256
+ }
1257
+
1258
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
1259
+ static StateResult handle_rawtext_end_tag_name_state (
1260
+ GumboParser* parser,
1261
+ GumboTokenizerState* tokenizer,
1262
+ int c,
1263
+ GumboToken* output
1264
+ ) {
1265
+ if (is_alpha(c)) {
1266
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1267
+ return CONTINUE;
1268
+ }
1269
+ switch (c) {
1270
+ case '\t':
1271
+ case '\n':
1272
+ case '\f':
1273
+ case ' ':
1274
+ if (is_appropriate_end_tag(parser)) {
1275
+ finish_tag_name(parser);
1276
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1277
+ return CONTINUE;
1278
+ }
1279
+ break;
1280
+ case '/':
1281
+ if (is_appropriate_end_tag(parser)) {
1282
+ finish_tag_name(parser);
1283
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1284
+ return CONTINUE;
1285
+ }
1286
+ break;
1287
+ case '>':
1288
+ if (is_appropriate_end_tag(parser)) {
1289
+ finish_tag_name(parser);
1290
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1291
+ return emit_current_tag(parser, output);
1292
+ }
1293
+ break;
1294
+ }
1295
+ abandon_current_tag(parser);
1296
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1297
+ return emit_from_mark(parser, output);
1298
+ }
1299
+
1300
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
1301
+ static StateResult handle_script_data_lt_state (
1302
+ GumboParser* parser,
1303
+ GumboTokenizerState* tokenizer,
1304
+ int c,
1305
+ GumboToken* output
1306
+ ) {
1307
+ if (c == '/') {
1308
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN);
1309
+ return CONTINUE;
1310
+ }
1311
+ if (c == '!') {
1312
+ // This is the only place we don't reconsume the input before emitting the
1313
+ // temporary buffer. Since the current position is stored and the current
1314
+ // character is not emitted, we need to advance the input and then
1315
+ // reconsume.
1316
+ utf8iterator_next(&tokenizer->_input);
1317
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START);
1318
+ return emit_from_mark(parser, output);
1319
+ }
1320
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1321
+ return emit_from_mark(parser, output);
1322
+ }
1323
+
1324
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
1325
+ static StateResult handle_script_data_end_tag_open_state (
1326
+ GumboParser* parser,
1327
+ GumboTokenizerState* tokenizer,
1328
+ int c,
1329
+ GumboToken* output
1330
+ ) {
1331
+ if (is_alpha(c)) {
1332
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME);
1333
+ start_new_tag(parser, false);
1334
+ return CONTINUE;
1335
+ }
1336
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1337
+ return emit_from_mark(parser, output);
1338
+ }
1339
+
1340
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
1341
+ static StateResult handle_script_data_end_tag_name_state (
1342
+ GumboParser* parser,
1343
+ GumboTokenizerState* tokenizer,
1344
+ int c,
1345
+ GumboToken* output
1346
+ ) {
1347
+ if (is_alpha(c)) {
1348
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1349
+ return CONTINUE;
1350
+ }
1351
+ switch (c) {
1352
+ case '\t':
1353
+ case '\n':
1354
+ case '\f':
1355
+ case ' ':
1356
+ if (is_appropriate_end_tag(parser)) {
1357
+ finish_tag_name(parser);
1358
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1359
+ return CONTINUE;
1360
+ }
1361
+ break;
1362
+ case '/':
1363
+ if (is_appropriate_end_tag(parser)) {
1364
+ finish_tag_name(parser);
1365
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1366
+ return CONTINUE;
1367
+ }
1368
+ break;
1369
+ case '>':
1370
+ if (is_appropriate_end_tag(parser)) {
1371
+ finish_tag_name(parser);
1372
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1373
+ return emit_current_tag(parser, output);
1374
+ }
1375
+ break;
1376
+ }
1377
+ abandon_current_tag(parser);
1378
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1379
+ return emit_from_mark(parser, output);
1380
+ }
1381
+
1382
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
1383
+ static StateResult handle_script_data_escaped_start_state (
1384
+ GumboParser* parser,
1385
+ GumboTokenizerState* tokenizer,
1386
+ int c,
1387
+ GumboToken* output
1388
+ ) {
1389
+ if (c == '-') {
1390
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH);
1391
+ return emit_char(parser, c, output);
1392
+ }
1393
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1394
+ return CONTINUE;
1395
+ }
1396
+
1397
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
1398
+ static StateResult handle_script_data_escaped_start_dash_state (
1399
+ GumboParser* parser,
1400
+ GumboTokenizerState* tokenizer,
1401
+ int c,
1402
+ GumboToken* output
1403
+ ) {
1404
+ if (c == '-') {
1405
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1406
+ return emit_char(parser, c, output);
1407
+ } else {
1408
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1409
+ return CONTINUE;
1410
+ }
1411
+ }
1412
+
1413
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
1414
+ static StateResult handle_script_data_escaped_state (
1415
+ GumboParser* parser,
1416
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1417
+ int c,
1418
+ GumboToken* output
1419
+ ) {
1420
+ switch (c) {
1421
+ case '-':
1422
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH);
1423
+ return emit_char(parser, c, output);
1424
+ case '<':
1425
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1426
+ clear_temporary_buffer(parser);
1427
+ set_mark(parser);
1428
+ return CONTINUE;
1429
+ case '\0':
1430
+ return emit_replacement_char(parser, output);
1431
+ case -1:
1432
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1433
+ return emit_eof(parser, output);
1434
+ default:
1435
+ return emit_char(parser, c, output);
1436
+ }
1437
+ }
1438
+
1439
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
1440
+ static StateResult handle_script_data_escaped_dash_state (
1441
+ GumboParser* parser,
1442
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1443
+ int c,
1444
+ GumboToken* output
1445
+ ) {
1446
+ switch (c) {
1447
+ case '-':
1448
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1449
+ return emit_char(parser, c, output);
1450
+ case '<':
1451
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1452
+ clear_temporary_buffer(parser);
1453
+ set_mark(parser);
1454
+ return CONTINUE;
1455
+ case '\0':
1456
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1457
+ return emit_replacement_char(parser, output);
1458
+ case -1:
1459
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1460
+ return emit_eof(parser, output);
1461
+ default:
1462
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1463
+ return emit_char(parser, c, output);
1464
+ }
1465
+ }
1466
+
1467
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
1468
+ static StateResult handle_script_data_escaped_dash_dash_state (
1469
+ GumboParser* parser,
1470
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1471
+ int c,
1472
+ GumboToken* output
1473
+ ) {
1474
+ switch (c) {
1475
+ case '-':
1476
+ return emit_char(parser, c, output);
1477
+ case '<':
1478
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1479
+ clear_temporary_buffer(parser);
1480
+ set_mark(parser);
1481
+ return CONTINUE;
1482
+ case '>':
1483
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1484
+ return emit_char(parser, c, output);
1485
+ case '\0':
1486
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1487
+ return emit_replacement_char(parser, output);
1488
+ case -1:
1489
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1490
+ return emit_eof(parser, output);
1491
+ default:
1492
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1493
+ return emit_char(parser, c, output);
1494
+ }
1495
+ }
1496
+
1497
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
1498
+ static StateResult handle_script_data_escaped_lt_state (
1499
+ GumboParser* parser,
1500
+ GumboTokenizerState* tokenizer,
1501
+ int c,
1502
+ GumboToken* output
1503
+ ) {
1504
+ assert(temporary_buffer_is_empty(parser));
1505
+ if (c == '/') {
1506
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN);
1507
+ return CONTINUE;
1508
+ }
1509
+ if (is_alpha(c)) {
1510
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START);
1511
+ return emit_from_mark(parser, output);
1512
+ }
1513
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1514
+ return emit_from_mark(parser, output);
1515
+ }
1516
+
1517
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
1518
+ static StateResult handle_script_data_escaped_end_tag_open_state (
1519
+ GumboParser* parser,
1520
+ GumboTokenizerState* tokenizer,
1521
+ int c,
1522
+ GumboToken* output
1523
+ ) {
1524
+ if (is_alpha(c)) {
1525
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME);
1526
+ start_new_tag(parser, false);
1527
+ return CONTINUE;
1528
+ }
1529
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1530
+ return emit_from_mark(parser, output);
1531
+ }
1532
+
1533
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
1534
+ static StateResult handle_script_data_escaped_end_tag_name_state (
1535
+ GumboParser* parser,
1536
+ GumboTokenizerState* tokenizer,
1537
+ int c,
1538
+ GumboToken* output
1539
+ ) {
1540
+ if (is_alpha(c)) {
1541
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1542
+ return CONTINUE;
1543
+ }
1544
+ switch (c) {
1545
+ case '\t':
1546
+ case '\n':
1547
+ case '\f':
1548
+ case ' ':
1549
+ if (is_appropriate_end_tag(parser)) {
1550
+ finish_tag_name(parser);
1551
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1552
+ return CONTINUE;
1553
+ }
1554
+ break;
1555
+ case '/':
1556
+ if (is_appropriate_end_tag(parser)) {
1557
+ finish_tag_name(parser);
1558
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1559
+ return CONTINUE;
1560
+ }
1561
+ break;
1562
+ case '>':
1563
+ if (is_appropriate_end_tag(parser)) {
1564
+ finish_tag_name(parser);
1565
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1566
+ return emit_current_tag(parser, output);
1567
+ }
1568
+ break;
1569
+ }
1570
+ abandon_current_tag(parser);
1571
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1572
+ return emit_from_mark(parser, output);
1573
+ }
1574
+
1575
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
1576
+ static StateResult handle_script_data_double_escaped_start_state (
1577
+ GumboParser* parser,
1578
+ GumboTokenizerState* tokenizer,
1579
+ int c,
1580
+ GumboToken* output
1581
+ ) {
1582
+ switch (c) {
1583
+ case '\t':
1584
+ case '\n':
1585
+ case '\f':
1586
+ case ' ':
1587
+ case '/':
1588
+ case '>':
1589
+ gumbo_tokenizer_set_state (
1590
+ parser,
1591
+ gumbo_string_equals (
1592
+ &kScriptTag,
1593
+ (GumboStringPiece*) &tokenizer->_temporary_buffer
1594
+ )
1595
+ ? GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED
1596
+ : GUMBO_LEX_SCRIPT_DATA_ESCAPED
1597
+ );
1598
+ return emit_char(parser, c, output);
1599
+ }
1600
+ if (is_alpha(c)) {
1601
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1602
+ return emit_char(parser, c, output);
1603
+ }
1604
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1605
+ return CONTINUE;
1606
+ }
1607
+
1608
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
1609
+ static StateResult handle_script_data_double_escaped_state (
1610
+ GumboParser* parser,
1611
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1612
+ int c,
1613
+ GumboToken* output
1614
+ ) {
1615
+ switch (c) {
1616
+ case '-':
1617
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH);
1618
+ return emit_char(parser, c, output);
1619
+ case '<':
1620
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1621
+ return emit_char(parser, c, output);
1622
+ case '\0':
1623
+ return emit_replacement_char(parser, output);
1624
+ case -1:
1625
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1626
+ return emit_eof(parser, output);
1627
+ default:
1628
+ return emit_char(parser, c, output);
1629
+ }
1630
+ }
1631
+
1632
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
1633
+ static StateResult handle_script_data_double_escaped_dash_state (
1634
+ GumboParser* parser,
1635
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1636
+ int c,
1637
+ GumboToken* output
1638
+ ) {
1639
+ switch (c) {
1640
+ case '-':
1641
+ gumbo_tokenizer_set_state(
1642
+ parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH);
1643
+ return emit_char(parser, c, output);
1644
+ case '<':
1645
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1646
+ return emit_char(parser, c, output);
1647
+ case '\0':
1648
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1649
+ return emit_replacement_char(parser, output);
1650
+ case -1:
1651
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1652
+ return emit_eof(parser, output);
1653
+ default:
1654
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1655
+ return emit_char(parser, c, output);
1656
+ }
1657
+ }
1658
+
1659
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
1660
+ static StateResult handle_script_data_double_escaped_dash_dash_state (
1661
+ GumboParser* parser,
1662
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1663
+ int c,
1664
+ GumboToken* output
1665
+ ) {
1666
+ switch (c) {
1667
+ case '-':
1668
+ return emit_char(parser, c, output);
1669
+ case '<':
1670
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1671
+ return emit_char(parser, c, output);
1672
+ case '>':
1673
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1674
+ return emit_char(parser, c, output);
1675
+ case '\0':
1676
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1677
+ return emit_replacement_char(parser, output);
1678
+ case -1:
1679
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1680
+ return emit_eof(parser, output);
1681
+ default:
1682
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1683
+ return emit_char(parser, c, output);
1684
+ }
1685
+ }
1686
+
1687
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
1688
+ static StateResult handle_script_data_double_escaped_lt_state (
1689
+ GumboParser* parser,
1690
+ GumboTokenizerState* tokenizer,
1691
+ int c,
1692
+ GumboToken* output
1693
+ ) {
1694
+ if (c == '/') {
1695
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END);
1696
+ clear_temporary_buffer(parser);
1697
+ return emit_char(parser, c, output);
1698
+ } else {
1699
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1700
+ return CONTINUE;
1701
+ }
1702
+ }
1703
+
1704
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
1705
+ static StateResult handle_script_data_double_escaped_end_state (
1706
+ GumboParser* parser,
1707
+ GumboTokenizerState* tokenizer,
1708
+ int c,
1709
+ GumboToken* output
1710
+ ) {
1711
+ switch (c) {
1712
+ case '\t':
1713
+ case '\n':
1714
+ case '\f':
1715
+ case ' ':
1716
+ case '/':
1717
+ case '>':
1718
+ gumbo_tokenizer_set_state(
1719
+ parser, gumbo_string_equals(&kScriptTag,
1720
+ (GumboStringPiece*) &tokenizer->_temporary_buffer)
1721
+ ? GUMBO_LEX_SCRIPT_DATA_ESCAPED
1722
+ : GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1723
+ return emit_char(parser, c, output);
1724
+ }
1725
+ if (is_alpha(c)) {
1726
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1727
+ return emit_char(parser, c, output);
1728
+ }
1729
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1730
+ return CONTINUE;
1731
+ }
1732
+
1733
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
1734
+ static StateResult handle_before_attr_name_state (
1735
+ GumboParser* parser,
1736
+ GumboTokenizerState* tokenizer,
1737
+ int c,
1738
+ GumboToken* output
1739
+ ) {
1740
+ switch (c) {
1741
+ case '\t':
1742
+ case '\n':
1743
+ case '\f':
1744
+ case ' ':
1745
+ return CONTINUE;
1746
+ case '/':
1747
+ case '>':
1748
+ case -1:
1749
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1750
+ return CONTINUE;
1751
+ case '=':
1752
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME);
1753
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1754
+ append_char_to_tag_buffer(parser, c, true);
1755
+ return CONTINUE;
1756
+ default:
1757
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1758
+ return CONTINUE;
1759
+ }
1760
+ }
1761
+
1762
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
1763
+ static StateResult handle_attr_name_state (
1764
+ GumboParser* parser,
1765
+ GumboTokenizerState* tokenizer,
1766
+ int c,
1767
+ GumboToken* output
1768
+ ) {
1769
+ switch (c) {
1770
+ case '\t':
1771
+ case '\n':
1772
+ case '\f':
1773
+ case ' ':
1774
+ case '/':
1775
+ case '>':
1776
+ case -1:
1777
+ finish_attribute_name(parser);
1778
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1779
+ return CONTINUE;
1780
+ case '=':
1781
+ finish_attribute_name(parser);
1782
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1783
+ return CONTINUE;
1784
+ case '\0':
1785
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1786
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1787
+ return CONTINUE;
1788
+ case '"':
1789
+ case '\'':
1790
+ case '<':
1791
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME);
1792
+ // Fall through.
1793
+ default:
1794
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1795
+ return CONTINUE;
1796
+ }
1797
+ }
1798
+
1799
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
1800
+ static StateResult handle_after_attr_name_state (
1801
+ GumboParser* parser,
1802
+ GumboTokenizerState* tokenizer,
1803
+ int c,
1804
+ GumboToken* output
1805
+ ) {
1806
+ switch (c) {
1807
+ case '\t':
1808
+ case '\n':
1809
+ case '\f':
1810
+ case ' ':
1811
+ return CONTINUE;
1812
+ case '/':
1813
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1814
+ return CONTINUE;
1815
+ case '=':
1816
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1817
+ return CONTINUE;
1818
+ case '>':
1819
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1820
+ return emit_current_tag(parser, output);
1821
+ case -1:
1822
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1823
+ abandon_current_tag(parser);
1824
+ return emit_eof(parser, output);
1825
+ default:
1826
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1827
+ return CONTINUE;
1828
+ }
1829
+ }
1830
+
1831
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
1832
+ static StateResult handle_before_attr_value_state (
1833
+ GumboParser* parser,
1834
+ GumboTokenizerState* tokenizer,
1835
+ int c,
1836
+ GumboToken* output
1837
+ ) {
1838
+ switch (c) {
1839
+ case '\t':
1840
+ case '\n':
1841
+ case '\f':
1842
+ case ' ':
1843
+ return CONTINUE;
1844
+ case '"':
1845
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1846
+ reset_tag_buffer_start_point(parser);
1847
+ return CONTINUE;
1848
+ case '\'':
1849
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1850
+ reset_tag_buffer_start_point(parser);
1851
+ return CONTINUE;
1852
+ case '>':
1853
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_ATTRIBUTE_VALUE);
1854
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1855
+ return emit_current_tag(parser, output);
1856
+ }
1857
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1858
+ return CONTINUE;
1859
+ }
1860
+
1861
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
1862
+ static StateResult handle_attr_value_double_quoted_state (
1863
+ GumboParser* parser,
1864
+ GumboTokenizerState* tokenizer,
1865
+ int c,
1866
+ GumboToken* output
1867
+ ) {
1868
+ switch (c) {
1869
+ case '"':
1870
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1871
+ return CONTINUE;
1872
+ case '&':
1873
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1874
+ set_mark(parser);
1875
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED;
1876
+ return CONTINUE;
1877
+ case '\0':
1878
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1879
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1880
+ return CONTINUE;
1881
+ case -1:
1882
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1883
+ abandon_current_tag(parser);
1884
+ return emit_eof(parser, output);
1885
+ default:
1886
+ append_char_to_tag_buffer(parser, c, false);
1887
+ return CONTINUE;
1888
+ }
1889
+ }
1890
+
1891
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state
1892
+ static StateResult handle_attr_value_single_quoted_state (
1893
+ GumboParser* parser,
1894
+ GumboTokenizerState* tokenizer,
1895
+ int c,
1896
+ GumboToken* output
1897
+ ) {
1898
+ switch (c) {
1899
+ case '\'':
1900
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1901
+ return CONTINUE;
1902
+ case '&':
1903
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1904
+ set_mark(parser);
1905
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED;
1906
+ return CONTINUE;
1907
+ case '\0':
1908
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1909
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1910
+ return CONTINUE;
1911
+ case -1:
1912
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1913
+ abandon_current_tag(parser);
1914
+ return emit_eof(parser, output);
1915
+ default:
1916
+ append_char_to_tag_buffer(parser, c, false);
1917
+ return CONTINUE;
1918
+ }
1919
+ }
1920
+
1921
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
1922
+ static StateResult handle_attr_value_unquoted_state (
1923
+ GumboParser* parser,
1924
+ GumboTokenizerState* tokenizer,
1925
+ int c,
1926
+ GumboToken* output
1927
+ ) {
1928
+ switch (c) {
1929
+ case '\t':
1930
+ case '\n':
1931
+ case '\f':
1932
+ case ' ':
1933
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1934
+ finish_attribute_value(parser);
1935
+ return CONTINUE;
1936
+ case '&':
1937
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1938
+ set_mark(parser);
1939
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_UNQUOTED;
1940
+ return CONTINUE;
1941
+ case '>':
1942
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1943
+ finish_attribute_value(parser);
1944
+ return emit_current_tag(parser, output);
1945
+ case '\0':
1946
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1947
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1948
+ return CONTINUE;
1949
+ case -1:
1950
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1951
+ abandon_current_tag(parser);
1952
+ return emit_eof(parser, output);
1953
+ case '"':
1954
+ case '\'':
1955
+ case '<':
1956
+ case '=':
1957
+ case '`':
1958
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE);
1959
+ // Fall through.
1960
+ default:
1961
+ append_char_to_tag_buffer(parser, c, true);
1962
+ return CONTINUE;
1963
+ }
1964
+ }
1965
+
1966
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
1967
+ static StateResult handle_after_attr_value_quoted_state (
1968
+ GumboParser* parser,
1969
+ GumboTokenizerState* tokenizer,
1970
+ int c,
1971
+ GumboToken* output
1972
+ ) {
1973
+ finish_attribute_value(parser);
1974
+ switch (c) {
1975
+ case '\t':
1976
+ case '\n':
1977
+ case '\f':
1978
+ case ' ':
1979
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1980
+ return CONTINUE;
1981
+ case '/':
1982
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1983
+ return CONTINUE;
1984
+ case '>':
1985
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1986
+ return emit_current_tag(parser, output);
1987
+ case -1:
1988
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1989
+ abandon_current_tag(parser);
1990
+ return emit_eof(parser, output);
1991
+ default:
1992
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES);
1993
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1994
+ return CONTINUE;
1995
+ }
1996
+ }
1997
+
1998
+ // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
1999
+ static StateResult handle_self_closing_start_tag_state (
2000
+ GumboParser* parser,
2001
+ GumboTokenizerState* tokenizer,
2002
+ int c,
2003
+ GumboToken* output
2004
+ ) {
2005
+ switch (c) {
2006
+ case '>':
2007
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2008
+ tokenizer->_tag_state._is_self_closing = true;
2009
+ return emit_current_tag(parser, output);
2010
+ case -1:
2011
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2012
+ abandon_current_tag(parser);
2013
+ return emit_eof(parser, output);
2014
+ default:
2015
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG);
2016
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2017
+ return CONTINUE;
2018
+ }
2019
+ }
2020
+
2021
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
2022
+ static StateResult handle_bogus_comment_state (
2023
+ GumboParser* parser,
2024
+ GumboTokenizerState* tokenizer,
2025
+ int c,
2026
+ GumboToken* output
2027
+ ) {
2028
+ switch (c) {
2029
+ case '>':
2030
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2031
+ return emit_comment(parser, output);
2032
+ case -1:
2033
+ // We need to emit the comment and then the EOF, so reconsume in data
2034
+ // state.
2035
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2036
+ return emit_comment(parser, output);
2037
+ case '\0':
2038
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2039
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2040
+ return CONTINUE;
2041
+ default:
2042
+ append_char_to_temporary_buffer(parser, c);
2043
+ return CONTINUE;
2044
+ }
2045
+ }
2046
+
2047
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
2048
+ static StateResult handle_markup_declaration_open_state (
2049
+ GumboParser* parser,
2050
+ GumboTokenizerState* tokenizer,
2051
+ int UNUSED_ARG(c),
2052
+ GumboToken* UNUSED_ARG(output)
2053
+ ) {
2054
+ if (
2055
+ utf8iterator_maybe_consume_match (
2056
+ &tokenizer->_input,
2057
+ "--",
2058
+ sizeof("--") - 1,
2059
+ /* case sensitive */ true
2060
+ )
2061
+ ) {
2062
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_START);
2063
+ return CONTINUE;
2064
+ }
2065
+ if (
2066
+ utf8iterator_maybe_consume_match (
2067
+ &tokenizer->_input,
2068
+ "DOCTYPE",
2069
+ sizeof("DOCTYPE") - 1,
2070
+ /* case sensitive */ false
2071
+ )
2072
+ ) {
2073
+ reconsume_in_state(parser, GUMBO_LEX_DOCTYPE);
2074
+ // If we get here, we know we'll eventually emit a doctype token, so now is
2075
+ // the time to initialize the doctype strings. (Not in doctype_state_init,
2076
+ // since then they'll leak if ownership never gets transferred to the
2077
+ // doctype token.
2078
+ tokenizer->_doc_type_state.name = gumbo_strdup("");
2079
+ tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
2080
+ tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
2081
+ return CONTINUE;
2082
+ }
2083
+ if (
2084
+ utf8iterator_maybe_consume_match (
2085
+ &tokenizer->_input,
2086
+ "[CDATA[", sizeof("[CDATA[") - 1,
2087
+ /* case sensitive */ true
2088
+ )
2089
+ ) {
2090
+ if (tokenizer->_is_adjusted_current_node_foreign) {
2091
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2092
+ tokenizer->_is_in_cdata = true;
2093
+ // Start the token after the <![CDATA[.
2094
+ reset_token_start_point(tokenizer);
2095
+ } else {
2096
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_CDATA_IN_HTML_CONTENT);
2097
+ clear_temporary_buffer(parser);
2098
+ append_string_to_temporary_buffer (
2099
+ parser,
2100
+ &(const GumboStringPiece) { .data = "[CDATA[", .length = 7 }
2101
+ );
2102
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2103
+ }
2104
+ return CONTINUE;
2105
+ }
2106
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_OPENED_COMMENT);
2107
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2108
+ clear_temporary_buffer(parser);
2109
+ return CONTINUE;
2110
+ }
2111
+
2112
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
2113
+ static StateResult handle_comment_start_state (
2114
+ GumboParser* parser,
2115
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2116
+ int c,
2117
+ GumboToken* output
2118
+ ) {
2119
+ switch (c) {
2120
+ case '-':
2121
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2122
+ return CONTINUE;
2123
+ case '>':
2124
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2125
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2126
+ return emit_comment(parser, output);
2127
+ default:
2128
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2129
+ return CONTINUE;
2130
+ }
2131
+ }
2132
+
2133
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
2134
+ static StateResult handle_comment_start_dash_state (
2135
+ GumboParser* parser,
2136
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2137
+ int c,
2138
+ GumboToken* output
2139
+ ) {
2140
+ switch (c) {
2141
+ case '-':
2142
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2143
+ return CONTINUE;
2144
+ case '>':
2145
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2146
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2147
+ return emit_comment(parser, output);
2148
+ case -1:
2149
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2150
+ // Switch to data to emit the EOF next.
2151
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2152
+ return emit_comment(parser, output);
2153
+ default:
2154
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2155
+ append_char_to_temporary_buffer(parser, '-');
2156
+ return CONTINUE;
2157
+ }
2158
+ }
2159
+
2160
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-state
2161
+ static StateResult handle_comment_state (
2162
+ GumboParser* parser,
2163
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2164
+ int c,
2165
+ GumboToken* output
2166
+ ) {
2167
+ switch (c) {
2168
+ case '<':
2169
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT);
2170
+ append_char_to_temporary_buffer(parser, c);
2171
+ return CONTINUE;
2172
+ case '-':
2173
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2174
+ return CONTINUE;
2175
+ case '\0':
2176
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2177
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2178
+ return CONTINUE;
2179
+ case -1:
2180
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2181
+ // Switch to data to emit the EOF token next.
2182
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2183
+ return emit_comment(parser, output);
2184
+ default:
2185
+ append_char_to_temporary_buffer(parser, c);
2186
+ return CONTINUE;
2187
+ }
2188
+ }
2189
+
2190
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
2191
+ static StateResult handle_comment_lt_state (
2192
+ GumboParser* parser,
2193
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2194
+ int c,
2195
+ GumboToken* output
2196
+ ) {
2197
+ switch (c) {
2198
+ case '!':
2199
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG);
2200
+ append_char_to_temporary_buffer(parser, c);
2201
+ return CONTINUE;
2202
+ case '<':
2203
+ append_char_to_temporary_buffer(parser, c);
2204
+ return CONTINUE;
2205
+ default:
2206
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2207
+ return CONTINUE;
2208
+ }
2209
+ }
2210
+
2211
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
2212
+ static StateResult handle_comment_lt_bang_state (
2213
+ GumboParser* parser,
2214
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2215
+ int c,
2216
+ GumboToken* output
2217
+ ) {
2218
+ switch (c) {
2219
+ case '-':
2220
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH);
2221
+ return CONTINUE;
2222
+ default:
2223
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2224
+ return CONTINUE;
2225
+ }
2226
+ }
2227
+
2228
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
2229
+ static StateResult handle_comment_lt_bang_dash_state (
2230
+ GumboParser* parser,
2231
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2232
+ int c,
2233
+ GumboToken* output
2234
+ ) {
2235
+ switch (c) {
2236
+ case '-':
2237
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH);
2238
+ return CONTINUE;
2239
+ default:
2240
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2241
+ return CONTINUE;
2242
+ }
2243
+ }
2244
+
2245
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
2246
+ static StateResult handle_comment_lt_bang_dash_dash_state (
2247
+ GumboParser* parser,
2248
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2249
+ int c,
2250
+ GumboToken* output
2251
+ ) {
2252
+ switch (c) {
2253
+ case '>':
2254
+ case -1:
2255
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2256
+ return CONTINUE;
2257
+ default:
2258
+ tokenizer_add_parse_error(parser, GUMBO_ERR_NESTED_COMMENT);
2259
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2260
+ return CONTINUE;
2261
+ }
2262
+ }
2263
+
2264
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
2265
+ static StateResult handle_comment_end_dash_state (
2266
+ GumboParser* parser,
2267
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2268
+ int c,
2269
+ GumboToken* output
2270
+ ) {
2271
+ switch (c) {
2272
+ case '-':
2273
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2274
+ return CONTINUE;
2275
+ case -1:
2276
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2277
+ // Switch to data to emit EOF next.
2278
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2279
+ return emit_comment(parser, output);
2280
+ default:
2281
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2282
+ append_char_to_temporary_buffer(parser, '-');
2283
+ return CONTINUE;
2284
+ }
2285
+ }
2286
+
2287
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
2288
+ static StateResult handle_comment_end_state (
2289
+ GumboParser* parser,
2290
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2291
+ int c,
2292
+ GumboToken* output
2293
+ ) {
2294
+ switch (c) {
2295
+ case '>':
2296
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2297
+ return emit_comment(parser, output);
2298
+ case '!':
2299
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2300
+ return CONTINUE;
2301
+ case '-':
2302
+ append_char_to_temporary_buffer(parser, '-');
2303
+ return CONTINUE;
2304
+ case -1:
2305
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2306
+ // Switch to data to emit EOF next.
2307
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2308
+ return emit_comment(parser, output);
2309
+ default:
2310
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2311
+ append_char_to_temporary_buffer(parser, '-');
2312
+ append_char_to_temporary_buffer(parser, '-');
2313
+ return CONTINUE;
2314
+ }
2315
+ }
2316
+
2317
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
2318
+ static StateResult handle_comment_end_bang_state (
2319
+ GumboParser* parser,
2320
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2321
+ int c,
2322
+ GumboToken* output
2323
+ ) {
2324
+ switch (c) {
2325
+ case '-':
2326
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2327
+ append_char_to_temporary_buffer(parser, '-');
2328
+ append_char_to_temporary_buffer(parser, '-');
2329
+ append_char_to_temporary_buffer(parser, '!');
2330
+ return CONTINUE;
2331
+ case '>':
2332
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT);
2333
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2334
+ return emit_comment(parser, output);
2335
+ case -1:
2336
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2337
+ // Switch to data to emit EOF next.
2338
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2339
+ return emit_comment(parser, output);
2340
+ default:
2341
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2342
+ append_char_to_temporary_buffer(parser, '-');
2343
+ append_char_to_temporary_buffer(parser, '-');
2344
+ append_char_to_temporary_buffer(parser, '!');
2345
+ return CONTINUE;
2346
+ }
2347
+ }
2348
+
2349
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
2350
+ static StateResult handle_doctype_state (
2351
+ GumboParser* parser,
2352
+ GumboTokenizerState* tokenizer,
2353
+ int c,
2354
+ GumboToken* output
2355
+ ) {
2356
+ assert(temporary_buffer_is_empty(parser));
2357
+ switch (c) {
2358
+ case '\t':
2359
+ case '\n':
2360
+ case '\f':
2361
+ case ' ':
2362
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2363
+ return CONTINUE;
2364
+ case '>':
2365
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2366
+ return CONTINUE;
2367
+ case -1:
2368
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2369
+ tokenizer->_doc_type_state.force_quirks = true;
2370
+ // Switch to data to emit EOF next.
2371
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2372
+ return emit_doctype(parser, output);
2373
+ default:
2374
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME);
2375
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2376
+ return CONTINUE;
2377
+ }
2378
+ }
2379
+
2380
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
2381
+ static StateResult handle_before_doctype_name_state (
2382
+ GumboParser* parser,
2383
+ GumboTokenizerState* tokenizer,
2384
+ int c,
2385
+ GumboToken* output
2386
+ ) {
2387
+ switch (c) {
2388
+ case '\t':
2389
+ case '\n':
2390
+ case '\f':
2391
+ case ' ':
2392
+ return CONTINUE;
2393
+ case '\0':
2394
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2395
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2396
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2397
+ return CONTINUE;
2398
+ case '>':
2399
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_NAME);
2400
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2401
+ tokenizer->_doc_type_state.force_quirks = true;
2402
+ return emit_doctype(parser, output);
2403
+ case -1:
2404
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2405
+ tokenizer->_doc_type_state.force_quirks = true;
2406
+ // Switch to data to emit EOF next.
2407
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2408
+ return emit_doctype(parser, output);
2409
+ default:
2410
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2411
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2412
+ return CONTINUE;
2413
+ }
2414
+ }
2415
+
2416
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
2417
+ static StateResult handle_doctype_name_state (
2418
+ GumboParser* parser,
2419
+ GumboTokenizerState* tokenizer,
2420
+ int c,
2421
+ GumboToken* output
2422
+ ) {
2423
+ switch (c) {
2424
+ case '\t':
2425
+ case '\n':
2426
+ case '\f':
2427
+ case ' ':
2428
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2429
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2430
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2431
+ return CONTINUE;
2432
+ case '>':
2433
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2434
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2435
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2436
+ return emit_doctype(parser, output);
2437
+ case '\0':
2438
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2439
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2440
+ return CONTINUE;
2441
+ case -1:
2442
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2443
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2444
+ tokenizer->_doc_type_state.force_quirks = true;
2445
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2446
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2447
+ return emit_doctype(parser, output);
2448
+ default:
2449
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2450
+ return CONTINUE;
2451
+ }
2452
+ }
2453
+
2454
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
2455
+ static StateResult handle_after_doctype_name_state (
2456
+ GumboParser* parser,
2457
+ GumboTokenizerState* tokenizer,
2458
+ int c,
2459
+ GumboToken* output
2460
+ ) {
2461
+ switch (c) {
2462
+ case '\t':
2463
+ case '\n':
2464
+ case '\f':
2465
+ case ' ':
2466
+ return CONTINUE;
2467
+ case '>':
2468
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2469
+ return emit_doctype(parser, output);
2470
+ case -1:
2471
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2472
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2473
+ tokenizer->_doc_type_state.force_quirks = true;
2474
+ return emit_doctype(parser, output);
2475
+ default:
2476
+ if (utf8iterator_maybe_consume_match(
2477
+ &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2478
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2479
+ } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2480
+ sizeof("SYSTEM") - 1, false)) {
2481
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2482
+ } else {
2483
+ tokenizer_add_parse_error(
2484
+ parser, GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME);
2485
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2486
+ tokenizer->_doc_type_state.force_quirks = true;
2487
+ }
2488
+ return CONTINUE;
2489
+ }
2490
+ }
2491
+
2492
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
2493
+ static StateResult handle_after_doctype_public_keyword_state (
2494
+ GumboParser* parser,
2495
+ GumboTokenizerState* tokenizer,
2496
+ int c,
2497
+ GumboToken* output
2498
+ ) {
2499
+ switch (c) {
2500
+ case '\t':
2501
+ case '\n':
2502
+ case '\f':
2503
+ case ' ':
2504
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2505
+ return CONTINUE;
2506
+ case '"':
2507
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2508
+ assert(temporary_buffer_is_empty(parser));
2509
+ gumbo_tokenizer_set_state(
2510
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2511
+ return CONTINUE;
2512
+ case '\'':
2513
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2514
+ assert(temporary_buffer_is_empty(parser));
2515
+ gumbo_tokenizer_set_state(
2516
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2517
+ return CONTINUE;
2518
+ case '>':
2519
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2520
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2521
+ tokenizer->_doc_type_state.force_quirks = true;
2522
+ return emit_doctype(parser, output);
2523
+ case -1:
2524
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2525
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2526
+ tokenizer->_doc_type_state.force_quirks = true;
2527
+ return emit_doctype(parser, output);
2528
+ default:
2529
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2530
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2531
+ tokenizer->_doc_type_state.force_quirks = true;
2532
+ return CONTINUE;
2533
+ }
2534
+ }
2535
+
2536
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
2537
+ static StateResult handle_before_doctype_public_id_state (
2538
+ GumboParser* parser,
2539
+ GumboTokenizerState* tokenizer,
2540
+ int c,
2541
+ GumboToken* output
2542
+ ) {
2543
+ switch (c) {
2544
+ case '\t':
2545
+ case '\n':
2546
+ case '\f':
2547
+ case ' ':
2548
+ return CONTINUE;
2549
+ case '"':
2550
+ assert(temporary_buffer_is_empty(parser));
2551
+ gumbo_tokenizer_set_state(
2552
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2553
+ return CONTINUE;
2554
+ case '\'':
2555
+ assert(temporary_buffer_is_empty(parser));
2556
+ gumbo_tokenizer_set_state(
2557
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2558
+ return CONTINUE;
2559
+ case '>':
2560
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2561
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2562
+ tokenizer->_doc_type_state.force_quirks = true;
2563
+ return emit_doctype(parser, output);
2564
+ case -1:
2565
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2566
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2567
+ tokenizer->_doc_type_state.force_quirks = true;
2568
+ return emit_doctype(parser, output);
2569
+ default:
2570
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2571
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2572
+ tokenizer->_doc_type_state.force_quirks = true;
2573
+ return CONTINUE;
2574
+ }
2575
+ }
2576
+
2577
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
2578
+ static StateResult handle_doctype_public_id_double_quoted_state (
2579
+ GumboParser* parser,
2580
+ GumboTokenizerState* tokenizer,
2581
+ int c,
2582
+ GumboToken* output
2583
+ ) {
2584
+ switch (c) {
2585
+ case '"':
2586
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2587
+ finish_doctype_public_id(parser);
2588
+ return CONTINUE;
2589
+ case '\0':
2590
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2591
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2592
+ return CONTINUE;
2593
+ case '>':
2594
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2595
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2596
+ tokenizer->_doc_type_state.force_quirks = true;
2597
+ finish_doctype_public_id(parser);
2598
+ return emit_doctype(parser, output);
2599
+ case -1:
2600
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2601
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2602
+ tokenizer->_doc_type_state.force_quirks = true;
2603
+ finish_doctype_public_id(parser);
2604
+ return emit_doctype(parser, output);
2605
+ default:
2606
+ append_char_to_temporary_buffer(parser, c);
2607
+ return CONTINUE;
2608
+ }
2609
+ }
2610
+
2611
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
2612
+ static StateResult handle_doctype_public_id_single_quoted_state (
2613
+ GumboParser* parser,
2614
+ GumboTokenizerState* tokenizer,
2615
+ int c,
2616
+ GumboToken* output
2617
+ ) {
2618
+ switch (c) {
2619
+ case '\'':
2620
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2621
+ finish_doctype_public_id(parser);
2622
+ return CONTINUE;
2623
+ case '\0':
2624
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2625
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2626
+ return CONTINUE;
2627
+ case '>':
2628
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2629
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2630
+ tokenizer->_doc_type_state.force_quirks = true;
2631
+ finish_doctype_public_id(parser);
2632
+ return emit_doctype(parser, output);
2633
+ case -1:
2634
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2635
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2636
+ tokenizer->_doc_type_state.force_quirks = true;
2637
+ finish_doctype_public_id(parser);
2638
+ return emit_doctype(parser, output);
2639
+ default:
2640
+ append_char_to_temporary_buffer(parser, c);
2641
+ return CONTINUE;
2642
+ }
2643
+ }
2644
+
2645
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
2646
+ static StateResult handle_after_doctype_public_id_state (
2647
+ GumboParser* parser,
2648
+ GumboTokenizerState* tokenizer,
2649
+ int c,
2650
+ GumboToken* output
2651
+ ) {
2652
+ switch (c) {
2653
+ case '\t':
2654
+ case '\n':
2655
+ case '\f':
2656
+ case ' ':
2657
+ gumbo_tokenizer_set_state(
2658
+ parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2659
+ return CONTINUE;
2660
+ case '>':
2661
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2662
+ return emit_doctype(parser, output);
2663
+ case '"':
2664
+ tokenizer_add_parse_error (
2665
+ parser,
2666
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2667
+ );
2668
+ assert(temporary_buffer_is_empty(parser));
2669
+ gumbo_tokenizer_set_state(
2670
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2671
+ return CONTINUE;
2672
+ case '\'':
2673
+ tokenizer_add_parse_error (
2674
+ parser,
2675
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2676
+ );
2677
+ assert(temporary_buffer_is_empty(parser));
2678
+ gumbo_tokenizer_set_state(
2679
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2680
+ return CONTINUE;
2681
+ case -1:
2682
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2683
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2684
+ tokenizer->_doc_type_state.force_quirks = true;
2685
+ return emit_doctype(parser, output);
2686
+ default:
2687
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2688
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2689
+ tokenizer->_doc_type_state.force_quirks = true;
2690
+ return CONTINUE;
2691
+ }
2692
+ }
2693
+
2694
+ // https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
2695
+ static StateResult handle_between_doctype_public_system_id_state (
2696
+ GumboParser* parser,
2697
+ GumboTokenizerState* tokenizer,
2698
+ int c,
2699
+ GumboToken* output
2700
+ ) {
2701
+ switch (c) {
2702
+ case '\t':
2703
+ case '\n':
2704
+ case '\f':
2705
+ case ' ':
2706
+ return CONTINUE;
2707
+ case '>':
2708
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2709
+ return emit_doctype(parser, output);
2710
+ case '"':
2711
+ assert(temporary_buffer_is_empty(parser));
2712
+ gumbo_tokenizer_set_state(
2713
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2714
+ return CONTINUE;
2715
+ case '\'':
2716
+ assert(temporary_buffer_is_empty(parser));
2717
+ gumbo_tokenizer_set_state(
2718
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2719
+ return CONTINUE;
2720
+ case -1:
2721
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2722
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2723
+ tokenizer->_doc_type_state.force_quirks = true;
2724
+ return emit_doctype(parser, output);
2725
+ default:
2726
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2727
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2728
+ tokenizer->_doc_type_state.force_quirks = true;
2729
+ return CONTINUE;
2730
+ }
2731
+ }
2732
+
2733
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
2734
+ static StateResult handle_after_doctype_system_keyword_state (
2735
+ GumboParser* parser,
2736
+ GumboTokenizerState* tokenizer,
2737
+ int c,
2738
+ GumboToken* output
2739
+ ) {
2740
+ switch (c) {
2741
+ case '\t':
2742
+ case '\n':
2743
+ case '\f':
2744
+ case ' ':
2745
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2746
+ return CONTINUE;
2747
+ case '"':
2748
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2749
+ assert(temporary_buffer_is_empty(parser));
2750
+ gumbo_tokenizer_set_state(
2751
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2752
+ return CONTINUE;
2753
+ case '\'':
2754
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2755
+ assert(temporary_buffer_is_empty(parser));
2756
+ gumbo_tokenizer_set_state(
2757
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2758
+ return CONTINUE;
2759
+ case '>':
2760
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2761
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2762
+ tokenizer->_doc_type_state.force_quirks = true;
2763
+ return emit_doctype(parser, output);
2764
+ case -1:
2765
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2766
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2767
+ tokenizer->_doc_type_state.force_quirks = true;
2768
+ return emit_doctype(parser, output);
2769
+ default:
2770
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2771
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2772
+ tokenizer->_doc_type_state.force_quirks = true;
2773
+ return CONTINUE;
2774
+ }
2775
+ }
2776
+
2777
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
2778
+ static StateResult handle_before_doctype_system_id_state (
2779
+ GumboParser* parser,
2780
+ GumboTokenizerState* tokenizer,
2781
+ int c,
2782
+ GumboToken* output
2783
+ ) {
2784
+ switch (c) {
2785
+ case '\t':
2786
+ case '\n':
2787
+ case '\f':
2788
+ case ' ':
2789
+ return CONTINUE;
2790
+ case '"':
2791
+ assert(temporary_buffer_is_empty(parser));
2792
+ gumbo_tokenizer_set_state(
2793
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2794
+ return CONTINUE;
2795
+ case '\'':
2796
+ assert(temporary_buffer_is_empty(parser));
2797
+ gumbo_tokenizer_set_state(
2798
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2799
+ return CONTINUE;
2800
+ case '>':
2801
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2802
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2803
+ tokenizer->_doc_type_state.force_quirks = true;
2804
+ return emit_doctype(parser, output);
2805
+ case -1:
2806
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2807
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2808
+ tokenizer->_doc_type_state.force_quirks = true;
2809
+ return emit_doctype(parser, output);
2810
+ default:
2811
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2812
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2813
+ tokenizer->_doc_type_state.force_quirks = true;
2814
+ return CONTINUE;
2815
+ }
2816
+ }
2817
+
2818
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
2819
+ static StateResult handle_doctype_system_id_double_quoted_state (
2820
+ GumboParser* parser,
2821
+ GumboTokenizerState* tokenizer,
2822
+ int c,
2823
+ GumboToken* output
2824
+ ) {
2825
+ switch (c) {
2826
+ case '"':
2827
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2828
+ finish_doctype_system_id(parser);
2829
+ return CONTINUE;
2830
+ case '\0':
2831
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2832
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2833
+ return CONTINUE;
2834
+ case '>':
2835
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
2836
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2837
+ tokenizer->_doc_type_state.force_quirks = true;
2838
+ finish_doctype_system_id(parser);
2839
+ return emit_doctype(parser, output);
2840
+ case -1:
2841
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2842
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2843
+ tokenizer->_doc_type_state.force_quirks = true;
2844
+ finish_doctype_system_id(parser);
2845
+ return emit_doctype(parser, output);
2846
+ default:
2847
+ append_char_to_temporary_buffer(parser, c);
2848
+ return CONTINUE;
2849
+ }
2850
+ }
2851
+
2852
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
2853
+ static StateResult handle_doctype_system_id_single_quoted_state (
2854
+ GumboParser* parser,
2855
+ GumboTokenizerState* tokenizer,
2856
+ int c,
2857
+ GumboToken* output
2858
+ ) {
2859
+ switch (c) {
2860
+ case '\'':
2861
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2862
+ finish_doctype_system_id(parser);
2863
+ return CONTINUE;
2864
+ case '\0':
2865
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2866
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2867
+ return CONTINUE;
2868
+ case '>':
2869
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
2870
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2871
+ tokenizer->_doc_type_state.force_quirks = true;
2872
+ finish_doctype_system_id(parser);
2873
+ return emit_doctype(parser, output);
2874
+ case -1:
2875
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2876
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2877
+ tokenizer->_doc_type_state.force_quirks = true;
2878
+ finish_doctype_system_id(parser);
2879
+ return emit_doctype(parser, output);
2880
+ default:
2881
+ append_char_to_temporary_buffer(parser, c);
2882
+ return CONTINUE;
2883
+ }
2884
+ }
2885
+
2886
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
2887
+ static StateResult handle_after_doctype_system_id_state (
2888
+ GumboParser* parser,
2889
+ GumboTokenizerState* tokenizer,
2890
+ int c,
2891
+ GumboToken* output
2892
+ ) {
2893
+ switch (c) {
2894
+ case '\t':
2895
+ case '\n':
2896
+ case '\f':
2897
+ case ' ':
2898
+ return CONTINUE;
2899
+ case '>':
2900
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2901
+ return emit_doctype(parser, output);
2902
+ case -1:
2903
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2904
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2905
+ tokenizer->_doc_type_state.force_quirks = true;
2906
+ return emit_doctype(parser, output);
2907
+ default:
2908
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER);
2909
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2910
+ return CONTINUE;
2911
+ }
2912
+ }
2913
+
2914
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
2915
+ static StateResult handle_bogus_doctype_state (
2916
+ GumboParser* parser,
2917
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2918
+ int c,
2919
+ GumboToken* output
2920
+ ) {
2921
+ switch (c) {
2922
+ case '>':
2923
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2924
+ return emit_doctype(parser, output);
2925
+ case '\0':
2926
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2927
+ return CONTINUE;
2928
+ case -1:
2929
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2930
+ return emit_doctype(parser, output);
2931
+ default:
2932
+ return CONTINUE;
2933
+ }
2934
+ }
2935
+
2936
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
2937
+ static StateResult handle_cdata_section_state (
2938
+ GumboParser* parser,
2939
+ GumboTokenizerState* tokenizer,
2940
+ int c,
2941
+ GumboToken* output
2942
+ ) {
2943
+ switch (c) {
2944
+ case ']':
2945
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_BRACKET);
2946
+ set_mark(parser);
2947
+ return CONTINUE;
2948
+ case -1:
2949
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_CDATA);
2950
+ return emit_eof(parser, output);
2951
+ default:
2952
+ return emit_char(parser, c, output);
2953
+ }
2954
+ }
2955
+
2956
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
2957
+ static StateResult handle_cdata_section_bracket_state (
2958
+ GumboParser* parser,
2959
+ GumboTokenizerState* tokenizer,
2960
+ int c,
2961
+ GumboToken* output
2962
+ ) {
2963
+ switch (c) {
2964
+ case ']':
2965
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_END);
2966
+ return CONTINUE;
2967
+ default:
2968
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2969
+ // Emit the ].
2970
+ return emit_from_mark(parser, output);
2971
+ }
2972
+ }
2973
+
2974
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
2975
+ static StateResult handle_cdata_section_end_state (
2976
+ GumboParser* parser,
2977
+ GumboTokenizerState* tokenizer,
2978
+ int c,
2979
+ GumboToken* output
2980
+ ) {
2981
+ switch (c) {
2982
+ case ']':
2983
+ {
2984
+ // XXX: This is terrible. We want to emit a ] corresponding to the first
2985
+ // of the three in a row we've seen. So let's emit one token from the
2986
+ // temporary buffer (which will rewind 3 characters, emit the ] and
2987
+ // advance one). Next, let's clear the temporary buffer which will set the
2988
+ // mark to the middle of the three brackets. Finally, let's move to the
2989
+ // appropriate state.
2990
+ StateResult result = emit_from_mark(parser, output);
2991
+ tokenizer->_resume_pos = NULL;
2992
+ set_mark(parser);
2993
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION);
2994
+ return result;
2995
+ }
2996
+ case '>':
2997
+ // We're done with CDATA so move past the >, reset the token start point
2998
+ // to point after the >, and then reconsume in the data state.
2999
+ utf8iterator_next(&tokenizer->_input);
3000
+ reset_token_start_point(tokenizer);
3001
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3002
+ tokenizer->_is_in_cdata = false;
3003
+ return CONTINUE;
3004
+ default:
3005
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
3006
+ return emit_from_mark(parser, output);
3007
+ }
3008
+ }
3009
+
3010
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
3011
+ static StateResult handle_character_reference_state (
3012
+ GumboParser* parser,
3013
+ GumboTokenizerState* tokenizer,
3014
+ int c,
3015
+ GumboToken* output
3016
+ ) {
3017
+ if (gumbo_ascii_isalnum(c)) {
3018
+ reconsume_in_state(parser, GUMBO_LEX_NAMED_CHARACTER_REFERENCE);
3019
+ return CONTINUE;
3020
+ }
3021
+ if (c == '#') {
3022
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE);
3023
+ return CONTINUE;
3024
+ }
3025
+ reconsume_in_state(parser, tokenizer->_return_state);
3026
+ return flush_code_points_consumed_as_character_reference(parser, output);
3027
+ }
3028
+
3029
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
3030
+ static StateResult handle_named_character_reference_state (
3031
+ GumboParser* parser,
3032
+ GumboTokenizerState* tokenizer,
3033
+ int c,
3034
+ GumboToken* output
3035
+ ) {
3036
+ const char *cur = utf8iterator_get_char_pointer(&tokenizer->_input);
3037
+ const char *end = utf8iterator_get_end_pointer(&tokenizer->_input);
3038
+ int code_point[2];
3039
+ size_t size = match_named_char_ref(cur, end - cur, code_point);
3040
+
3041
+ if (size > 0) {
3042
+ utf8iterator_maybe_consume_match(&tokenizer->_input, cur, size, true);
3043
+ int next = utf8iterator_current(&tokenizer->_input);
3044
+ reconsume_in_state(parser, tokenizer->_return_state);
3045
+ if (character_reference_part_of_attribute(parser)
3046
+ && cur[size-1] != ';'
3047
+ && (next == '=' || gumbo_ascii_isalnum(next))) {
3048
+ GumboStringPiece str = { .data = cur, .length = size };
3049
+ append_string_to_temporary_buffer(parser, &str);
3050
+ return flush_code_points_consumed_as_character_reference(parser, output);
3051
+ }
3052
+ if (cur[size-1] != ';')
3053
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE, -1);
3054
+ reconsume_in_state(parser, tokenizer->_return_state);
3055
+ return flush_char_ref(parser, code_point[0], code_point[1], output);
3056
+ }
3057
+ reconsume_in_state(parser, GUMBO_LEX_AMBIGUOUS_AMPERSAND);
3058
+ return flush_code_points_consumed_as_character_reference(parser, output);
3059
+ }
3060
+
3061
+ // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
3062
+ static StateResult handle_ambiguous_ampersand_state (
3063
+ GumboParser* parser,
3064
+ GumboTokenizerState* tokenizer,
3065
+ int c,
3066
+ GumboToken* output
3067
+ ) {
3068
+ if (gumbo_ascii_isalnum(c)) {
3069
+ if (character_reference_part_of_attribute(parser)) {
3070
+ append_char_to_tag_buffer(parser, c, true);
3071
+ return CONTINUE;
3072
+ }
3073
+ return emit_char(parser, c, output);
3074
+ }
3075
+ if (c == ';') {
3076
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE, -1);
3077
+ // fall through
3078
+ }
3079
+ reconsume_in_state(parser, tokenizer->_return_state);
3080
+ return CONTINUE;
3081
+ }
3082
+
3083
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
3084
+ static StateResult handle_numeric_character_reference_state (
3085
+ GumboParser* parser,
3086
+ GumboTokenizerState* tokenizer,
3087
+ int c,
3088
+ GumboToken* output
3089
+ ) {
3090
+ tokenizer->_character_reference_code = 0;
3091
+ switch (c) {
3092
+ case 'x':
3093
+ case 'X':
3094
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START);
3095
+ return CONTINUE;
3096
+ default:
3097
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START);
3098
+ return CONTINUE;
3099
+ }
3100
+ }
3101
+
3102
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-start-state
3103
+ static StateResult handle_hexadecimal_character_reference_start_state (
3104
+ GumboParser* parser,
3105
+ GumboTokenizerState* tokenizer,
3106
+ int c,
3107
+ GumboToken* output
3108
+ ) {
3109
+ if (gumbo_ascii_isxdigit(c)) {
3110
+ reconsume_in_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE);
3111
+ return CONTINUE;
3112
+ }
3113
+ tokenizer_add_char_ref_error (
3114
+ parser,
3115
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3116
+ -1
3117
+ );
3118
+ reconsume_in_state(parser, tokenizer->_return_state);
3119
+ return flush_code_points_consumed_as_character_reference(parser, output);
3120
+ }
3121
+
3122
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
3123
+ static StateResult handle_decimal_character_reference_start_state (
3124
+ GumboParser* parser,
3125
+ GumboTokenizerState* tokenizer,
3126
+ int c,
3127
+ GumboToken* output
3128
+ ) {
3129
+ if (gumbo_ascii_isdigit(c)) {
3130
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE);
3131
+ return CONTINUE;
3132
+ }
3133
+ tokenizer_add_char_ref_error (
3134
+ parser,
3135
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3136
+ -1
3137
+ );
3138
+ reconsume_in_state(parser, tokenizer->_return_state);
3139
+ return flush_code_points_consumed_as_character_reference(parser, output);
3140
+ }
3141
+
3142
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-state
3143
+ static StateResult handle_hexadecimal_character_reference_state (
3144
+ GumboParser* parser,
3145
+ GumboTokenizerState* tokenizer,
3146
+ int c,
3147
+ GumboToken* output
3148
+ ) {
3149
+ if (gumbo_ascii_isdigit(c)) {
3150
+ tokenizer->_character_reference_code =
3151
+ tokenizer->_character_reference_code * 16 + (c - 0x0030);
3152
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3153
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3154
+ return CONTINUE;
3155
+ }
3156
+ if (gumbo_ascii_isupper_xdigit(c)) {
3157
+ tokenizer->_character_reference_code =
3158
+ tokenizer->_character_reference_code * 16 + (c - 0x0037);
3159
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3160
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3161
+ return CONTINUE;
3162
+ }
3163
+ if (gumbo_ascii_islower_xdigit(c)) {
3164
+ tokenizer->_character_reference_code =
3165
+ tokenizer->_character_reference_code * 16 + (c - 0x0057);
3166
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3167
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3168
+ return CONTINUE;
3169
+ }
3170
+ if (c == ';') {
3171
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3172
+ return CONTINUE;
3173
+ }
3174
+ tokenizer_add_char_ref_error(
3175
+ parser,
3176
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3177
+ tokenizer->_character_reference_code
3178
+ );
3179
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3180
+ return CONTINUE;
3181
+ }
3182
+
3183
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
3184
+ static StateResult handle_decimal_character_reference_state (
3185
+ GumboParser* parser,
3186
+ GumboTokenizerState* tokenizer,
3187
+ int c,
3188
+ GumboToken* output
3189
+ ) {
3190
+ if (gumbo_ascii_isdigit(c)) {
3191
+ tokenizer->_character_reference_code =
3192
+ tokenizer->_character_reference_code * 10 + (c - 0x0030);
3193
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3194
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3195
+ return CONTINUE;
3196
+ }
3197
+ if (c == ';') {
3198
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3199
+ return CONTINUE;
3200
+ }
3201
+ tokenizer_add_char_ref_error(
3202
+ parser,
3203
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3204
+ tokenizer->_character_reference_code
3205
+ );
3206
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3207
+ return CONTINUE;
3208
+ }
3209
+
3210
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
3211
+ static StateResult handle_numeric_character_reference_end_state (
3212
+ GumboParser* parser,
3213
+ GumboTokenizerState* tokenizer,
3214
+ int c,
3215
+ GumboToken* output
3216
+ ) {
3217
+ c = tokenizer->_character_reference_code;
3218
+ if (c == 0) {
3219
+ tokenizer_add_char_ref_error(
3220
+ parser,
3221
+ GUMBO_ERR_NULL_CHARACTER_REFERENCE,
3222
+ c
3223
+ );
3224
+ c = kUtf8ReplacementChar;
3225
+ } else if (c > kUtf8MaxChar) {
3226
+ tokenizer_add_char_ref_error(
3227
+ parser,
3228
+ GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
3229
+ c
3230
+ );
3231
+ c = kUtf8ReplacementChar;
3232
+ } else if (utf8_is_surrogate(c)) {
3233
+ tokenizer_add_char_ref_error(
3234
+ parser,
3235
+ GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
3236
+ c
3237
+ );
3238
+ c = kUtf8ReplacementChar;
3239
+ } else if (utf8_is_noncharacter(c)) {
3240
+ tokenizer_add_char_ref_error(
3241
+ parser,
3242
+ GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
3243
+ c
3244
+ );
3245
+ } else if (c == 0x0D || (utf8_is_control(c) && !gumbo_ascii_isspace(c))) {
3246
+ tokenizer_add_char_ref_error(
3247
+ parser,
3248
+ GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
3249
+ c
3250
+ );
3251
+ switch (c) {
3252
+ case 0x80: c = 0x20AC; break;
3253
+ case 0x82: c = 0x201A; break;
3254
+ case 0x83: c = 0x0192; break;
3255
+ case 0x84: c = 0x201E; break;
3256
+ case 0x85: c = 0x2026; break;
3257
+ case 0x86: c = 0x2020; break;
3258
+ case 0x87: c = 0x2021; break;
3259
+ case 0x88: c = 0x02C6; break;
3260
+ case 0x89: c = 0x2030; break;
3261
+ case 0x8A: c = 0x0160; break;
3262
+ case 0x8B: c = 0x2039; break;
3263
+ case 0x8C: c = 0x0152; break;
3264
+ case 0x8E: c = 0x017D; break;
3265
+ case 0x91: c = 0x2018; break;
3266
+ case 0x92: c = 0x2019; break;
3267
+ case 0x93: c = 0x201C; break;
3268
+ case 0x94: c = 0x201D; break;
3269
+ case 0x95: c = 0x2022; break;
3270
+ case 0x96: c = 0x2013; break;
3271
+ case 0x97: c = 0x2014; break;
3272
+ case 0x98: c = 0x02DC; break;
3273
+ case 0x99: c = 0x2122; break;
3274
+ case 0x9A: c = 0x0161; break;
3275
+ case 0x9B: c = 0x203A; break;
3276
+ case 0x9C: c = 0x0153; break;
3277
+ case 0x9E: c = 0x017E; break;
3278
+ case 0x9F: c = 0x0178; break;
3279
+ }
3280
+ }
3281
+ reconsume_in_state(parser, tokenizer->_return_state);
3282
+ return flush_char_ref(parser, c, kGumboNoChar, output);
3283
+ }
3284
+
3285
+ typedef StateResult (*GumboLexerStateFunction) (
3286
+ GumboParser* parser,
3287
+ GumboTokenizerState* tokenizer,
3288
+ int c,
3289
+ GumboToken* output
3290
+ );
3291
+
3292
+ static GumboLexerStateFunction dispatch_table[] = {
3293
+ [GUMBO_LEX_DATA] = handle_data_state,
3294
+ [GUMBO_LEX_RCDATA] = handle_rcdata_state,
3295
+ [GUMBO_LEX_RAWTEXT] = handle_rawtext_state,
3296
+ [GUMBO_LEX_SCRIPT_DATA] = handle_script_data_state,
3297
+ [GUMBO_LEX_PLAINTEXT] = handle_plaintext_state,
3298
+ [GUMBO_LEX_TAG_OPEN] = handle_tag_open_state,
3299
+ [GUMBO_LEX_END_TAG_OPEN] = handle_end_tag_open_state,
3300
+ [GUMBO_LEX_TAG_NAME] = handle_tag_name_state,
3301
+ [GUMBO_LEX_RCDATA_LT] = handle_rcdata_lt_state,
3302
+ [GUMBO_LEX_RCDATA_END_TAG_OPEN] = handle_rcdata_end_tag_open_state,
3303
+ [GUMBO_LEX_RCDATA_END_TAG_NAME] = handle_rcdata_end_tag_name_state,
3304
+ [GUMBO_LEX_RAWTEXT_LT] = handle_rawtext_lt_state,
3305
+ [GUMBO_LEX_RAWTEXT_END_TAG_OPEN] = handle_rawtext_end_tag_open_state,
3306
+ [GUMBO_LEX_RAWTEXT_END_TAG_NAME] = handle_rawtext_end_tag_name_state,
3307
+ [GUMBO_LEX_SCRIPT_DATA_LT] = handle_script_data_lt_state,
3308
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN] = handle_script_data_end_tag_open_state,
3309
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME] = handle_script_data_end_tag_name_state,
3310
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START] = handle_script_data_escaped_start_state,
3311
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH] = handle_script_data_escaped_start_dash_state,
3312
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED] = handle_script_data_escaped_state,
3313
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH] = handle_script_data_escaped_dash_state,
3314
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH] = handle_script_data_escaped_dash_dash_state,
3315
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT] = handle_script_data_escaped_lt_state,
3316
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN] = handle_script_data_escaped_end_tag_open_state,
3317
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME] = handle_script_data_escaped_end_tag_name_state,
3318
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START] = handle_script_data_double_escaped_start_state,
3319
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED] = handle_script_data_double_escaped_state,
3320
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH] = handle_script_data_double_escaped_dash_state,
3321
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH] = handle_script_data_double_escaped_dash_dash_state,
3322
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT] = handle_script_data_double_escaped_lt_state,
3323
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END] = handle_script_data_double_escaped_end_state,
3324
+ [GUMBO_LEX_BEFORE_ATTR_NAME] = handle_before_attr_name_state,
3325
+ [GUMBO_LEX_ATTR_NAME] = handle_attr_name_state,
3326
+ [GUMBO_LEX_AFTER_ATTR_NAME] = handle_after_attr_name_state,
3327
+ [GUMBO_LEX_BEFORE_ATTR_VALUE] = handle_before_attr_value_state,
3328
+ [GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED] = handle_attr_value_double_quoted_state,
3329
+ [GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED] = handle_attr_value_single_quoted_state,
3330
+ [GUMBO_LEX_ATTR_VALUE_UNQUOTED] = handle_attr_value_unquoted_state,
3331
+ [GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED] = handle_after_attr_value_quoted_state,
3332
+ [GUMBO_LEX_SELF_CLOSING_START_TAG] = handle_self_closing_start_tag_state,
3333
+ [GUMBO_LEX_BOGUS_COMMENT] = handle_bogus_comment_state,
3334
+ [GUMBO_LEX_MARKUP_DECLARATION_OPEN] = handle_markup_declaration_open_state,
3335
+ [GUMBO_LEX_COMMENT_START] = handle_comment_start_state,
3336
+ [GUMBO_LEX_COMMENT_START_DASH] = handle_comment_start_dash_state,
3337
+ [GUMBO_LEX_COMMENT] = handle_comment_state,
3338
+ [GUMBO_LEX_COMMENT_LT] = handle_comment_lt_state,
3339
+ [GUMBO_LEX_COMMENT_LT_BANG] = handle_comment_lt_bang_state,
3340
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH] = handle_comment_lt_bang_dash_state,
3341
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH] = handle_comment_lt_bang_dash_dash_state,
3342
+ [GUMBO_LEX_COMMENT_END_DASH] = handle_comment_end_dash_state,
3343
+ [GUMBO_LEX_COMMENT_END] = handle_comment_end_state,
3344
+ [GUMBO_LEX_COMMENT_END_BANG] = handle_comment_end_bang_state,
3345
+ [GUMBO_LEX_DOCTYPE] = handle_doctype_state,
3346
+ [GUMBO_LEX_BEFORE_DOCTYPE_NAME] = handle_before_doctype_name_state,
3347
+ [GUMBO_LEX_DOCTYPE_NAME] = handle_doctype_name_state,
3348
+ [GUMBO_LEX_AFTER_DOCTYPE_NAME] = handle_after_doctype_name_state,
3349
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD] = handle_after_doctype_public_keyword_state,
3350
+ [GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID] = handle_before_doctype_public_id_state,
3351
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED] = handle_doctype_public_id_double_quoted_state,
3352
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED] = handle_doctype_public_id_single_quoted_state,
3353
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID] = handle_after_doctype_public_id_state,
3354
+ [GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID] = handle_between_doctype_public_system_id_state,
3355
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD] = handle_after_doctype_system_keyword_state,
3356
+ [GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID] = handle_before_doctype_system_id_state,
3357
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED] = handle_doctype_system_id_double_quoted_state,
3358
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED] = handle_doctype_system_id_single_quoted_state,
3359
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID] = handle_after_doctype_system_id_state,
3360
+ [GUMBO_LEX_BOGUS_DOCTYPE] = handle_bogus_doctype_state,
3361
+ [GUMBO_LEX_CDATA_SECTION] = handle_cdata_section_state,
3362
+ [GUMBO_LEX_CDATA_SECTION_BRACKET] = handle_cdata_section_bracket_state,
3363
+ [GUMBO_LEX_CDATA_SECTION_END] = handle_cdata_section_end_state,
3364
+ [GUMBO_LEX_CHARACTER_REFERENCE] = handle_character_reference_state,
3365
+ [GUMBO_LEX_NAMED_CHARACTER_REFERENCE] = handle_named_character_reference_state,
3366
+ [GUMBO_LEX_AMBIGUOUS_AMPERSAND] = handle_ambiguous_ampersand_state,
3367
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE] = handle_numeric_character_reference_state,
3368
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START] = handle_hexadecimal_character_reference_start_state,
3369
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START] = handle_decimal_character_reference_start_state,
3370
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE] = handle_hexadecimal_character_reference_state,
3371
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE] = handle_decimal_character_reference_state,
3372
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
3373
+ };
3374
+
3375
+ void gumbo_lex(GumboParser* parser, GumboToken* output) {
3376
+ // Because of the spec requirements that...
3377
+ //
3378
+ // 1. Tokens be handled immediately by the parser upon emission.
3379
+ // 2. Some states (eg. CDATA, or various error conditions) require the
3380
+ // emission of multiple tokens in the same states.
3381
+ // 3. The tokenizer often has to reconsume the same character in a different
3382
+ // state.
3383
+ //
3384
+ // ...all state must be held in the GumboTokenizer struct instead of in local
3385
+ // variables in this function. That allows us to return from this method with
3386
+ // a token, and then immediately jump back to the same state with the same
3387
+ // input if we need to return a different token. The various emit_* functions
3388
+ // are responsible for changing state (eg. flushing the chardata buffer,
3389
+ // reading the next input character) to avoid an infinite loop.
3390
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
3391
+
3392
+ if (tokenizer->_buffered_emit_char != kGumboNoChar) {
3393
+ tokenizer->_reconsume_current_input = true;
3394
+ emit_char(parser, tokenizer->_buffered_emit_char, output);
3395
+ // And now that we've avoided advancing the input, make sure we set
3396
+ // _reconsume_current_input back to false to make sure the *next* character
3397
+ // isn't consumed twice.
3398
+ tokenizer->_reconsume_current_input = false;
3399
+ tokenizer->_buffered_emit_char = kGumboNoChar;
3400
+ return;
3401
+ }
3402
+
3403
+ if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
3404
+ return;
3405
+ }
3406
+
3407
+ while (1) {
3408
+ assert(!tokenizer->_resume_pos);
3409
+ assert(tokenizer->_buffered_emit_char == kGumboNoChar);
3410
+ int c = utf8iterator_current(&tokenizer->_input);
3411
+ GumboTokenizerEnum state = tokenizer->_state;
3412
+ gumbo_debug("Lexing character '%c' (%d) in state %u.\n", c, c, state);
3413
+ StateResult result = dispatch_table[state](parser, tokenizer, c, output);
3414
+ // We need to clear reconsume_current_input before returning to prevent
3415
+ // certain infinite loop states.
3416
+ bool should_advance = !tokenizer->_reconsume_current_input;
3417
+ tokenizer->_reconsume_current_input = false;
3418
+
3419
+ if (result == EMIT_TOKEN)
3420
+ return;
3421
+
3422
+ if (should_advance) {
3423
+ utf8iterator_next(&tokenizer->_input);
3424
+ }
3425
+ }
3426
+ }
3427
+
3428
+ void gumbo_token_destroy(GumboToken* token) {
3429
+ if (!token) return;
3430
+
3431
+ switch (token->type) {
3432
+ case GUMBO_TOKEN_DOCTYPE:
3433
+ gumbo_free((void*) token->v.doc_type.name);
3434
+ gumbo_free((void*) token->v.doc_type.public_identifier);
3435
+ gumbo_free((void*) token->v.doc_type.system_identifier);
3436
+ return;
3437
+ case GUMBO_TOKEN_START_TAG:
3438
+ for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
3439
+ GumboAttribute* attr = token->v.start_tag.attributes.data[i];
3440
+ if (attr) {
3441
+ // May have been nulled out if this token was merged with another.
3442
+ gumbo_destroy_attribute(attr);
3443
+ }
3444
+ }
3445
+ gumbo_free((void*) token->v.start_tag.attributes.data);
3446
+ if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
3447
+ gumbo_free(token->v.start_tag.name);
3448
+ token->v.start_tag.name = NULL;
3449
+ }
3450
+ return;
3451
+ case GUMBO_TOKEN_END_TAG:
3452
+ if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN) {
3453
+ gumbo_free(token->v.end_tag.name);
3454
+ token->v.end_tag.name = NULL;
3455
+ }
3456
+ break;
3457
+ case GUMBO_TOKEN_COMMENT:
3458
+ gumbo_free((void*) token->v.text);
3459
+ return;
3460
+ default:
3461
+ return;
3462
+ }
3463
+ }