nokogiri 1.10.9 → 1.18.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (230) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +38 -0
  3. data/LICENSE-DEPENDENCIES.md +1632 -1022
  4. data/LICENSE.md +1 -1
  5. data/README.md +190 -95
  6. data/bin/nokogiri +63 -50
  7. data/dependencies.yml +34 -66
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +909 -422
  10. data/ext/nokogiri/gumbo.c +610 -0
  11. data/ext/nokogiri/html4_document.c +171 -0
  12. data/ext/nokogiri/html4_element_description.c +299 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser.c +40 -0
  15. data/ext/nokogiri/html4_sax_parser_context.c +98 -0
  16. data/ext/nokogiri/html4_sax_push_parser.c +96 -0
  17. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  18. data/ext/nokogiri/nokogiri.c +258 -105
  19. data/ext/nokogiri/nokogiri.h +207 -90
  20. data/ext/nokogiri/test_global_handlers.c +40 -0
  21. data/ext/nokogiri/xml_attr.c +18 -18
  22. data/ext/nokogiri/xml_attribute_decl.c +22 -22
  23. data/ext/nokogiri/xml_cdata.c +33 -33
  24. data/ext/nokogiri/xml_comment.c +19 -31
  25. data/ext/nokogiri/xml_document.c +499 -323
  26. data/ext/nokogiri/xml_document_fragment.c +17 -36
  27. data/ext/nokogiri/xml_dtd.c +65 -59
  28. data/ext/nokogiri/xml_element_content.c +63 -55
  29. data/ext/nokogiri/xml_element_decl.c +31 -31
  30. data/ext/nokogiri/xml_encoding_handler.c +54 -21
  31. data/ext/nokogiri/xml_entity_decl.c +37 -35
  32. data/ext/nokogiri/xml_entity_reference.c +17 -19
  33. data/ext/nokogiri/xml_namespace.c +131 -61
  34. data/ext/nokogiri/xml_node.c +1429 -723
  35. data/ext/nokogiri/xml_node_set.c +257 -225
  36. data/ext/nokogiri/xml_processing_instruction.c +18 -20
  37. data/ext/nokogiri/xml_reader.c +340 -231
  38. data/ext/nokogiri/xml_relax_ng.c +87 -99
  39. data/ext/nokogiri/xml_sax_parser.c +269 -176
  40. data/ext/nokogiri/xml_sax_parser_context.c +286 -152
  41. data/ext/nokogiri/xml_sax_push_parser.c +111 -64
  42. data/ext/nokogiri/xml_schema.c +132 -140
  43. data/ext/nokogiri/xml_syntax_error.c +52 -23
  44. data/ext/nokogiri/xml_text.c +37 -30
  45. data/ext/nokogiri/xml_xpath_context.c +373 -185
  46. data/ext/nokogiri/xslt_stylesheet.c +342 -191
  47. data/gumbo-parser/CHANGES.md +63 -0
  48. data/gumbo-parser/Makefile +129 -0
  49. data/gumbo-parser/THANKS +27 -0
  50. data/gumbo-parser/src/Makefile +34 -0
  51. data/gumbo-parser/src/README.md +41 -0
  52. data/gumbo-parser/src/ascii.c +75 -0
  53. data/gumbo-parser/src/ascii.h +115 -0
  54. data/gumbo-parser/src/attribute.c +42 -0
  55. data/gumbo-parser/src/attribute.h +17 -0
  56. data/gumbo-parser/src/char_ref.c +22225 -0
  57. data/gumbo-parser/src/char_ref.h +29 -0
  58. data/gumbo-parser/src/char_ref.rl +2154 -0
  59. data/gumbo-parser/src/error.c +658 -0
  60. data/gumbo-parser/src/error.h +152 -0
  61. data/gumbo-parser/src/foreign_attrs.c +103 -0
  62. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/nokogiri_gumbo.h +953 -0
  66. data/gumbo-parser/src/parser.c +4932 -0
  67. data/gumbo-parser/src/parser.h +41 -0
  68. data/gumbo-parser/src/replacement.h +33 -0
  69. data/gumbo-parser/src/string_buffer.c +103 -0
  70. data/gumbo-parser/src/string_buffer.h +68 -0
  71. data/gumbo-parser/src/string_piece.c +48 -0
  72. data/gumbo-parser/src/svg_attrs.c +174 -0
  73. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  74. data/gumbo-parser/src/svg_tags.c +137 -0
  75. data/gumbo-parser/src/svg_tags.gperf +55 -0
  76. data/gumbo-parser/src/tag.c +223 -0
  77. data/gumbo-parser/src/tag_lookup.c +382 -0
  78. data/gumbo-parser/src/tag_lookup.gperf +170 -0
  79. data/gumbo-parser/src/tag_lookup.h +13 -0
  80. data/gumbo-parser/src/token_buffer.c +79 -0
  81. data/gumbo-parser/src/token_buffer.h +71 -0
  82. data/gumbo-parser/src/token_type.h +17 -0
  83. data/gumbo-parser/src/tokenizer.c +3464 -0
  84. data/gumbo-parser/src/tokenizer.h +112 -0
  85. data/gumbo-parser/src/tokenizer_states.h +339 -0
  86. data/gumbo-parser/src/utf8.c +245 -0
  87. data/gumbo-parser/src/utf8.h +164 -0
  88. data/gumbo-parser/src/util.c +66 -0
  89. data/gumbo-parser/src/util.h +34 -0
  90. data/gumbo-parser/src/vector.c +111 -0
  91. data/gumbo-parser/src/vector.h +45 -0
  92. data/lib/nokogiri/class_resolver.rb +67 -0
  93. data/lib/nokogiri/css/node.rb +14 -8
  94. data/lib/nokogiri/css/parser.rb +399 -377
  95. data/lib/nokogiri/css/parser.y +250 -245
  96. data/lib/nokogiri/css/parser_extras.rb +16 -71
  97. data/lib/nokogiri/css/selector_cache.rb +38 -0
  98. data/lib/nokogiri/css/syntax_error.rb +3 -1
  99. data/lib/nokogiri/css/tokenizer.rb +7 -5
  100. data/lib/nokogiri/css/tokenizer.rex +11 -9
  101. data/lib/nokogiri/css/xpath_visitor.rb +242 -96
  102. data/lib/nokogiri/css.rb +122 -17
  103. data/lib/nokogiri/decorators/slop.rb +11 -11
  104. data/lib/nokogiri/encoding_handler.rb +57 -0
  105. data/lib/nokogiri/extension.rb +32 -0
  106. data/lib/nokogiri/gumbo.rb +15 -0
  107. data/lib/nokogiri/html.rb +38 -27
  108. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  109. data/lib/nokogiri/html4/document.rb +235 -0
  110. data/lib/nokogiri/html4/document_fragment.rb +166 -0
  111. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  112. data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
  113. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  114. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  115. data/lib/nokogiri/html4/sax/parser.rb +48 -0
  116. data/lib/nokogiri/html4/sax/parser_context.rb +15 -0
  117. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
  118. data/lib/nokogiri/html4.rb +42 -0
  119. data/lib/nokogiri/html5/builder.rb +40 -0
  120. data/lib/nokogiri/html5/document.rb +199 -0
  121. data/lib/nokogiri/html5/document_fragment.rb +200 -0
  122. data/lib/nokogiri/html5/node.rb +103 -0
  123. data/lib/nokogiri/html5.rb +368 -0
  124. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  125. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  126. data/lib/nokogiri/syntax_error.rb +2 -0
  127. data/lib/nokogiri/version/constant.rb +6 -0
  128. data/lib/nokogiri/version/info.rb +224 -0
  129. data/lib/nokogiri/version.rb +3 -108
  130. data/lib/nokogiri/xml/attr.rb +55 -3
  131. data/lib/nokogiri/xml/attribute_decl.rb +6 -2
  132. data/lib/nokogiri/xml/builder.rb +83 -35
  133. data/lib/nokogiri/xml/cdata.rb +3 -1
  134. data/lib/nokogiri/xml/character_data.rb +2 -0
  135. data/lib/nokogiri/xml/document.rb +359 -130
  136. data/lib/nokogiri/xml/document_fragment.rb +170 -54
  137. data/lib/nokogiri/xml/dtd.rb +4 -2
  138. data/lib/nokogiri/xml/element_content.rb +12 -2
  139. data/lib/nokogiri/xml/element_decl.rb +6 -2
  140. data/lib/nokogiri/xml/entity_decl.rb +7 -3
  141. data/lib/nokogiri/xml/entity_reference.rb +2 -0
  142. data/lib/nokogiri/xml/namespace.rb +44 -0
  143. data/lib/nokogiri/xml/node/save_options.rb +23 -8
  144. data/lib/nokogiri/xml/node.rb +1168 -420
  145. data/lib/nokogiri/xml/node_set.rb +145 -67
  146. data/lib/nokogiri/xml/notation.rb +13 -0
  147. data/lib/nokogiri/xml/parse_options.rb +145 -52
  148. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  149. data/lib/nokogiri/xml/pp/node.rb +47 -30
  150. data/lib/nokogiri/xml/pp.rb +4 -2
  151. data/lib/nokogiri/xml/processing_instruction.rb +4 -1
  152. data/lib/nokogiri/xml/reader.rb +68 -41
  153. data/lib/nokogiri/xml/relax_ng.rb +60 -17
  154. data/lib/nokogiri/xml/sax/document.rb +198 -111
  155. data/lib/nokogiri/xml/sax/parser.rb +144 -67
  156. data/lib/nokogiri/xml/sax/parser_context.rb +119 -6
  157. data/lib/nokogiri/xml/sax/push_parser.rb +9 -5
  158. data/lib/nokogiri/xml/sax.rb +54 -4
  159. data/lib/nokogiri/xml/schema.rb +116 -39
  160. data/lib/nokogiri/xml/searchable.rb +139 -95
  161. data/lib/nokogiri/xml/syntax_error.rb +29 -5
  162. data/lib/nokogiri/xml/text.rb +2 -0
  163. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  164. data/lib/nokogiri/xml/xpath.rb +15 -4
  165. data/lib/nokogiri/xml/xpath_context.rb +15 -4
  166. data/lib/nokogiri/xml.rb +45 -55
  167. data/lib/nokogiri/xslt/stylesheet.rb +32 -8
  168. data/lib/nokogiri/xslt.rb +103 -30
  169. data/lib/nokogiri.rb +59 -75
  170. data/lib/xsd/xmlparser/nokogiri.rb +32 -29
  171. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  172. data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
  173. data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
  174. data/patches/libxml2/0019-xpath-Use-separate-static-hash-table-for-standard-fu.patch +244 -0
  175. data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
  176. data/ports/archives/libxml2-2.13.6.tar.xz +0 -0
  177. data/ports/archives/libxslt-1.1.42.tar.xz +0 -0
  178. metadata +123 -295
  179. data/ext/nokogiri/html_document.c +0 -170
  180. data/ext/nokogiri/html_document.h +0 -10
  181. data/ext/nokogiri/html_element_description.c +0 -279
  182. data/ext/nokogiri/html_element_description.h +0 -10
  183. data/ext/nokogiri/html_entity_lookup.c +0 -32
  184. data/ext/nokogiri/html_entity_lookup.h +0 -8
  185. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  186. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  187. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  188. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  189. data/ext/nokogiri/xml_attr.h +0 -9
  190. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  191. data/ext/nokogiri/xml_cdata.h +0 -9
  192. data/ext/nokogiri/xml_comment.h +0 -9
  193. data/ext/nokogiri/xml_document.h +0 -23
  194. data/ext/nokogiri/xml_document_fragment.h +0 -10
  195. data/ext/nokogiri/xml_dtd.h +0 -10
  196. data/ext/nokogiri/xml_element_content.h +0 -10
  197. data/ext/nokogiri/xml_element_decl.h +0 -9
  198. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  199. data/ext/nokogiri/xml_entity_decl.h +0 -10
  200. data/ext/nokogiri/xml_entity_reference.h +0 -9
  201. data/ext/nokogiri/xml_io.c +0 -61
  202. data/ext/nokogiri/xml_io.h +0 -11
  203. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  204. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  205. data/ext/nokogiri/xml_namespace.h +0 -14
  206. data/ext/nokogiri/xml_node.h +0 -13
  207. data/ext/nokogiri/xml_node_set.h +0 -12
  208. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  209. data/ext/nokogiri/xml_reader.h +0 -10
  210. data/ext/nokogiri/xml_relax_ng.h +0 -9
  211. data/ext/nokogiri/xml_sax_parser.h +0 -39
  212. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  213. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  214. data/ext/nokogiri/xml_schema.h +0 -9
  215. data/ext/nokogiri/xml_syntax_error.h +0 -13
  216. data/ext/nokogiri/xml_text.h +0 -9
  217. data/ext/nokogiri/xml_xpath_context.h +0 -10
  218. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  219. data/lib/nokogiri/html/document.rb +0 -335
  220. data/lib/nokogiri/html/document_fragment.rb +0 -49
  221. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  222. data/lib/nokogiri/html/sax/parser.rb +0 -62
  223. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  224. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  225. data/patches/libxml2/0004-libxml2.la-is-in-top_builddir.patch +0 -25
  226. data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +0 -32
  227. data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
  228. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
  229. /data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
  230. /data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
@@ -0,0 +1,3464 @@
1
+ /*
2
+ Copyright 2010 Google Inc.
3
+ Copyright 2017-2018 Craig Barnes
4
+ Copyright 2018 Stephen Checkoway
5
+
6
+ Licensed under the Apache License, Version 2.0 (the "License");
7
+ you may not use this file except in compliance with the License.
8
+ You may obtain a copy of the License at
9
+
10
+ https://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+ */
18
+
19
+ /*
20
+ Coding conventions specific to this file:
21
+
22
+ 1. Functions that fill in a token should be named emit_*, and should be
23
+ followed immediately by a return from the tokenizer.
24
+ 2. Functions that shuffle data from temporaries to final API structures
25
+ should be named finish_*, and be called just before the tokenizer exits the
26
+ state that accumulates the temporary.
27
+ 3. All internal data structures should be kept in an initialized state from
28
+ tokenizer creation onwards, ready to accept input. When a buffer's flushed
29
+ and reset, it should be deallocated and immediately reinitialized.
30
+ 4. Make sure there are appropriate break statements following each state.
31
+ 5. Assertions on the state of the temporary and tag buffers are usually a
32
+ good idea, and should go at the entry point of each state when added.
33
+ 6. Statement order within states goes:
34
+ 1. Add parse errors, if appropriate.
35
+ 2. Call finish_* functions to build up tag state.
36
+ 2. Switch to new state. Set _reconsume flag if appropriate.
37
+ 3. Perform any other temporary buffer manipulation.
38
+ 4. Emit tokens
39
+ 5. Return/break.
40
+ This order ensures that we can verify that every emit is followed by
41
+ a return, ensures that the correct state is recorded with any parse
42
+ errors, and prevents parse error position from being messed up by
43
+ possible mark/resets in temporary buffer manipulation.
44
+ */
45
+
46
+ #include <assert.h>
47
+ #include <string.h>
48
+ #include "tokenizer.h"
49
+ #include "ascii.h"
50
+ #include "attribute.h"
51
+ #include "char_ref.h"
52
+ #include "error.h"
53
+ #include "nokogiri_gumbo.h"
54
+ #include "parser.h"
55
+ #include "string_buffer.h"
56
+ #include "token_type.h"
57
+ #include "tokenizer_states.h"
58
+ #include "utf8.h"
59
+ #include "util.h"
60
+ #include "vector.h"
61
+
62
+ // Compared against _temporary_buffer to determine if we're in
63
+ // double-escaped script mode.
64
+ static const GumboStringPiece kScriptTag = {.data = "script", .length = 6};
65
+
66
+ // An enum for the return value of each individual state. Each of the emit_*
67
+ // functions should return EMIT_TOKEN and should be called as
68
+ // return emit_foo(parser, ..., output);
69
+ // Each of the handle_*_state functions that do not return emit_* should
70
+ // instead return CONTINUE to indicate to gumbo_lex to continue lexing.
71
+ typedef enum {
72
+ EMIT_TOKEN,
73
+ CONTINUE,
74
+ } StateResult;
75
+
76
+ // This is a struct containing state necessary to build up a tag token,
77
+ // character by character.
78
+ typedef struct GumboInternalTagState {
79
+ // A buffer to accumulate characters for various GumboStringPiece fields.
80
+ GumboStringBuffer _buffer;
81
+
82
+ // A pointer to the start of the original text corresponding to the contents
83
+ // of the buffer.
84
+ const char* _original_text;
85
+
86
+ // The current tag enum, computed once the tag name state has finished so that
87
+ // the buffer can be re-used for building up attributes.
88
+ GumboTag _tag;
89
+
90
+ // The current tag name. It's set at the same time that _tag is set if _tag
91
+ // is set to GUMBO_TAG_UNKNOWN.
92
+ char *_name;
93
+
94
+ // The starting location of the text in the buffer.
95
+ GumboSourcePosition _start_pos;
96
+
97
+ // The current list of attributes. This is copied (and ownership of its data
98
+ // transferred) to the GumboStartTag token upon completion of the tag. New
99
+ // attributes are added as soon as their attribute name state is complete, and
100
+ // values are filled in by operating on _attributes.data[attributes.length-1].
101
+ GumboVector /* GumboAttribute */ _attributes;
102
+
103
+ // If true, the next attribute value to be finished should be dropped. This
104
+ // happens if a duplicate attribute name is encountered - we want to consume
105
+ // the attribute value, but shouldn't overwrite the existing value.
106
+ bool _drop_next_attr_value;
107
+
108
+ // The last start tag to have been emitted by the tokenizer. This is
109
+ // necessary to check for appropriate end tags.
110
+ GumboTag _last_start_tag;
111
+
112
+ // If true, then this is a start tag. If false, it's an end tag. This is
113
+ // necessary to generate the appropriate token type at tag-closing time.
114
+ bool _is_start_tag;
115
+
116
+ // If true, then this tag is "self-closing" and doesn't have an end tag.
117
+ bool _is_self_closing;
118
+ } GumboTagState;
119
+
120
+ // This is the main tokenizer state struct, containing all state used by in
121
+ // tokenizing the input stream.
122
+ typedef struct GumboInternalTokenizerState {
123
+ // The current lexer state. Starts in GUMBO_LEX_DATA.
124
+ GumboTokenizerEnum _state;
125
+
126
+ // A flag indicating whether the current input character needs to reconsumed
127
+ // in another state, or whether the next input character should be read for
128
+ // the next iteration of the state loop. This is set when the spec reads
129
+ // "Reconsume the current input character in..."
130
+ bool _reconsume_current_input;
131
+
132
+ // A flag indicating whether the adjusted current node is a foreign element.
133
+ // This is set by gumbo_tokenizer_set_is_adjusted_current_node_foreign and
134
+ // checked in the markup declaration state.
135
+ bool _is_adjusted_current_node_foreign;
136
+
137
+ // A flag indicating whether the tokenizer is in a CDATA section. If so, then
138
+ // text tokens emitted will be GUMBO_TOKEN_CDATA.
139
+ bool _is_in_cdata;
140
+
141
+ // Certain states (notably character references) may emit two character tokens
142
+ // at once, but the contract for lex() fills in only one token at a time. The
143
+ // extra character is buffered here, and then this is checked on entry to
144
+ // lex(). If a character is stored here, it's immediately emitted and control
145
+ // returns from the lexer. kGumboNoChar is used to represent 'no character
146
+ // stored.'
147
+ //
148
+ // Note that characters emitted through this mechanism will have their source
149
+ // position marked as the character under the mark, i.e. multiple characters
150
+ // may be emitted with the same position. This is desirable for character
151
+ // references, but unsuitable for many other cases. Use the _temporary_buffer
152
+ // mechanism if the buffered characters must have their original positions in
153
+ // the document.
154
+ int _buffered_emit_char;
155
+
156
+ // A temporary buffer to accumulate characters, as described by the "temporary
157
+ // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox
158
+ // way: In situations where the spec calls for inserting characters into the
159
+ // temporary buffer that exactly match the input in order to emit them as
160
+ // character tokens, we don't actually do it.
161
+ // Instead, we mark the input and reset the input to it using set_mark() and
162
+ // emit_from_mark(). We do use the temporary buffer for other uses such as
163
+ // DOCTYPEs, comments, and detecting escaped <script> tags.
164
+ GumboStringBuffer _temporary_buffer;
165
+
166
+ // The position to resume normal operation after we start emitting from the
167
+ // mark. NULL whenever we're not emitting from the mark.
168
+ const char* _resume_pos;
169
+
170
+ // The character reference state uses a return state to return to the state
171
+ // it was invoked from.
172
+ GumboTokenizerEnum _return_state;
173
+
174
+ // Numeric character reference.
175
+ uint32_t _character_reference_code;
176
+
177
+ // Pointer to the beginning of the current token in the original buffer; used
178
+ // to record the original text.
179
+ const char* _token_start;
180
+
181
+ // GumboSourcePosition recording the source location of the start of the
182
+ // current token.
183
+ GumboSourcePosition _token_start_pos;
184
+
185
+ // Current tag state.
186
+ GumboTagState _tag_state;
187
+
188
+ // Doctype state. We use the temporary buffer to accumulate characters (it's
189
+ // not used for anything else in the doctype states), and then freshly
190
+ // allocate the strings in the doctype token, then copy it over on emit.
191
+ GumboTokenDocType _doc_type_state;
192
+
193
+ // The UTF8Iterator over the tokenizer input.
194
+ Utf8Iterator _input;
195
+ } GumboTokenizerState;
196
+
197
+ // Adds a parse error to the parser's error struct.
198
+ static void tokenizer_add_parse_error (
199
+ GumboParser* parser,
200
+ GumboErrorType type
201
+ ) {
202
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
203
+ GumboError* error = gumbo_add_error(parser);
204
+ if (!error) {
205
+ return;
206
+ }
207
+ const Utf8Iterator* input = &tokenizer->_input;
208
+ utf8iterator_get_position(input, &error->position);
209
+ error->original_text.data = utf8iterator_get_char_pointer(input);
210
+ error->original_text.length = utf8iterator_get_width(input);
211
+ error->type = type;
212
+ error->v.tokenizer.state = tokenizer->_state;
213
+ error->v.tokenizer.codepoint = utf8iterator_current(input);
214
+ }
215
+
216
+ // Adds an error pointing at the start of the character reference.
217
+ static void tokenizer_add_char_ref_error (
218
+ struct GumboInternalParser* parser,
219
+ GumboErrorType type,
220
+ int codepoint
221
+ ) {
222
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
223
+ GumboError* error = gumbo_add_error(parser);
224
+ if (!error)
225
+ return;
226
+ Utf8Iterator* input = &tokenizer->_input;
227
+ error->type = type;
228
+ error->position = utf8iterator_get_mark_position(input);
229
+ const char* mark = utf8iterator_get_mark_pointer(input);
230
+ error->original_text.data = mark;
231
+ error->original_text.length = utf8iterator_get_char_pointer(input) - mark;
232
+ error->v.tokenizer.state = tokenizer->_state;
233
+ error->v.tokenizer.codepoint = codepoint;
234
+ }
235
+
236
+ // Adds an error pointing at the start of the token.
237
+ static void tokenizer_add_token_parse_error (
238
+ GumboParser* parser,
239
+ GumboErrorType type
240
+ ) {
241
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
242
+ GumboError* error = gumbo_add_error(parser);
243
+ if (!error)
244
+ return;
245
+ Utf8Iterator* input = &tokenizer->_input;
246
+ error->type = type;
247
+ error->position = tokenizer->_token_start_pos;
248
+ error->original_text.data = tokenizer->_token_start;
249
+ error->original_text.length =
250
+ utf8iterator_get_char_pointer(input) - tokenizer->_token_start;
251
+ error->v.tokenizer.state = tokenizer->_state;
252
+ error->v.tokenizer.codepoint = 0;
253
+ }
254
+
255
+ static bool is_alpha(int c) {
256
+ return gumbo_ascii_isalpha(c);
257
+ }
258
+
259
+ static int ensure_lowercase(int c) {
260
+ return gumbo_ascii_tolower(c);
261
+ }
262
+
263
+ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
264
+ if (is_in_cdata && c > 0) {
265
+ return GUMBO_TOKEN_CDATA;
266
+ }
267
+
268
+ switch (c) {
269
+ case '\t':
270
+ case '\n':
271
+ case '\r':
272
+ case '\f':
273
+ case ' ':
274
+ return GUMBO_TOKEN_WHITESPACE;
275
+ case 0:
276
+ gumbo_debug("Emitted null byte.\n");
277
+ return GUMBO_TOKEN_NULL;
278
+ case -1:
279
+ return GUMBO_TOKEN_EOF;
280
+ default:
281
+ return GUMBO_TOKEN_CHARACTER;
282
+ }
283
+ }
284
+
285
+ // Starts recording characters in the temporary buffer.
286
+ static void clear_temporary_buffer(GumboParser* parser) {
287
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
288
+ gumbo_string_buffer_clear(&tokenizer->_temporary_buffer);
289
+ }
290
+
291
+ // Appends a codepoint to the temporary buffer.
292
+ static void append_char_to_temporary_buffer (
293
+ GumboParser* parser,
294
+ int codepoint
295
+ ) {
296
+ gumbo_string_buffer_append_codepoint (
297
+ codepoint,
298
+ &parser->_tokenizer_state->_temporary_buffer
299
+ );
300
+ }
301
+
302
+ static void append_string_to_temporary_buffer (
303
+ GumboParser* parser,
304
+ const GumboStringPiece* str
305
+ ) {
306
+ gumbo_string_buffer_append_string (
307
+ str,
308
+ &parser->_tokenizer_state->_temporary_buffer
309
+ );
310
+ }
311
+
312
+
313
+ static bool temporary_buffer_is_empty(const GumboParser* parser) {
314
+ return parser->_tokenizer_state->_temporary_buffer.length == 0;
315
+ }
316
+
317
+ static void doc_type_state_init(GumboParser* parser) {
318
+ GumboTokenDocType* doc_type_state =
319
+ &parser->_tokenizer_state->_doc_type_state;
320
+ // We initialize these to NULL here so that we don't end up leaking memory if
321
+ // we never see a doctype token. When we do see a doctype token, we reset
322
+ // them to a freshly-allocated empty string so that we can present a uniform
323
+ // interface to client code and not make them check for null. Ownership is
324
+ // transferred to the doctype token when it's emitted.
325
+ doc_type_state->name = NULL;
326
+ doc_type_state->public_identifier = NULL;
327
+ doc_type_state->system_identifier = NULL;
328
+ doc_type_state->force_quirks = false;
329
+ doc_type_state->has_public_identifier = false;
330
+ doc_type_state->has_system_identifier = false;
331
+ }
332
+
333
+ // Sets the token original_text and position to the current iterator position.
334
+ // This is necessary because [CDATA[ sections may include text that is ignored
335
+ // by the tokenizer.
336
+ static void reset_token_start_point(GumboTokenizerState* tokenizer) {
337
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
338
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
339
+ }
340
+
341
+ // Sets the tag buffer original text and start point to the current iterator
342
+ // position. This is necessary because attribute names & values may have
343
+ // whitespace preceding them, and so we can't assume that the actual token
344
+ // starting point was the end of the last tag buffer usage.
345
+ static void reset_tag_buffer_start_point(GumboParser* parser) {
346
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
347
+ GumboTagState* tag_state = &tokenizer->_tag_state;
348
+
349
+ utf8iterator_get_position(&tokenizer->_input, &tag_state->_start_pos);
350
+ tag_state->_original_text = utf8iterator_get_char_pointer(&tokenizer->_input);
351
+ }
352
+
353
+ // Moves the temporary buffer contents over to the specified output string,
354
+ // and clears the temporary buffer.
355
+ static void finish_temporary_buffer(GumboParser* parser, const char** output) {
356
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
357
+ *output = gumbo_string_buffer_to_string(&tokenizer->_temporary_buffer);
358
+ clear_temporary_buffer(parser);
359
+ }
360
+
361
+ // Advances the iterator past the end of the token, and then fills in the
362
+ // relevant position fields. It's assumed that after every emit, the tokenizer
363
+ // will immediately return (letting the tree-construction stage read the filled
364
+ // in Token). Thus, it's safe to advance the input stream here, since it will
365
+ // bypass the advance at the bottom of the state machine loop.
366
+ //
367
+ // Since this advances the iterator and resets the current input, make sure to
368
+ // call it after you've recorded any other data you need for the token.
369
+ static void finish_token(GumboParser* parser, GumboToken* token) {
370
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
371
+ if (!tokenizer->_reconsume_current_input) {
372
+ utf8iterator_next(&tokenizer->_input);
373
+ }
374
+
375
+ token->position = tokenizer->_token_start_pos;
376
+ token->original_text.data = tokenizer->_token_start;
377
+ reset_token_start_point(tokenizer);
378
+ token->original_text.length =
379
+ tokenizer->_token_start - token->original_text.data;
380
+ if (token->original_text.length > 0 &&
381
+ token->original_text.data[token->original_text.length - 1] == '\r') {
382
+ // The UTF8 iterator will ignore carriage returns in the input stream, which
383
+ // means that the next token may start one past a \r character. The pointer
384
+ // arithmetic above results in that \r being appended to the original text
385
+ // of the preceding token, so we have to adjust its length here to chop the
386
+ // \r off.
387
+ --token->original_text.length;
388
+ }
389
+ }
390
+
391
+ // Records the doctype public ID, assumed to be in the temporary buffer.
392
+ // Convenience method that also sets has_public_identifier to true.
393
+ static void finish_doctype_public_id(GumboParser* parser) {
394
+ GumboTokenDocType* doc_type_state =
395
+ &parser->_tokenizer_state->_doc_type_state;
396
+ gumbo_free((void*) doc_type_state->public_identifier);
397
+ finish_temporary_buffer(parser, &doc_type_state->public_identifier);
398
+ doc_type_state->has_public_identifier = true;
399
+ }
400
+
401
+ // Records the doctype system ID, assumed to be in the temporary buffer.
402
+ // Convenience method that also sets has_system_identifier to true.
403
+ static void finish_doctype_system_id(GumboParser* parser) {
404
+ GumboTokenDocType* doc_type_state =
405
+ &parser->_tokenizer_state->_doc_type_state;
406
+ gumbo_free((void*) doc_type_state->system_identifier);
407
+ finish_temporary_buffer(parser, &doc_type_state->system_identifier);
408
+ doc_type_state->has_system_identifier = true;
409
+ }
410
+
411
+ // Writes a single specified character to the output token.
412
+ static StateResult emit_char(GumboParser* parser, int c, GumboToken* output) {
413
+ output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
414
+ output->v.character = c;
415
+ finish_token(parser, output);
416
+ return EMIT_TOKEN;
417
+ }
418
+
419
+ // Writes a replacement character token and records a parse error.
420
+ // Always returns EMIT_TOKEN, per gumbo_lex return value.
421
+ static StateResult emit_replacement_char(
422
+ GumboParser* parser, GumboToken* output) {
423
+ // In all cases, this is because of a null byte in the input stream.
424
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
425
+ emit_char(parser, kUtf8ReplacementChar, output);
426
+ return EMIT_TOKEN;
427
+ }
428
+
429
+ // Writes an EOF character token. Always returns EMIT_TOKEN.
430
+ static StateResult emit_eof(GumboParser* parser, GumboToken* output) {
431
+ return emit_char(parser, -1, output);
432
+ }
433
+
434
+ // Writes out a doctype token, copying it from the tokenizer state.
435
+ static StateResult emit_doctype(GumboParser* parser, GumboToken* output) {
436
+ output->type = GUMBO_TOKEN_DOCTYPE;
437
+ output->v.doc_type = parser->_tokenizer_state->_doc_type_state;
438
+ finish_token(parser, output);
439
+ doc_type_state_init(parser);
440
+ return EMIT_TOKEN;
441
+ }
442
+
443
+ // Debug-only function that explicitly sets the attribute vector data to NULL so
444
+ // it can be asserted on tag creation, verifying that there are no memory leaks.
445
+ static void mark_tag_state_as_empty(GumboTagState* tag_state) {
446
+ UNUSED_IF_NDEBUG(tag_state);
447
+ tag_state->_name = NULL;
448
+ #ifndef NDEBUG
449
+ tag_state->_attributes = kGumboEmptyVector;
450
+ #endif
451
+ }
452
+
453
+ // Writes out the current tag as a start or end tag token.
454
+ // Always returns EMIT_TOKEN.
455
+ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
456
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
457
+ if (tag_state->_is_start_tag) {
458
+ output->type = GUMBO_TOKEN_START_TAG;
459
+ output->v.start_tag.tag = tag_state->_tag;
460
+ output->v.start_tag.name = tag_state->_name;
461
+ output->v.start_tag.attributes = tag_state->_attributes;
462
+ output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
463
+ tag_state->_last_start_tag = tag_state->_tag;
464
+ mark_tag_state_as_empty(tag_state);
465
+ gumbo_debug(
466
+ "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
467
+ } else {
468
+ output->type = GUMBO_TOKEN_END_TAG;
469
+ output->v.end_tag.tag = tag_state->_tag;
470
+ output->v.end_tag.name = tag_state->_name;
471
+ if (tag_state->_is_self_closing)
472
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS);
473
+ if (tag_state->_attributes.length > 0)
474
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_END_TAG_WITH_ATTRIBUTES);
475
+ // In end tags, ownership of the attributes vector is not transferred to the
476
+ // token, but it's still initialized as normal, so it must be manually
477
+ // deallocated. There may also be attributes to destroy, in certain broken
478
+ // cases like </div</th> (the "th" is an attribute there).
479
+ for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
480
+ gumbo_destroy_attribute(tag_state->_attributes.data[i]);
481
+ }
482
+ gumbo_free(tag_state->_attributes.data);
483
+ mark_tag_state_as_empty(tag_state);
484
+ gumbo_debug(
485
+ "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
486
+ }
487
+ gumbo_string_buffer_destroy(&tag_state->_buffer);
488
+ finish_token(parser, output);
489
+ gumbo_debug (
490
+ "Original text = %.*s.\n",
491
+ (int) output->original_text.length,
492
+ output->original_text.data
493
+ );
494
+ assert(output->original_text.length >= 2);
495
+ assert(output->original_text.data[0] == '<');
496
+ assert(output->original_text.data[output->original_text.length - 1] == '>');
497
+ return EMIT_TOKEN;
498
+ }
499
+
500
+ // In some states, we speculatively start a tag, but don't know whether it'll be
501
+ // emitted as tag token or as a series of character tokens until we finish it.
502
+ // We need to abandon the tag we'd started & free its memory in that case to
503
+ // avoid a memory leak.
504
+ static void abandon_current_tag(GumboParser* parser) {
505
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
506
+ for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
507
+ gumbo_destroy_attribute(tag_state->_attributes.data[i]);
508
+ }
509
+ gumbo_free(tag_state->_name);
510
+ gumbo_free(tag_state->_attributes.data);
511
+ mark_tag_state_as_empty(tag_state);
512
+ gumbo_string_buffer_destroy(&tag_state->_buffer);
513
+ gumbo_debug("Abandoning current tag.\n");
514
+ }
515
+
516
+ // Emits a comment token. Comments use the temporary buffer to accumulate their
517
+ // data, and then it's copied over and released to the 'text' field of the
518
+ // GumboToken union. Always returns EMIT_TOKEN.
519
+ static StateResult emit_comment(GumboParser* parser, GumboToken* output) {
520
+ output->type = GUMBO_TOKEN_COMMENT;
521
+ finish_temporary_buffer(parser, &output->v.text);
522
+ finish_token(parser, output);
523
+ return EMIT_TOKEN;
524
+ }
525
+
526
+ static void set_mark(GumboParser* parser) {
527
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
528
+ utf8iterator_mark(&tokenizer->_input);
529
+ }
530
+
531
+ // Checks to see we should be emitting characters from the mark, and fills the
532
+ // output token with the next output character if so.
533
+ // Returns EMIT_TOKEN if a character has been emitted and the tokenizer should
534
+ // immediately return, CONTINUE if we should resume normal operation.
535
+ static StateResult maybe_emit_from_mark (
536
+ GumboParser* parser,
537
+ GumboToken* output
538
+ ) {
539
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
540
+ const char* pos = tokenizer->_resume_pos;
541
+
542
+ if (!pos)
543
+ return CONTINUE;
544
+ if (utf8iterator_get_char_pointer(&tokenizer->_input) >= pos) {
545
+ tokenizer->_resume_pos = NULL;
546
+ return CONTINUE;
547
+ }
548
+
549
+ // emit_char advances the input stream. _reconsume_current_input should
550
+ // *never* be set when emitting from the mark since those characters have
551
+ // already been advanced past.
552
+ assert(!tokenizer->_reconsume_current_input);
553
+ return emit_char(parser, utf8iterator_current(&tokenizer->_input), output);
554
+ }
555
+
556
+ // Sets up the tokenizer to begin emitting from the mark up to, but not
557
+ // including, the current code point. This resets the input iterator stream to
558
+ // the mark, sets up _resume_pos, and then emits the first character in it.
559
+ // Returns EMIT_TOKEN.
560
+ static StateResult emit_from_mark(GumboParser* parser, GumboToken* output) {
561
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
562
+ tokenizer->_resume_pos = utf8iterator_get_char_pointer(&tokenizer->_input);
563
+ utf8iterator_reset(&tokenizer->_input);
564
+ // Now that we have reset the input, we need to advance through it.
565
+ tokenizer->_reconsume_current_input = false;
566
+ StateResult result = maybe_emit_from_mark(parser, output);
567
+ assert(result == EMIT_TOKEN);
568
+ return result;
569
+ }
570
+
571
+ // Appends a codepoint to the current tag buffer. If
572
+ // reinitialize_position_on_first is set, this also initializes the tag buffer
573
+ // start point; the only time you would *not* want to pass true for this
574
+ // parameter is if you want the original_text to include character (like an
575
+ // opening quote) that doesn't appear in the value.
576
+ static void append_char_to_tag_buffer (
577
+ GumboParser* parser,
578
+ int codepoint,
579
+ bool reinitialize_position_on_first
580
+ ) {
581
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
582
+ if (buffer->length == 0 && reinitialize_position_on_first) {
583
+ reset_tag_buffer_start_point(parser);
584
+ }
585
+ gumbo_string_buffer_append_codepoint(codepoint, buffer);
586
+ }
587
+
588
+ // Like above but append a string.
589
+ static void append_string_to_tag_buffer (
590
+ GumboParser* parser,
591
+ GumboStringPiece* str,
592
+ bool reinitialize_position_on_first
593
+ ) {
594
+ GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
595
+ if (buffer->length == 0 && reinitialize_position_on_first) {
596
+ reset_tag_buffer_start_point(parser);
597
+ }
598
+ gumbo_string_buffer_append_string(str, buffer);
599
+ }
600
+
601
+ // (Re-)initialize the tag buffer. This also resets the original_text pointer
602
+ // and _start_pos field to point to the current position.
603
+ static void initialize_tag_buffer(GumboParser* parser) {
604
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
605
+ GumboTagState* tag_state = &tokenizer->_tag_state;
606
+
607
+ gumbo_string_buffer_init(&tag_state->_buffer);
608
+ reset_tag_buffer_start_point(parser);
609
+ }
610
+
611
+ // https://html.spec.whatwg.org/multipage/parsing.html#charref-in-attribute
612
+ static bool character_reference_part_of_attribute(GumboParser* parser) {
613
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
614
+ switch (tokenizer->_return_state) {
615
+ case GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED:
616
+ case GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED:
617
+ case GUMBO_LEX_ATTR_VALUE_UNQUOTED:
618
+ return true;
619
+ default:
620
+ return false;
621
+ }
622
+ }
623
+
624
+ // https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference
625
+ // For each code point in the temporary buffer, add to the current attribute
626
+ // value if the character reference was consumed as part of an attribute or
627
+ // emit the code point as a character token.
628
+ static StateResult flush_code_points_consumed_as_character_reference (
629
+ GumboParser* parser,
630
+ GumboToken* output
631
+ ) {
632
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
633
+ if (character_reference_part_of_attribute(parser)) {
634
+ const char *start = utf8iterator_get_mark_pointer(&tokenizer->_input);
635
+ assert(start);
636
+ GumboStringPiece str = {
637
+ .data = start,
638
+ .length = utf8iterator_get_char_pointer(&tokenizer->_input) - start,
639
+ };
640
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
641
+ append_string_to_tag_buffer(parser, &str, unquoted);
642
+ return CONTINUE;
643
+ }
644
+ return emit_from_mark(parser, output);
645
+ }
646
+
647
+ // After a character reference has been successfully constructed, the standard
648
+ // says to set the temporary buffer equal to the empty string, append the code
649
+ // point(s) associated with the reference and flush code points consumed as a
650
+ // character reference.
651
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
652
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
653
+ // That doesn't work for us because we use the temporary buffer in lock step
654
+ // with the input for position and that would fail if we inserted a different
655
+ // number of code points. So duplicate a bit of the above logic.
656
+ static StateResult flush_char_ref (
657
+ GumboParser* parser,
658
+ int first,
659
+ int second,
660
+ GumboToken* output
661
+ ) {
662
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
663
+ if (character_reference_part_of_attribute(parser)) {
664
+ bool unquoted = tokenizer->_return_state == GUMBO_LEX_ATTR_VALUE_UNQUOTED;
665
+ append_char_to_tag_buffer(parser, first, unquoted);
666
+ if (second != kGumboNoChar)
667
+ append_char_to_tag_buffer(parser, second, unquoted);
668
+ return CONTINUE;
669
+ }
670
+ tokenizer->_buffered_emit_char = second;
671
+ return emit_char(parser, first, output);
672
+ }
673
+
674
+
675
+ // Initializes the tag_state to start a new tag, keeping track of the opening
676
+ // positions and original text. Takes a boolean indicating whether this is a
677
+ // start or end tag.
678
+ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
679
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
680
+ GumboTagState* tag_state = &tokenizer->_tag_state;
681
+ int c = utf8iterator_current(&tokenizer->_input);
682
+ assert(is_alpha(c));
683
+ c = ensure_lowercase(c);
684
+ assert(is_alpha(c));
685
+
686
+ initialize_tag_buffer(parser);
687
+
688
+ assert(tag_state->_name == NULL);
689
+ assert(tag_state->_attributes.data == NULL);
690
+ // Initial size chosen by statistical analysis of a corpus of 60k webpages.
691
+ // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
692
+ // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
693
+ // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
694
+ gumbo_vector_init(1, &tag_state->_attributes);
695
+ tag_state->_drop_next_attr_value = false;
696
+ tag_state->_is_start_tag = is_start_tag;
697
+ tag_state->_is_self_closing = false;
698
+ gumbo_debug("Starting new tag.\n");
699
+ }
700
+
701
+ // Fills in the specified char* with the contents of the tag buffer.
702
+ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
703
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
704
+ GumboTagState* tag_state = &tokenizer->_tag_state;
705
+ *output = gumbo_string_buffer_to_string(&tag_state->_buffer);
706
+ }
707
+
708
+ // Fills in:
709
+ // * The original_text GumboStringPiece with the portion of the original
710
+ // buffer that corresponds to the tag buffer.
711
+ // * The start_pos GumboSourcePosition with the start position of the tag
712
+ // buffer.
713
+ // * The end_pos GumboSourcePosition with the current source position.
714
+ static void copy_over_original_tag_text (
715
+ GumboParser* parser,
716
+ GumboStringPiece* original_text,
717
+ GumboSourcePosition* start_pos,
718
+ GumboSourcePosition* end_pos
719
+ ) {
720
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
721
+ GumboTagState* tag_state = &tokenizer->_tag_state;
722
+
723
+ original_text->data = tag_state->_original_text;
724
+ original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
725
+ tag_state->_original_text;
726
+ if (
727
+ original_text->length
728
+ && original_text->data[original_text->length - 1] == '\r'
729
+ ) {
730
+ // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
731
+ // appended to the end of original text even when it's really the first part
732
+ // of the next character. If we detect this situation, shrink the length of
733
+ // the original text by 1 to remove the carriage return.
734
+ --original_text->length;
735
+ }
736
+ *start_pos = tag_state->_start_pos;
737
+ utf8iterator_get_position(&tokenizer->_input, end_pos);
738
+ }
739
+
740
+ // Releases and then re-initializes the tag buffer.
741
+ static void reinitialize_tag_buffer(GumboParser* parser) {
742
+ gumbo_free(parser->_tokenizer_state->_tag_state._buffer.data);
743
+ initialize_tag_buffer(parser);
744
+ }
745
+
746
+ // Moves some data from the temporary buffer over the the tag-based fields in
747
+ // TagState.
748
+ static void finish_tag_name(GumboParser* parser) {
749
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
750
+ GumboTagState* tag_state = &tokenizer->_tag_state;
751
+
752
+ const char *data = tag_state->_buffer.data;
753
+ size_t length = tag_state->_buffer.length;
754
+ tag_state->_tag = gumbo_tagn_enum(data, length);
755
+ if (tag_state->_tag == GUMBO_TAG_UNKNOWN) {
756
+ char *name = gumbo_alloc(length + 1);
757
+ memcpy(name, data, length);
758
+ name[length] = 0;
759
+ tag_state->_name = name;
760
+ }
761
+ reinitialize_tag_buffer(parser);
762
+ }
763
+
764
+ // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
765
+ static void add_duplicate_attr_error(GumboParser* parser) {
766
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
767
+ GumboError* error = gumbo_add_error(parser);
768
+ if (!error) {
769
+ return;
770
+ }
771
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
772
+ error->type = GUMBO_ERR_DUPLICATE_ATTRIBUTE;
773
+ error->position = tag_state->_start_pos;
774
+ error->original_text.data = tag_state->_original_text;
775
+ error->original_text.length =
776
+ utf8iterator_get_char_pointer(&tokenizer->_input) - error->original_text.data;
777
+ error->v.tokenizer.state = tokenizer->_state;
778
+ }
779
+
780
+ // Creates a new attribute in the current tag, copying the current tag buffer to
781
+ // the attribute's name. The attribute's value starts out as the empty string
782
+ // (following the "Boolean attributes" section of the spec) and is only
783
+ // overwritten on finish_attribute_value(). If the attribute has already been
784
+ // specified, the new attribute is dropped and a parse error is added
785
+ static void finish_attribute_name(GumboParser* parser) {
786
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
787
+ GumboTagState* tag_state = &tokenizer->_tag_state;
788
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
789
+
790
+ int max_attributes = parser->_options->max_attributes;
791
+ if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
792
+ parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
793
+ gumbo_debug("Attributes limit exceeded.\n");
794
+ reinitialize_tag_buffer(parser);
795
+ tag_state->_drop_next_attr_value = true;
796
+ return;
797
+ }
798
+
799
+ // May've been set by a previous attribute without a value; reset it here.
800
+ tag_state->_drop_next_attr_value = false;
801
+ assert(tag_state->_attributes.data);
802
+ assert(tag_state->_attributes.capacity);
803
+
804
+ for (unsigned int i = 0; i < attributes->length; ++i) {
805
+ GumboAttribute* attr = attributes->data[i];
806
+ if (
807
+ strlen(attr->name) == tag_state->_buffer.length
808
+ && 0 == memcmp (
809
+ attr->name,
810
+ tag_state->_buffer.data,
811
+ tag_state->_buffer.length
812
+ )
813
+ ) {
814
+ // Identical attribute; bail.
815
+ add_duplicate_attr_error(parser);
816
+ reinitialize_tag_buffer(parser);
817
+ tag_state->_drop_next_attr_value = true;
818
+ return;
819
+ }
820
+ }
821
+
822
+ GumboAttribute* attr = gumbo_alloc(sizeof(GumboAttribute));
823
+ attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
824
+ copy_over_tag_buffer(parser, &attr->name);
825
+ copy_over_original_tag_text (
826
+ parser,
827
+ &attr->original_name,
828
+ &attr->name_start,
829
+ &attr->name_end
830
+ );
831
+ attr->value = gumbo_strdup("");
832
+ copy_over_original_tag_text (
833
+ parser,
834
+ &attr->original_value,
835
+ &attr->name_start,
836
+ &attr->name_end
837
+ );
838
+ gumbo_vector_add(attr, attributes);
839
+ reinitialize_tag_buffer(parser);
840
+ }
841
+
842
+ // Finishes an attribute value. This sets the value of the most recently added
843
+ // attribute to the current contents of the tag buffer.
844
+ static void finish_attribute_value(GumboParser* parser) {
845
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
846
+ if (tag_state->_drop_next_attr_value) {
847
+ // Duplicate attribute name detected in an earlier state, so we have to
848
+ // ignore the value.
849
+ tag_state->_drop_next_attr_value = false;
850
+ reinitialize_tag_buffer(parser);
851
+ return;
852
+ }
853
+
854
+ GumboAttribute* attr =
855
+ tag_state->_attributes.data[tag_state->_attributes.length - 1];
856
+ gumbo_free((void*) attr->value);
857
+ copy_over_tag_buffer(parser, &attr->value);
858
+ copy_over_original_tag_text(
859
+ parser, &attr->original_value, &attr->value_start, &attr->value_end);
860
+ reinitialize_tag_buffer(parser);
861
+ }
862
+
863
+ // Returns true if the current end tag matches the last start tag emitted.
864
+ static bool is_appropriate_end_tag(GumboParser* parser) {
865
+ GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
866
+ assert(!tag_state->_is_start_tag);
867
+ return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
868
+ tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
869
+ tag_state->_buffer.length);
870
+ }
871
+
872
+ void gumbo_tokenizer_state_init (
873
+ GumboParser* parser,
874
+ const char* text,
875
+ size_t text_length
876
+ ) {
877
+ GumboTokenizerState* tokenizer = gumbo_alloc(sizeof(GumboTokenizerState));
878
+ parser->_tokenizer_state = tokenizer;
879
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
880
+ tokenizer->_return_state = GUMBO_LEX_DATA;
881
+ tokenizer->_character_reference_code = 0;
882
+ tokenizer->_reconsume_current_input = false;
883
+ tokenizer->_is_adjusted_current_node_foreign = false;
884
+ tokenizer->_is_in_cdata = false;
885
+ tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
886
+ tokenizer->_tag_state._name = NULL;
887
+
888
+ tokenizer->_buffered_emit_char = kGumboNoChar;
889
+ gumbo_string_buffer_init(&tokenizer->_temporary_buffer);
890
+ tokenizer->_resume_pos = NULL;
891
+
892
+ mark_tag_state_as_empty(&tokenizer->_tag_state);
893
+
894
+ utf8iterator_init(parser, text, text_length, &tokenizer->_input);
895
+ utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
896
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
897
+ doc_type_state_init(parser);
898
+ }
899
+
900
+ void gumbo_tokenizer_state_destroy(GumboParser* parser) {
901
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
902
+ assert(tokenizer->_doc_type_state.name == NULL);
903
+ assert(tokenizer->_doc_type_state.public_identifier == NULL);
904
+ assert(tokenizer->_doc_type_state.system_identifier == NULL);
905
+ gumbo_string_buffer_destroy(&tokenizer->_temporary_buffer);
906
+ assert(tokenizer->_tag_state._name == NULL);
907
+ assert(tokenizer->_tag_state._attributes.data == NULL);
908
+ gumbo_free(tokenizer);
909
+ }
910
+
911
+ void gumbo_tokenizer_set_state(GumboParser* parser, GumboTokenizerEnum state) {
912
+ parser->_tokenizer_state->_state = state;
913
+ }
914
+
915
+ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
916
+ GumboParser* parser,
917
+ bool is_foreign
918
+ ) {
919
+ if (is_foreign != parser->_tokenizer_state->_is_adjusted_current_node_foreign) {
920
+ gumbo_debug (
921
+ "Toggling is_current_node_foreign to %s.\n",
922
+ is_foreign ? "true" : "false"
923
+ );
924
+ }
925
+ parser->_tokenizer_state->_is_adjusted_current_node_foreign = is_foreign;
926
+ }
927
+
928
+ static void reconsume_in_state(GumboParser* parser, GumboTokenizerEnum state) {
929
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
930
+ tokenizer->_reconsume_current_input = true;
931
+ tokenizer->_state = state;
932
+ }
933
+
934
+ // https://html.spec.whatwg.org/multipage/parsing.html#data-state
935
+ static StateResult handle_data_state (
936
+ GumboParser* parser,
937
+ GumboTokenizerState* tokenizer,
938
+ int c,
939
+ GumboToken* output
940
+ ) {
941
+ switch (c) {
942
+ case '&':
943
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
944
+ set_mark(parser);
945
+ tokenizer->_return_state = GUMBO_LEX_DATA;
946
+ return CONTINUE;
947
+ case '<':
948
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_TAG_OPEN);
949
+ set_mark(parser);
950
+ return CONTINUE;
951
+ case '\0':
952
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
953
+ return emit_char(parser, c, output);
954
+ case -1:
955
+ return emit_eof(parser, output);
956
+ default:
957
+ return emit_char(parser, c, output);
958
+ }
959
+ }
960
+
961
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
962
+ static StateResult handle_rcdata_state (
963
+ GumboParser* parser,
964
+ GumboTokenizerState* tokenizer,
965
+ int c,
966
+ GumboToken* output
967
+ ) {
968
+ switch (c) {
969
+ case '&':
970
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
971
+ set_mark(parser);
972
+ tokenizer->_return_state = GUMBO_LEX_RCDATA;
973
+ return CONTINUE;
974
+ case '<':
975
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_LT);
976
+ set_mark(parser);
977
+ return CONTINUE;
978
+ case '\0':
979
+ return emit_replacement_char(parser, output);
980
+ case -1:
981
+ return emit_eof(parser, output);
982
+ default:
983
+ return emit_char(parser, c, output);
984
+ }
985
+ }
986
+
987
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
988
+ static StateResult handle_rawtext_state (
989
+ GumboParser* parser,
990
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
991
+ int c,
992
+ GumboToken* output
993
+ ) {
994
+ switch (c) {
995
+ case '<':
996
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
997
+ set_mark(parser);
998
+ return CONTINUE;
999
+ case '\0':
1000
+ return emit_replacement_char(parser, output);
1001
+ case -1:
1002
+ return emit_eof(parser, output);
1003
+ default:
1004
+ return emit_char(parser, c, output);
1005
+ }
1006
+ }
1007
+
1008
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
1009
+ static StateResult handle_script_data_state (
1010
+ GumboParser* parser,
1011
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1012
+ int c,
1013
+ GumboToken* output
1014
+ ) {
1015
+ switch (c) {
1016
+ case '<':
1017
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_LT);
1018
+ set_mark(parser);
1019
+ return CONTINUE;
1020
+ case '\0':
1021
+ return emit_replacement_char(parser, output);
1022
+ case -1:
1023
+ return emit_eof(parser, output);
1024
+ default:
1025
+ return emit_char(parser, c, output);
1026
+ }
1027
+ }
1028
+
1029
+ // https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
1030
+ static StateResult handle_plaintext_state (
1031
+ GumboParser* parser,
1032
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1033
+ int c,
1034
+ GumboToken* output
1035
+ ) {
1036
+ switch (c) {
1037
+ case '\0':
1038
+ return emit_replacement_char(parser, output);
1039
+ case -1:
1040
+ return emit_eof(parser, output);
1041
+ default:
1042
+ return emit_char(parser, c, output);
1043
+ }
1044
+ }
1045
+
1046
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
1047
+ static StateResult handle_tag_open_state (
1048
+ GumboParser* parser,
1049
+ GumboTokenizerState* tokenizer,
1050
+ int c,
1051
+ GumboToken* output
1052
+ ) {
1053
+ switch (c) {
1054
+ case '!':
1055
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_MARKUP_DECLARATION_OPEN);
1056
+ clear_temporary_buffer(parser);
1057
+ return CONTINUE;
1058
+ case '/':
1059
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_END_TAG_OPEN);
1060
+ return CONTINUE;
1061
+ case '?':
1062
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME);
1063
+ clear_temporary_buffer(parser);
1064
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1065
+ return CONTINUE;
1066
+ case -1:
1067
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1068
+ // Switch to data to emit EOF.
1069
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1070
+ return emit_from_mark(parser, output);
1071
+ default:
1072
+ if (is_alpha(c)) {
1073
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1074
+ start_new_tag(parser, true);
1075
+ return CONTINUE;
1076
+ }
1077
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1078
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1079
+ return emit_from_mark(parser, output);
1080
+ }
1081
+ }
1082
+
1083
+ // https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
1084
+ static StateResult handle_end_tag_open_state (
1085
+ GumboParser* parser,
1086
+ GumboTokenizerState* tokenizer,
1087
+ int c,
1088
+ GumboToken* output
1089
+ ) {
1090
+ switch (c) {
1091
+ case '>':
1092
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_END_TAG_NAME);
1093
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1094
+ return CONTINUE;
1095
+ case -1:
1096
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_BEFORE_TAG_NAME);
1097
+ // Similar to the tag open state except we need to emit '<' and '/'
1098
+ // before the EOF.
1099
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
1100
+ return emit_from_mark(parser, output);
1101
+ default:
1102
+ if (is_alpha(c)) {
1103
+ reconsume_in_state(parser, GUMBO_LEX_TAG_NAME);
1104
+ start_new_tag(parser, false);
1105
+ } else {
1106
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME);
1107
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
1108
+ clear_temporary_buffer(parser);
1109
+ }
1110
+ return CONTINUE;
1111
+ }
1112
+ }
1113
+
1114
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
1115
+ static StateResult handle_tag_name_state (
1116
+ GumboParser* parser,
1117
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1118
+ int c,
1119
+ GumboToken* output
1120
+ ) {
1121
+ switch (c) {
1122
+ case '\t':
1123
+ case '\n':
1124
+ case '\f':
1125
+ case ' ':
1126
+ finish_tag_name(parser);
1127
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1128
+ return CONTINUE;
1129
+ case '/':
1130
+ finish_tag_name(parser);
1131
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1132
+ return CONTINUE;
1133
+ case '>':
1134
+ finish_tag_name(parser);
1135
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1136
+ return emit_current_tag(parser, output);
1137
+ case '\0':
1138
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1139
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1140
+ return CONTINUE;
1141
+ case -1:
1142
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1143
+ abandon_current_tag(parser);
1144
+ return emit_eof(parser, output);
1145
+ default:
1146
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1147
+ return CONTINUE;
1148
+ }
1149
+ }
1150
+
1151
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
1152
+ static StateResult handle_rcdata_lt_state (
1153
+ GumboParser* parser,
1154
+ GumboTokenizerState* tokenizer,
1155
+ int c,
1156
+ GumboToken* output
1157
+ ) {
1158
+ if (c == '/') {
1159
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
1160
+ return CONTINUE;
1161
+ } else {
1162
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1163
+ return emit_from_mark(parser, output);
1164
+ }
1165
+ }
1166
+
1167
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
1168
+ static StateResult handle_rcdata_end_tag_open_state (
1169
+ GumboParser* parser,
1170
+ GumboTokenizerState* tokenizer,
1171
+ int c,
1172
+ GumboToken* output
1173
+ ) {
1174
+ if (is_alpha(c)) {
1175
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
1176
+ start_new_tag(parser, false);
1177
+ return CONTINUE;
1178
+ }
1179
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1180
+ return emit_from_mark(parser, output);
1181
+ }
1182
+
1183
+ // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
1184
+ static StateResult handle_rcdata_end_tag_name_state (
1185
+ GumboParser* parser,
1186
+ GumboTokenizerState* tokenizer,
1187
+ int c,
1188
+ GumboToken* output
1189
+ ) {
1190
+ UNUSED_IF_NDEBUG(tokenizer);
1191
+ if (is_alpha(c)) {
1192
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1193
+ return CONTINUE;
1194
+ }
1195
+ switch (c) {
1196
+ case '\t':
1197
+ case '\n':
1198
+ case '\f':
1199
+ case ' ':
1200
+ if (is_appropriate_end_tag(parser)) {
1201
+ finish_tag_name(parser);
1202
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1203
+ return CONTINUE;
1204
+ }
1205
+ break;
1206
+ case '/':
1207
+ if (is_appropriate_end_tag(parser)) {
1208
+ finish_tag_name(parser);
1209
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1210
+ return CONTINUE;
1211
+ }
1212
+ break;
1213
+ case '>':
1214
+ if (is_appropriate_end_tag(parser)) {
1215
+ finish_tag_name(parser);
1216
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1217
+ return emit_current_tag(parser, output);
1218
+ }
1219
+ break;
1220
+ }
1221
+ abandon_current_tag(parser);
1222
+ reconsume_in_state(parser, GUMBO_LEX_RCDATA);
1223
+ return emit_from_mark(parser, output);
1224
+ }
1225
+
1226
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
1227
+ static StateResult handle_rawtext_lt_state (
1228
+ GumboParser* parser,
1229
+ GumboTokenizerState* tokenizer,
1230
+ int c,
1231
+ GumboToken* output
1232
+ ) {
1233
+ if (c == '/') {
1234
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
1235
+ return CONTINUE;
1236
+ } else {
1237
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1238
+ return emit_from_mark(parser, output);
1239
+ }
1240
+ }
1241
+
1242
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
1243
+ static StateResult handle_rawtext_end_tag_open_state (
1244
+ GumboParser* parser,
1245
+ GumboTokenizerState* tokenizer,
1246
+ int c,
1247
+ GumboToken* output
1248
+ ) {
1249
+ if (is_alpha(c)) {
1250
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
1251
+ start_new_tag(parser, false);
1252
+ return CONTINUE;
1253
+ } else {
1254
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1255
+ return emit_from_mark(parser, output);
1256
+ }
1257
+ }
1258
+
1259
+ // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
1260
+ static StateResult handle_rawtext_end_tag_name_state (
1261
+ GumboParser* parser,
1262
+ GumboTokenizerState* tokenizer,
1263
+ int c,
1264
+ GumboToken* output
1265
+ ) {
1266
+ if (is_alpha(c)) {
1267
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1268
+ return CONTINUE;
1269
+ }
1270
+ switch (c) {
1271
+ case '\t':
1272
+ case '\n':
1273
+ case '\f':
1274
+ case ' ':
1275
+ if (is_appropriate_end_tag(parser)) {
1276
+ finish_tag_name(parser);
1277
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1278
+ return CONTINUE;
1279
+ }
1280
+ break;
1281
+ case '/':
1282
+ if (is_appropriate_end_tag(parser)) {
1283
+ finish_tag_name(parser);
1284
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1285
+ return CONTINUE;
1286
+ }
1287
+ break;
1288
+ case '>':
1289
+ if (is_appropriate_end_tag(parser)) {
1290
+ finish_tag_name(parser);
1291
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1292
+ return emit_current_tag(parser, output);
1293
+ }
1294
+ break;
1295
+ }
1296
+ abandon_current_tag(parser);
1297
+ reconsume_in_state(parser, GUMBO_LEX_RAWTEXT);
1298
+ return emit_from_mark(parser, output);
1299
+ }
1300
+
1301
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
1302
+ static StateResult handle_script_data_lt_state (
1303
+ GumboParser* parser,
1304
+ GumboTokenizerState* tokenizer,
1305
+ int c,
1306
+ GumboToken* output
1307
+ ) {
1308
+ if (c == '/') {
1309
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN);
1310
+ return CONTINUE;
1311
+ }
1312
+ if (c == '!') {
1313
+ // This is the only place we don't reconsume the input before emitting the
1314
+ // temporary buffer. Since the current position is stored and the current
1315
+ // character is not emitted, we need to advance the input and then
1316
+ // reconsume.
1317
+ utf8iterator_next(&tokenizer->_input);
1318
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START);
1319
+ return emit_from_mark(parser, output);
1320
+ }
1321
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1322
+ return emit_from_mark(parser, output);
1323
+ }
1324
+
1325
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
1326
+ static StateResult handle_script_data_end_tag_open_state (
1327
+ GumboParser* parser,
1328
+ GumboTokenizerState* tokenizer,
1329
+ int c,
1330
+ GumboToken* output
1331
+ ) {
1332
+ if (is_alpha(c)) {
1333
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME);
1334
+ start_new_tag(parser, false);
1335
+ return CONTINUE;
1336
+ }
1337
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1338
+ return emit_from_mark(parser, output);
1339
+ }
1340
+
1341
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
1342
+ static StateResult handle_script_data_end_tag_name_state (
1343
+ GumboParser* parser,
1344
+ GumboTokenizerState* tokenizer,
1345
+ int c,
1346
+ GumboToken* output
1347
+ ) {
1348
+ if (is_alpha(c)) {
1349
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1350
+ return CONTINUE;
1351
+ }
1352
+ switch (c) {
1353
+ case '\t':
1354
+ case '\n':
1355
+ case '\f':
1356
+ case ' ':
1357
+ if (is_appropriate_end_tag(parser)) {
1358
+ finish_tag_name(parser);
1359
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1360
+ return CONTINUE;
1361
+ }
1362
+ break;
1363
+ case '/':
1364
+ if (is_appropriate_end_tag(parser)) {
1365
+ finish_tag_name(parser);
1366
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1367
+ return CONTINUE;
1368
+ }
1369
+ break;
1370
+ case '>':
1371
+ if (is_appropriate_end_tag(parser)) {
1372
+ finish_tag_name(parser);
1373
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1374
+ return emit_current_tag(parser, output);
1375
+ }
1376
+ break;
1377
+ }
1378
+ abandon_current_tag(parser);
1379
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1380
+ return emit_from_mark(parser, output);
1381
+ }
1382
+
1383
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
1384
+ static StateResult handle_script_data_escaped_start_state (
1385
+ GumboParser* parser,
1386
+ GumboTokenizerState* tokenizer,
1387
+ int c,
1388
+ GumboToken* output
1389
+ ) {
1390
+ if (c == '-') {
1391
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH);
1392
+ return emit_char(parser, c, output);
1393
+ }
1394
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1395
+ return CONTINUE;
1396
+ }
1397
+
1398
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
1399
+ static StateResult handle_script_data_escaped_start_dash_state (
1400
+ GumboParser* parser,
1401
+ GumboTokenizerState* tokenizer,
1402
+ int c,
1403
+ GumboToken* output
1404
+ ) {
1405
+ if (c == '-') {
1406
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1407
+ return emit_char(parser, c, output);
1408
+ } else {
1409
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA);
1410
+ return CONTINUE;
1411
+ }
1412
+ }
1413
+
1414
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
1415
+ static StateResult handle_script_data_escaped_state (
1416
+ GumboParser* parser,
1417
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1418
+ int c,
1419
+ GumboToken* output
1420
+ ) {
1421
+ switch (c) {
1422
+ case '-':
1423
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH);
1424
+ return emit_char(parser, c, output);
1425
+ case '<':
1426
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1427
+ clear_temporary_buffer(parser);
1428
+ set_mark(parser);
1429
+ return CONTINUE;
1430
+ case '\0':
1431
+ return emit_replacement_char(parser, output);
1432
+ case -1:
1433
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1434
+ return emit_eof(parser, output);
1435
+ default:
1436
+ return emit_char(parser, c, output);
1437
+ }
1438
+ }
1439
+
1440
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
1441
+ static StateResult handle_script_data_escaped_dash_state (
1442
+ GumboParser* parser,
1443
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1444
+ int c,
1445
+ GumboToken* output
1446
+ ) {
1447
+ switch (c) {
1448
+ case '-':
1449
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH);
1450
+ return emit_char(parser, c, output);
1451
+ case '<':
1452
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1453
+ clear_temporary_buffer(parser);
1454
+ set_mark(parser);
1455
+ return CONTINUE;
1456
+ case '\0':
1457
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1458
+ return emit_replacement_char(parser, output);
1459
+ case -1:
1460
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1461
+ return emit_eof(parser, output);
1462
+ default:
1463
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1464
+ return emit_char(parser, c, output);
1465
+ }
1466
+ }
1467
+
1468
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
1469
+ static StateResult handle_script_data_escaped_dash_dash_state (
1470
+ GumboParser* parser,
1471
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1472
+ int c,
1473
+ GumboToken* output
1474
+ ) {
1475
+ switch (c) {
1476
+ case '-':
1477
+ return emit_char(parser, c, output);
1478
+ case '<':
1479
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT);
1480
+ clear_temporary_buffer(parser);
1481
+ set_mark(parser);
1482
+ return CONTINUE;
1483
+ case '>':
1484
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1485
+ return emit_char(parser, c, output);
1486
+ case '\0':
1487
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1488
+ return emit_replacement_char(parser, output);
1489
+ case -1:
1490
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1491
+ return emit_eof(parser, output);
1492
+ default:
1493
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1494
+ return emit_char(parser, c, output);
1495
+ }
1496
+ }
1497
+
1498
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
1499
+ static StateResult handle_script_data_escaped_lt_state (
1500
+ GumboParser* parser,
1501
+ GumboTokenizerState* tokenizer,
1502
+ int c,
1503
+ GumboToken* output
1504
+ ) {
1505
+ assert(temporary_buffer_is_empty(parser));
1506
+ if (c == '/') {
1507
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN);
1508
+ return CONTINUE;
1509
+ }
1510
+ if (is_alpha(c)) {
1511
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START);
1512
+ return emit_from_mark(parser, output);
1513
+ }
1514
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1515
+ return emit_from_mark(parser, output);
1516
+ }
1517
+
1518
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
1519
+ static StateResult handle_script_data_escaped_end_tag_open_state (
1520
+ GumboParser* parser,
1521
+ GumboTokenizerState* tokenizer,
1522
+ int c,
1523
+ GumboToken* output
1524
+ ) {
1525
+ if (is_alpha(c)) {
1526
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME);
1527
+ start_new_tag(parser, false);
1528
+ return CONTINUE;
1529
+ }
1530
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1531
+ return emit_from_mark(parser, output);
1532
+ }
1533
+
1534
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
1535
+ static StateResult handle_script_data_escaped_end_tag_name_state (
1536
+ GumboParser* parser,
1537
+ GumboTokenizerState* tokenizer,
1538
+ int c,
1539
+ GumboToken* output
1540
+ ) {
1541
+ if (is_alpha(c)) {
1542
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1543
+ return CONTINUE;
1544
+ }
1545
+ switch (c) {
1546
+ case '\t':
1547
+ case '\n':
1548
+ case '\f':
1549
+ case ' ':
1550
+ if (is_appropriate_end_tag(parser)) {
1551
+ finish_tag_name(parser);
1552
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1553
+ return CONTINUE;
1554
+ }
1555
+ break;
1556
+ case '/':
1557
+ if (is_appropriate_end_tag(parser)) {
1558
+ finish_tag_name(parser);
1559
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1560
+ return CONTINUE;
1561
+ }
1562
+ break;
1563
+ case '>':
1564
+ if (is_appropriate_end_tag(parser)) {
1565
+ finish_tag_name(parser);
1566
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1567
+ return emit_current_tag(parser, output);
1568
+ }
1569
+ break;
1570
+ }
1571
+ abandon_current_tag(parser);
1572
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1573
+ return emit_from_mark(parser, output);
1574
+ }
1575
+
1576
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
1577
+ static StateResult handle_script_data_double_escaped_start_state (
1578
+ GumboParser* parser,
1579
+ GumboTokenizerState* tokenizer,
1580
+ int c,
1581
+ GumboToken* output
1582
+ ) {
1583
+ switch (c) {
1584
+ case '\t':
1585
+ case '\n':
1586
+ case '\f':
1587
+ case ' ':
1588
+ case '/':
1589
+ case '>':
1590
+ gumbo_tokenizer_set_state (
1591
+ parser,
1592
+ gumbo_string_equals (
1593
+ &kScriptTag,
1594
+ (GumboStringPiece*) &tokenizer->_temporary_buffer
1595
+ )
1596
+ ? GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED
1597
+ : GUMBO_LEX_SCRIPT_DATA_ESCAPED
1598
+ );
1599
+ return emit_char(parser, c, output);
1600
+ }
1601
+ if (is_alpha(c)) {
1602
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1603
+ return emit_char(parser, c, output);
1604
+ }
1605
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_ESCAPED);
1606
+ return CONTINUE;
1607
+ }
1608
+
1609
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
1610
+ static StateResult handle_script_data_double_escaped_state (
1611
+ GumboParser* parser,
1612
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1613
+ int c,
1614
+ GumboToken* output
1615
+ ) {
1616
+ switch (c) {
1617
+ case '-':
1618
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH);
1619
+ return emit_char(parser, c, output);
1620
+ case '<':
1621
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1622
+ return emit_char(parser, c, output);
1623
+ case '\0':
1624
+ return emit_replacement_char(parser, output);
1625
+ case -1:
1626
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1627
+ return emit_eof(parser, output);
1628
+ default:
1629
+ return emit_char(parser, c, output);
1630
+ }
1631
+ }
1632
+
1633
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
1634
+ static StateResult handle_script_data_double_escaped_dash_state (
1635
+ GumboParser* parser,
1636
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1637
+ int c,
1638
+ GumboToken* output
1639
+ ) {
1640
+ switch (c) {
1641
+ case '-':
1642
+ gumbo_tokenizer_set_state(
1643
+ parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH);
1644
+ return emit_char(parser, c, output);
1645
+ case '<':
1646
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1647
+ return emit_char(parser, c, output);
1648
+ case '\0':
1649
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1650
+ return emit_replacement_char(parser, output);
1651
+ case -1:
1652
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1653
+ return emit_eof(parser, output);
1654
+ default:
1655
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1656
+ return emit_char(parser, c, output);
1657
+ }
1658
+ }
1659
+
1660
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
1661
+ static StateResult handle_script_data_double_escaped_dash_dash_state (
1662
+ GumboParser* parser,
1663
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
1664
+ int c,
1665
+ GumboToken* output
1666
+ ) {
1667
+ switch (c) {
1668
+ case '-':
1669
+ return emit_char(parser, c, output);
1670
+ case '<':
1671
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT);
1672
+ return emit_char(parser, c, output);
1673
+ case '>':
1674
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA);
1675
+ return emit_char(parser, c, output);
1676
+ case '\0':
1677
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1678
+ return emit_replacement_char(parser, output);
1679
+ case -1:
1680
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
1681
+ return emit_eof(parser, output);
1682
+ default:
1683
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1684
+ return emit_char(parser, c, output);
1685
+ }
1686
+ }
1687
+
1688
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
1689
+ static StateResult handle_script_data_double_escaped_lt_state (
1690
+ GumboParser* parser,
1691
+ GumboTokenizerState* tokenizer,
1692
+ int c,
1693
+ GumboToken* output
1694
+ ) {
1695
+ if (c == '/') {
1696
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END);
1697
+ clear_temporary_buffer(parser);
1698
+ return emit_char(parser, c, output);
1699
+ } else {
1700
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1701
+ return CONTINUE;
1702
+ }
1703
+ }
1704
+
1705
+ // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
1706
+ static StateResult handle_script_data_double_escaped_end_state (
1707
+ GumboParser* parser,
1708
+ GumboTokenizerState* tokenizer,
1709
+ int c,
1710
+ GumboToken* output
1711
+ ) {
1712
+ switch (c) {
1713
+ case '\t':
1714
+ case '\n':
1715
+ case '\f':
1716
+ case ' ':
1717
+ case '/':
1718
+ case '>':
1719
+ gumbo_tokenizer_set_state(
1720
+ parser, gumbo_string_equals(&kScriptTag,
1721
+ (GumboStringPiece*) &tokenizer->_temporary_buffer)
1722
+ ? GUMBO_LEX_SCRIPT_DATA_ESCAPED
1723
+ : GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1724
+ return emit_char(parser, c, output);
1725
+ }
1726
+ if (is_alpha(c)) {
1727
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
1728
+ return emit_char(parser, c, output);
1729
+ }
1730
+ reconsume_in_state(parser, GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED);
1731
+ return CONTINUE;
1732
+ }
1733
+
1734
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
1735
+ static StateResult handle_before_attr_name_state (
1736
+ GumboParser* parser,
1737
+ GumboTokenizerState* tokenizer,
1738
+ int c,
1739
+ GumboToken* output
1740
+ ) {
1741
+ switch (c) {
1742
+ case '\t':
1743
+ case '\n':
1744
+ case '\f':
1745
+ case ' ':
1746
+ return CONTINUE;
1747
+ case '/':
1748
+ case '>':
1749
+ case -1:
1750
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1751
+ return CONTINUE;
1752
+ case '=':
1753
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME);
1754
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1755
+ append_char_to_tag_buffer(parser, c, true);
1756
+ return CONTINUE;
1757
+ default:
1758
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1759
+ return CONTINUE;
1760
+ }
1761
+ }
1762
+
1763
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
1764
+ static StateResult handle_attr_name_state (
1765
+ GumboParser* parser,
1766
+ GumboTokenizerState* tokenizer,
1767
+ int c,
1768
+ GumboToken* output
1769
+ ) {
1770
+ switch (c) {
1771
+ case '\t':
1772
+ case '\n':
1773
+ case '\f':
1774
+ case ' ':
1775
+ case '/':
1776
+ case '>':
1777
+ case -1:
1778
+ finish_attribute_name(parser);
1779
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_ATTR_NAME);
1780
+ return CONTINUE;
1781
+ case '=':
1782
+ finish_attribute_name(parser);
1783
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1784
+ return CONTINUE;
1785
+ case '\0':
1786
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1787
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1788
+ return CONTINUE;
1789
+ case '"':
1790
+ case '\'':
1791
+ case '<':
1792
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME);
1793
+ // Fall through.
1794
+ default:
1795
+ append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1796
+ return CONTINUE;
1797
+ }
1798
+ }
1799
+
1800
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
1801
+ static StateResult handle_after_attr_name_state (
1802
+ GumboParser* parser,
1803
+ GumboTokenizerState* tokenizer,
1804
+ int c,
1805
+ GumboToken* output
1806
+ ) {
1807
+ switch (c) {
1808
+ case '\t':
1809
+ case '\n':
1810
+ case '\f':
1811
+ case ' ':
1812
+ return CONTINUE;
1813
+ case '/':
1814
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1815
+ return CONTINUE;
1816
+ case '=':
1817
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_VALUE);
1818
+ return CONTINUE;
1819
+ case '>':
1820
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1821
+ return emit_current_tag(parser, output);
1822
+ case -1:
1823
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1824
+ abandon_current_tag(parser);
1825
+ return emit_eof(parser, output);
1826
+ default:
1827
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_NAME);
1828
+ return CONTINUE;
1829
+ }
1830
+ }
1831
+
1832
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
1833
+ static StateResult handle_before_attr_value_state (
1834
+ GumboParser* parser,
1835
+ GumboTokenizerState* tokenizer,
1836
+ int c,
1837
+ GumboToken* output
1838
+ ) {
1839
+ switch (c) {
1840
+ case '\t':
1841
+ case '\n':
1842
+ case '\f':
1843
+ case ' ':
1844
+ return CONTINUE;
1845
+ case '"':
1846
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED);
1847
+ reset_tag_buffer_start_point(parser);
1848
+ return CONTINUE;
1849
+ case '\'':
1850
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED);
1851
+ reset_tag_buffer_start_point(parser);
1852
+ return CONTINUE;
1853
+ case '>':
1854
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_ATTRIBUTE_VALUE);
1855
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1856
+ return emit_current_tag(parser, output);
1857
+ }
1858
+ reconsume_in_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1859
+ return CONTINUE;
1860
+ }
1861
+
1862
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
1863
+ static StateResult handle_attr_value_double_quoted_state (
1864
+ GumboParser* parser,
1865
+ GumboTokenizerState* tokenizer,
1866
+ int c,
1867
+ GumboToken* output
1868
+ ) {
1869
+ switch (c) {
1870
+ case '"':
1871
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1872
+ return CONTINUE;
1873
+ case '&':
1874
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1875
+ set_mark(parser);
1876
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED;
1877
+ return CONTINUE;
1878
+ case '\0':
1879
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1880
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1881
+ return CONTINUE;
1882
+ case -1:
1883
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1884
+ abandon_current_tag(parser);
1885
+ return emit_eof(parser, output);
1886
+ default:
1887
+ append_char_to_tag_buffer(parser, c, false);
1888
+ return CONTINUE;
1889
+ }
1890
+ }
1891
+
1892
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state
1893
+ static StateResult handle_attr_value_single_quoted_state (
1894
+ GumboParser* parser,
1895
+ GumboTokenizerState* tokenizer,
1896
+ int c,
1897
+ GumboToken* output
1898
+ ) {
1899
+ switch (c) {
1900
+ case '\'':
1901
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
1902
+ return CONTINUE;
1903
+ case '&':
1904
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1905
+ set_mark(parser);
1906
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED;
1907
+ return CONTINUE;
1908
+ case '\0':
1909
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1910
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, false);
1911
+ return CONTINUE;
1912
+ case -1:
1913
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1914
+ abandon_current_tag(parser);
1915
+ return emit_eof(parser, output);
1916
+ default:
1917
+ append_char_to_tag_buffer(parser, c, false);
1918
+ return CONTINUE;
1919
+ }
1920
+ }
1921
+
1922
+ // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
1923
+ static StateResult handle_attr_value_unquoted_state (
1924
+ GumboParser* parser,
1925
+ GumboTokenizerState* tokenizer,
1926
+ int c,
1927
+ GumboToken* output
1928
+ ) {
1929
+ switch (c) {
1930
+ case '\t':
1931
+ case '\n':
1932
+ case '\f':
1933
+ case ' ':
1934
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1935
+ finish_attribute_value(parser);
1936
+ return CONTINUE;
1937
+ case '&':
1938
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHARACTER_REFERENCE);
1939
+ set_mark(parser);
1940
+ tokenizer->_return_state = GUMBO_LEX_ATTR_VALUE_UNQUOTED;
1941
+ return CONTINUE;
1942
+ case '>':
1943
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1944
+ finish_attribute_value(parser);
1945
+ return emit_current_tag(parser, output);
1946
+ case '\0':
1947
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
1948
+ append_char_to_tag_buffer(parser, kUtf8ReplacementChar, true);
1949
+ return CONTINUE;
1950
+ case -1:
1951
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1952
+ abandon_current_tag(parser);
1953
+ return emit_eof(parser, output);
1954
+ case '"':
1955
+ case '\'':
1956
+ case '<':
1957
+ case '=':
1958
+ case '`':
1959
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE);
1960
+ // Fall through.
1961
+ default:
1962
+ append_char_to_tag_buffer(parser, c, true);
1963
+ return CONTINUE;
1964
+ }
1965
+ }
1966
+
1967
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
1968
+ static StateResult handle_after_attr_value_quoted_state (
1969
+ GumboParser* parser,
1970
+ GumboTokenizerState* tokenizer,
1971
+ int c,
1972
+ GumboToken* output
1973
+ ) {
1974
+ finish_attribute_value(parser);
1975
+ switch (c) {
1976
+ case '\t':
1977
+ case '\n':
1978
+ case '\f':
1979
+ case ' ':
1980
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1981
+ return CONTINUE;
1982
+ case '/':
1983
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SELF_CLOSING_START_TAG);
1984
+ return CONTINUE;
1985
+ case '>':
1986
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
1987
+ return emit_current_tag(parser, output);
1988
+ case -1:
1989
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
1990
+ abandon_current_tag(parser);
1991
+ return emit_eof(parser, output);
1992
+ default:
1993
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES);
1994
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
1995
+ return CONTINUE;
1996
+ }
1997
+ }
1998
+
1999
+ // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
2000
+ static StateResult handle_self_closing_start_tag_state (
2001
+ GumboParser* parser,
2002
+ GumboTokenizerState* tokenizer,
2003
+ int c,
2004
+ GumboToken* output
2005
+ ) {
2006
+ switch (c) {
2007
+ case '>':
2008
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2009
+ tokenizer->_tag_state._is_self_closing = true;
2010
+ return emit_current_tag(parser, output);
2011
+ case -1:
2012
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_TAG);
2013
+ abandon_current_tag(parser);
2014
+ return emit_eof(parser, output);
2015
+ default:
2016
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG);
2017
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_ATTR_NAME);
2018
+ return CONTINUE;
2019
+ }
2020
+ }
2021
+
2022
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
2023
+ static StateResult handle_bogus_comment_state (
2024
+ GumboParser* parser,
2025
+ GumboTokenizerState* tokenizer,
2026
+ int c,
2027
+ GumboToken* output
2028
+ ) {
2029
+ switch (c) {
2030
+ case '>':
2031
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2032
+ return emit_comment(parser, output);
2033
+ case -1:
2034
+ // We need to emit the comment and then the EOF, so reconsume in data
2035
+ // state.
2036
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2037
+ return emit_comment(parser, output);
2038
+ case '\0':
2039
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2040
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2041
+ return CONTINUE;
2042
+ default:
2043
+ append_char_to_temporary_buffer(parser, c);
2044
+ return CONTINUE;
2045
+ }
2046
+ }
2047
+
2048
+ // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
2049
+ static StateResult handle_markup_declaration_open_state (
2050
+ GumboParser* parser,
2051
+ GumboTokenizerState* tokenizer,
2052
+ int UNUSED_ARG(c),
2053
+ GumboToken* UNUSED_ARG(output)
2054
+ ) {
2055
+ if (
2056
+ utf8iterator_maybe_consume_match (
2057
+ &tokenizer->_input,
2058
+ "--",
2059
+ sizeof("--") - 1,
2060
+ /* case sensitive */ true
2061
+ )
2062
+ ) {
2063
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_START);
2064
+ return CONTINUE;
2065
+ }
2066
+ if (
2067
+ utf8iterator_maybe_consume_match (
2068
+ &tokenizer->_input,
2069
+ "DOCTYPE",
2070
+ sizeof("DOCTYPE") - 1,
2071
+ /* case sensitive */ false
2072
+ )
2073
+ ) {
2074
+ reconsume_in_state(parser, GUMBO_LEX_DOCTYPE);
2075
+ // If we get here, we know we'll eventually emit a doctype token, so now is
2076
+ // the time to initialize the doctype strings. (Not in doctype_state_init,
2077
+ // since then they'll leak if ownership never gets transferred to the
2078
+ // doctype token.
2079
+ tokenizer->_doc_type_state.name = gumbo_strdup("");
2080
+ tokenizer->_doc_type_state.public_identifier = gumbo_strdup("");
2081
+ tokenizer->_doc_type_state.system_identifier = gumbo_strdup("");
2082
+ return CONTINUE;
2083
+ }
2084
+ if (
2085
+ utf8iterator_maybe_consume_match (
2086
+ &tokenizer->_input,
2087
+ "[CDATA[", sizeof("[CDATA[") - 1,
2088
+ /* case sensitive */ true
2089
+ )
2090
+ ) {
2091
+ if (tokenizer->_is_adjusted_current_node_foreign) {
2092
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2093
+ tokenizer->_is_in_cdata = true;
2094
+ // Start the token after the <![CDATA[.
2095
+ reset_token_start_point(tokenizer);
2096
+ } else {
2097
+ tokenizer_add_token_parse_error(parser, GUMBO_ERR_CDATA_IN_HTML_CONTENT);
2098
+ clear_temporary_buffer(parser);
2099
+ append_string_to_temporary_buffer (
2100
+ parser,
2101
+ &(const GumboStringPiece) { .data = "[CDATA[", .length = 7 }
2102
+ );
2103
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2104
+ }
2105
+ return CONTINUE;
2106
+ }
2107
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_OPENED_COMMENT);
2108
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_COMMENT);
2109
+ clear_temporary_buffer(parser);
2110
+ return CONTINUE;
2111
+ }
2112
+
2113
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
2114
+ static StateResult handle_comment_start_state (
2115
+ GumboParser* parser,
2116
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2117
+ int c,
2118
+ GumboToken* output
2119
+ ) {
2120
+ switch (c) {
2121
+ case '-':
2122
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
2123
+ return CONTINUE;
2124
+ case '>':
2125
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2126
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2127
+ return emit_comment(parser, output);
2128
+ default:
2129
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2130
+ return CONTINUE;
2131
+ }
2132
+ }
2133
+
2134
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
2135
+ static StateResult handle_comment_start_dash_state (
2136
+ GumboParser* parser,
2137
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2138
+ int c,
2139
+ GumboToken* output
2140
+ ) {
2141
+ switch (c) {
2142
+ case '-':
2143
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2144
+ return CONTINUE;
2145
+ case '>':
2146
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT);
2147
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2148
+ return emit_comment(parser, output);
2149
+ case -1:
2150
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2151
+ // Switch to data to emit the EOF next.
2152
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2153
+ return emit_comment(parser, output);
2154
+ default:
2155
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2156
+ append_char_to_temporary_buffer(parser, '-');
2157
+ return CONTINUE;
2158
+ }
2159
+ }
2160
+
2161
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-state
2162
+ static StateResult handle_comment_state (
2163
+ GumboParser* parser,
2164
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2165
+ int c,
2166
+ GumboToken* output
2167
+ ) {
2168
+ switch (c) {
2169
+ case '<':
2170
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT);
2171
+ append_char_to_temporary_buffer(parser, c);
2172
+ return CONTINUE;
2173
+ case '-':
2174
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2175
+ return CONTINUE;
2176
+ case '\0':
2177
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2178
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2179
+ return CONTINUE;
2180
+ case -1:
2181
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2182
+ // Switch to data to emit the EOF token next.
2183
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2184
+ return emit_comment(parser, output);
2185
+ default:
2186
+ append_char_to_temporary_buffer(parser, c);
2187
+ return CONTINUE;
2188
+ }
2189
+ }
2190
+
2191
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
2192
+ static StateResult handle_comment_lt_state (
2193
+ GumboParser* parser,
2194
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2195
+ int c,
2196
+ GumboToken* output
2197
+ ) {
2198
+ switch (c) {
2199
+ case '!':
2200
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG);
2201
+ append_char_to_temporary_buffer(parser, c);
2202
+ return CONTINUE;
2203
+ case '<':
2204
+ append_char_to_temporary_buffer(parser, c);
2205
+ return CONTINUE;
2206
+ default:
2207
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2208
+ return CONTINUE;
2209
+ }
2210
+ }
2211
+
2212
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
2213
+ static StateResult handle_comment_lt_bang_state (
2214
+ GumboParser* parser,
2215
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2216
+ int c,
2217
+ GumboToken* output
2218
+ ) {
2219
+ switch (c) {
2220
+ case '-':
2221
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH);
2222
+ return CONTINUE;
2223
+ default:
2224
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2225
+ return CONTINUE;
2226
+ }
2227
+ }
2228
+
2229
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
2230
+ static StateResult handle_comment_lt_bang_dash_state (
2231
+ GumboParser* parser,
2232
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2233
+ int c,
2234
+ GumboToken* output
2235
+ ) {
2236
+ switch (c) {
2237
+ case '-':
2238
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH);
2239
+ return CONTINUE;
2240
+ default:
2241
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2242
+ return CONTINUE;
2243
+ }
2244
+ }
2245
+
2246
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
2247
+ static StateResult handle_comment_lt_bang_dash_dash_state (
2248
+ GumboParser* parser,
2249
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2250
+ int c,
2251
+ GumboToken* output
2252
+ ) {
2253
+ switch (c) {
2254
+ case '>':
2255
+ case -1:
2256
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2257
+ return CONTINUE;
2258
+ default:
2259
+ tokenizer_add_parse_error(parser, GUMBO_ERR_NESTED_COMMENT);
2260
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT_END);
2261
+ return CONTINUE;
2262
+ }
2263
+ }
2264
+
2265
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
2266
+ static StateResult handle_comment_end_dash_state (
2267
+ GumboParser* parser,
2268
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2269
+ int c,
2270
+ GumboToken* output
2271
+ ) {
2272
+ switch (c) {
2273
+ case '-':
2274
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
2275
+ return CONTINUE;
2276
+ case -1:
2277
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2278
+ // Switch to data to emit EOF next.
2279
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2280
+ return emit_comment(parser, output);
2281
+ default:
2282
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2283
+ append_char_to_temporary_buffer(parser, '-');
2284
+ return CONTINUE;
2285
+ }
2286
+ }
2287
+
2288
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
2289
+ static StateResult handle_comment_end_state (
2290
+ GumboParser* parser,
2291
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2292
+ int c,
2293
+ GumboToken* output
2294
+ ) {
2295
+ switch (c) {
2296
+ case '>':
2297
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2298
+ return emit_comment(parser, output);
2299
+ case '!':
2300
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2301
+ return CONTINUE;
2302
+ case '-':
2303
+ append_char_to_temporary_buffer(parser, '-');
2304
+ return CONTINUE;
2305
+ case -1:
2306
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2307
+ // Switch to data to emit EOF next.
2308
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2309
+ return emit_comment(parser, output);
2310
+ default:
2311
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2312
+ append_char_to_temporary_buffer(parser, '-');
2313
+ append_char_to_temporary_buffer(parser, '-');
2314
+ return CONTINUE;
2315
+ }
2316
+ }
2317
+
2318
+ // https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
2319
+ static StateResult handle_comment_end_bang_state (
2320
+ GumboParser* parser,
2321
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2322
+ int c,
2323
+ GumboToken* output
2324
+ ) {
2325
+ switch (c) {
2326
+ case '-':
2327
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
2328
+ append_char_to_temporary_buffer(parser, '-');
2329
+ append_char_to_temporary_buffer(parser, '-');
2330
+ append_char_to_temporary_buffer(parser, '!');
2331
+ return CONTINUE;
2332
+ case '>':
2333
+ tokenizer_add_parse_error(parser, GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT);
2334
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2335
+ return emit_comment(parser, output);
2336
+ case -1:
2337
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_COMMENT);
2338
+ // Switch to data to emit EOF next.
2339
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2340
+ return emit_comment(parser, output);
2341
+ default:
2342
+ reconsume_in_state(parser, GUMBO_LEX_COMMENT);
2343
+ append_char_to_temporary_buffer(parser, '-');
2344
+ append_char_to_temporary_buffer(parser, '-');
2345
+ append_char_to_temporary_buffer(parser, '!');
2346
+ return CONTINUE;
2347
+ }
2348
+ }
2349
+
2350
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
2351
+ static StateResult handle_doctype_state (
2352
+ GumboParser* parser,
2353
+ GumboTokenizerState* tokenizer,
2354
+ int c,
2355
+ GumboToken* output
2356
+ ) {
2357
+ assert(temporary_buffer_is_empty(parser));
2358
+ switch (c) {
2359
+ case '\t':
2360
+ case '\n':
2361
+ case '\f':
2362
+ case ' ':
2363
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2364
+ return CONTINUE;
2365
+ case '>':
2366
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2367
+ return CONTINUE;
2368
+ case -1:
2369
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2370
+ tokenizer->_doc_type_state.force_quirks = true;
2371
+ // Switch to data to emit EOF next.
2372
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2373
+ return emit_doctype(parser, output);
2374
+ default:
2375
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME);
2376
+ reconsume_in_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_NAME);
2377
+ return CONTINUE;
2378
+ }
2379
+ }
2380
+
2381
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
2382
+ static StateResult handle_before_doctype_name_state (
2383
+ GumboParser* parser,
2384
+ GumboTokenizerState* tokenizer,
2385
+ int c,
2386
+ GumboToken* output
2387
+ ) {
2388
+ switch (c) {
2389
+ case '\t':
2390
+ case '\n':
2391
+ case '\f':
2392
+ case ' ':
2393
+ return CONTINUE;
2394
+ case '\0':
2395
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2396
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2397
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2398
+ return CONTINUE;
2399
+ case '>':
2400
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_NAME);
2401
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2402
+ tokenizer->_doc_type_state.force_quirks = true;
2403
+ return emit_doctype(parser, output);
2404
+ case -1:
2405
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2406
+ tokenizer->_doc_type_state.force_quirks = true;
2407
+ // Switch to data to emit EOF next.
2408
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2409
+ return emit_doctype(parser, output);
2410
+ default:
2411
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE_NAME);
2412
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2413
+ return CONTINUE;
2414
+ }
2415
+ }
2416
+
2417
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
2418
+ static StateResult handle_doctype_name_state (
2419
+ GumboParser* parser,
2420
+ GumboTokenizerState* tokenizer,
2421
+ int c,
2422
+ GumboToken* output
2423
+ ) {
2424
+ switch (c) {
2425
+ case '\t':
2426
+ case '\n':
2427
+ case '\f':
2428
+ case ' ':
2429
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2430
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2431
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2432
+ return CONTINUE;
2433
+ case '>':
2434
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2435
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2436
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2437
+ return emit_doctype(parser, output);
2438
+ case '\0':
2439
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2440
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2441
+ return CONTINUE;
2442
+ case -1:
2443
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2444
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2445
+ tokenizer->_doc_type_state.force_quirks = true;
2446
+ gumbo_free((void*) tokenizer->_doc_type_state.name);
2447
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2448
+ return emit_doctype(parser, output);
2449
+ default:
2450
+ append_char_to_temporary_buffer(parser, ensure_lowercase(c));
2451
+ return CONTINUE;
2452
+ }
2453
+ }
2454
+
2455
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
2456
+ static StateResult handle_after_doctype_name_state (
2457
+ GumboParser* parser,
2458
+ GumboTokenizerState* tokenizer,
2459
+ int c,
2460
+ GumboToken* output
2461
+ ) {
2462
+ switch (c) {
2463
+ case '\t':
2464
+ case '\n':
2465
+ case '\f':
2466
+ case ' ':
2467
+ return CONTINUE;
2468
+ case '>':
2469
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2470
+ return emit_doctype(parser, output);
2471
+ case -1:
2472
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2473
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2474
+ tokenizer->_doc_type_state.force_quirks = true;
2475
+ return emit_doctype(parser, output);
2476
+ default:
2477
+ if (utf8iterator_maybe_consume_match(
2478
+ &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2479
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2480
+ } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2481
+ sizeof("SYSTEM") - 1, false)) {
2482
+ reconsume_in_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2483
+ } else {
2484
+ tokenizer_add_parse_error(
2485
+ parser, GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME);
2486
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2487
+ tokenizer->_doc_type_state.force_quirks = true;
2488
+ }
2489
+ return CONTINUE;
2490
+ }
2491
+ }
2492
+
2493
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
2494
+ static StateResult handle_after_doctype_public_keyword_state (
2495
+ GumboParser* parser,
2496
+ GumboTokenizerState* tokenizer,
2497
+ int c,
2498
+ GumboToken* output
2499
+ ) {
2500
+ switch (c) {
2501
+ case '\t':
2502
+ case '\n':
2503
+ case '\f':
2504
+ case ' ':
2505
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2506
+ return CONTINUE;
2507
+ case '"':
2508
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2509
+ assert(temporary_buffer_is_empty(parser));
2510
+ gumbo_tokenizer_set_state(
2511
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2512
+ return CONTINUE;
2513
+ case '\'':
2514
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2515
+ assert(temporary_buffer_is_empty(parser));
2516
+ gumbo_tokenizer_set_state(
2517
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2518
+ return CONTINUE;
2519
+ case '>':
2520
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2521
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2522
+ tokenizer->_doc_type_state.force_quirks = true;
2523
+ return emit_doctype(parser, output);
2524
+ case -1:
2525
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2526
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2527
+ tokenizer->_doc_type_state.force_quirks = true;
2528
+ return emit_doctype(parser, output);
2529
+ default:
2530
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2531
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2532
+ tokenizer->_doc_type_state.force_quirks = true;
2533
+ return CONTINUE;
2534
+ }
2535
+ }
2536
+
2537
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
2538
+ static StateResult handle_before_doctype_public_id_state (
2539
+ GumboParser* parser,
2540
+ GumboTokenizerState* tokenizer,
2541
+ int c,
2542
+ GumboToken* output
2543
+ ) {
2544
+ switch (c) {
2545
+ case '\t':
2546
+ case '\n':
2547
+ case '\f':
2548
+ case ' ':
2549
+ return CONTINUE;
2550
+ case '"':
2551
+ assert(temporary_buffer_is_empty(parser));
2552
+ gumbo_tokenizer_set_state(
2553
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED);
2554
+ return CONTINUE;
2555
+ case '\'':
2556
+ assert(temporary_buffer_is_empty(parser));
2557
+ gumbo_tokenizer_set_state(
2558
+ parser, GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED);
2559
+ return CONTINUE;
2560
+ case '>':
2561
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER);
2562
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2563
+ tokenizer->_doc_type_state.force_quirks = true;
2564
+ return emit_doctype(parser, output);
2565
+ case -1:
2566
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2567
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2568
+ tokenizer->_doc_type_state.force_quirks = true;
2569
+ return emit_doctype(parser, output);
2570
+ default:
2571
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER);
2572
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2573
+ tokenizer->_doc_type_state.force_quirks = true;
2574
+ return CONTINUE;
2575
+ }
2576
+ }
2577
+
2578
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
2579
+ static StateResult handle_doctype_public_id_double_quoted_state (
2580
+ GumboParser* parser,
2581
+ GumboTokenizerState* tokenizer,
2582
+ int c,
2583
+ GumboToken* output
2584
+ ) {
2585
+ switch (c) {
2586
+ case '"':
2587
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2588
+ finish_doctype_public_id(parser);
2589
+ return CONTINUE;
2590
+ case '\0':
2591
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2592
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2593
+ return CONTINUE;
2594
+ case '>':
2595
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2596
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2597
+ tokenizer->_doc_type_state.force_quirks = true;
2598
+ finish_doctype_public_id(parser);
2599
+ return emit_doctype(parser, output);
2600
+ case -1:
2601
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2602
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2603
+ tokenizer->_doc_type_state.force_quirks = true;
2604
+ finish_doctype_public_id(parser);
2605
+ return emit_doctype(parser, output);
2606
+ default:
2607
+ append_char_to_temporary_buffer(parser, c);
2608
+ return CONTINUE;
2609
+ }
2610
+ }
2611
+
2612
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
2613
+ static StateResult handle_doctype_public_id_single_quoted_state (
2614
+ GumboParser* parser,
2615
+ GumboTokenizerState* tokenizer,
2616
+ int c,
2617
+ GumboToken* output
2618
+ ) {
2619
+ switch (c) {
2620
+ case '\'':
2621
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
2622
+ finish_doctype_public_id(parser);
2623
+ return CONTINUE;
2624
+ case '\0':
2625
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2626
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2627
+ return CONTINUE;
2628
+ case '>':
2629
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER);
2630
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2631
+ tokenizer->_doc_type_state.force_quirks = true;
2632
+ finish_doctype_public_id(parser);
2633
+ return emit_doctype(parser, output);
2634
+ case -1:
2635
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2636
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2637
+ tokenizer->_doc_type_state.force_quirks = true;
2638
+ finish_doctype_public_id(parser);
2639
+ return emit_doctype(parser, output);
2640
+ default:
2641
+ append_char_to_temporary_buffer(parser, c);
2642
+ return CONTINUE;
2643
+ }
2644
+ }
2645
+
2646
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
2647
+ static StateResult handle_after_doctype_public_id_state (
2648
+ GumboParser* parser,
2649
+ GumboTokenizerState* tokenizer,
2650
+ int c,
2651
+ GumboToken* output
2652
+ ) {
2653
+ switch (c) {
2654
+ case '\t':
2655
+ case '\n':
2656
+ case '\f':
2657
+ case ' ':
2658
+ gumbo_tokenizer_set_state(
2659
+ parser, GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID);
2660
+ return CONTINUE;
2661
+ case '>':
2662
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2663
+ return emit_doctype(parser, output);
2664
+ case '"':
2665
+ tokenizer_add_parse_error (
2666
+ parser,
2667
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2668
+ );
2669
+ assert(temporary_buffer_is_empty(parser));
2670
+ gumbo_tokenizer_set_state(
2671
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2672
+ return CONTINUE;
2673
+ case '\'':
2674
+ tokenizer_add_parse_error (
2675
+ parser,
2676
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS
2677
+ );
2678
+ assert(temporary_buffer_is_empty(parser));
2679
+ gumbo_tokenizer_set_state(
2680
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2681
+ return CONTINUE;
2682
+ case -1:
2683
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2684
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2685
+ tokenizer->_doc_type_state.force_quirks = true;
2686
+ return emit_doctype(parser, output);
2687
+ default:
2688
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2689
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2690
+ tokenizer->_doc_type_state.force_quirks = true;
2691
+ return CONTINUE;
2692
+ }
2693
+ }
2694
+
2695
+ // https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
2696
+ static StateResult handle_between_doctype_public_system_id_state (
2697
+ GumboParser* parser,
2698
+ GumboTokenizerState* tokenizer,
2699
+ int c,
2700
+ GumboToken* output
2701
+ ) {
2702
+ switch (c) {
2703
+ case '\t':
2704
+ case '\n':
2705
+ case '\f':
2706
+ case ' ':
2707
+ return CONTINUE;
2708
+ case '>':
2709
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2710
+ return emit_doctype(parser, output);
2711
+ case '"':
2712
+ assert(temporary_buffer_is_empty(parser));
2713
+ gumbo_tokenizer_set_state(
2714
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2715
+ return CONTINUE;
2716
+ case '\'':
2717
+ assert(temporary_buffer_is_empty(parser));
2718
+ gumbo_tokenizer_set_state(
2719
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2720
+ return CONTINUE;
2721
+ case -1:
2722
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2723
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2724
+ tokenizer->_doc_type_state.force_quirks = true;
2725
+ return emit_doctype(parser, output);
2726
+ default:
2727
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2728
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2729
+ tokenizer->_doc_type_state.force_quirks = true;
2730
+ return CONTINUE;
2731
+ }
2732
+ }
2733
+
2734
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
2735
+ static StateResult handle_after_doctype_system_keyword_state (
2736
+ GumboParser* parser,
2737
+ GumboTokenizerState* tokenizer,
2738
+ int c,
2739
+ GumboToken* output
2740
+ ) {
2741
+ switch (c) {
2742
+ case '\t':
2743
+ case '\n':
2744
+ case '\f':
2745
+ case ' ':
2746
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID);
2747
+ return CONTINUE;
2748
+ case '"':
2749
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2750
+ assert(temporary_buffer_is_empty(parser));
2751
+ gumbo_tokenizer_set_state(
2752
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2753
+ return CONTINUE;
2754
+ case '\'':
2755
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2756
+ assert(temporary_buffer_is_empty(parser));
2757
+ gumbo_tokenizer_set_state(
2758
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2759
+ return CONTINUE;
2760
+ case '>':
2761
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2762
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2763
+ tokenizer->_doc_type_state.force_quirks = true;
2764
+ return emit_doctype(parser, output);
2765
+ case -1:
2766
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2767
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2768
+ tokenizer->_doc_type_state.force_quirks = true;
2769
+ return emit_doctype(parser, output);
2770
+ default:
2771
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2772
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2773
+ tokenizer->_doc_type_state.force_quirks = true;
2774
+ return CONTINUE;
2775
+ }
2776
+ }
2777
+
2778
+ // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
2779
+ static StateResult handle_before_doctype_system_id_state (
2780
+ GumboParser* parser,
2781
+ GumboTokenizerState* tokenizer,
2782
+ int c,
2783
+ GumboToken* output
2784
+ ) {
2785
+ switch (c) {
2786
+ case '\t':
2787
+ case '\n':
2788
+ case '\f':
2789
+ case ' ':
2790
+ return CONTINUE;
2791
+ case '"':
2792
+ assert(temporary_buffer_is_empty(parser));
2793
+ gumbo_tokenizer_set_state(
2794
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED);
2795
+ return CONTINUE;
2796
+ case '\'':
2797
+ assert(temporary_buffer_is_empty(parser));
2798
+ gumbo_tokenizer_set_state(
2799
+ parser, GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED);
2800
+ return CONTINUE;
2801
+ case '>':
2802
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER);
2803
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2804
+ tokenizer->_doc_type_state.force_quirks = true;
2805
+ return emit_doctype(parser, output);
2806
+ case -1:
2807
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2808
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2809
+ tokenizer->_doc_type_state.force_quirks = true;
2810
+ return emit_doctype(parser, output);
2811
+ default:
2812
+ tokenizer_add_parse_error(parser, GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER);
2813
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2814
+ tokenizer->_doc_type_state.force_quirks = true;
2815
+ return CONTINUE;
2816
+ }
2817
+ }
2818
+
2819
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
2820
+ static StateResult handle_doctype_system_id_double_quoted_state (
2821
+ GumboParser* parser,
2822
+ GumboTokenizerState* tokenizer,
2823
+ int c,
2824
+ GumboToken* output
2825
+ ) {
2826
+ switch (c) {
2827
+ case '"':
2828
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2829
+ finish_doctype_system_id(parser);
2830
+ return CONTINUE;
2831
+ case '\0':
2832
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2833
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2834
+ return CONTINUE;
2835
+ case '>':
2836
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
2837
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2838
+ tokenizer->_doc_type_state.force_quirks = true;
2839
+ finish_doctype_system_id(parser);
2840
+ return emit_doctype(parser, output);
2841
+ case -1:
2842
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2843
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2844
+ tokenizer->_doc_type_state.force_quirks = true;
2845
+ finish_doctype_system_id(parser);
2846
+ return emit_doctype(parser, output);
2847
+ default:
2848
+ append_char_to_temporary_buffer(parser, c);
2849
+ return CONTINUE;
2850
+ }
2851
+ }
2852
+
2853
+ // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
2854
+ static StateResult handle_doctype_system_id_single_quoted_state (
2855
+ GumboParser* parser,
2856
+ GumboTokenizerState* tokenizer,
2857
+ int c,
2858
+ GumboToken* output
2859
+ ) {
2860
+ switch (c) {
2861
+ case '\'':
2862
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
2863
+ finish_doctype_system_id(parser);
2864
+ return CONTINUE;
2865
+ case '\0':
2866
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2867
+ append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2868
+ return CONTINUE;
2869
+ case '>':
2870
+ tokenizer_add_parse_error(parser, GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER);
2871
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2872
+ tokenizer->_doc_type_state.force_quirks = true;
2873
+ finish_doctype_system_id(parser);
2874
+ return emit_doctype(parser, output);
2875
+ case -1:
2876
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2877
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2878
+ tokenizer->_doc_type_state.force_quirks = true;
2879
+ finish_doctype_system_id(parser);
2880
+ return emit_doctype(parser, output);
2881
+ default:
2882
+ append_char_to_temporary_buffer(parser, c);
2883
+ return CONTINUE;
2884
+ }
2885
+ }
2886
+
2887
+ // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
2888
+ static StateResult handle_after_doctype_system_id_state (
2889
+ GumboParser* parser,
2890
+ GumboTokenizerState* tokenizer,
2891
+ int c,
2892
+ GumboToken* output
2893
+ ) {
2894
+ switch (c) {
2895
+ case '\t':
2896
+ case '\n':
2897
+ case '\f':
2898
+ case ' ':
2899
+ return CONTINUE;
2900
+ case '>':
2901
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2902
+ return emit_doctype(parser, output);
2903
+ case -1:
2904
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_DOCTYPE);
2905
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2906
+ tokenizer->_doc_type_state.force_quirks = true;
2907
+ return emit_doctype(parser, output);
2908
+ default:
2909
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER);
2910
+ reconsume_in_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2911
+ return CONTINUE;
2912
+ }
2913
+ }
2914
+
2915
+ // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
2916
+ static StateResult handle_bogus_doctype_state (
2917
+ GumboParser* parser,
2918
+ GumboTokenizerState* UNUSED_ARG(tokenizer),
2919
+ int c,
2920
+ GumboToken* output
2921
+ ) {
2922
+ switch (c) {
2923
+ case '>':
2924
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2925
+ return emit_doctype(parser, output);
2926
+ case '\0':
2927
+ tokenizer_add_parse_error(parser, GUMBO_ERR_UNEXPECTED_NULL_CHARACTER);
2928
+ return CONTINUE;
2929
+ case -1:
2930
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
2931
+ return emit_doctype(parser, output);
2932
+ default:
2933
+ return CONTINUE;
2934
+ }
2935
+ }
2936
+
2937
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
2938
+ static StateResult handle_cdata_section_state (
2939
+ GumboParser* parser,
2940
+ GumboTokenizerState* tokenizer,
2941
+ int c,
2942
+ GumboToken* output
2943
+ ) {
2944
+ switch (c) {
2945
+ case ']':
2946
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_BRACKET);
2947
+ set_mark(parser);
2948
+ return CONTINUE;
2949
+ case -1:
2950
+ tokenizer_add_parse_error(parser, GUMBO_ERR_EOF_IN_CDATA);
2951
+ return emit_eof(parser, output);
2952
+ default:
2953
+ return emit_char(parser, c, output);
2954
+ }
2955
+ }
2956
+
2957
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
2958
+ static StateResult handle_cdata_section_bracket_state (
2959
+ GumboParser* parser,
2960
+ GumboTokenizerState* tokenizer,
2961
+ int c,
2962
+ GumboToken* output
2963
+ ) {
2964
+ switch (c) {
2965
+ case ']':
2966
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION_END);
2967
+ return CONTINUE;
2968
+ default:
2969
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
2970
+ // Emit the ].
2971
+ return emit_from_mark(parser, output);
2972
+ }
2973
+ }
2974
+
2975
+ // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
2976
+ static StateResult handle_cdata_section_end_state (
2977
+ GumboParser* parser,
2978
+ GumboTokenizerState* tokenizer,
2979
+ int c,
2980
+ GumboToken* output
2981
+ ) {
2982
+ switch (c) {
2983
+ case ']':
2984
+ {
2985
+ // XXX: This is terrible. We want to emit a ] corresponding to the first
2986
+ // of the three in a row we've seen. So let's emit one token from the
2987
+ // temporary buffer (which will rewind 3 characters, emit the ] and
2988
+ // advance one). Next, let's clear the temporary buffer which will set the
2989
+ // mark to the middle of the three brackets. Finally, let's move to the
2990
+ // appropriate state.
2991
+ StateResult result = emit_from_mark(parser, output);
2992
+ tokenizer->_resume_pos = NULL;
2993
+ set_mark(parser);
2994
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA_SECTION);
2995
+ return result;
2996
+ }
2997
+ case '>':
2998
+ // We're done with CDATA so move past the >, reset the token start point
2999
+ // to point after the >, and then reconsume in the data state.
3000
+ utf8iterator_next(&tokenizer->_input);
3001
+ reset_token_start_point(tokenizer);
3002
+ reconsume_in_state(parser, GUMBO_LEX_DATA);
3003
+ tokenizer->_is_in_cdata = false;
3004
+ return CONTINUE;
3005
+ default:
3006
+ reconsume_in_state(parser, GUMBO_LEX_CDATA_SECTION);
3007
+ return emit_from_mark(parser, output);
3008
+ }
3009
+ }
3010
+
3011
+ // https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
3012
+ static StateResult handle_character_reference_state (
3013
+ GumboParser* parser,
3014
+ GumboTokenizerState* tokenizer,
3015
+ int c,
3016
+ GumboToken* output
3017
+ ) {
3018
+ if (gumbo_ascii_isalnum(c)) {
3019
+ reconsume_in_state(parser, GUMBO_LEX_NAMED_CHARACTER_REFERENCE);
3020
+ return CONTINUE;
3021
+ }
3022
+ if (c == '#') {
3023
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE);
3024
+ return CONTINUE;
3025
+ }
3026
+ reconsume_in_state(parser, tokenizer->_return_state);
3027
+ return flush_code_points_consumed_as_character_reference(parser, output);
3028
+ }
3029
+
3030
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
3031
+ static StateResult handle_named_character_reference_state (
3032
+ GumboParser* parser,
3033
+ GumboTokenizerState* tokenizer,
3034
+ int c,
3035
+ GumboToken* output
3036
+ ) {
3037
+ const char *cur = utf8iterator_get_char_pointer(&tokenizer->_input);
3038
+ const char *end = utf8iterator_get_end_pointer(&tokenizer->_input);
3039
+ int code_point[2];
3040
+ size_t size = match_named_char_ref(cur, end - cur, code_point);
3041
+
3042
+ if (size > 0) {
3043
+ utf8iterator_maybe_consume_match(&tokenizer->_input, cur, size, true);
3044
+ int next = utf8iterator_current(&tokenizer->_input);
3045
+ reconsume_in_state(parser, tokenizer->_return_state);
3046
+ if (character_reference_part_of_attribute(parser)
3047
+ && cur[size-1] != ';'
3048
+ && (next == '=' || gumbo_ascii_isalnum(next))) {
3049
+ GumboStringPiece str = { .data = cur, .length = size };
3050
+ append_string_to_temporary_buffer(parser, &str);
3051
+ return flush_code_points_consumed_as_character_reference(parser, output);
3052
+ }
3053
+ if (cur[size-1] != ';')
3054
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE, -1);
3055
+ reconsume_in_state(parser, tokenizer->_return_state);
3056
+ return flush_char_ref(parser, code_point[0], code_point[1], output);
3057
+ }
3058
+ reconsume_in_state(parser, GUMBO_LEX_AMBIGUOUS_AMPERSAND);
3059
+ return flush_code_points_consumed_as_character_reference(parser, output);
3060
+ }
3061
+
3062
+ // https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
3063
+ static StateResult handle_ambiguous_ampersand_state (
3064
+ GumboParser* parser,
3065
+ GumboTokenizerState* tokenizer,
3066
+ int c,
3067
+ GumboToken* output
3068
+ ) {
3069
+ if (gumbo_ascii_isalnum(c)) {
3070
+ if (character_reference_part_of_attribute(parser)) {
3071
+ append_char_to_tag_buffer(parser, c, true);
3072
+ return CONTINUE;
3073
+ }
3074
+ return emit_char(parser, c, output);
3075
+ }
3076
+ if (c == ';') {
3077
+ tokenizer_add_char_ref_error(parser, GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE, -1);
3078
+ // fall through
3079
+ }
3080
+ reconsume_in_state(parser, tokenizer->_return_state);
3081
+ return CONTINUE;
3082
+ }
3083
+
3084
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
3085
+ static StateResult handle_numeric_character_reference_state (
3086
+ GumboParser* parser,
3087
+ GumboTokenizerState* tokenizer,
3088
+ int c,
3089
+ GumboToken* output
3090
+ ) {
3091
+ tokenizer->_character_reference_code = 0;
3092
+ switch (c) {
3093
+ case 'x':
3094
+ case 'X':
3095
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START);
3096
+ return CONTINUE;
3097
+ default:
3098
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START);
3099
+ return CONTINUE;
3100
+ }
3101
+ }
3102
+
3103
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-start-state
3104
+ static StateResult handle_hexadecimal_character_reference_start_state (
3105
+ GumboParser* parser,
3106
+ GumboTokenizerState* tokenizer,
3107
+ int c,
3108
+ GumboToken* output
3109
+ ) {
3110
+ if (gumbo_ascii_isxdigit(c)) {
3111
+ reconsume_in_state(parser, GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE);
3112
+ return CONTINUE;
3113
+ }
3114
+ tokenizer_add_char_ref_error (
3115
+ parser,
3116
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3117
+ -1
3118
+ );
3119
+ reconsume_in_state(parser, tokenizer->_return_state);
3120
+ return flush_code_points_consumed_as_character_reference(parser, output);
3121
+ }
3122
+
3123
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
3124
+ static StateResult handle_decimal_character_reference_start_state (
3125
+ GumboParser* parser,
3126
+ GumboTokenizerState* tokenizer,
3127
+ int c,
3128
+ GumboToken* output
3129
+ ) {
3130
+ if (gumbo_ascii_isdigit(c)) {
3131
+ reconsume_in_state(parser, GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE);
3132
+ return CONTINUE;
3133
+ }
3134
+ tokenizer_add_char_ref_error (
3135
+ parser,
3136
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
3137
+ -1
3138
+ );
3139
+ reconsume_in_state(parser, tokenizer->_return_state);
3140
+ return flush_code_points_consumed_as_character_reference(parser, output);
3141
+ }
3142
+
3143
+ // https://html.spec.whatwg.org/multipage/parsing.html#hexademical-character-reference-state
3144
+ static StateResult handle_hexadecimal_character_reference_state (
3145
+ GumboParser* parser,
3146
+ GumboTokenizerState* tokenizer,
3147
+ int c,
3148
+ GumboToken* output
3149
+ ) {
3150
+ if (gumbo_ascii_isdigit(c)) {
3151
+ tokenizer->_character_reference_code =
3152
+ tokenizer->_character_reference_code * 16 + (c - 0x0030);
3153
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3154
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3155
+ return CONTINUE;
3156
+ }
3157
+ if (gumbo_ascii_isupper_xdigit(c)) {
3158
+ tokenizer->_character_reference_code =
3159
+ tokenizer->_character_reference_code * 16 + (c - 0x0037);
3160
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3161
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3162
+ return CONTINUE;
3163
+ }
3164
+ if (gumbo_ascii_islower_xdigit(c)) {
3165
+ tokenizer->_character_reference_code =
3166
+ tokenizer->_character_reference_code * 16 + (c - 0x0057);
3167
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3168
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3169
+ return CONTINUE;
3170
+ }
3171
+ if (c == ';') {
3172
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3173
+ return CONTINUE;
3174
+ }
3175
+ tokenizer_add_char_ref_error(
3176
+ parser,
3177
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3178
+ tokenizer->_character_reference_code
3179
+ );
3180
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3181
+ return CONTINUE;
3182
+ }
3183
+
3184
+ // https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
3185
+ static StateResult handle_decimal_character_reference_state (
3186
+ GumboParser* parser,
3187
+ GumboTokenizerState* tokenizer,
3188
+ int c,
3189
+ GumboToken* output
3190
+ ) {
3191
+ if (gumbo_ascii_isdigit(c)) {
3192
+ tokenizer->_character_reference_code =
3193
+ tokenizer->_character_reference_code * 10 + (c - 0x0030);
3194
+ if (tokenizer->_character_reference_code > kUtf8MaxChar)
3195
+ tokenizer->_character_reference_code = kUtf8MaxChar+1;
3196
+ return CONTINUE;
3197
+ }
3198
+ if (c == ';') {
3199
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3200
+ return CONTINUE;
3201
+ }
3202
+ tokenizer_add_char_ref_error(
3203
+ parser,
3204
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
3205
+ tokenizer->_character_reference_code
3206
+ );
3207
+ reconsume_in_state(parser, GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END);
3208
+ return CONTINUE;
3209
+ }
3210
+
3211
+ // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
3212
+ static StateResult handle_numeric_character_reference_end_state (
3213
+ GumboParser* parser,
3214
+ GumboTokenizerState* tokenizer,
3215
+ int c,
3216
+ GumboToken* output
3217
+ ) {
3218
+ c = tokenizer->_character_reference_code;
3219
+ if (c == 0) {
3220
+ tokenizer_add_char_ref_error(
3221
+ parser,
3222
+ GUMBO_ERR_NULL_CHARACTER_REFERENCE,
3223
+ c
3224
+ );
3225
+ c = kUtf8ReplacementChar;
3226
+ } else if (c > kUtf8MaxChar) {
3227
+ tokenizer_add_char_ref_error(
3228
+ parser,
3229
+ GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
3230
+ c
3231
+ );
3232
+ c = kUtf8ReplacementChar;
3233
+ } else if (utf8_is_surrogate(c)) {
3234
+ tokenizer_add_char_ref_error(
3235
+ parser,
3236
+ GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
3237
+ c
3238
+ );
3239
+ c = kUtf8ReplacementChar;
3240
+ } else if (utf8_is_noncharacter(c)) {
3241
+ tokenizer_add_char_ref_error(
3242
+ parser,
3243
+ GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
3244
+ c
3245
+ );
3246
+ } else if (c == 0x0D || (utf8_is_control(c) && !gumbo_ascii_isspace(c))) {
3247
+ tokenizer_add_char_ref_error(
3248
+ parser,
3249
+ GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
3250
+ c
3251
+ );
3252
+ switch (c) {
3253
+ case 0x80: c = 0x20AC; break;
3254
+ case 0x82: c = 0x201A; break;
3255
+ case 0x83: c = 0x0192; break;
3256
+ case 0x84: c = 0x201E; break;
3257
+ case 0x85: c = 0x2026; break;
3258
+ case 0x86: c = 0x2020; break;
3259
+ case 0x87: c = 0x2021; break;
3260
+ case 0x88: c = 0x02C6; break;
3261
+ case 0x89: c = 0x2030; break;
3262
+ case 0x8A: c = 0x0160; break;
3263
+ case 0x8B: c = 0x2039; break;
3264
+ case 0x8C: c = 0x0152; break;
3265
+ case 0x8E: c = 0x017D; break;
3266
+ case 0x91: c = 0x2018; break;
3267
+ case 0x92: c = 0x2019; break;
3268
+ case 0x93: c = 0x201C; break;
3269
+ case 0x94: c = 0x201D; break;
3270
+ case 0x95: c = 0x2022; break;
3271
+ case 0x96: c = 0x2013; break;
3272
+ case 0x97: c = 0x2014; break;
3273
+ case 0x98: c = 0x02DC; break;
3274
+ case 0x99: c = 0x2122; break;
3275
+ case 0x9A: c = 0x0161; break;
3276
+ case 0x9B: c = 0x203A; break;
3277
+ case 0x9C: c = 0x0153; break;
3278
+ case 0x9E: c = 0x017E; break;
3279
+ case 0x9F: c = 0x0178; break;
3280
+ }
3281
+ }
3282
+ reconsume_in_state(parser, tokenizer->_return_state);
3283
+ return flush_char_ref(parser, c, kGumboNoChar, output);
3284
+ }
3285
+
3286
+ typedef StateResult (*GumboLexerStateFunction) (
3287
+ GumboParser* parser,
3288
+ GumboTokenizerState* tokenizer,
3289
+ int c,
3290
+ GumboToken* output
3291
+ );
3292
+
3293
+ static GumboLexerStateFunction dispatch_table[] = {
3294
+ [GUMBO_LEX_DATA] = handle_data_state,
3295
+ [GUMBO_LEX_RCDATA] = handle_rcdata_state,
3296
+ [GUMBO_LEX_RAWTEXT] = handle_rawtext_state,
3297
+ [GUMBO_LEX_SCRIPT_DATA] = handle_script_data_state,
3298
+ [GUMBO_LEX_PLAINTEXT] = handle_plaintext_state,
3299
+ [GUMBO_LEX_TAG_OPEN] = handle_tag_open_state,
3300
+ [GUMBO_LEX_END_TAG_OPEN] = handle_end_tag_open_state,
3301
+ [GUMBO_LEX_TAG_NAME] = handle_tag_name_state,
3302
+ [GUMBO_LEX_RCDATA_LT] = handle_rcdata_lt_state,
3303
+ [GUMBO_LEX_RCDATA_END_TAG_OPEN] = handle_rcdata_end_tag_open_state,
3304
+ [GUMBO_LEX_RCDATA_END_TAG_NAME] = handle_rcdata_end_tag_name_state,
3305
+ [GUMBO_LEX_RAWTEXT_LT] = handle_rawtext_lt_state,
3306
+ [GUMBO_LEX_RAWTEXT_END_TAG_OPEN] = handle_rawtext_end_tag_open_state,
3307
+ [GUMBO_LEX_RAWTEXT_END_TAG_NAME] = handle_rawtext_end_tag_name_state,
3308
+ [GUMBO_LEX_SCRIPT_DATA_LT] = handle_script_data_lt_state,
3309
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_OPEN] = handle_script_data_end_tag_open_state,
3310
+ [GUMBO_LEX_SCRIPT_DATA_END_TAG_NAME] = handle_script_data_end_tag_name_state,
3311
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START] = handle_script_data_escaped_start_state,
3312
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_START_DASH] = handle_script_data_escaped_start_dash_state,
3313
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED] = handle_script_data_escaped_state,
3314
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH] = handle_script_data_escaped_dash_state,
3315
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_DASH_DASH] = handle_script_data_escaped_dash_dash_state,
3316
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_LT] = handle_script_data_escaped_lt_state,
3317
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_OPEN] = handle_script_data_escaped_end_tag_open_state,
3318
+ [GUMBO_LEX_SCRIPT_DATA_ESCAPED_END_TAG_NAME] = handle_script_data_escaped_end_tag_name_state,
3319
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_START] = handle_script_data_double_escaped_start_state,
3320
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED] = handle_script_data_double_escaped_state,
3321
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH] = handle_script_data_double_escaped_dash_state,
3322
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH] = handle_script_data_double_escaped_dash_dash_state,
3323
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_LT] = handle_script_data_double_escaped_lt_state,
3324
+ [GUMBO_LEX_SCRIPT_DATA_DOUBLE_ESCAPED_END] = handle_script_data_double_escaped_end_state,
3325
+ [GUMBO_LEX_BEFORE_ATTR_NAME] = handle_before_attr_name_state,
3326
+ [GUMBO_LEX_ATTR_NAME] = handle_attr_name_state,
3327
+ [GUMBO_LEX_AFTER_ATTR_NAME] = handle_after_attr_name_state,
3328
+ [GUMBO_LEX_BEFORE_ATTR_VALUE] = handle_before_attr_value_state,
3329
+ [GUMBO_LEX_ATTR_VALUE_DOUBLE_QUOTED] = handle_attr_value_double_quoted_state,
3330
+ [GUMBO_LEX_ATTR_VALUE_SINGLE_QUOTED] = handle_attr_value_single_quoted_state,
3331
+ [GUMBO_LEX_ATTR_VALUE_UNQUOTED] = handle_attr_value_unquoted_state,
3332
+ [GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED] = handle_after_attr_value_quoted_state,
3333
+ [GUMBO_LEX_SELF_CLOSING_START_TAG] = handle_self_closing_start_tag_state,
3334
+ [GUMBO_LEX_BOGUS_COMMENT] = handle_bogus_comment_state,
3335
+ [GUMBO_LEX_MARKUP_DECLARATION_OPEN] = handle_markup_declaration_open_state,
3336
+ [GUMBO_LEX_COMMENT_START] = handle_comment_start_state,
3337
+ [GUMBO_LEX_COMMENT_START_DASH] = handle_comment_start_dash_state,
3338
+ [GUMBO_LEX_COMMENT] = handle_comment_state,
3339
+ [GUMBO_LEX_COMMENT_LT] = handle_comment_lt_state,
3340
+ [GUMBO_LEX_COMMENT_LT_BANG] = handle_comment_lt_bang_state,
3341
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH] = handle_comment_lt_bang_dash_state,
3342
+ [GUMBO_LEX_COMMENT_LT_BANG_DASH_DASH] = handle_comment_lt_bang_dash_dash_state,
3343
+ [GUMBO_LEX_COMMENT_END_DASH] = handle_comment_end_dash_state,
3344
+ [GUMBO_LEX_COMMENT_END] = handle_comment_end_state,
3345
+ [GUMBO_LEX_COMMENT_END_BANG] = handle_comment_end_bang_state,
3346
+ [GUMBO_LEX_DOCTYPE] = handle_doctype_state,
3347
+ [GUMBO_LEX_BEFORE_DOCTYPE_NAME] = handle_before_doctype_name_state,
3348
+ [GUMBO_LEX_DOCTYPE_NAME] = handle_doctype_name_state,
3349
+ [GUMBO_LEX_AFTER_DOCTYPE_NAME] = handle_after_doctype_name_state,
3350
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD] = handle_after_doctype_public_keyword_state,
3351
+ [GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID] = handle_before_doctype_public_id_state,
3352
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_DOUBLE_QUOTED] = handle_doctype_public_id_double_quoted_state,
3353
+ [GUMBO_LEX_DOCTYPE_PUBLIC_ID_SINGLE_QUOTED] = handle_doctype_public_id_single_quoted_state,
3354
+ [GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID] = handle_after_doctype_public_id_state,
3355
+ [GUMBO_LEX_BETWEEN_DOCTYPE_PUBLIC_SYSTEM_ID] = handle_between_doctype_public_system_id_state,
3356
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD] = handle_after_doctype_system_keyword_state,
3357
+ [GUMBO_LEX_BEFORE_DOCTYPE_SYSTEM_ID] = handle_before_doctype_system_id_state,
3358
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_DOUBLE_QUOTED] = handle_doctype_system_id_double_quoted_state,
3359
+ [GUMBO_LEX_DOCTYPE_SYSTEM_ID_SINGLE_QUOTED] = handle_doctype_system_id_single_quoted_state,
3360
+ [GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID] = handle_after_doctype_system_id_state,
3361
+ [GUMBO_LEX_BOGUS_DOCTYPE] = handle_bogus_doctype_state,
3362
+ [GUMBO_LEX_CDATA_SECTION] = handle_cdata_section_state,
3363
+ [GUMBO_LEX_CDATA_SECTION_BRACKET] = handle_cdata_section_bracket_state,
3364
+ [GUMBO_LEX_CDATA_SECTION_END] = handle_cdata_section_end_state,
3365
+ [GUMBO_LEX_CHARACTER_REFERENCE] = handle_character_reference_state,
3366
+ [GUMBO_LEX_NAMED_CHARACTER_REFERENCE] = handle_named_character_reference_state,
3367
+ [GUMBO_LEX_AMBIGUOUS_AMPERSAND] = handle_ambiguous_ampersand_state,
3368
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE] = handle_numeric_character_reference_state,
3369
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE_START] = handle_hexadecimal_character_reference_start_state,
3370
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE_START] = handle_decimal_character_reference_start_state,
3371
+ [GUMBO_LEX_HEXADECIMAL_CHARACTER_REFERENCE] = handle_hexadecimal_character_reference_state,
3372
+ [GUMBO_LEX_DECIMAL_CHARACTER_REFERENCE] = handle_decimal_character_reference_state,
3373
+ [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
3374
+ };
3375
+
3376
+ void gumbo_lex(GumboParser* parser, GumboToken* output) {
3377
+ // Because of the spec requirements that...
3378
+ //
3379
+ // 1. Tokens be handled immediately by the parser upon emission.
3380
+ // 2. Some states (eg. CDATA, or various error conditions) require the
3381
+ // emission of multiple tokens in the same states.
3382
+ // 3. The tokenizer often has to reconsume the same character in a different
3383
+ // state.
3384
+ //
3385
+ // ...all state must be held in the GumboTokenizer struct instead of in local
3386
+ // variables in this function. That allows us to return from this method with
3387
+ // a token, and then immediately jump back to the same state with the same
3388
+ // input if we need to return a different token. The various emit_* functions
3389
+ // are responsible for changing state (eg. flushing the chardata buffer,
3390
+ // reading the next input character) to avoid an infinite loop.
3391
+ GumboTokenizerState* tokenizer = parser->_tokenizer_state;
3392
+
3393
+ if (tokenizer->_buffered_emit_char != kGumboNoChar) {
3394
+ tokenizer->_reconsume_current_input = true;
3395
+ emit_char(parser, tokenizer->_buffered_emit_char, output);
3396
+ // And now that we've avoided advancing the input, make sure we set
3397
+ // _reconsume_current_input back to false to make sure the *next* character
3398
+ // isn't consumed twice.
3399
+ tokenizer->_reconsume_current_input = false;
3400
+ tokenizer->_buffered_emit_char = kGumboNoChar;
3401
+ return;
3402
+ }
3403
+
3404
+ if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
3405
+ return;
3406
+ }
3407
+
3408
+ while (1) {
3409
+ assert(!tokenizer->_resume_pos);
3410
+ assert(tokenizer->_buffered_emit_char == kGumboNoChar);
3411
+ int c = utf8iterator_current(&tokenizer->_input);
3412
+ GumboTokenizerEnum state = tokenizer->_state;
3413
+ gumbo_debug("Lexing character '%c' (%d) in state %u.\n", c, c, state);
3414
+ StateResult result = dispatch_table[state](parser, tokenizer, c, output);
3415
+ // We need to clear reconsume_current_input before returning to prevent
3416
+ // certain infinite loop states.
3417
+ bool should_advance = !tokenizer->_reconsume_current_input;
3418
+ tokenizer->_reconsume_current_input = false;
3419
+
3420
+ if (result == EMIT_TOKEN)
3421
+ return;
3422
+
3423
+ if (should_advance) {
3424
+ utf8iterator_next(&tokenizer->_input);
3425
+ }
3426
+ }
3427
+ }
3428
+
3429
+ void gumbo_token_destroy(GumboToken* token) {
3430
+ if (!token) return;
3431
+
3432
+ switch (token->type) {
3433
+ case GUMBO_TOKEN_DOCTYPE:
3434
+ gumbo_free((void*) token->v.doc_type.name);
3435
+ gumbo_free((void*) token->v.doc_type.public_identifier);
3436
+ gumbo_free((void*) token->v.doc_type.system_identifier);
3437
+ return;
3438
+ case GUMBO_TOKEN_START_TAG:
3439
+ for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
3440
+ GumboAttribute* attr = token->v.start_tag.attributes.data[i];
3441
+ if (attr) {
3442
+ // May have been nulled out if this token was merged with another.
3443
+ gumbo_destroy_attribute(attr);
3444
+ }
3445
+ }
3446
+ gumbo_free((void*) token->v.start_tag.attributes.data);
3447
+ if (token->v.start_tag.tag == GUMBO_TAG_UNKNOWN) {
3448
+ gumbo_free(token->v.start_tag.name);
3449
+ token->v.start_tag.name = NULL;
3450
+ }
3451
+ return;
3452
+ case GUMBO_TOKEN_END_TAG:
3453
+ if (token->v.end_tag.tag == GUMBO_TAG_UNKNOWN) {
3454
+ gumbo_free(token->v.end_tag.name);
3455
+ token->v.end_tag.name = NULL;
3456
+ }
3457
+ break;
3458
+ case GUMBO_TOKEN_COMMENT:
3459
+ gumbo_free((void*) token->v.text);
3460
+ return;
3461
+ default:
3462
+ return;
3463
+ }
3464
+ }