nokogumbo 1.4.8 → 1.4.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,57 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #ifndef GUMBO_INSERTION_MODE_H_
18
+ #define GUMBO_INSERTION_MODE_H_
19
+
20
+ #ifdef __cplusplus
21
+ extern "C" {
22
+ #endif
23
+
24
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
25
+ // If new enum values are added, be sure to update the kTokenHandlers dispatch
26
+ // table in parser.c.
27
+ typedef enum {
28
+ GUMBO_INSERTION_MODE_INITIAL,
29
+ GUMBO_INSERTION_MODE_BEFORE_HTML,
30
+ GUMBO_INSERTION_MODE_BEFORE_HEAD,
31
+ GUMBO_INSERTION_MODE_IN_HEAD,
32
+ GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT,
33
+ GUMBO_INSERTION_MODE_AFTER_HEAD,
34
+ GUMBO_INSERTION_MODE_IN_BODY,
35
+ GUMBO_INSERTION_MODE_TEXT,
36
+ GUMBO_INSERTION_MODE_IN_TABLE,
37
+ GUMBO_INSERTION_MODE_IN_TABLE_TEXT,
38
+ GUMBO_INSERTION_MODE_IN_CAPTION,
39
+ GUMBO_INSERTION_MODE_IN_COLUMN_GROUP,
40
+ GUMBO_INSERTION_MODE_IN_TABLE_BODY,
41
+ GUMBO_INSERTION_MODE_IN_ROW,
42
+ GUMBO_INSERTION_MODE_IN_CELL,
43
+ GUMBO_INSERTION_MODE_IN_SELECT,
44
+ GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
45
+ GUMBO_INSERTION_MODE_IN_TEMPLATE,
46
+ GUMBO_INSERTION_MODE_AFTER_BODY,
47
+ GUMBO_INSERTION_MODE_IN_FRAMESET,
48
+ GUMBO_INSERTION_MODE_AFTER_FRAMESET,
49
+ GUMBO_INSERTION_MODE_AFTER_AFTER_BODY,
50
+ GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET
51
+ } GumboInsertionMode;
52
+
53
+ #ifdef __cplusplus
54
+ } // extern C
55
+ #endif
56
+
57
+ #endif // GUMBO_INSERTION_MODE_H_
@@ -0,0 +1,4188 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include <assert.h>
18
+ #include <ctype.h>
19
+ #include <stdarg.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "attribute.h"
25
+ #include "error.h"
26
+ #include "gumbo.h"
27
+ #include "insertion_mode.h"
28
+ #include "parser.h"
29
+ #include "tokenizer.h"
30
+ #include "tokenizer_states.h"
31
+ #include "utf8.h"
32
+ #include "util.h"
33
+ #include "vector.h"
34
+
35
+ #define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
36
+
37
+ #define GUMBO_STRING(literal) \
38
+ { literal, sizeof(literal) - 1 }
39
+ #define TERMINATOR \
40
+ { "", 0 }
41
+
42
+ typedef char gumbo_tagset[GUMBO_TAG_LAST];
43
+ #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML)
44
+ #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG)
45
+ #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML)
46
+
47
+ #define TAGSET_INCLUDES(tagset, namespace, tag) \
48
+ (tag < GUMBO_TAG_LAST && tagset[(int) tag] == (1 << (int) namespace))
49
+
50
+ // selected forward declarations as it is getting hard to find
51
+ // an appropriate order
52
+ static bool node_html_tag_is(const GumboNode*, GumboTag);
53
+ static GumboInsertionMode get_current_template_insertion_mode(
54
+ const GumboParser*);
55
+ static bool handle_in_template(GumboParser*, GumboToken*);
56
+ static void destroy_node(GumboParser*, GumboNode*);
57
+
58
+ static void* malloc_wrapper(void* unused, size_t size) { return malloc(size); }
59
+
60
+ static void free_wrapper(void* unused, void* ptr) { free(ptr); }
61
+
62
+ const GumboOptions kGumboDefaultOptions = {&malloc_wrapper, &free_wrapper, NULL,
63
+ 8, false, -1, GUMBO_TAG_LAST, GUMBO_NAMESPACE_HTML};
64
+
65
+ static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html");
66
+ static const GumboStringPiece kPublicIdHtml4_0 =
67
+ GUMBO_STRING("-//W3C//DTD HTML 4.0//EN");
68
+ static const GumboStringPiece kPublicIdHtml4_01 =
69
+ GUMBO_STRING("-//W3C//DTD HTML 4.01//EN");
70
+ static const GumboStringPiece kPublicIdXhtml1_0 =
71
+ GUMBO_STRING("-//W3C//DTD XHTML 1.0 Strict//EN");
72
+ static const GumboStringPiece kPublicIdXhtml1_1 =
73
+ GUMBO_STRING("-//W3C//DTD XHTML 1.1//EN");
74
+ static const GumboStringPiece kSystemIdRecHtml4_0 =
75
+ GUMBO_STRING("http://www.w3.org/TR/REC-html40/strict.dtd");
76
+ static const GumboStringPiece kSystemIdHtml4 =
77
+ GUMBO_STRING("http://www.w3.org/TR/html4/strict.dtd");
78
+ static const GumboStringPiece kSystemIdXhtmlStrict1_1 =
79
+ GUMBO_STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
80
+ static const GumboStringPiece kSystemIdXhtml1_1 =
81
+ GUMBO_STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
82
+ static const GumboStringPiece kSystemIdLegacyCompat =
83
+ GUMBO_STRING("about:legacy-compat");
84
+
85
+ // The doctype arrays have an explicit terminator because we want to pass them
86
+ // to a helper function, and passing them as a pointer discards sizeof
87
+ // information. The SVG arrays are used only by one-off functions, and so loops
88
+ // over them use sizeof directly instead of a terminator.
89
+
90
+ static const GumboStringPiece kQuirksModePublicIdPrefixes[] = {
91
+ GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"),
92
+ GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
93
+ GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"),
94
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"),
95
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"),
96
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"),
97
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"),
98
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"),
99
+ GUMBO_STRING("-//IETF//DTD HTML 2.0//"),
100
+ GUMBO_STRING("-//IETF//DTD HTML 2.1E//"),
101
+ GUMBO_STRING("-//IETF//DTD HTML 3.0//"),
102
+ GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"),
103
+ GUMBO_STRING("-//IETF//DTD HTML 3.2//"),
104
+ GUMBO_STRING("-//IETF//DTD HTML 3//"),
105
+ GUMBO_STRING("-//IETF//DTD HTML Level 0//"),
106
+ GUMBO_STRING("-//IETF//DTD HTML Level 1//"),
107
+ GUMBO_STRING("-//IETF//DTD HTML Level 2//"),
108
+ GUMBO_STRING("-//IETF//DTD HTML Level 3//"),
109
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"),
110
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"),
111
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"),
112
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"),
113
+ GUMBO_STRING("-//IETF//DTD HTML Strict//"),
114
+ GUMBO_STRING("-//IETF//DTD HTML//"),
115
+ GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"),
116
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
117
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
118
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
119
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
120
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
121
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
122
+ GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"),
123
+ GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"),
124
+ GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"),
125
+ GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
126
+ GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
127
+ GUMBO_STRING(
128
+ "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)"
129
+ "extensions to HTML 4.0//"),
130
+ GUMBO_STRING(
131
+ "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::"
132
+ "extensions to HTML 4.0//"),
133
+ GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"),
134
+ GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
135
+ GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
136
+ GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
137
+ GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"),
138
+ GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"),
139
+ GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"),
140
+ GUMBO_STRING("-//W3C//DTD HTML 3.2//"),
141
+ GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"),
142
+ GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"),
143
+ GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"),
144
+ GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"),
145
+ GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"),
146
+ GUMBO_STRING("-//W3C//DTD W3 HTML//"),
147
+ GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"),
148
+ GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"),
149
+ GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"), TERMINATOR};
150
+
151
+ static const GumboStringPiece kQuirksModePublicIdExactMatches[] = {
152
+ GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"),
153
+ GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), GUMBO_STRING("HTML"),
154
+ TERMINATOR};
155
+
156
+ static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = {
157
+ GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"),
158
+ TERMINATOR};
159
+
160
+ static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = {
161
+ GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"),
162
+ GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"), TERMINATOR};
163
+
164
+ static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] =
165
+ {GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"),
166
+ GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"), TERMINATOR};
167
+
168
+ // Indexed by GumboNamespaceEnum; keep in sync with that.
169
+ static const char* kLegalXmlns[] = {"http://www.w3.org/1999/xhtml",
170
+ "http://www.w3.org/2000/svg", "http://www.w3.org/1998/Math/MathML"};
171
+
172
+ typedef struct _ReplacementEntry {
173
+ const GumboStringPiece from;
174
+ const GumboStringPiece to;
175
+ } ReplacementEntry;
176
+
177
+ #define REPLACEMENT_ENTRY(from, to) \
178
+ { GUMBO_STRING(from), GUMBO_STRING(to) }
179
+
180
+ // Static data for SVG attribute replacements.
181
+ // https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes
182
+ static const ReplacementEntry kSvgAttributeReplacements[] = {
183
+ REPLACEMENT_ENTRY("attributename", "attributeName"),
184
+ REPLACEMENT_ENTRY("attributetype", "attributeType"),
185
+ REPLACEMENT_ENTRY("basefrequency", "baseFrequency"),
186
+ REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
187
+ REPLACEMENT_ENTRY("calcmode", "calcMode"),
188
+ REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
189
+ // REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
190
+ // REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
191
+ REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
192
+ REPLACEMENT_ENTRY("edgemode", "edgeMode"),
193
+ // REPLACEMENT_ENTRY("externalresourcesrequired",
194
+ // "externalResourcesRequired"),
195
+ // REPLACEMENT_ENTRY("filterres", "filterRes"),
196
+ REPLACEMENT_ENTRY("filterunits", "filterUnits"),
197
+ REPLACEMENT_ENTRY("glyphref", "glyphRef"),
198
+ REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
199
+ REPLACEMENT_ENTRY("gradientunits", "gradientUnits"),
200
+ REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"),
201
+ REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"),
202
+ REPLACEMENT_ENTRY("keypoints", "keyPoints"),
203
+ REPLACEMENT_ENTRY("keysplines", "keySplines"),
204
+ REPLACEMENT_ENTRY("keytimes", "keyTimes"),
205
+ REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"),
206
+ REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"),
207
+ REPLACEMENT_ENTRY("markerheight", "markerHeight"),
208
+ REPLACEMENT_ENTRY("markerunits", "markerUnits"),
209
+ REPLACEMENT_ENTRY("markerwidth", "markerWidth"),
210
+ REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"),
211
+ REPLACEMENT_ENTRY("maskunits", "maskUnits"),
212
+ REPLACEMENT_ENTRY("numoctaves", "numOctaves"),
213
+ REPLACEMENT_ENTRY("pathlength", "pathLength"),
214
+ REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"),
215
+ REPLACEMENT_ENTRY("patterntransform", "patternTransform"),
216
+ REPLACEMENT_ENTRY("patternunits", "patternUnits"),
217
+ REPLACEMENT_ENTRY("pointsatx", "pointsAtX"),
218
+ REPLACEMENT_ENTRY("pointsaty", "pointsAtY"),
219
+ REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"),
220
+ REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"),
221
+ REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"),
222
+ REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"),
223
+ REPLACEMENT_ENTRY("refx", "refX"), REPLACEMENT_ENTRY("refy", "refY"),
224
+ REPLACEMENT_ENTRY("repeatcount", "repeatCount"),
225
+ REPLACEMENT_ENTRY("repeatdur", "repeatDur"),
226
+ REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"),
227
+ REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"),
228
+ REPLACEMENT_ENTRY("specularconstant", "specularConstant"),
229
+ REPLACEMENT_ENTRY("specularexponent", "specularExponent"),
230
+ REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"),
231
+ REPLACEMENT_ENTRY("startoffset", "startOffset"),
232
+ REPLACEMENT_ENTRY("stddeviation", "stdDeviation"),
233
+ REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"),
234
+ REPLACEMENT_ENTRY("surfacescale", "surfaceScale"),
235
+ REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"),
236
+ REPLACEMENT_ENTRY("tablevalues", "tableValues"),
237
+ REPLACEMENT_ENTRY("targetx", "targetX"),
238
+ REPLACEMENT_ENTRY("targety", "targetY"),
239
+ REPLACEMENT_ENTRY("textlength", "textLength"),
240
+ REPLACEMENT_ENTRY("viewbox", "viewBox"),
241
+ REPLACEMENT_ENTRY("viewtarget", "viewTarget"),
242
+ REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"),
243
+ REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"),
244
+ REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"),
245
+ };
246
+
247
+ static const ReplacementEntry kSvgTagReplacements[] = {
248
+ REPLACEMENT_ENTRY("altglyph", "altGlyph"),
249
+ REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"),
250
+ REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"),
251
+ REPLACEMENT_ENTRY("animatecolor", "animateColor"),
252
+ REPLACEMENT_ENTRY("animatemotion", "animateMotion"),
253
+ REPLACEMENT_ENTRY("animatetransform", "animateTransform"),
254
+ REPLACEMENT_ENTRY("clippath", "clipPath"),
255
+ REPLACEMENT_ENTRY("feblend", "feBlend"),
256
+ REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"),
257
+ REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"),
258
+ REPLACEMENT_ENTRY("fecomposite", "feComposite"),
259
+ REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"),
260
+ REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"),
261
+ REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"),
262
+ REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"),
263
+ REPLACEMENT_ENTRY("feflood", "feFlood"),
264
+ REPLACEMENT_ENTRY("fefunca", "feFuncA"),
265
+ REPLACEMENT_ENTRY("fefuncb", "feFuncB"),
266
+ REPLACEMENT_ENTRY("fefuncg", "feFuncG"),
267
+ REPLACEMENT_ENTRY("fefuncr", "feFuncR"),
268
+ REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"),
269
+ REPLACEMENT_ENTRY("feimage", "feImage"),
270
+ REPLACEMENT_ENTRY("femerge", "feMerge"),
271
+ REPLACEMENT_ENTRY("femergenode", "feMergeNode"),
272
+ REPLACEMENT_ENTRY("femorphology", "feMorphology"),
273
+ REPLACEMENT_ENTRY("feoffset", "feOffset"),
274
+ REPLACEMENT_ENTRY("fepointlight", "fePointLight"),
275
+ REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"),
276
+ REPLACEMENT_ENTRY("fespotlight", "feSpotLight"),
277
+ REPLACEMENT_ENTRY("fetile", "feTile"),
278
+ REPLACEMENT_ENTRY("feturbulence", "feTurbulence"),
279
+ REPLACEMENT_ENTRY("foreignobject", "foreignObject"),
280
+ REPLACEMENT_ENTRY("glyphref", "glyphRef"),
281
+ REPLACEMENT_ENTRY("lineargradient", "linearGradient"),
282
+ REPLACEMENT_ENTRY("radialgradient", "radialGradient"),
283
+ REPLACEMENT_ENTRY("textpath", "textPath"),
284
+ };
285
+
286
+ typedef struct _NamespacedAttributeReplacement {
287
+ const char* from;
288
+ const char* local_name;
289
+ const GumboAttributeNamespaceEnum attr_namespace;
290
+ } NamespacedAttributeReplacement;
291
+
292
+ static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = {
293
+ {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
294
+ {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
295
+ {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
296
+ {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
297
+ {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
298
+ {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
299
+ {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
300
+ {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML},
301
+ {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
302
+ {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
303
+ {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
304
+ {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
305
+ };
306
+
307
+ // The "scope marker" for the list of active formatting elements. We use a
308
+ // pointer to this as a generic marker element, since the particular element
309
+ // scope doesn't matter.
310
+ static const GumboNode kActiveFormattingScopeMarker;
311
+
312
+ // The tag_is and tag_in function use true & false to denote start & end tags,
313
+ // but for readability, we define constants for them here.
314
+ static const bool kStartTag = true;
315
+ static const bool kEndTag = false;
316
+
317
+ // Because GumboStringPieces are immutable, we can't insert a character directly
318
+ // into a text node. Instead, we accumulate all pending characters here and
319
+ // flush them out to a text node whenever a new element is inserted.
320
+ //
321
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character
322
+ typedef struct _TextNodeBufferState {
323
+ // The accumulated text to be inserted into the current text node.
324
+ GumboStringBuffer _buffer;
325
+
326
+ // A pointer to the original text represented by this text node. Note that
327
+ // because of foster parenting and other strange DOM manipulations, this may
328
+ // include other non-text HTML tags in it; it is defined as the span of
329
+ // original text from the first character in this text node to the last
330
+ // character in this text node.
331
+ const char* _start_original_text;
332
+
333
+ // The source position of the start of this text node.
334
+ GumboSourcePosition _start_position;
335
+
336
+ // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
337
+ GumboNodeType _type;
338
+ } TextNodeBufferState;
339
+
340
+ typedef struct GumboInternalParserState {
341
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
342
+ GumboInsertionMode _insertion_mode;
343
+
344
+ // Used for run_generic_parsing_algorithm, which needs to switch back to the
345
+ // original insertion mode at its conclusion.
346
+ GumboInsertionMode _original_insertion_mode;
347
+
348
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements
349
+ GumboVector /*GumboNode*/ _open_elements;
350
+
351
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements
352
+ GumboVector /*GumboNode*/ _active_formatting_elements;
353
+
354
+ // The stack of template insertion modes.
355
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode
356
+ GumboVector /*InsertionMode*/ _template_insertion_modes;
357
+
358
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers
359
+ GumboNode* _head_element;
360
+ GumboNode* _form_element;
361
+
362
+ // The element used as fragment context when parsing in fragment mode
363
+ GumboNode* _fragment_ctx;
364
+
365
+ // The flag for when the spec says "Reprocess the current token in..."
366
+ bool _reprocess_current_token;
367
+
368
+ // The flag for "acknowledge the token's self-closing flag".
369
+ bool _self_closing_flag_acknowledged;
370
+
371
+ // The "frameset-ok" flag from the spec.
372
+ bool _frameset_ok;
373
+
374
+ // The flag for "If the next token is a LINE FEED, ignore that token...".
375
+ bool _ignore_next_linefeed;
376
+
377
+ // The flag for "whenever a node would be inserted into the current node, it
378
+ // must instead be foster parented". This is used for misnested table
379
+ // content, which needs to be handled according to "in body" rules yet foster
380
+ // parented outside of the table.
381
+ // It would perhaps be more explicit to have this as a parameter to
382
+ // handle_in_body and insert_element, but given how special-purpose this is
383
+ // and the number of call-sites that would need to take the extra parameter,
384
+ // it's easier just to have a state flag.
385
+ bool _foster_parent_insertions;
386
+
387
+ // The accumulated text node buffer state.
388
+ TextNodeBufferState _text_node;
389
+
390
+ // The current token.
391
+ GumboToken* _current_token;
392
+
393
+ // The way that the spec is written, the </body> and </html> tags are *always*
394
+ // implicit, because encountering one of those tokens merely switches the
395
+ // insertion mode out of "in body". So we have individual state flags for
396
+ // those end tags that are then inspected by pop_current_node when the <body>
397
+ // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG
398
+ // flag appropriately.
399
+ bool _closed_body_tag;
400
+ bool _closed_html_tag;
401
+ } GumboParserState;
402
+
403
+ static bool token_has_attribute(const GumboToken* token, const char* name) {
404
+ assert(token->type == GUMBO_TOKEN_START_TAG);
405
+ return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL;
406
+ }
407
+
408
+ // Checks if the value of the specified attribute is a case-insensitive match
409
+ // for the specified string.
410
+ static bool attribute_matches(
411
+ const GumboVector* attributes, const char* name, const char* value) {
412
+ const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
413
+ return attr ? strcasecmp(value, attr->value) == 0 : false;
414
+ }
415
+
416
+ // Checks if the value of the specified attribute is a case-sensitive match
417
+ // for the specified string.
418
+ static bool attribute_matches_case_sensitive(
419
+ const GumboVector* attributes, const char* name, const char* value) {
420
+ const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
421
+ return attr ? strcmp(value, attr->value) == 0 : false;
422
+ }
423
+
424
+ // Checks if the specified attribute vectors are identical.
425
+ static bool all_attributes_match(
426
+ const GumboVector* attr1, const GumboVector* attr2) {
427
+ unsigned int num_unmatched_attr2_elements = attr2->length;
428
+ for (unsigned int i = 0; i < attr1->length; ++i) {
429
+ const GumboAttribute* attr = attr1->data[i];
430
+ if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) {
431
+ --num_unmatched_attr2_elements;
432
+ } else {
433
+ return false;
434
+ }
435
+ }
436
+ return num_unmatched_attr2_elements == 0;
437
+ }
438
+
439
+ static void set_frameset_not_ok(GumboParser* parser) {
440
+ gumbo_debug("Setting frameset_ok to false.\n");
441
+ parser->_parser_state->_frameset_ok = false;
442
+ }
443
+
444
+ static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
445
+ GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
446
+ node->parent = NULL;
447
+ node->index_within_parent = -1;
448
+ node->type = type;
449
+ node->parse_flags = GUMBO_INSERTION_NORMAL;
450
+ return node;
451
+ }
452
+
453
+ static GumboNode* new_document_node(GumboParser* parser) {
454
+ GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT);
455
+ document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
456
+ gumbo_vector_init(parser, 1, &document_node->v.document.children);
457
+
458
+ // Must be initialized explicitly, as there's no guarantee that we'll see a
459
+ // doc type token.
460
+ GumboDocument* document = &document_node->v.document;
461
+ document->has_doctype = false;
462
+ document->name = NULL;
463
+ document->public_identifier = NULL;
464
+ document->system_identifier = NULL;
465
+ return document_node;
466
+ }
467
+
468
+ static void output_init(GumboParser* parser) {
469
+ GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput));
470
+ output->root = NULL;
471
+ output->document = new_document_node(parser);
472
+ parser->_output = output;
473
+ gumbo_init_errors(parser);
474
+ }
475
+
476
+ static void parser_state_init(GumboParser* parser) {
477
+ GumboParserState* parser_state =
478
+ gumbo_parser_allocate(parser, sizeof(GumboParserState));
479
+ parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL;
480
+ parser_state->_reprocess_current_token = false;
481
+ parser_state->_frameset_ok = true;
482
+ parser_state->_ignore_next_linefeed = false;
483
+ parser_state->_foster_parent_insertions = false;
484
+ parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
485
+ gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer);
486
+ gumbo_vector_init(parser, 10, &parser_state->_open_elements);
487
+ gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements);
488
+ gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
489
+ parser_state->_head_element = NULL;
490
+ parser_state->_form_element = NULL;
491
+ parser_state->_fragment_ctx = NULL;
492
+ parser_state->_current_token = NULL;
493
+ parser_state->_closed_body_tag = false;
494
+ parser_state->_closed_html_tag = false;
495
+ parser->_parser_state = parser_state;
496
+ }
497
+
498
+ static void parser_state_destroy(GumboParser* parser) {
499
+ GumboParserState* state = parser->_parser_state;
500
+ if (state->_fragment_ctx) {
501
+ destroy_node(parser, state->_fragment_ctx);
502
+ }
503
+ gumbo_vector_destroy(parser, &state->_active_formatting_elements);
504
+ gumbo_vector_destroy(parser, &state->_open_elements);
505
+ gumbo_vector_destroy(parser, &state->_template_insertion_modes);
506
+ gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
507
+ gumbo_parser_deallocate(parser, state);
508
+ }
509
+
510
+ static GumboNode* get_document_node(GumboParser* parser) {
511
+ return parser->_output->document;
512
+ }
513
+
514
+ static bool is_fragment_parser(const GumboParser* parser) {
515
+ return !!parser->_parser_state->_fragment_ctx;
516
+ }
517
+
518
+ // Returns the node at the bottom of the stack of open elements, or NULL if no
519
+ // elements have been added yet.
520
+ static GumboNode* get_current_node(GumboParser* parser) {
521
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
522
+ if (open_elements->length == 0) {
523
+ assert(!parser->_output->root);
524
+ return NULL;
525
+ }
526
+ assert(open_elements->length > 0);
527
+ assert(open_elements->data != NULL);
528
+ return open_elements->data[open_elements->length - 1];
529
+ }
530
+
531
+ static GumboNode* get_adjusted_current_node(GumboParser* parser) {
532
+ GumboParserState* state = parser->_parser_state;
533
+ if (state->_open_elements.length == 1 && state->_fragment_ctx) {
534
+ return state->_fragment_ctx;
535
+ }
536
+ return get_current_node(parser);
537
+ }
538
+
539
+ // Returns true if the given needle is in the given array of literal
540
+ // GumboStringPieces. If exact_match is true, this requires that they match
541
+ // exactly; otherwise, this performs a prefix match to check if any of the
542
+ // elements in haystack start with needle. This always performs a
543
+ // case-insensitive match.
544
+ static bool is_in_static_list(
545
+ const char* needle, const GumboStringPiece* haystack, bool exact_match) {
546
+ for (unsigned int i = 0; haystack[i].length > 0; ++i) {
547
+ if ((exact_match && !strcmp(needle, haystack[i].data)) ||
548
+ (!exact_match && !strcasecmp(needle, haystack[i].data))) {
549
+ return true;
550
+ }
551
+ }
552
+ return false;
553
+ }
554
+
555
+ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
556
+ parser->_parser_state->_insertion_mode = mode;
557
+ }
558
+
559
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
560
+ // This is a helper function that returns the appropriate insertion mode instead
561
+ // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
562
+ // indicate that there is no appropriate insertion mode, and the loop should
563
+ // continue.
564
+ static GumboInsertionMode get_appropriate_insertion_mode(
565
+ const GumboParser* parser, int index) {
566
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
567
+ const GumboNode* node = open_elements->data[index];
568
+ const bool is_last = index == 0;
569
+
570
+ if (is_last && is_fragment_parser(parser)) {
571
+ node = parser->_parser_state->_fragment_ctx;
572
+ }
573
+
574
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
575
+ switch (node->v.element.tag) {
576
+ case GUMBO_TAG_SELECT: {
577
+ if (is_last) {
578
+ return GUMBO_INSERTION_MODE_IN_SELECT;
579
+ }
580
+ for (int i = index; i > 0; --i) {
581
+ const GumboNode* ancestor = open_elements->data[i];
582
+ if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) {
583
+ return GUMBO_INSERTION_MODE_IN_SELECT;
584
+ }
585
+ if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) {
586
+ return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE;
587
+ }
588
+ }
589
+ return GUMBO_INSERTION_MODE_IN_SELECT;
590
+ }
591
+ case GUMBO_TAG_TD:
592
+ case GUMBO_TAG_TH:
593
+ if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL;
594
+ break;
595
+ case GUMBO_TAG_TR:
596
+ return GUMBO_INSERTION_MODE_IN_ROW;
597
+ case GUMBO_TAG_TBODY:
598
+ case GUMBO_TAG_THEAD:
599
+ case GUMBO_TAG_TFOOT:
600
+ return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
601
+ case GUMBO_TAG_CAPTION:
602
+ return GUMBO_INSERTION_MODE_IN_CAPTION;
603
+ case GUMBO_TAG_COLGROUP:
604
+ return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
605
+ case GUMBO_TAG_TABLE:
606
+ return GUMBO_INSERTION_MODE_IN_TABLE;
607
+ case GUMBO_TAG_TEMPLATE:
608
+ return get_current_template_insertion_mode(parser);
609
+ case GUMBO_TAG_HEAD:
610
+ if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD;
611
+ break;
612
+ case GUMBO_TAG_BODY:
613
+ return GUMBO_INSERTION_MODE_IN_BODY;
614
+ case GUMBO_TAG_FRAMESET:
615
+ return GUMBO_INSERTION_MODE_IN_FRAMESET;
616
+ case GUMBO_TAG_HTML:
617
+ return parser->_parser_state->_head_element
618
+ ? GUMBO_INSERTION_MODE_AFTER_HEAD
619
+ : GUMBO_INSERTION_MODE_BEFORE_HEAD;
620
+ default:
621
+ break;
622
+ }
623
+ return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
624
+ }
625
+
626
+ // This performs the actual "reset the insertion mode" loop.
627
+ static void reset_insertion_mode_appropriately(GumboParser* parser) {
628
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
629
+ for (int i = open_elements->length; --i >= 0;) {
630
+ GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i);
631
+ if (mode != GUMBO_INSERTION_MODE_INITIAL) {
632
+ set_insertion_mode(parser, mode);
633
+ return;
634
+ }
635
+ }
636
+ // Should never get here, because is_last will be set on the last iteration
637
+ // and will force GUMBO_INSERTION_MODE_IN_BODY.
638
+ assert(0);
639
+ }
640
+
641
+ static GumboError* parser_add_parse_error(
642
+ GumboParser* parser, const GumboToken* token) {
643
+ gumbo_debug("Adding parse error.\n");
644
+ GumboError* error = gumbo_add_error(parser);
645
+ if (!error) {
646
+ return NULL;
647
+ }
648
+ error->type = GUMBO_ERR_PARSER;
649
+ error->position = token->position;
650
+ error->original_text = token->original_text.data;
651
+ GumboParserError* extra_data = &error->v.parser;
652
+ extra_data->input_type = token->type;
653
+ extra_data->input_tag = GUMBO_TAG_UNKNOWN;
654
+ if (token->type == GUMBO_TOKEN_START_TAG) {
655
+ extra_data->input_tag = token->v.start_tag.tag;
656
+ } else if (token->type == GUMBO_TOKEN_END_TAG) {
657
+ extra_data->input_tag = token->v.end_tag;
658
+ }
659
+ GumboParserState* state = parser->_parser_state;
660
+ extra_data->parser_state = state->_insertion_mode;
661
+ gumbo_vector_init(
662
+ parser, state->_open_elements.length, &extra_data->tag_stack);
663
+ for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
664
+ const GumboNode* node = state->_open_elements.data[i];
665
+ assert(
666
+ node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
667
+ gumbo_vector_add(
668
+ parser, (void*) node->v.element.tag, &extra_data->tag_stack);
669
+ }
670
+ return error;
671
+ }
672
+
673
+ // Returns true if the specified token is either a start or end tag (specified
674
+ // by is_start) with one of the tag types in the varargs list. Terminate the
675
+ // list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
676
+ // the spec references tags that are not in the spec.
677
+ static bool tag_in(
678
+ const GumboToken* token, bool is_start, const gumbo_tagset tags) {
679
+ GumboTag token_tag;
680
+ if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
681
+ token_tag = token->v.start_tag.tag;
682
+ } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
683
+ token_tag = token->v.end_tag;
684
+ } else {
685
+ return false;
686
+ }
687
+ return (token_tag < GUMBO_TAG_LAST && tags[(int) token_tag] != 0);
688
+ }
689
+
690
+ // Like tag_in, but for the single-tag case.
691
+ static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
692
+ if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
693
+ return token->v.start_tag.tag == tag;
694
+ } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
695
+ return token->v.end_tag == tag;
696
+ } else {
697
+ return false;
698
+ }
699
+ }
700
+
701
+ // Like tag_in, but checks for the tag of a node, rather than a token.
702
+ static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) {
703
+ assert(node != NULL);
704
+ if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
705
+ return false;
706
+ }
707
+ return TAGSET_INCLUDES(
708
+ tags, node->v.element.tag_namespace, node->v.element.tag);
709
+ }
710
+
711
+ // Like node_tag_in, but for the single-tag case.
712
+ static bool node_qualified_tag_is(
713
+ const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) {
714
+ assert(node);
715
+ return (node->type == GUMBO_NODE_ELEMENT ||
716
+ node->type == GUMBO_NODE_TEMPLATE) &&
717
+ node->v.element.tag == tag && node->v.element.tag_namespace == ns;
718
+ }
719
+
720
+ // Like node_tag_in, but for the single-tag case in the HTML namespace
721
+ static bool node_html_tag_is(const GumboNode* node, GumboTag tag) {
722
+ return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
723
+ }
724
+
725
+ static void push_template_insertion_mode(
726
+ GumboParser* parser, GumboInsertionMode mode) {
727
+ gumbo_vector_add(
728
+ parser, (void*) mode, &parser->_parser_state->_template_insertion_modes);
729
+ }
730
+
731
+ static void pop_template_insertion_mode(GumboParser* parser) {
732
+ gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes);
733
+ }
734
+
735
+ // Returns the current template insertion mode. If the stack of template
736
+ // insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL.
737
+ static GumboInsertionMode get_current_template_insertion_mode(
738
+ const GumboParser* parser) {
739
+ GumboVector* template_insertion_modes =
740
+ &parser->_parser_state->_template_insertion_modes;
741
+ if (template_insertion_modes->length == 0) {
742
+ return GUMBO_INSERTION_MODE_INITIAL;
743
+ }
744
+ return (GumboInsertionMode)
745
+ template_insertion_modes->data[(template_insertion_modes->length - 1)];
746
+ }
747
+
748
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
749
+ static bool is_mathml_integration_point(const GumboNode* node) {
750
+ return node_tag_in_set(
751
+ node, (gumbo_tagset){TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
752
+ TAG_MATHML(MS), TAG_MATHML(MTEXT)});
753
+ }
754
+
755
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
756
+ static bool is_html_integration_point(const GumboNode* node) {
757
+ return node_tag_in_set(node, (gumbo_tagset){TAG_SVG(FOREIGNOBJECT),
758
+ TAG_SVG(DESC), TAG_SVG(TITLE)}) ||
759
+ (node_qualified_tag_is(
760
+ node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
761
+ (attribute_matches(
762
+ &node->v.element.attributes, "encoding", "text/html") ||
763
+ attribute_matches(&node->v.element.attributes, "encoding",
764
+ "application/xhtml+xml")));
765
+ }
766
+
767
+ // This represents a place to insert a node, consisting of a target parent and a
768
+ // child index within that parent. If the node should be inserted at the end of
769
+ // the parent's child, index will be -1.
770
+ typedef struct {
771
+ GumboNode* target;
772
+ int index;
773
+ } InsertionLocation;
774
+
775
+ InsertionLocation get_appropriate_insertion_location(
776
+ GumboParser* parser, GumboNode* override_target) {
777
+ InsertionLocation retval = {override_target, -1};
778
+ if (retval.target == NULL) {
779
+ // No override target; default to the current node, but special-case the
780
+ // root node since get_current_node() assumes the stack of open elements is
781
+ // non-empty.
782
+ retval.target = parser->_output->root != NULL ? get_current_node(parser)
783
+ : get_document_node(parser);
784
+ }
785
+ if (!parser->_parser_state->_foster_parent_insertions ||
786
+ !node_tag_in_set(retval.target, (gumbo_tagset){TAG(TABLE), TAG(TBODY),
787
+ TAG(TFOOT), TAG(THEAD), TAG(TR)})) {
788
+ return retval;
789
+ }
790
+
791
+ // Foster-parenting case.
792
+ int last_template_index = -1;
793
+ int last_table_index = -1;
794
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
795
+ for (unsigned int i = 0; i < open_elements->length; ++i) {
796
+ if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) {
797
+ last_template_index = i;
798
+ }
799
+ if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) {
800
+ last_table_index = i;
801
+ }
802
+ }
803
+ if (last_template_index != -1 &&
804
+ (last_table_index == -1 || last_template_index > last_table_index)) {
805
+ retval.target = open_elements->data[last_template_index];
806
+ return retval;
807
+ }
808
+ if (last_table_index == -1) {
809
+ retval.target = open_elements->data[0];
810
+ return retval;
811
+ }
812
+ GumboNode* last_table = open_elements->data[last_table_index];
813
+ if (last_table->parent != NULL) {
814
+ retval.target = last_table->parent;
815
+ retval.index = last_table->index_within_parent;
816
+ return retval;
817
+ }
818
+
819
+ retval.target = open_elements->data[last_table_index - 1];
820
+ return retval;
821
+ }
822
+
823
+ // Appends a node to the end of its parent, setting the "parent" and
824
+ // "index_within_parent" fields appropriately.
825
+ static void append_node(
826
+ GumboParser* parser, GumboNode* parent, GumboNode* node) {
827
+ assert(node->parent == NULL);
828
+ assert(node->index_within_parent == -1);
829
+ GumboVector* children;
830
+ if (parent->type == GUMBO_NODE_ELEMENT ||
831
+ parent->type == GUMBO_NODE_TEMPLATE) {
832
+ children = &parent->v.element.children;
833
+ } else {
834
+ assert(parent->type == GUMBO_NODE_DOCUMENT);
835
+ children = &parent->v.document.children;
836
+ }
837
+ node->parent = parent;
838
+ node->index_within_parent = children->length;
839
+ gumbo_vector_add(parser, (void*) node, children);
840
+ assert(node->index_within_parent < children->length);
841
+ }
842
+
843
+ // Inserts a node at the specified InsertionLocation, updating the
844
+ // "parent" and "index_within_parent" fields of it and all its siblings.
845
+ // If the index of the location is -1, this calls append_node.
846
+ static void insert_node(
847
+ GumboParser* parser, GumboNode* node, InsertionLocation location) {
848
+ assert(node->parent == NULL);
849
+ assert(node->index_within_parent == -1);
850
+ GumboNode* parent = location.target;
851
+ int index = location.index;
852
+ if (index != -1) {
853
+ GumboVector* children = NULL;
854
+ if (parent->type == GUMBO_NODE_ELEMENT ||
855
+ parent->type == GUMBO_NODE_TEMPLATE) {
856
+ children = &parent->v.element.children;
857
+ } else if (parent->type == GUMBO_NODE_DOCUMENT) {
858
+ children = &parent->v.document.children;
859
+ assert(children->length == 0);
860
+ } else {
861
+ assert(0);
862
+ }
863
+
864
+ assert(index >= 0);
865
+ assert((unsigned int) index < children->length);
866
+ node->parent = parent;
867
+ node->index_within_parent = index;
868
+ gumbo_vector_insert_at(parser, (void*) node, index, children);
869
+ assert(node->index_within_parent < children->length);
870
+ for (unsigned int i = index + 1; i < children->length; ++i) {
871
+ GumboNode* sibling = children->data[i];
872
+ sibling->index_within_parent = i;
873
+ assert(sibling->index_within_parent < children->length);
874
+ }
875
+ } else {
876
+ append_node(parser, parent, node);
877
+ }
878
+ }
879
+
880
+ static void maybe_flush_text_node_buffer(GumboParser* parser) {
881
+ GumboParserState* state = parser->_parser_state;
882
+ TextNodeBufferState* buffer_state = &state->_text_node;
883
+ if (buffer_state->_buffer.length == 0) {
884
+ return;
885
+ }
886
+
887
+ assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
888
+ buffer_state->_type == GUMBO_NODE_TEXT ||
889
+ buffer_state->_type == GUMBO_NODE_CDATA);
890
+ GumboNode* text_node = create_node(parser, buffer_state->_type);
891
+ GumboText* text_node_data = &text_node->v.text;
892
+ text_node_data->text =
893
+ gumbo_string_buffer_to_string(parser, &buffer_state->_buffer);
894
+ text_node_data->original_text.data = buffer_state->_start_original_text;
895
+ text_node_data->original_text.length =
896
+ state->_current_token->original_text.data -
897
+ buffer_state->_start_original_text;
898
+ text_node_data->start_pos = buffer_state->_start_position;
899
+
900
+ gumbo_debug("Flushing text node buffer of %.*s.\n",
901
+ (int) buffer_state->_buffer.length, buffer_state->_buffer.data);
902
+
903
+ InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
904
+ if (location.target->type == GUMBO_NODE_DOCUMENT) {
905
+ // The DOM does not allow Document nodes to have Text children, so per the
906
+ // spec, they are dropped on the floor.
907
+ destroy_node(parser, text_node);
908
+ } else {
909
+ insert_node(parser, text_node, location);
910
+ }
911
+
912
+ gumbo_string_buffer_clear(parser, &buffer_state->_buffer);
913
+ buffer_state->_type = GUMBO_NODE_WHITESPACE;
914
+ assert(buffer_state->_buffer.length == 0);
915
+ }
916
+
917
+ static void record_end_of_element(
918
+ GumboToken* current_token, GumboElement* element) {
919
+ element->end_pos = current_token->position;
920
+ element->original_end_tag = current_token->type == GUMBO_TOKEN_END_TAG
921
+ ? current_token->original_text
922
+ : kGumboEmptyString;
923
+ }
924
+
925
+ static GumboNode* pop_current_node(GumboParser* parser) {
926
+ GumboParserState* state = parser->_parser_state;
927
+ maybe_flush_text_node_buffer(parser);
928
+ if (state->_open_elements.length > 0) {
929
+ assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
930
+ gumbo_debug("Popping %s node.\n",
931
+ gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
932
+ }
933
+ GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements);
934
+ if (!current_node) {
935
+ assert(state->_open_elements.length == 0);
936
+ return NULL;
937
+ }
938
+ assert(current_node->type == GUMBO_NODE_ELEMENT ||
939
+ current_node->type == GUMBO_NODE_TEMPLATE);
940
+ bool is_closed_body_or_html_tag =
941
+ (node_html_tag_is(current_node, GUMBO_TAG_BODY) &&
942
+ state->_closed_body_tag) ||
943
+ (node_html_tag_is(current_node, GUMBO_TAG_HTML) &&
944
+ state->_closed_html_tag);
945
+ if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
946
+ !node_html_tag_is(current_node, state->_current_token->v.end_tag)) &&
947
+ !is_closed_body_or_html_tag) {
948
+ current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
949
+ }
950
+ if (!is_closed_body_or_html_tag) {
951
+ record_end_of_element(state->_current_token, &current_node->v.element);
952
+ }
953
+ return current_node;
954
+ }
955
+
956
+ static void append_comment_node(
957
+ GumboParser* parser, GumboNode* node, const GumboToken* token) {
958
+ maybe_flush_text_node_buffer(parser);
959
+ GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT);
960
+ comment->type = GUMBO_NODE_COMMENT;
961
+ comment->parse_flags = GUMBO_INSERTION_NORMAL;
962
+ comment->v.text.text = token->v.text;
963
+ comment->v.text.original_text = token->original_text;
964
+ comment->v.text.start_pos = token->position;
965
+ append_node(parser, node, comment);
966
+ }
967
+
968
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
969
+ static void clear_stack_to_table_row_context(GumboParser* parser) {
970
+ while (!node_tag_in_set(get_current_node(parser),
971
+ (gumbo_tagset){TAG(HTML), TAG(TR), TAG(TEMPLATE)})) {
972
+ pop_current_node(parser);
973
+ }
974
+ }
975
+
976
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
977
+ static void clear_stack_to_table_context(GumboParser* parser) {
978
+ while (!node_tag_in_set(get_current_node(parser),
979
+ (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)})) {
980
+ pop_current_node(parser);
981
+ }
982
+ }
983
+
984
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
985
+ void clear_stack_to_table_body_context(GumboParser* parser) {
986
+ while (!node_tag_in_set(get_current_node(parser),
987
+ (gumbo_tagset){TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD),
988
+ TAG(TEMPLATE)})) {
989
+ pop_current_node(parser);
990
+ }
991
+ }
992
+
993
+ // Creates a parser-inserted element in the HTML namespace and returns it.
994
+ static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
995
+ GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
996
+ GumboElement* element = &node->v.element;
997
+ gumbo_vector_init(parser, 1, &element->children);
998
+ gumbo_vector_init(parser, 0, &element->attributes);
999
+ element->tag = tag;
1000
+ element->tag_namespace = GUMBO_NAMESPACE_HTML;
1001
+ element->original_tag = kGumboEmptyString;
1002
+ element->original_end_tag = kGumboEmptyString;
1003
+ element->start_pos = (parser->_parser_state->_current_token)
1004
+ ? parser->_parser_state->_current_token->position
1005
+ : kGumboEmptySourcePosition;
1006
+ element->end_pos = kGumboEmptySourcePosition;
1007
+ return node;
1008
+ }
1009
+
1010
+ // Constructs an element from the given start tag token.
1011
+ static GumboNode* create_element_from_token(
1012
+ GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
1013
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1014
+ GumboTokenStartTag* start_tag = &token->v.start_tag;
1015
+
1016
+ GumboNodeType type = (tag_namespace == GUMBO_NAMESPACE_HTML &&
1017
+ start_tag->tag == GUMBO_TAG_TEMPLATE)
1018
+ ? GUMBO_NODE_TEMPLATE
1019
+ : GUMBO_NODE_ELEMENT;
1020
+
1021
+ GumboNode* node = create_node(parser, type);
1022
+ GumboElement* element = &node->v.element;
1023
+ gumbo_vector_init(parser, 1, &element->children);
1024
+ element->attributes = start_tag->attributes;
1025
+ element->tag = start_tag->tag;
1026
+ element->tag_namespace = tag_namespace;
1027
+
1028
+ assert(token->original_text.length >= 2);
1029
+ assert(token->original_text.data[0] == '<');
1030
+ assert(token->original_text.data[token->original_text.length - 1] == '>');
1031
+ element->original_tag = token->original_text;
1032
+ element->start_pos = token->position;
1033
+ element->original_end_tag = kGumboEmptyString;
1034
+ element->end_pos = kGumboEmptySourcePosition;
1035
+
1036
+ // The element takes ownership of the attributes from the token, so any
1037
+ // allocated-memory fields should be nulled out.
1038
+ start_tag->attributes = kGumboEmptyVector;
1039
+ return node;
1040
+ }
1041
+
1042
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element
1043
+ static void insert_element(GumboParser* parser, GumboNode* node,
1044
+ bool is_reconstructing_formatting_elements) {
1045
+ GumboParserState* state = parser->_parser_state;
1046
+ // NOTE(jdtang): The text node buffer must always be flushed before inserting
1047
+ // a node, otherwise we're handling nodes in a different order than the spec
1048
+ // mandated. However, one clause of the spec (character tokens in the body)
1049
+ // requires that we reconstruct the active formatting elements *before* adding
1050
+ // the character, and reconstructing the active formatting elements may itself
1051
+ // result in the insertion of new elements (which should be pushed onto the
1052
+ // stack of open elements before the buffer is flushed). We solve this (for
1053
+ // the time being, the spec has been rewritten for <template> and the new
1054
+ // version may be simpler here) with a boolean flag to this method.
1055
+ if (!is_reconstructing_formatting_elements) {
1056
+ maybe_flush_text_node_buffer(parser);
1057
+ }
1058
+ InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
1059
+ insert_node(parser, node, location);
1060
+ gumbo_vector_add(parser, (void*) node, &state->_open_elements);
1061
+ }
1062
+
1063
+ // Convenience method that combines create_element_from_token and
1064
+ // insert_element, inserting the generated element directly into the current
1065
+ // node. Returns the node inserted.
1066
+ static GumboNode* insert_element_from_token(
1067
+ GumboParser* parser, GumboToken* token) {
1068
+ GumboNode* element =
1069
+ create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML);
1070
+ insert_element(parser, element, false);
1071
+ gumbo_debug("Inserting <%s> element (@%x) from token.\n",
1072
+ gumbo_normalized_tagname(element->v.element.tag), element);
1073
+ return element;
1074
+ }
1075
+
1076
+ // Convenience method that combines create_element and insert_element, inserting
1077
+ // a parser-generated element of a specific tag type. Returns the node
1078
+ // inserted.
1079
+ static GumboNode* insert_element_of_tag_type(
1080
+ GumboParser* parser, GumboTag tag, GumboParseFlags reason) {
1081
+ GumboNode* element = create_element(parser, tag);
1082
+ element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1083
+ insert_element(parser, element, false);
1084
+ gumbo_debug("Inserting %s element (@%x) from tag type.\n",
1085
+ gumbo_normalized_tagname(tag), element);
1086
+ return element;
1087
+ }
1088
+
1089
+ // Convenience method for creating foreign namespaced element. Returns the node
1090
+ // inserted.
1091
+ static GumboNode* insert_foreign_element(
1092
+ GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
1093
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1094
+ GumboNode* element = create_element_from_token(parser, token, tag_namespace);
1095
+ insert_element(parser, element, false);
1096
+ if (token_has_attribute(token, "xmlns") &&
1097
+ !attribute_matches_case_sensitive(&token->v.start_tag.attributes, "xmlns",
1098
+ kLegalXmlns[tag_namespace])) {
1099
+ // TODO(jdtang): Since there're multiple possible error codes here, we
1100
+ // eventually need reason codes to differentiate them.
1101
+ parser_add_parse_error(parser, token);
1102
+ }
1103
+ if (token_has_attribute(token, "xmlns:xlink") &&
1104
+ !attribute_matches_case_sensitive(&token->v.start_tag.attributes,
1105
+ "xmlns:xlink", "http://www.w3.org/1999/xlink")) {
1106
+ parser_add_parse_error(parser, token);
1107
+ }
1108
+ return element;
1109
+ }
1110
+
1111
+ static void insert_text_token(GumboParser* parser, GumboToken* token) {
1112
+ assert(token->type == GUMBO_TOKEN_WHITESPACE ||
1113
+ token->type == GUMBO_TOKEN_CHARACTER ||
1114
+ token->type == GUMBO_TOKEN_NULL || token->type == GUMBO_TOKEN_CDATA);
1115
+ TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
1116
+ if (buffer_state->_buffer.length == 0) {
1117
+ // Initialize position fields.
1118
+ buffer_state->_start_original_text = token->original_text.data;
1119
+ buffer_state->_start_position = token->position;
1120
+ }
1121
+ gumbo_string_buffer_append_codepoint(
1122
+ parser, token->v.character, &buffer_state->_buffer);
1123
+ if (token->type == GUMBO_TOKEN_CHARACTER) {
1124
+ buffer_state->_type = GUMBO_NODE_TEXT;
1125
+ } else if (token->type == GUMBO_TOKEN_CDATA) {
1126
+ buffer_state->_type = GUMBO_NODE_CDATA;
1127
+ }
1128
+ gumbo_debug("Inserting text token '%c'.\n", token->v.character);
1129
+ }
1130
+
1131
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm
1132
+ static void run_generic_parsing_algorithm(
1133
+ GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) {
1134
+ insert_element_from_token(parser, token);
1135
+ gumbo_tokenizer_set_state(parser, lexer_state);
1136
+ parser->_parser_state->_original_insertion_mode =
1137
+ parser->_parser_state->_insertion_mode;
1138
+ parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT;
1139
+ }
1140
+
1141
+ static void acknowledge_self_closing_tag(GumboParser* parser) {
1142
+ parser->_parser_state->_self_closing_flag_acknowledged = true;
1143
+ }
1144
+
1145
+ // Returns true if there's an anchor tag in the list of active formatting
1146
+ // elements, and fills in its index if so.
1147
+ static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
1148
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1149
+ for (int i = elements->length; --i >= 0;) {
1150
+ GumboNode* node = elements->data[i];
1151
+ if (node == &kActiveFormattingScopeMarker) {
1152
+ return false;
1153
+ }
1154
+ if (node_html_tag_is(node, GUMBO_TAG_A)) {
1155
+ *anchor_index = i;
1156
+ return true;
1157
+ }
1158
+ }
1159
+ return false;
1160
+ }
1161
+
1162
+ // Counts the number of open formatting elements in the list of active
1163
+ // formatting elements (after the last active scope marker) that have a specific
1164
+ // tag. If this is > 0, then earliest_matching_index will be filled in with the
1165
+ // index of the first such element.
1166
+ static int count_formatting_elements_of_tag(GumboParser* parser,
1167
+ const GumboNode* desired_node, int* earliest_matching_index) {
1168
+ const GumboElement* desired_element = &desired_node->v.element;
1169
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1170
+ int num_identical_elements = 0;
1171
+ for (int i = elements->length; --i >= 0;) {
1172
+ GumboNode* node = elements->data[i];
1173
+ if (node == &kActiveFormattingScopeMarker) {
1174
+ break;
1175
+ }
1176
+ assert(node->type == GUMBO_NODE_ELEMENT);
1177
+ if (node_qualified_tag_is(
1178
+ node, desired_element->tag_namespace, desired_element->tag) &&
1179
+ all_attributes_match(
1180
+ &node->v.element.attributes, &desired_element->attributes)) {
1181
+ num_identical_elements++;
1182
+ *earliest_matching_index = i;
1183
+ }
1184
+ }
1185
+ return num_identical_elements;
1186
+ }
1187
+
1188
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements
1189
+ static void add_formatting_element(GumboParser* parser, const GumboNode* node) {
1190
+ assert(node == &kActiveFormattingScopeMarker ||
1191
+ node->type == GUMBO_NODE_ELEMENT);
1192
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1193
+ if (node == &kActiveFormattingScopeMarker) {
1194
+ gumbo_debug("Adding a scope marker.\n");
1195
+ } else {
1196
+ gumbo_debug("Adding a formatting element.\n");
1197
+ }
1198
+
1199
+ // Hunt for identical elements.
1200
+ int earliest_identical_element = elements->length;
1201
+ int num_identical_elements = count_formatting_elements_of_tag(
1202
+ parser, node, &earliest_identical_element);
1203
+
1204
+ // Noah's Ark clause: if there're at least 3, remove the earliest.
1205
+ if (num_identical_elements >= 3) {
1206
+ gumbo_debug("Noah's ark clause: removing element at %d.\n",
1207
+ earliest_identical_element);
1208
+ gumbo_vector_remove_at(parser, earliest_identical_element, elements);
1209
+ }
1210
+
1211
+ gumbo_vector_add(parser, (void*) node, elements);
1212
+ }
1213
+
1214
+ static bool is_open_element(GumboParser* parser, const GumboNode* node) {
1215
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1216
+ for (unsigned int i = 0; i < open_elements->length; ++i) {
1217
+ if (open_elements->data[i] == node) {
1218
+ return true;
1219
+ }
1220
+ }
1221
+ return false;
1222
+ }
1223
+
1224
+ // Clones attributes, tags, etc. of a node, but does not copy the content. The
1225
+ // clone shares no structure with the original node: all owned strings and
1226
+ // values are fresh copies.
1227
+ GumboNode* clone_node(
1228
+ GumboParser* parser, GumboNode* node, GumboParseFlags reason) {
1229
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1230
+ GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
1231
+ *new_node = *node;
1232
+ new_node->parent = NULL;
1233
+ new_node->index_within_parent = -1;
1234
+ // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may
1235
+ // have a separate end tag.
1236
+ new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG;
1237
+ new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER;
1238
+ GumboElement* element = &new_node->v.element;
1239
+ gumbo_vector_init(parser, 1, &element->children);
1240
+
1241
+ const GumboVector* old_attributes = &node->v.element.attributes;
1242
+ gumbo_vector_init(parser, old_attributes->length, &element->attributes);
1243
+ for (unsigned int i = 0; i < old_attributes->length; ++i) {
1244
+ const GumboAttribute* old_attr = old_attributes->data[i];
1245
+ GumboAttribute* attr =
1246
+ gumbo_parser_allocate(parser, sizeof(GumboAttribute));
1247
+ *attr = *old_attr;
1248
+ attr->name = gumbo_copy_stringz(parser, old_attr->name);
1249
+ attr->value = gumbo_copy_stringz(parser, old_attr->value);
1250
+ gumbo_vector_add(parser, attr, &element->attributes);
1251
+ }
1252
+ return new_node;
1253
+ }
1254
+
1255
+ // "Reconstruct active formatting elements" part of the spec.
1256
+ // This implementation is based on the html5lib translation from the mess of
1257
+ // GOTOs in the spec to reasonably structured programming.
1258
+ // http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py
1259
+ static void reconstruct_active_formatting_elements(GumboParser* parser) {
1260
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1261
+ // Step 1
1262
+ if (elements->length == 0) {
1263
+ return;
1264
+ }
1265
+
1266
+ // Step 2 & 3
1267
+ unsigned int i = elements->length - 1;
1268
+ GumboNode* element = elements->data[i];
1269
+ if (element == &kActiveFormattingScopeMarker ||
1270
+ is_open_element(parser, element)) {
1271
+ return;
1272
+ }
1273
+
1274
+ // Step 6
1275
+ do {
1276
+ if (i == 0) {
1277
+ // Step 4
1278
+ i = -1; // Incremented to 0 below.
1279
+ break;
1280
+ }
1281
+ // Step 5
1282
+ element = elements->data[--i];
1283
+ } while (element != &kActiveFormattingScopeMarker &&
1284
+ !is_open_element(parser, element));
1285
+
1286
+ ++i;
1287
+ gumbo_debug("Reconstructing elements from %d on %s parent.\n", i,
1288
+ gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
1289
+ for (; i < elements->length; ++i) {
1290
+ // Step 7 & 8.
1291
+ assert(elements->length > 0);
1292
+ assert(i < elements->length);
1293
+ element = elements->data[i];
1294
+ assert(element != &kActiveFormattingScopeMarker);
1295
+ GumboNode* clone = clone_node(
1296
+ parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
1297
+ // Step 9.
1298
+ InsertionLocation location =
1299
+ get_appropriate_insertion_location(parser, NULL);
1300
+ insert_node(parser, clone, location);
1301
+ gumbo_vector_add(
1302
+ parser, (void*) clone, &parser->_parser_state->_open_elements);
1303
+
1304
+ // Step 10.
1305
+ elements->data[i] = clone;
1306
+ gumbo_debug("Reconstructed %s element at %d.\n",
1307
+ gumbo_normalized_tagname(clone->v.element.tag), i);
1308
+ }
1309
+ }
1310
+
1311
+ static void clear_active_formatting_elements(GumboParser* parser) {
1312
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1313
+ int num_elements_cleared = 0;
1314
+ const GumboNode* node;
1315
+ do {
1316
+ node = gumbo_vector_pop(parser, elements);
1317
+ ++num_elements_cleared;
1318
+ } while (node && node != &kActiveFormattingScopeMarker);
1319
+ gumbo_debug("Cleared %d elements from active formatting list.\n",
1320
+ num_elements_cleared);
1321
+ }
1322
+
1323
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode
1324
+ static GumboQuirksModeEnum compute_quirks_mode(
1325
+ const GumboTokenDocType* doctype) {
1326
+ if (doctype->force_quirks || strcmp(doctype->name, kDoctypeHtml.data) ||
1327
+ is_in_static_list(
1328
+ doctype->public_identifier, kQuirksModePublicIdPrefixes, false) ||
1329
+ is_in_static_list(
1330
+ doctype->public_identifier, kQuirksModePublicIdExactMatches, true) ||
1331
+ is_in_static_list(
1332
+ doctype->system_identifier, kQuirksModeSystemIdExactMatches, true) ||
1333
+ (is_in_static_list(doctype->public_identifier,
1334
+ kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) &&
1335
+ !doctype->has_system_identifier)) {
1336
+ return GUMBO_DOCTYPE_QUIRKS;
1337
+ } else if (is_in_static_list(doctype->public_identifier,
1338
+ kLimitedQuirksPublicIdPrefixes, false) ||
1339
+ (is_in_static_list(doctype->public_identifier,
1340
+ kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) &&
1341
+ doctype->has_system_identifier)) {
1342
+ return GUMBO_DOCTYPE_LIMITED_QUIRKS;
1343
+ }
1344
+ return GUMBO_DOCTYPE_NO_QUIRKS;
1345
+ }
1346
+
1347
+ // The following functions are all defined by the "has an element in __ scope"
1348
+ // sections of the HTML5 spec:
1349
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
1350
+ // The basic idea behind them is that they check for an element of the given
1351
+ // qualified name, contained within a scope formed by a set of other qualified
1352
+ // names. For example, "has an element in list scope" looks for an element of
1353
+ // the given qualified name within the nearest enclosing <ol> or <ul>, along
1354
+ // with a bunch of generic element types that serve to "firewall" their content
1355
+ // from the rest of the document. Note that because of the way the spec is
1356
+ // written,
1357
+ // all elements are expected to be in the HTML namespace
1358
+ static bool has_an_element_in_specific_scope(GumboParser* parser,
1359
+ int expected_size, const GumboTag* expected, bool negate,
1360
+ const gumbo_tagset tags) {
1361
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1362
+ for (int i = open_elements->length; --i >= 0;) {
1363
+ const GumboNode* node = open_elements->data[i];
1364
+ if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE)
1365
+ continue;
1366
+
1367
+ GumboTag node_tag = node->v.element.tag;
1368
+ GumboNamespaceEnum node_ns = node->v.element.tag_namespace;
1369
+ for (int j = 0; j < expected_size; ++j) {
1370
+ if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML)
1371
+ return true;
1372
+ }
1373
+
1374
+ bool found = TAGSET_INCLUDES(tags, node_ns, node_tag);
1375
+ if (negate != found) return false;
1376
+ }
1377
+ return false;
1378
+ }
1379
+
1380
+ // Checks for the presence of an open element of the specified tag type.
1381
+ static bool has_open_element(GumboParser* parser, GumboTag tag) {
1382
+ return has_an_element_in_specific_scope(
1383
+ parser, 1, &tag, false, (gumbo_tagset){TAG(HTML)});
1384
+ }
1385
+
1386
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
1387
+ static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
1388
+ return has_an_element_in_specific_scope(parser, 1, &tag, false,
1389
+ (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1390
+ TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1391
+ TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1392
+ TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1393
+ TAG_SVG(TITLE)});
1394
+ }
1395
+
1396
+ // Like "has an element in scope", but for the specific case of looking for a
1397
+ // unique target node, not for any node with a given tag name. This duplicates
1398
+ // much of the algorithm from has_an_element_in_specific_scope because the
1399
+ // predicate is different when checking for an exact node, and it's easier &
1400
+ // faster just to duplicate the code for this one case than to try and
1401
+ // parameterize it.
1402
+ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
1403
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1404
+ for (int i = open_elements->length; --i >= 0;) {
1405
+ const GumboNode* current = open_elements->data[i];
1406
+ if (current == node) {
1407
+ return true;
1408
+ }
1409
+ if (current->type != GUMBO_NODE_ELEMENT &&
1410
+ current->type != GUMBO_NODE_TEMPLATE) {
1411
+ continue;
1412
+ }
1413
+ if (node_tag_in_set(current,
1414
+ (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE),
1415
+ TAG(TD), TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE),
1416
+ TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1417
+ TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1418
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE)})) {
1419
+ return false;
1420
+ }
1421
+ }
1422
+ assert(false);
1423
+ return false;
1424
+ }
1425
+
1426
+ // Like has_an_element_in_scope, but restricts the expected qualified name to a
1427
+ // range of possible qualified names instead of just a single one.
1428
+ static bool has_an_element_in_scope_with_tagname(
1429
+ GumboParser* parser, int expected_len, const GumboTag expected[]) {
1430
+ return has_an_element_in_specific_scope(parser, expected_len, expected, false,
1431
+ (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1432
+ TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1433
+ TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1434
+ TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1435
+ TAG_SVG(TITLE)});
1436
+ }
1437
+
1438
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
1439
+ static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
1440
+ return has_an_element_in_specific_scope(parser, 1, &tag, false,
1441
+ (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1442
+ TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1443
+ TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1444
+ TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1445
+ TAG_SVG(TITLE), TAG(OL), TAG(UL)});
1446
+ }
1447
+
1448
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
1449
+ static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
1450
+ return has_an_element_in_specific_scope(parser, 1, &tag, false,
1451
+ (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1452
+ TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1453
+ TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1454
+ TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1455
+ TAG_SVG(TITLE), TAG(BUTTON)});
1456
+ }
1457
+
1458
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
1459
+ static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
1460
+ return has_an_element_in_specific_scope(parser, 1, &tag, false,
1461
+ (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)});
1462
+ }
1463
+
1464
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
1465
+ static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
1466
+ return has_an_element_in_specific_scope(
1467
+ parser, 1, &tag, true, (gumbo_tagset){TAG(OPTGROUP), TAG(OPTION)});
1468
+ }
1469
+
1470
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
1471
+ // "exception" is the "element to exclude from the process" listed in the spec.
1472
+ // Pass GUMBO_TAG_LAST to not exclude any of them.
1473
+ static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1474
+ for (; node_tag_in_set(get_current_node(parser),
1475
+ (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTION),
1476
+ TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC)}) &&
1477
+ !node_html_tag_is(get_current_node(parser), exception);
1478
+ pop_current_node(parser))
1479
+ ;
1480
+ }
1481
+
1482
+ // This is the "generate all implied end tags thoroughly" clause of the spec.
1483
+ // https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags
1484
+ static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
1485
+ for (
1486
+ ; node_tag_in_set(get_current_node(parser),
1487
+ (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI),
1488
+ TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC),
1489
+ TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)});
1490
+ pop_current_node(parser))
1491
+ ;
1492
+ }
1493
+
1494
+ // This factors out the clauses relating to "act as if an end tag token with tag
1495
+ // name "table" had been seen. Returns true if there's a table element in table
1496
+ // scope which was successfully closed, false if not and the token should be
1497
+ // ignored. Does not add parse errors; callers should handle that.
1498
+ static bool close_table(GumboParser* parser) {
1499
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) {
1500
+ return false;
1501
+ }
1502
+
1503
+ GumboNode* node = pop_current_node(parser);
1504
+ while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) {
1505
+ node = pop_current_node(parser);
1506
+ }
1507
+ reset_insertion_mode_appropriately(parser);
1508
+ return true;
1509
+ }
1510
+
1511
+ // This factors out the clauses relating to "act as if an end tag token with tag
1512
+ // name `cell_tag` had been seen".
1513
+ static bool close_table_cell(
1514
+ GumboParser* parser, const GumboToken* token, GumboTag cell_tag) {
1515
+ bool result = true;
1516
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
1517
+ const GumboNode* node = get_current_node(parser);
1518
+ if (!node_html_tag_is(node, cell_tag)) {
1519
+ parser_add_parse_error(parser, token);
1520
+ result = false;
1521
+ }
1522
+ do {
1523
+ node = pop_current_node(parser);
1524
+ } while (!node_html_tag_is(node, cell_tag));
1525
+
1526
+ clear_active_formatting_elements(parser);
1527
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
1528
+ return result;
1529
+ }
1530
+
1531
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell
1532
+ // This holds the logic to determine whether we should close a <td> or a <th>.
1533
+ static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1534
+ if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1535
+ assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1536
+ return close_table_cell(parser, token, GUMBO_TAG_TD);
1537
+ } else {
1538
+ assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1539
+ return close_table_cell(parser, token, GUMBO_TAG_TH);
1540
+ }
1541
+ }
1542
+
1543
+ // This factors out the "act as if an end tag of tag name 'select' had been
1544
+ // seen" clause of the spec, since it's referenced in several places. It pops
1545
+ // all nodes from the stack until the current <select> has been closed, then
1546
+ // resets the insertion mode appropriately.
1547
+ static void close_current_select(GumboParser* parser) {
1548
+ GumboNode* node = pop_current_node(parser);
1549
+ while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) {
1550
+ node = pop_current_node(parser);
1551
+ }
1552
+ reset_insertion_mode_appropriately(parser);
1553
+ }
1554
+
1555
+ // The list of nodes in the "special" category:
1556
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
1557
+ static bool is_special_node(const GumboNode* node) {
1558
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1559
+ return node_tag_in_set(node,
1560
+ (gumbo_tagset){TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE),
1561
+ TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE),
1562
+ TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL),
1563
+ TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR),
1564
+ TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET),
1565
+ TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME),
1566
+ TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6),
1567
+ TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME),
1568
+ TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK), TAG(LISTING),
1569
+ TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED),
1570
+ TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P),
1571
+ TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION),
1572
+ TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY),
1573
+ TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH),
1574
+ TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP),
1575
+
1576
+ TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1577
+ TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1578
+
1579
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC)});
1580
+ }
1581
+
1582
+ // Implicitly closes currently open elements until it reaches an element with
1583
+ // the
1584
+ // specified qualified name. If the elements closed are in the set handled by
1585
+ // generate_implied_end_tags, this is normal operation and this function returns
1586
+ // true. Otherwise, a parse error is recorded and this function returns false.
1587
+ static bool implicitly_close_tags(GumboParser* parser, GumboToken* token,
1588
+ GumboNamespaceEnum target_ns, GumboTag target) {
1589
+ bool result = true;
1590
+ generate_implied_end_tags(parser, target);
1591
+ if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1592
+ parser_add_parse_error(parser, token);
1593
+ while (
1594
+ !node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1595
+ pop_current_node(parser);
1596
+ }
1597
+ result = false;
1598
+ }
1599
+ assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
1600
+ pop_current_node(parser);
1601
+ return result;
1602
+ }
1603
+
1604
+ // If the stack of open elements has a <p> tag in button scope, this acts as if
1605
+ // a </p> tag was encountered, implicitly closing tags. Returns false if a
1606
+ // parse error occurs. This is a convenience function because this particular
1607
+ // clause appears several times in the spec.
1608
+ static bool maybe_implicitly_close_p_tag(
1609
+ GumboParser* parser, GumboToken* token) {
1610
+ if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1611
+ return implicitly_close_tags(
1612
+ parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
1613
+ }
1614
+ return true;
1615
+ }
1616
+
1617
+ // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
1618
+ // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
1619
+ static void maybe_implicitly_close_list_tag(
1620
+ GumboParser* parser, GumboToken* token, bool is_li) {
1621
+ GumboParserState* state = parser->_parser_state;
1622
+ state->_frameset_ok = false;
1623
+ for (int i = state->_open_elements.length; --i >= 0;) {
1624
+ const GumboNode* node = state->_open_elements.data[i];
1625
+ bool is_list_tag =
1626
+ is_li ? node_html_tag_is(node, GUMBO_TAG_LI)
1627
+ : node_tag_in_set(node, (gumbo_tagset){TAG(DD), TAG(DT)});
1628
+ if (is_list_tag) {
1629
+ implicitly_close_tags(
1630
+ parser, token, node->v.element.tag_namespace, node->v.element.tag);
1631
+ return;
1632
+ }
1633
+ if (is_special_node(node) &&
1634
+ !node_tag_in_set(
1635
+ node, (gumbo_tagset){TAG(ADDRESS), TAG(DIV), TAG(P)})) {
1636
+ return;
1637
+ }
1638
+ }
1639
+ }
1640
+
1641
+ static void merge_attributes(
1642
+ GumboParser* parser, GumboToken* token, GumboNode* node) {
1643
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1644
+ assert(node->type == GUMBO_NODE_ELEMENT);
1645
+ const GumboVector* token_attr = &token->v.start_tag.attributes;
1646
+ GumboVector* node_attr = &node->v.element.attributes;
1647
+
1648
+ for (unsigned int i = 0; i < token_attr->length; ++i) {
1649
+ GumboAttribute* attr = token_attr->data[i];
1650
+ if (!gumbo_get_attribute(node_attr, attr->name)) {
1651
+ // Ownership of the attribute is transferred by this gumbo_vector_add,
1652
+ // so it has to be nulled out of the original token so it doesn't get
1653
+ // double-deleted.
1654
+ gumbo_vector_add(parser, attr, node_attr);
1655
+ token_attr->data[i] = NULL;
1656
+ }
1657
+ }
1658
+ // When attributes are merged, it means the token has been ignored and merged
1659
+ // with another token, so we need to free its memory. The attributes that are
1660
+ // transferred need to be nulled-out in the vector above so that they aren't
1661
+ // double-deleted.
1662
+ gumbo_token_destroy(parser, token);
1663
+
1664
+ #ifndef NDEBUG
1665
+ // Mark this sentinel so the assertion in the main loop knows it's been
1666
+ // destroyed.
1667
+ token->v.start_tag.attributes = kGumboEmptyVector;
1668
+ #endif
1669
+ }
1670
+
1671
+ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) {
1672
+ for (size_t i = 0; i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry);
1673
+ ++i) {
1674
+ const ReplacementEntry* entry = &kSvgTagReplacements[i];
1675
+ if (gumbo_string_equals_ignore_case(tag, &entry->from)) {
1676
+ return entry->to.data;
1677
+ }
1678
+ }
1679
+ return NULL;
1680
+ }
1681
+
1682
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
1683
+ // This destructively modifies any matching attributes on the token and sets the
1684
+ // namespace appropriately.
1685
+ static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) {
1686
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1687
+ const GumboVector* attributes = &token->v.start_tag.attributes;
1688
+ for (size_t i = 0; i < sizeof(kForeignAttributeReplacements) /
1689
+ sizeof(NamespacedAttributeReplacement);
1690
+ ++i) {
1691
+ const NamespacedAttributeReplacement* entry =
1692
+ &kForeignAttributeReplacements[i];
1693
+ GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from);
1694
+ if (!attr) {
1695
+ continue;
1696
+ }
1697
+ gumbo_parser_deallocate(parser, (void*) attr->name);
1698
+ attr->attr_namespace = entry->attr_namespace;
1699
+ attr->name = gumbo_copy_stringz(parser, entry->local_name);
1700
+ }
1701
+ }
1702
+
1703
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes
1704
+ // This destructively modifies any matching attributes on the token.
1705
+ static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) {
1706
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1707
+ const GumboVector* attributes = &token->v.start_tag.attributes;
1708
+ for (size_t i = 0;
1709
+ i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) {
1710
+ const ReplacementEntry* entry = &kSvgAttributeReplacements[i];
1711
+ GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data);
1712
+ if (!attr) {
1713
+ continue;
1714
+ }
1715
+ gumbo_parser_deallocate(parser, (void*) attr->name);
1716
+ attr->name = gumbo_copy_stringz(parser, entry->to.data);
1717
+ }
1718
+ }
1719
+
1720
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes
1721
+ // Note that this may destructively modify the token with the new attribute
1722
+ // value.
1723
+ static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) {
1724
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1725
+ GumboAttribute* attr =
1726
+ gumbo_get_attribute(&token->v.start_tag.attributes, "definitionurl");
1727
+ if (!attr) {
1728
+ return;
1729
+ }
1730
+ gumbo_parser_deallocate(parser, (void*) attr->name);
1731
+ attr->name = gumbo_copy_stringz(parser, "definitionURL");
1732
+ }
1733
+
1734
+ static bool doctype_matches(const GumboTokenDocType* doctype,
1735
+ const GumboStringPiece* public_id, const GumboStringPiece* system_id,
1736
+ bool allow_missing_system_id) {
1737
+ return !strcmp(doctype->public_identifier, public_id->data) &&
1738
+ (allow_missing_system_id || doctype->has_system_identifier) &&
1739
+ !strcmp(doctype->system_identifier, system_id->data);
1740
+ }
1741
+
1742
+ static bool maybe_add_doctype_error(
1743
+ GumboParser* parser, const GumboToken* token) {
1744
+ const GumboTokenDocType* doctype = &token->v.doc_type;
1745
+ bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data);
1746
+ if ((!html_doctype || doctype->has_public_identifier ||
1747
+ (doctype->has_system_identifier &&
1748
+ !strcmp(
1749
+ doctype->system_identifier, kSystemIdLegacyCompat.data))) &&
1750
+ !(html_doctype && (doctype_matches(doctype, &kPublicIdHtml4_0,
1751
+ &kSystemIdRecHtml4_0, true) ||
1752
+ doctype_matches(doctype, &kPublicIdHtml4_01,
1753
+ &kSystemIdHtml4, true) ||
1754
+ doctype_matches(doctype, &kPublicIdXhtml1_0,
1755
+ &kSystemIdXhtmlStrict1_1, false) ||
1756
+ doctype_matches(doctype, &kPublicIdXhtml1_1,
1757
+ &kSystemIdXhtml1_1, false)))) {
1758
+ parser_add_parse_error(parser, token);
1759
+ return false;
1760
+ }
1761
+ return true;
1762
+ }
1763
+
1764
+ static void remove_from_parent(GumboParser* parser, GumboNode* node) {
1765
+ if (!node->parent) {
1766
+ // The node may not have a parent if, for example, it is a newly-cloned copy
1767
+ // of an active formatting element. DOM manipulations continue with the
1768
+ // orphaned fragment of the DOM tree until it's appended/foster-parented to
1769
+ // the common ancestor at the end of the adoption agency algorithm.
1770
+ return;
1771
+ }
1772
+ assert(node->parent->type == GUMBO_NODE_ELEMENT);
1773
+ GumboVector* children = &node->parent->v.element.children;
1774
+ int index = gumbo_vector_index_of(children, node);
1775
+ assert(index != -1);
1776
+
1777
+ gumbo_vector_remove_at(parser, index, children);
1778
+ node->parent = NULL;
1779
+ node->index_within_parent = -1;
1780
+ for (unsigned int i = index; i < children->length; ++i) {
1781
+ GumboNode* child = children->data[i];
1782
+ child->index_within_parent = i;
1783
+ }
1784
+ }
1785
+
1786
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
1787
+ // Also described in the "in body" handling for end formatting tags.
1788
+ static bool adoption_agency_algorithm(
1789
+ GumboParser* parser, GumboToken* token, GumboTag subject) {
1790
+ GumboParserState* state = parser->_parser_state;
1791
+ gumbo_debug("Entering adoption agency algorithm.\n");
1792
+ // Step 1.
1793
+ GumboNode* current_node = get_current_node(parser);
1794
+ if (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
1795
+ current_node->v.element.tag == subject &&
1796
+ gumbo_vector_index_of(
1797
+ &state->_active_formatting_elements, current_node) == -1) {
1798
+ pop_current_node(parser);
1799
+ return false;
1800
+ }
1801
+ // Steps 2-4 & 20:
1802
+ for (unsigned int i = 0; i < 8; ++i) {
1803
+ // Step 5.
1804
+ GumboNode* formatting_node = NULL;
1805
+ int formatting_node_in_open_elements = -1;
1806
+ for (int j = state->_active_formatting_elements.length; --j >= 0;) {
1807
+ GumboNode* current_node = state->_active_formatting_elements.data[j];
1808
+ if (current_node == &kActiveFormattingScopeMarker) {
1809
+ gumbo_debug("Broke on scope marker; aborting.\n");
1810
+ // Last scope marker; abort the algorithm.
1811
+ return false;
1812
+ }
1813
+ if (node_html_tag_is(current_node, subject)) {
1814
+ // Found it.
1815
+ formatting_node = current_node;
1816
+ formatting_node_in_open_elements =
1817
+ gumbo_vector_index_of(&state->_open_elements, formatting_node);
1818
+ gumbo_debug("Formatting element of tag %s at %d.\n",
1819
+ gumbo_normalized_tagname(subject),
1820
+ formatting_node_in_open_elements);
1821
+ break;
1822
+ }
1823
+ }
1824
+ if (!formatting_node) {
1825
+ // No matching tag; not a parse error outright, but fall through to the
1826
+ // "any other end tag" clause (which may potentially add a parse error,
1827
+ // but not always).
1828
+ gumbo_debug("No active formatting elements; aborting.\n");
1829
+ return false;
1830
+ }
1831
+
1832
+ // Step 6
1833
+ if (formatting_node_in_open_elements == -1) {
1834
+ gumbo_debug("Formatting node not on stack of open elements.\n");
1835
+ parser_add_parse_error(parser, token);
1836
+ gumbo_vector_remove(
1837
+ parser, formatting_node, &state->_active_formatting_elements);
1838
+ return false;
1839
+ }
1840
+
1841
+ // Step 7
1842
+ if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
1843
+ parser_add_parse_error(parser, token);
1844
+ gumbo_debug("Element not in scope.\n");
1845
+ return false;
1846
+ }
1847
+
1848
+ // Step 8
1849
+ if (formatting_node != get_current_node(parser)) {
1850
+ parser_add_parse_error(parser, token); // But continue onwards.
1851
+ }
1852
+ assert(formatting_node);
1853
+ assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
1854
+ assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
1855
+
1856
+ // Step 9 & 10
1857
+ GumboNode* furthest_block = NULL;
1858
+ for (unsigned int j = formatting_node_in_open_elements;
1859
+ j < state->_open_elements.length; ++j) {
1860
+ assert(j > 0);
1861
+ GumboNode* current = state->_open_elements.data[j];
1862
+ if (is_special_node(current)) {
1863
+ // Step 9.
1864
+ furthest_block = current;
1865
+ break;
1866
+ }
1867
+ }
1868
+ if (!furthest_block) {
1869
+ // Step 10.
1870
+ while (get_current_node(parser) != formatting_node) {
1871
+ pop_current_node(parser);
1872
+ }
1873
+ // And the formatting element itself.
1874
+ pop_current_node(parser);
1875
+ gumbo_vector_remove(
1876
+ parser, formatting_node, &state->_active_formatting_elements);
1877
+ return false;
1878
+ }
1879
+ assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
1880
+ assert(furthest_block);
1881
+
1882
+ // Step 11.
1883
+ // Elements may be moved and reparented by this algorithm, so
1884
+ // common_ancestor is not necessarily the same as formatting_node->parent.
1885
+ GumboNode* common_ancestor =
1886
+ state->_open_elements.data[gumbo_vector_index_of(&state->_open_elements,
1887
+ formatting_node) -
1888
+ 1];
1889
+ gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
1890
+ gumbo_normalized_tagname(common_ancestor->v.element.tag),
1891
+ gumbo_normalized_tagname(furthest_block->v.element.tag));
1892
+
1893
+ // Step 12.
1894
+ int bookmark = gumbo_vector_index_of(
1895
+ &state->_active_formatting_elements, formatting_node) +
1896
+ 1;
1897
+ gumbo_debug("Bookmark at %d.\n", bookmark);
1898
+ // Step 13.
1899
+ GumboNode* node = furthest_block;
1900
+ GumboNode* last_node = furthest_block;
1901
+ // Must be stored explicitly, in case node is removed from the stack of open
1902
+ // elements, to handle step 9.4.
1903
+ int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
1904
+ assert(saved_node_index > 0);
1905
+ // Step 13.1.
1906
+ for (int j = 0;;) {
1907
+ // Step 13.2.
1908
+ ++j;
1909
+ // Step 13.3.
1910
+ int node_index = gumbo_vector_index_of(&state->_open_elements, node);
1911
+ gumbo_debug(
1912
+ "Current index: %d, last index: %d.\n", node_index, saved_node_index);
1913
+ if (node_index == -1) {
1914
+ node_index = saved_node_index;
1915
+ }
1916
+ saved_node_index = --node_index;
1917
+ assert(node_index > 0);
1918
+ assert((unsigned int) node_index < state->_open_elements.capacity);
1919
+ node = state->_open_elements.data[node_index];
1920
+ assert(node->parent);
1921
+ if (node == formatting_node) {
1922
+ // Step 13.4.
1923
+ break;
1924
+ }
1925
+ int formatting_index =
1926
+ gumbo_vector_index_of(&state->_active_formatting_elements, node);
1927
+ if (j > 3 && formatting_index != -1) {
1928
+ // Step 13.5.
1929
+ gumbo_debug("Removing formatting element at %d.\n", formatting_index);
1930
+ gumbo_vector_remove_at(
1931
+ parser, formatting_index, &state->_active_formatting_elements);
1932
+ // Removing the element shifts all indices over by one, so we may need
1933
+ // to move the bookmark.
1934
+ if (formatting_index < bookmark) {
1935
+ --bookmark;
1936
+ gumbo_debug("Moving bookmark to %d.\n", bookmark);
1937
+ }
1938
+ continue;
1939
+ }
1940
+ if (formatting_index == -1) {
1941
+ // Step 13.6.
1942
+ gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
1943
+ continue;
1944
+ }
1945
+ // Step 13.7.
1946
+ // "common ancestor as the intended parent" doesn't actually mean insert
1947
+ // it into the common ancestor; that happens below.
1948
+ node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1949
+ assert(formatting_index >= 0);
1950
+ state->_active_formatting_elements.data[formatting_index] = node;
1951
+ assert(node_index >= 0);
1952
+ state->_open_elements.data[node_index] = node;
1953
+ // Step 13.8.
1954
+ if (last_node == furthest_block) {
1955
+ bookmark = formatting_index + 1;
1956
+ gumbo_debug("Bookmark moved to %d.\n", bookmark);
1957
+ assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
1958
+ }
1959
+ // Step 13.9.
1960
+ last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1961
+ remove_from_parent(parser, last_node);
1962
+ append_node(parser, node, last_node);
1963
+ // Step 13.10.
1964
+ last_node = node;
1965
+ } // Step 13.11.
1966
+
1967
+ // Step 14.
1968
+ gumbo_debug("Removing %s node from parent ",
1969
+ gumbo_normalized_tagname(last_node->v.element.tag));
1970
+ remove_from_parent(parser, last_node);
1971
+ last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1972
+ InsertionLocation location =
1973
+ get_appropriate_insertion_location(parser, common_ancestor);
1974
+ gumbo_debug("and inserting it into %s.\n",
1975
+ gumbo_normalized_tagname(location.target->v.element.tag));
1976
+ insert_node(parser, last_node, location);
1977
+
1978
+ // Step 15.
1979
+ GumboNode* new_formatting_node = clone_node(
1980
+ parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1981
+ formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1982
+
1983
+ // Step 16. Instead of appending nodes one-by-one, we swap the children
1984
+ // vector of furthest_block with the empty children of new_formatting_node,
1985
+ // reducing memory traffic and allocations. We still have to reset their
1986
+ // parent pointers, though.
1987
+ GumboVector temp = new_formatting_node->v.element.children;
1988
+ new_formatting_node->v.element.children =
1989
+ furthest_block->v.element.children;
1990
+ furthest_block->v.element.children = temp;
1991
+
1992
+ temp = new_formatting_node->v.element.children;
1993
+ for (unsigned int i = 0; i < temp.length; ++i) {
1994
+ GumboNode* child = temp.data[i];
1995
+ child->parent = new_formatting_node;
1996
+ }
1997
+
1998
+ // Step 17.
1999
+ append_node(parser, furthest_block, new_formatting_node);
2000
+
2001
+ // Step 18.
2002
+ // If the formatting node was before the bookmark, it may shift over all
2003
+ // indices after it, so we need to explicitly find the index and possibly
2004
+ // adjust the bookmark.
2005
+ int formatting_node_index = gumbo_vector_index_of(
2006
+ &state->_active_formatting_elements, formatting_node);
2007
+ assert(formatting_node_index != -1);
2008
+ if (formatting_node_index < bookmark) {
2009
+ gumbo_debug(
2010
+ "Formatting node at %d is before bookmark at %d; decrementing.\n",
2011
+ formatting_node_index, bookmark);
2012
+ --bookmark;
2013
+ }
2014
+ gumbo_vector_remove_at(
2015
+ parser, formatting_node_index, &state->_active_formatting_elements);
2016
+ assert(bookmark >= 0);
2017
+ assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
2018
+ gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
2019
+ &state->_active_formatting_elements);
2020
+
2021
+ // Step 19.
2022
+ gumbo_vector_remove(parser, formatting_node, &state->_open_elements);
2023
+ int insert_at =
2024
+ gumbo_vector_index_of(&state->_open_elements, furthest_block) + 1;
2025
+ assert(insert_at >= 0);
2026
+ assert((unsigned int) insert_at <= state->_open_elements.length);
2027
+ gumbo_vector_insert_at(
2028
+ parser, new_formatting_node, insert_at, &state->_open_elements);
2029
+ } // Step 20.
2030
+ return true;
2031
+ }
2032
+
2033
+ // This is here to clean up memory when the spec says "Ignore current token."
2034
+ static void ignore_token(GumboParser* parser) {
2035
+ GumboToken* token = parser->_parser_state->_current_token;
2036
+ // Ownership of the token's internal buffers are normally transferred to the
2037
+ // element, but if no element is emitted (as happens in non-verbatim-mode
2038
+ // when a token is ignored), we need to free it here to prevent a memory
2039
+ // leak.
2040
+ gumbo_token_destroy(parser, token);
2041
+ #ifndef NDEBUG
2042
+ if (token->type == GUMBO_TOKEN_START_TAG) {
2043
+ // Mark this sentinel so the assertion in the main loop knows it's been
2044
+ // destroyed.
2045
+ token->v.start_tag.attributes = kGumboEmptyVector;
2046
+ }
2047
+ #endif
2048
+ }
2049
+
2050
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html
2051
+ static void finish_parsing(GumboParser* parser) {
2052
+ gumbo_debug("Finishing parsing");
2053
+ maybe_flush_text_node_buffer(parser);
2054
+ GumboParserState* state = parser->_parser_state;
2055
+ for (GumboNode* node = pop_current_node(parser); node;
2056
+ node = pop_current_node(parser)) {
2057
+ if ((node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
2058
+ (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
2059
+ continue;
2060
+ }
2061
+ node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2062
+ }
2063
+ while (pop_current_node(parser))
2064
+ ; // Pop them all.
2065
+ }
2066
+
2067
+ static bool handle_initial(GumboParser* parser, GumboToken* token) {
2068
+ GumboDocument* document = &get_document_node(parser)->v.document;
2069
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2070
+ ignore_token(parser);
2071
+ return true;
2072
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2073
+ append_comment_node(parser, get_document_node(parser), token);
2074
+ return true;
2075
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2076
+ document->has_doctype = true;
2077
+ document->name = token->v.doc_type.name;
2078
+ document->public_identifier = token->v.doc_type.public_identifier;
2079
+ document->system_identifier = token->v.doc_type.system_identifier;
2080
+ document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
2081
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2082
+ return maybe_add_doctype_error(parser, token);
2083
+ }
2084
+ parser_add_parse_error(parser, token);
2085
+ document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
2086
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2087
+ parser->_parser_state->_reprocess_current_token = true;
2088
+ return true;
2089
+ }
2090
+
2091
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode
2092
+ static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2093
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2094
+ parser_add_parse_error(parser, token);
2095
+ ignore_token(parser);
2096
+ return false;
2097
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2098
+ append_comment_node(parser, get_document_node(parser), token);
2099
+ return true;
2100
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2101
+ ignore_token(parser);
2102
+ return true;
2103
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2104
+ GumboNode* html_node = insert_element_from_token(parser, token);
2105
+ parser->_output->root = html_node;
2106
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2107
+ return true;
2108
+ } else if (token->type == GUMBO_TOKEN_END_TAG &&
2109
+ !tag_in(token, false,
2110
+ (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) {
2111
+ parser_add_parse_error(parser, token);
2112
+ ignore_token(parser);
2113
+ return false;
2114
+ } else {
2115
+ GumboNode* html_node = insert_element_of_tag_type(
2116
+ parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
2117
+ assert(html_node);
2118
+ parser->_output->root = html_node;
2119
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2120
+ parser->_parser_state->_reprocess_current_token = true;
2121
+ return true;
2122
+ }
2123
+ }
2124
+
2125
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode
2126
+ static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2127
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2128
+ parser_add_parse_error(parser, token);
2129
+ ignore_token(parser);
2130
+ return false;
2131
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2132
+ append_comment_node(parser, get_current_node(parser), token);
2133
+ return true;
2134
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2135
+ ignore_token(parser);
2136
+ return true;
2137
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2138
+ GumboNode* node = insert_element_from_token(parser, token);
2139
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2140
+ parser->_parser_state->_head_element = node;
2141
+ return true;
2142
+ } else if (token->type == GUMBO_TOKEN_END_TAG &&
2143
+ !tag_in(token, false,
2144
+ (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) {
2145
+ parser_add_parse_error(parser, token);
2146
+ ignore_token(parser);
2147
+ return false;
2148
+ } else {
2149
+ GumboNode* node = insert_element_of_tag_type(
2150
+ parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED);
2151
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2152
+ parser->_parser_state->_head_element = node;
2153
+ parser->_parser_state->_reprocess_current_token = true;
2154
+ return true;
2155
+ }
2156
+ }
2157
+
2158
+ // Forward declarations because of mutual dependencies.
2159
+ static bool handle_token(GumboParser* parser, GumboToken* token);
2160
+ static bool handle_in_body(GumboParser* parser, GumboToken* token);
2161
+
2162
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead
2163
+ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2164
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2165
+ insert_text_token(parser, token);
2166
+ return true;
2167
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2168
+ parser_add_parse_error(parser, token);
2169
+ ignore_token(parser);
2170
+ return false;
2171
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2172
+ append_comment_node(parser, get_current_node(parser), token);
2173
+ return true;
2174
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2175
+ return handle_in_body(parser, token);
2176
+ } else if (tag_in(token, kStartTag,
2177
+ (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2178
+ TAG(MENUITEM), TAG(LINK)})) {
2179
+ insert_element_from_token(parser, token);
2180
+ pop_current_node(parser);
2181
+ acknowledge_self_closing_tag(parser);
2182
+ return true;
2183
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2184
+ insert_element_from_token(parser, token);
2185
+ pop_current_node(parser);
2186
+ acknowledge_self_closing_tag(parser);
2187
+ // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the
2188
+ // spec doesn't apply. If clients want to handle meta-tag re-encoding, they
2189
+ // should specifically look for that string in the document and re-encode it
2190
+ // before passing to Gumbo.
2191
+ return true;
2192
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2193
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2194
+ return true;
2195
+ } else if (tag_in(
2196
+ token, kStartTag, (gumbo_tagset){TAG(NOFRAMES), TAG(STYLE)})) {
2197
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2198
+ return true;
2199
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2200
+ insert_element_from_token(parser, token);
2201
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2202
+ return true;
2203
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2204
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
2205
+ return true;
2206
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2207
+ GumboNode* head = pop_current_node(parser);
2208
+ AVOID_UNUSED_VARIABLE_WARNING(head);
2209
+ assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
2210
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2211
+ return true;
2212
+ } else if (tag_in(token, kEndTag,
2213
+ (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)})) {
2214
+ pop_current_node(parser);
2215
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2216
+ parser->_parser_state->_reprocess_current_token = true;
2217
+ return true;
2218
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2219
+ insert_element_from_token(parser, token);
2220
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
2221
+ parser->_parser_state->_frameset_ok = false;
2222
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2223
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2224
+ return true;
2225
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2226
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2227
+ parser_add_parse_error(parser, token);
2228
+ ignore_token(parser);
2229
+ return false;
2230
+ }
2231
+ generate_all_implied_end_tags_thoroughly(parser);
2232
+ bool success = true;
2233
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) {
2234
+ parser_add_parse_error(parser, token);
2235
+ success = false;
2236
+ }
2237
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
2238
+ ;
2239
+ clear_active_formatting_elements(parser);
2240
+ pop_template_insertion_mode(parser);
2241
+ reset_insertion_mode_appropriately(parser);
2242
+ return success;
2243
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2244
+ (token->type == GUMBO_TOKEN_END_TAG)) {
2245
+ parser_add_parse_error(parser, token);
2246
+ ignore_token(parser);
2247
+ return false;
2248
+ } else {
2249
+ pop_current_node(parser);
2250
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2251
+ parser->_parser_state->_reprocess_current_token = true;
2252
+ return true;
2253
+ }
2254
+ return true;
2255
+ }
2256
+
2257
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript
2258
+ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2259
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2260
+ parser_add_parse_error(parser, token);
2261
+ return false;
2262
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2263
+ return handle_in_body(parser, token);
2264
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2265
+ const GumboNode* node = pop_current_node(parser);
2266
+ assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2267
+ AVOID_UNUSED_VARIABLE_WARNING(node);
2268
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2269
+ return true;
2270
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE ||
2271
+ token->type == GUMBO_TOKEN_COMMENT ||
2272
+ tag_in(token, kStartTag,
2273
+ (gumbo_tagset){TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
2274
+ TAG(META), TAG(NOFRAMES), TAG(STYLE)})) {
2275
+ return handle_in_head(parser, token);
2276
+ } else if (tag_in(
2277
+ token, kStartTag, (gumbo_tagset){TAG(HEAD), TAG(NOSCRIPT)}) ||
2278
+ (token->type == GUMBO_TOKEN_END_TAG &&
2279
+ !tag_is(token, kEndTag, GUMBO_TAG_BR))) {
2280
+ parser_add_parse_error(parser, token);
2281
+ ignore_token(parser);
2282
+ return false;
2283
+ } else {
2284
+ parser_add_parse_error(parser, token);
2285
+ const GumboNode* node = pop_current_node(parser);
2286
+ assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2287
+ AVOID_UNUSED_VARIABLE_WARNING(node);
2288
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2289
+ parser->_parser_state->_reprocess_current_token = true;
2290
+ return false;
2291
+ }
2292
+ }
2293
+
2294
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode
2295
+ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2296
+ GumboParserState* state = parser->_parser_state;
2297
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2298
+ insert_text_token(parser, token);
2299
+ return true;
2300
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2301
+ parser_add_parse_error(parser, token);
2302
+ ignore_token(parser);
2303
+ return false;
2304
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2305
+ append_comment_node(parser, get_current_node(parser), token);
2306
+ return true;
2307
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2308
+ return handle_in_body(parser, token);
2309
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2310
+ insert_element_from_token(parser, token);
2311
+ state->_frameset_ok = false;
2312
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2313
+ return true;
2314
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2315
+ insert_element_from_token(parser, token);
2316
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2317
+ return true;
2318
+ } else if (tag_in(token, kStartTag,
2319
+ (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2320
+ TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
2321
+ TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)})) {
2322
+ parser_add_parse_error(parser, token);
2323
+ assert(state->_head_element != NULL);
2324
+ // This must be flushed before we push the head element on, as there may be
2325
+ // pending character tokens that should be attached to the root.
2326
+ maybe_flush_text_node_buffer(parser);
2327
+ gumbo_vector_add(parser, state->_head_element, &state->_open_elements);
2328
+ bool result = handle_in_head(parser, token);
2329
+ gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
2330
+ return result;
2331
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2332
+ return handle_in_head(parser, token);
2333
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2334
+ (token->type == GUMBO_TOKEN_END_TAG &&
2335
+ !tag_in(token, kEndTag,
2336
+ (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)}))) {
2337
+ parser_add_parse_error(parser, token);
2338
+ ignore_token(parser);
2339
+ return false;
2340
+ } else {
2341
+ insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2342
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2343
+ state->_reprocess_current_token = true;
2344
+ return true;
2345
+ }
2346
+ }
2347
+
2348
+ static void destroy_node(GumboParser* parser, GumboNode* node) {
2349
+ switch (node->type) {
2350
+ case GUMBO_NODE_DOCUMENT: {
2351
+ GumboDocument* doc = &node->v.document;
2352
+ for (unsigned int i = 0; i < doc->children.length; ++i) {
2353
+ destroy_node(parser, doc->children.data[i]);
2354
+ }
2355
+ gumbo_parser_deallocate(parser, (void*) doc->children.data);
2356
+ gumbo_parser_deallocate(parser, (void*) doc->name);
2357
+ gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
2358
+ gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
2359
+ } break;
2360
+ case GUMBO_NODE_TEMPLATE:
2361
+ case GUMBO_NODE_ELEMENT:
2362
+ for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) {
2363
+ gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
2364
+ }
2365
+ gumbo_parser_deallocate(parser, node->v.element.attributes.data);
2366
+ for (unsigned int i = 0; i < node->v.element.children.length; ++i) {
2367
+ destroy_node(parser, node->v.element.children.data[i]);
2368
+ }
2369
+ gumbo_parser_deallocate(parser, node->v.element.children.data);
2370
+ break;
2371
+ case GUMBO_NODE_TEXT:
2372
+ case GUMBO_NODE_CDATA:
2373
+ case GUMBO_NODE_COMMENT:
2374
+ case GUMBO_NODE_WHITESPACE:
2375
+ gumbo_parser_deallocate(parser, (void*) node->v.text.text);
2376
+ break;
2377
+ }
2378
+ gumbo_parser_deallocate(parser, node);
2379
+ }
2380
+
2381
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
2382
+ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2383
+ GumboParserState* state = parser->_parser_state;
2384
+ assert(state->_open_elements.length > 0);
2385
+ if (token->type == GUMBO_TOKEN_NULL) {
2386
+ parser_add_parse_error(parser, token);
2387
+ ignore_token(parser);
2388
+ return false;
2389
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2390
+ reconstruct_active_formatting_elements(parser);
2391
+ insert_text_token(parser, token);
2392
+ return true;
2393
+ } else if (token->type == GUMBO_TOKEN_CHARACTER ||
2394
+ token->type == GUMBO_TOKEN_CDATA) {
2395
+ reconstruct_active_formatting_elements(parser);
2396
+ insert_text_token(parser, token);
2397
+ set_frameset_not_ok(parser);
2398
+ return true;
2399
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2400
+ append_comment_node(parser, get_current_node(parser), token);
2401
+ return true;
2402
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2403
+ parser_add_parse_error(parser, token);
2404
+ ignore_token(parser);
2405
+ return false;
2406
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2407
+ parser_add_parse_error(parser, token);
2408
+ if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2409
+ ignore_token(parser);
2410
+ return false;
2411
+ }
2412
+ assert(parser->_output->root != NULL);
2413
+ assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2414
+ merge_attributes(parser, token, parser->_output->root);
2415
+ return false;
2416
+ } else if (tag_in(token, kStartTag,
2417
+ (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2418
+ TAG(MENUITEM), TAG(LINK), TAG(META), TAG(NOFRAMES),
2419
+ TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) ||
2420
+ tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2421
+ return handle_in_head(parser, token);
2422
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2423
+ parser_add_parse_error(parser, token);
2424
+ if (state->_open_elements.length < 2 ||
2425
+ !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2426
+ has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2427
+ ignore_token(parser);
2428
+ return false;
2429
+ }
2430
+ state->_frameset_ok = false;
2431
+ merge_attributes(parser, token, state->_open_elements.data[1]);
2432
+ return false;
2433
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2434
+ parser_add_parse_error(parser, token);
2435
+ if (state->_open_elements.length < 2 ||
2436
+ !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2437
+ !state->_frameset_ok) {
2438
+ ignore_token(parser);
2439
+ return false;
2440
+ }
2441
+ // Save the body node for later removal.
2442
+ GumboNode* body_node = state->_open_elements.data[1];
2443
+
2444
+ // Pop all nodes except root HTML element.
2445
+ GumboNode* node;
2446
+ do {
2447
+ node = pop_current_node(parser);
2448
+ } while (node != state->_open_elements.data[1]);
2449
+
2450
+ // Removing & destroying the body node is going to kill any nodes that have
2451
+ // been added to the list of active formatting elements, and so we should
2452
+ // clear it to prevent a use-after-free if the list of active formatting
2453
+ // elements is reconstructed afterwards. This may happen if whitespace
2454
+ // follows the </frameset>.
2455
+ clear_active_formatting_elements(parser);
2456
+
2457
+ // Remove the body node. We may want to factor this out into a generic
2458
+ // helper, but right now this is the only code that needs to do this.
2459
+ GumboVector* children = &parser->_output->root->v.element.children;
2460
+ for (unsigned int i = 0; i < children->length; ++i) {
2461
+ if (children->data[i] == body_node) {
2462
+ gumbo_vector_remove_at(parser, i, children);
2463
+ break;
2464
+ }
2465
+ }
2466
+ destroy_node(parser, body_node);
2467
+
2468
+ // Insert the <frameset>, and switch the insertion mode.
2469
+ insert_element_from_token(parser, token);
2470
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2471
+ return true;
2472
+ } else if (token->type == GUMBO_TOKEN_EOF) {
2473
+ for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2474
+ if (!node_tag_in_set(state->_open_elements.data[i],
2475
+ (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY),
2476
+ TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY),
2477
+ TAG(HTML)})) {
2478
+ parser_add_parse_error(parser, token);
2479
+ }
2480
+ }
2481
+ if (get_current_template_insertion_mode(parser) !=
2482
+ GUMBO_INSERTION_MODE_INITIAL) {
2483
+ return handle_in_template(parser, token);
2484
+ }
2485
+ return true;
2486
+ } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(HTML)})) {
2487
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2488
+ parser_add_parse_error(parser, token);
2489
+ ignore_token(parser);
2490
+ return false;
2491
+ }
2492
+ bool success = true;
2493
+ for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2494
+ if (!node_tag_in_set(state->_open_elements.data[i],
2495
+ (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
2496
+ TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC),
2497
+ TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR),
2498
+ TAG(BODY), TAG(HTML)})) {
2499
+ parser_add_parse_error(parser, token);
2500
+ success = false;
2501
+ break;
2502
+ }
2503
+ }
2504
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2505
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2506
+ parser->_parser_state->_reprocess_current_token = true;
2507
+ } else {
2508
+ GumboNode* body = state->_open_elements.data[1];
2509
+ assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2510
+ record_end_of_element(state->_current_token, &body->v.element);
2511
+ }
2512
+ return success;
2513
+ } else if (tag_in(token, kStartTag,
2514
+ (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
2515
+ TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS), TAG(DIR),
2516
+ TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
2517
+ TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
2518
+ TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P),
2519
+ TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
2520
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2521
+ insert_element_from_token(parser, token);
2522
+ return result;
2523
+ } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2524
+ TAG(H4), TAG(H5), TAG(H6)})) {
2525
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2526
+ if (node_tag_in_set(
2527
+ get_current_node(parser), (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2528
+ TAG(H4), TAG(H5), TAG(H6)})) {
2529
+ parser_add_parse_error(parser, token);
2530
+ pop_current_node(parser);
2531
+ result = false;
2532
+ }
2533
+ insert_element_from_token(parser, token);
2534
+ return result;
2535
+ } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(PRE), TAG(LISTING)})) {
2536
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2537
+ insert_element_from_token(parser, token);
2538
+ state->_ignore_next_linefeed = true;
2539
+ state->_frameset_ok = false;
2540
+ return result;
2541
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2542
+ if (state->_form_element != NULL &&
2543
+ !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2544
+ gumbo_debug("Ignoring nested form.\n");
2545
+ parser_add_parse_error(parser, token);
2546
+ ignore_token(parser);
2547
+ return false;
2548
+ }
2549
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2550
+ GumboNode* form_element = insert_element_from_token(parser, token);
2551
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2552
+ state->_form_element = form_element;
2553
+ }
2554
+ return result;
2555
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2556
+ maybe_implicitly_close_list_tag(parser, token, true);
2557
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2558
+ insert_element_from_token(parser, token);
2559
+ return result;
2560
+ } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(DD), TAG(DT)})) {
2561
+ maybe_implicitly_close_list_tag(parser, token, false);
2562
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2563
+ insert_element_from_token(parser, token);
2564
+ return result;
2565
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2566
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2567
+ insert_element_from_token(parser, token);
2568
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
2569
+ return result;
2570
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2571
+ if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2572
+ parser_add_parse_error(parser, token);
2573
+ implicitly_close_tags(
2574
+ parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_BUTTON);
2575
+ state->_reprocess_current_token = true;
2576
+ return false;
2577
+ }
2578
+ reconstruct_active_formatting_elements(parser);
2579
+ insert_element_from_token(parser, token);
2580
+ state->_frameset_ok = false;
2581
+ return true;
2582
+ } else if (tag_in(token, kEndTag,
2583
+ (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
2584
+ TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS),
2585
+ TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
2586
+ TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER),
2587
+ TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV),
2588
+ TAG(OL), TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
2589
+ GumboTag tag = token->v.end_tag;
2590
+ if (!has_an_element_in_scope(parser, tag)) {
2591
+ parser_add_parse_error(parser, token);
2592
+ ignore_token(parser);
2593
+ return false;
2594
+ }
2595
+ implicitly_close_tags(
2596
+ parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag);
2597
+ return true;
2598
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2599
+ if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2600
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
2601
+ parser_add_parse_error(parser, token);
2602
+ ignore_token(parser);
2603
+ return false;
2604
+ }
2605
+ bool success = true;
2606
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2607
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
2608
+ parser_add_parse_error(parser, token);
2609
+ return false;
2610
+ }
2611
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
2612
+ ;
2613
+ return success;
2614
+ } else {
2615
+ bool result = true;
2616
+ const GumboNode* node = state->_form_element;
2617
+ assert(!node || node->type == GUMBO_NODE_ELEMENT);
2618
+ state->_form_element = NULL;
2619
+ if (!node || !has_node_in_scope(parser, node)) {
2620
+ gumbo_debug("Closing an unopened form.\n");
2621
+ parser_add_parse_error(parser, token);
2622
+ ignore_token(parser);
2623
+ return false;
2624
+ }
2625
+ // This differs from implicitly_close_tags because we remove *only* the
2626
+ // <form> element; other nodes are left in scope.
2627
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2628
+ if (get_current_node(parser) != node) {
2629
+ parser_add_parse_error(parser, token);
2630
+ result = false;
2631
+ }
2632
+
2633
+ GumboVector* open_elements = &state->_open_elements;
2634
+ int index = gumbo_vector_index_of(open_elements, node);
2635
+ assert(index >= 0);
2636
+ gumbo_vector_remove_at(parser, index, open_elements);
2637
+ return result;
2638
+ }
2639
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
2640
+ if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2641
+ parser_add_parse_error(parser, token);
2642
+ // reconstruct_active_formatting_elements(parser);
2643
+ insert_element_of_tag_type(
2644
+ parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2645
+ state->_reprocess_current_token = true;
2646
+ return false;
2647
+ }
2648
+ return implicitly_close_tags(
2649
+ parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
2650
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
2651
+ if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
2652
+ parser_add_parse_error(parser, token);
2653
+ ignore_token(parser);
2654
+ return false;
2655
+ }
2656
+ return implicitly_close_tags(
2657
+ parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_LI);
2658
+ } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(DD), TAG(DT)})) {
2659
+ assert(token->type == GUMBO_TOKEN_END_TAG);
2660
+ GumboTag token_tag = token->v.end_tag;
2661
+ if (!has_an_element_in_scope(parser, token_tag)) {
2662
+ parser_add_parse_error(parser, token);
2663
+ ignore_token(parser);
2664
+ return false;
2665
+ }
2666
+ return implicitly_close_tags(
2667
+ parser, token, GUMBO_NAMESPACE_HTML, token_tag);
2668
+ } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2669
+ TAG(H4), TAG(H5), TAG(H6)})) {
2670
+ if (!has_an_element_in_scope_with_tagname(
2671
+ parser, 6, (GumboTag[]){GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2672
+ GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) {
2673
+ // No heading open; ignore the token entirely.
2674
+ parser_add_parse_error(parser, token);
2675
+ ignore_token(parser);
2676
+ return false;
2677
+ } else {
2678
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2679
+ const GumboNode* current_node = get_current_node(parser);
2680
+ bool success = node_html_tag_is(current_node, token->v.end_tag);
2681
+ if (!success) {
2682
+ // There're children of the heading currently open; close them below and
2683
+ // record a parse error.
2684
+ // TODO(jdtang): Add a way to distinguish this error case from the one
2685
+ // above.
2686
+ parser_add_parse_error(parser, token);
2687
+ }
2688
+ do {
2689
+ current_node = pop_current_node(parser);
2690
+ } while (!node_tag_in_set(
2691
+ current_node, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2692
+ TAG(H4), TAG(H5), TAG(H6)}));
2693
+ return success;
2694
+ }
2695
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
2696
+ bool success = true;
2697
+ int last_a;
2698
+ int has_matching_a = find_last_anchor_index(parser, &last_a);
2699
+ if (has_matching_a) {
2700
+ assert(has_matching_a == 1);
2701
+ parser_add_parse_error(parser, token);
2702
+ adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
2703
+ // The adoption agency algorithm usually removes all instances of <a>
2704
+ // from the list of active formatting elements, but in case it doesn't,
2705
+ // we're supposed to do this. (The conditions where it might not are
2706
+ // listed in the spec.)
2707
+ if (find_last_anchor_index(parser, &last_a)) {
2708
+ void* last_element = gumbo_vector_remove_at(
2709
+ parser, last_a, &state->_active_formatting_elements);
2710
+ gumbo_vector_remove(parser, last_element, &state->_open_elements);
2711
+ }
2712
+ success = false;
2713
+ }
2714
+ reconstruct_active_formatting_elements(parser);
2715
+ add_formatting_element(parser, insert_element_from_token(parser, token));
2716
+ return success;
2717
+ } else if (tag_in(token, kStartTag,
2718
+ (gumbo_tagset){TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT),
2719
+ TAG(I), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG),
2720
+ TAG(TT), TAG(U)})) {
2721
+ reconstruct_active_formatting_elements(parser);
2722
+ add_formatting_element(parser, insert_element_from_token(parser, token));
2723
+ return true;
2724
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
2725
+ bool result = true;
2726
+ reconstruct_active_formatting_elements(parser);
2727
+ if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
2728
+ result = false;
2729
+ parser_add_parse_error(parser, token);
2730
+ adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
2731
+ reconstruct_active_formatting_elements(parser);
2732
+ }
2733
+ insert_element_from_token(parser, token);
2734
+ add_formatting_element(parser, get_current_node(parser));
2735
+ return result;
2736
+ } else if (tag_in(token, kEndTag,
2737
+ (gumbo_tagset){TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM),
2738
+ TAG(FONT), TAG(I), TAG(NOBR), TAG(S), TAG(SMALL),
2739
+ TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)})) {
2740
+ return adoption_agency_algorithm(parser, token, token->v.end_tag);
2741
+ } else if (tag_in(token, kStartTag,
2742
+ (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) {
2743
+ reconstruct_active_formatting_elements(parser);
2744
+ insert_element_from_token(parser, token);
2745
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
2746
+ set_frameset_not_ok(parser);
2747
+ return true;
2748
+ } else if (tag_in(token, kEndTag,
2749
+ (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) {
2750
+ GumboTag token_tag = token->v.end_tag;
2751
+ if (!has_an_element_in_table_scope(parser, token_tag)) {
2752
+ parser_add_parse_error(parser, token);
2753
+ ignore_token(parser);
2754
+ return false;
2755
+ }
2756
+ implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
2757
+ clear_active_formatting_elements(parser);
2758
+ return true;
2759
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
2760
+ if (get_document_node(parser)->v.document.doc_type_quirks_mode !=
2761
+ GUMBO_DOCTYPE_QUIRKS) {
2762
+ maybe_implicitly_close_p_tag(parser, token);
2763
+ }
2764
+ insert_element_from_token(parser, token);
2765
+ set_frameset_not_ok(parser);
2766
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
2767
+ return true;
2768
+ } else if (tag_in(token, kStartTag,
2769
+ (gumbo_tagset){TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG),
2770
+ TAG(IMAGE), TAG(KEYGEN), TAG(WBR)})) {
2771
+ bool success = true;
2772
+ if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2773
+ success = false;
2774
+ parser_add_parse_error(parser, token);
2775
+ token->v.start_tag.tag = GUMBO_TAG_IMG;
2776
+ }
2777
+ reconstruct_active_formatting_elements(parser);
2778
+ GumboNode* node = insert_element_from_token(parser, token);
2779
+ if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2780
+ success = false;
2781
+ parser_add_parse_error(parser, token);
2782
+ node->v.element.tag = GUMBO_TAG_IMG;
2783
+ node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
2784
+ }
2785
+ pop_current_node(parser);
2786
+ acknowledge_self_closing_tag(parser);
2787
+ set_frameset_not_ok(parser);
2788
+ return success;
2789
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
2790
+ if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
2791
+ // Must be before the element is inserted, as that takes ownership of the
2792
+ // token's attribute vector.
2793
+ set_frameset_not_ok(parser);
2794
+ }
2795
+ reconstruct_active_formatting_elements(parser);
2796
+ insert_element_from_token(parser, token);
2797
+ pop_current_node(parser);
2798
+ acknowledge_self_closing_tag(parser);
2799
+ return true;
2800
+ } else if (tag_in(token, kStartTag,
2801
+ (gumbo_tagset){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})) {
2802
+ insert_element_from_token(parser, token);
2803
+ pop_current_node(parser);
2804
+ acknowledge_self_closing_tag(parser);
2805
+ return true;
2806
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
2807
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2808
+ insert_element_from_token(parser, token);
2809
+ pop_current_node(parser);
2810
+ acknowledge_self_closing_tag(parser);
2811
+ set_frameset_not_ok(parser);
2812
+ return result;
2813
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
2814
+ parser_add_parse_error(parser, token);
2815
+ if (parser->_parser_state->_form_element != NULL &&
2816
+ !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2817
+ ignore_token(parser);
2818
+ return false;
2819
+ }
2820
+ acknowledge_self_closing_tag(parser);
2821
+ maybe_implicitly_close_p_tag(parser, token);
2822
+ set_frameset_not_ok(parser);
2823
+
2824
+ GumboVector* token_attrs = &token->v.start_tag.attributes;
2825
+ GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt");
2826
+ GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action");
2827
+ GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "name");
2828
+
2829
+ GumboNode* form = insert_element_of_tag_type(
2830
+ parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
2831
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2832
+ parser->_parser_state->_form_element = form;
2833
+ }
2834
+ if (action_attr) {
2835
+ gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
2836
+ }
2837
+ insert_element_of_tag_type(
2838
+ parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2839
+ pop_current_node(parser); // <hr>
2840
+
2841
+ insert_element_of_tag_type(
2842
+ parser, GUMBO_TAG_LABEL, GUMBO_INSERTION_FROM_ISINDEX);
2843
+ TextNodeBufferState* text_state = &parser->_parser_state->_text_node;
2844
+ text_state->_start_original_text = token->original_text.data;
2845
+ text_state->_start_position = token->position;
2846
+ text_state->_type = GUMBO_NODE_TEXT;
2847
+ if (prompt_attr) {
2848
+ int prompt_attr_length = strlen(prompt_attr->value);
2849
+ gumbo_string_buffer_destroy(parser, &text_state->_buffer);
2850
+ text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value);
2851
+ text_state->_buffer.length = prompt_attr_length;
2852
+ text_state->_buffer.capacity = prompt_attr_length + 1;
2853
+ gumbo_destroy_attribute(parser, prompt_attr);
2854
+ } else {
2855
+ GumboStringPiece prompt_text =
2856
+ GUMBO_STRING("This is a searchable index. Enter search keywords: ");
2857
+ gumbo_string_buffer_append_string(
2858
+ parser, &prompt_text, &text_state->_buffer);
2859
+ }
2860
+
2861
+ GumboNode* input = insert_element_of_tag_type(
2862
+ parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX);
2863
+ for (unsigned int i = 0; i < token_attrs->length; ++i) {
2864
+ GumboAttribute* attr = token_attrs->data[i];
2865
+ if (attr != prompt_attr && attr != action_attr && attr != name_attr) {
2866
+ gumbo_vector_add(parser, attr, &input->v.element.attributes);
2867
+ }
2868
+ token_attrs->data[i] = NULL;
2869
+ }
2870
+
2871
+ // All attributes have been successfully transferred and nulled out at this
2872
+ // point, so the call to ignore_token will free the memory for it without
2873
+ // touching the attributes.
2874
+ ignore_token(parser);
2875
+
2876
+ // The name attribute, if present, should be destroyed since it's ignored
2877
+ // when copying over. The action attribute should be kept since it's moved
2878
+ // to the form.
2879
+ if (name_attr) {
2880
+ gumbo_destroy_attribute(parser, name_attr);
2881
+ }
2882
+
2883
+ GumboAttribute* name =
2884
+ gumbo_parser_allocate(parser, sizeof(GumboAttribute));
2885
+ GumboStringPiece name_str = GUMBO_STRING("name");
2886
+ GumboStringPiece isindex_str = GUMBO_STRING("isindex");
2887
+ name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
2888
+ name->name = gumbo_copy_stringz(parser, "name");
2889
+ name->value = gumbo_copy_stringz(parser, "isindex");
2890
+ name->original_name = name_str;
2891
+ name->original_value = isindex_str;
2892
+ name->name_start = kGumboEmptySourcePosition;
2893
+ name->name_end = kGumboEmptySourcePosition;
2894
+ name->value_start = kGumboEmptySourcePosition;
2895
+ name->value_end = kGumboEmptySourcePosition;
2896
+ gumbo_vector_add(parser, name, &input->v.element.attributes);
2897
+
2898
+ pop_current_node(parser); // <input>
2899
+ pop_current_node(parser); // <label>
2900
+ insert_element_of_tag_type(
2901
+ parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2902
+ pop_current_node(parser); // <hr>
2903
+ pop_current_node(parser); // <form>
2904
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2905
+ parser->_parser_state->_form_element = NULL;
2906
+ }
2907
+ return false;
2908
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
2909
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2910
+ parser->_parser_state->_ignore_next_linefeed = true;
2911
+ set_frameset_not_ok(parser);
2912
+ return true;
2913
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
2914
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2915
+ reconstruct_active_formatting_elements(parser);
2916
+ set_frameset_not_ok(parser);
2917
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2918
+ return result;
2919
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
2920
+ set_frameset_not_ok(parser);
2921
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2922
+ return true;
2923
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
2924
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2925
+ return true;
2926
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
2927
+ reconstruct_active_formatting_elements(parser);
2928
+ insert_element_from_token(parser, token);
2929
+ set_frameset_not_ok(parser);
2930
+ GumboInsertionMode state = parser->_parser_state->_insertion_mode;
2931
+ if (state == GUMBO_INSERTION_MODE_IN_TABLE ||
2932
+ state == GUMBO_INSERTION_MODE_IN_CAPTION ||
2933
+ state == GUMBO_INSERTION_MODE_IN_TABLE_BODY ||
2934
+ state == GUMBO_INSERTION_MODE_IN_ROW ||
2935
+ state == GUMBO_INSERTION_MODE_IN_CELL) {
2936
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
2937
+ } else {
2938
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
2939
+ }
2940
+ return true;
2941
+ } else if (tag_in(token, kStartTag,
2942
+ (gumbo_tagset){TAG(OPTION), TAG(OPTGROUP)})) {
2943
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2944
+ pop_current_node(parser);
2945
+ }
2946
+ reconstruct_active_formatting_elements(parser);
2947
+ insert_element_from_token(parser, token);
2948
+ return true;
2949
+ } else if (tag_in(token, kStartTag,
2950
+ (gumbo_tagset){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)})) {
2951
+ bool success = true;
2952
+ GumboTag exception =
2953
+ tag_in(token, kStartTag, (gumbo_tagset){TAG(RT), TAG(RP)})
2954
+ ? GUMBO_TAG_RTC
2955
+ : GUMBO_TAG_LAST;
2956
+ if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
2957
+ generate_implied_end_tags(parser, exception);
2958
+ }
2959
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY) &&
2960
+ !(exception == GUMBO_TAG_LAST ||
2961
+ node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC))) {
2962
+ parser_add_parse_error(parser, token);
2963
+ success = false;
2964
+ }
2965
+ insert_element_from_token(parser, token);
2966
+ return success;
2967
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
2968
+ parser_add_parse_error(parser, token);
2969
+ reconstruct_active_formatting_elements(parser);
2970
+ insert_element_of_tag_type(
2971
+ parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2972
+ pop_current_node(parser);
2973
+ return false;
2974
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
2975
+ reconstruct_active_formatting_elements(parser);
2976
+ adjust_mathml_attributes(parser, token);
2977
+ adjust_foreign_attributes(parser, token);
2978
+ insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML);
2979
+ if (token->v.start_tag.is_self_closing) {
2980
+ pop_current_node(parser);
2981
+ acknowledge_self_closing_tag(parser);
2982
+ }
2983
+ return true;
2984
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
2985
+ reconstruct_active_formatting_elements(parser);
2986
+ adjust_svg_attributes(parser, token);
2987
+ adjust_foreign_attributes(parser, token);
2988
+ insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG);
2989
+ if (token->v.start_tag.is_self_closing) {
2990
+ pop_current_node(parser);
2991
+ acknowledge_self_closing_tag(parser);
2992
+ }
2993
+ return true;
2994
+ } else if (tag_in(token, kStartTag,
2995
+ (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
2996
+ TAG(FRAME), TAG(HEAD), TAG(TBODY), TAG(TD), TAG(TFOOT),
2997
+ TAG(TH), TAG(THEAD), TAG(TR)})) {
2998
+ parser_add_parse_error(parser, token);
2999
+ ignore_token(parser);
3000
+ return false;
3001
+ } else if (token->type == GUMBO_TOKEN_START_TAG) {
3002
+ reconstruct_active_formatting_elements(parser);
3003
+ insert_element_from_token(parser, token);
3004
+ return true;
3005
+ } else {
3006
+ assert(token->type == GUMBO_TOKEN_END_TAG);
3007
+ GumboTag end_tag = token->v.end_tag;
3008
+ assert(state->_open_elements.length > 0);
3009
+ assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
3010
+ // Walk up the stack of open elements until we find one that either:
3011
+ // a) Matches the tag name we saw
3012
+ // b) Is in the "special" category.
3013
+ // If we see a), implicitly close everything up to and including it. If we
3014
+ // see b), then record a parse error, don't close anything (except the
3015
+ // implied end tags) and ignore the end tag token.
3016
+ for (int i = state->_open_elements.length; --i >= 0;) {
3017
+ const GumboNode* node = state->_open_elements.data[i];
3018
+ if (node_html_tag_is(node, end_tag)) {
3019
+ generate_implied_end_tags(parser, end_tag);
3020
+ // TODO(jdtang): Do I need to add a parse error here? The condition in
3021
+ // the spec seems like it's the inverse of the loop condition above, and
3022
+ // so would never fire.
3023
+ while (node != pop_current_node(parser))
3024
+ ; // Pop everything.
3025
+ return true;
3026
+ } else if (is_special_node(node)) {
3027
+ parser_add_parse_error(parser, token);
3028
+ ignore_token(parser);
3029
+ return false;
3030
+ }
3031
+ }
3032
+ // <html> is in the special category, so we should never get here.
3033
+ assert(0);
3034
+ return false;
3035
+ }
3036
+ }
3037
+
3038
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata
3039
+ static bool handle_text(GumboParser* parser, GumboToken* token) {
3040
+ if (token->type == GUMBO_TOKEN_CHARACTER ||
3041
+ token->type == GUMBO_TOKEN_WHITESPACE) {
3042
+ insert_text_token(parser, token);
3043
+ } else {
3044
+ // We provide only bare-bones script handling that doesn't involve any of
3045
+ // the parser-pause/already-started/script-nesting flags or re-entrant
3046
+ // invocations of the tokenizer. Because the intended usage of this library
3047
+ // is mostly for templating, refactoring, and static-analysis libraries, we
3048
+ // provide the script body as a text-node child of the <script> element.
3049
+ // This behavior doesn't support document.write of partial HTML elements,
3050
+ // but should be adequate for almost all other scripting support.
3051
+ if (token->type == GUMBO_TOKEN_EOF) {
3052
+ parser_add_parse_error(parser, token);
3053
+ parser->_parser_state->_reprocess_current_token = true;
3054
+ }
3055
+ pop_current_node(parser);
3056
+ set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3057
+ }
3058
+ return true;
3059
+ }
3060
+
3061
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable
3062
+ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3063
+ GumboParserState* state = parser->_parser_state;
3064
+ if (token->type == GUMBO_TOKEN_CHARACTER ||
3065
+ token->type == GUMBO_TOKEN_WHITESPACE) {
3066
+ // The "pending table character tokens" list described in the spec is
3067
+ // nothing more than the TextNodeBufferState. We accumulate text tokens as
3068
+ // normal, except that when we go to flush them in the handle_in_table_text,
3069
+ // we set _foster_parent_insertions if there're non-whitespace characters in
3070
+ // the buffer.
3071
+ assert(state->_text_node._buffer.length == 0);
3072
+ state->_original_insertion_mode = state->_insertion_mode;
3073
+ state->_reprocess_current_token = true;
3074
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
3075
+ return true;
3076
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3077
+ parser_add_parse_error(parser, token);
3078
+ ignore_token(parser);
3079
+ return false;
3080
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3081
+ append_comment_node(parser, get_current_node(parser), token);
3082
+ return true;
3083
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3084
+ clear_stack_to_table_context(parser);
3085
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
3086
+ insert_element_from_token(parser, token);
3087
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
3088
+ return true;
3089
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3090
+ clear_stack_to_table_context(parser);
3091
+ insert_element_from_token(parser, token);
3092
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3093
+ return true;
3094
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3095
+ clear_stack_to_table_context(parser);
3096
+ insert_element_of_tag_type(
3097
+ parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED);
3098
+ parser->_parser_state->_reprocess_current_token = true;
3099
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3100
+ return true;
3101
+ } else if (tag_in(token, kStartTag,
3102
+ (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD),
3103
+ TAG(TH), TAG(TR)})) {
3104
+ clear_stack_to_table_context(parser);
3105
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3106
+ if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH), TAG(TR)})) {
3107
+ insert_element_of_tag_type(
3108
+ parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED);
3109
+ state->_reprocess_current_token = true;
3110
+ } else {
3111
+ insert_element_from_token(parser, token);
3112
+ }
3113
+ return true;
3114
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3115
+ parser_add_parse_error(parser, token);
3116
+ if (close_table(parser)) {
3117
+ parser->_parser_state->_reprocess_current_token = true;
3118
+ } else {
3119
+ ignore_token(parser);
3120
+ }
3121
+ return false;
3122
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3123
+ if (!close_table(parser)) {
3124
+ parser_add_parse_error(parser, token);
3125
+ return false;
3126
+ }
3127
+ return true;
3128
+ } else if (tag_in(token, kEndTag,
3129
+ (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL),
3130
+ TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD), TAG(TFOOT),
3131
+ TAG(TH), TAG(THEAD), TAG(TR)})) {
3132
+ parser_add_parse_error(parser, token);
3133
+ ignore_token(parser);
3134
+ return false;
3135
+ } else if (tag_in(token, kStartTag,
3136
+ (gumbo_tagset){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)}) ||
3137
+ (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))) {
3138
+ return handle_in_head(parser, token);
3139
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) &&
3140
+ attribute_matches(
3141
+ &token->v.start_tag.attributes, "type", "hidden")) {
3142
+ parser_add_parse_error(parser, token);
3143
+ insert_element_from_token(parser, token);
3144
+ pop_current_node(parser);
3145
+ return false;
3146
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3147
+ parser_add_parse_error(parser, token);
3148
+ if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3149
+ ignore_token(parser);
3150
+ return false;
3151
+ }
3152
+ state->_form_element = insert_element_from_token(parser, token);
3153
+ pop_current_node(parser);
3154
+ return false;
3155
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3156
+ return handle_in_body(parser, token);
3157
+ } else {
3158
+ parser_add_parse_error(parser, token);
3159
+ state->_foster_parent_insertions = true;
3160
+ bool result = handle_in_body(parser, token);
3161
+ state->_foster_parent_insertions = false;
3162
+ return result;
3163
+ }
3164
+ }
3165
+
3166
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext
3167
+ static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3168
+ if (token->type == GUMBO_TOKEN_NULL) {
3169
+ parser_add_parse_error(parser, token);
3170
+ ignore_token(parser);
3171
+ return false;
3172
+ } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3173
+ token->type == GUMBO_TOKEN_WHITESPACE) {
3174
+ insert_text_token(parser, token);
3175
+ return true;
3176
+ } else {
3177
+ GumboParserState* state = parser->_parser_state;
3178
+ GumboStringBuffer* buffer = &state->_text_node._buffer;
3179
+ // Can't use strspn for this because GumboStringBuffers are not
3180
+ // null-terminated.
3181
+ // Note that TextNodeBuffer may contain UTF-8 characters, but the presence
3182
+ // of any one byte that is not whitespace means we flip the flag, so this
3183
+ // loop is still valid.
3184
+ for (unsigned int i = 0; i < buffer->length; ++i) {
3185
+ if (!isspace((unsigned char) buffer->data[i]) ||
3186
+ buffer->data[i] == '\v') {
3187
+ state->_foster_parent_insertions = true;
3188
+ reconstruct_active_formatting_elements(parser);
3189
+ break;
3190
+ }
3191
+ }
3192
+ maybe_flush_text_node_buffer(parser);
3193
+ state->_foster_parent_insertions = false;
3194
+ state->_reprocess_current_token = true;
3195
+ state->_insertion_mode = state->_original_insertion_mode;
3196
+ return true;
3197
+ }
3198
+ }
3199
+
3200
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption
3201
+ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3202
+ if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3203
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3204
+ parser_add_parse_error(parser, token);
3205
+ ignore_token(parser);
3206
+ return false;
3207
+ } else {
3208
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3209
+ bool result = true;
3210
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3211
+ parser_add_parse_error(parser, token);
3212
+ }
3213
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3214
+ ;
3215
+ clear_active_formatting_elements(parser);
3216
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3217
+ return result;
3218
+ }
3219
+ } else if (tag_in(token, kStartTag,
3220
+ (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3221
+ TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3222
+ TAG(TR)}) ||
3223
+ (tag_is(token, kEndTag, GUMBO_TAG_TABLE))) {
3224
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3225
+ parser_add_parse_error(parser, token);
3226
+ ignore_token(parser);
3227
+ return false;
3228
+ }
3229
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3230
+ ;
3231
+ clear_active_formatting_elements(parser);
3232
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3233
+ parser->_parser_state->_reprocess_current_token = true;
3234
+ return true;
3235
+ } else if (tag_in(token, kEndTag,
3236
+ (gumbo_tagset){TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML),
3237
+ TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3238
+ TAG(TR)})) {
3239
+ parser_add_parse_error(parser, token);
3240
+ ignore_token(parser);
3241
+ return false;
3242
+ } else {
3243
+ return handle_in_body(parser, token);
3244
+ }
3245
+ }
3246
+
3247
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup
3248
+ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3249
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
3250
+ insert_text_token(parser, token);
3251
+ return true;
3252
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3253
+ parser_add_parse_error(parser, token);
3254
+ ignore_token(parser);
3255
+ return false;
3256
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3257
+ append_comment_node(parser, get_current_node(parser), token);
3258
+ return true;
3259
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3260
+ return handle_in_body(parser, token);
3261
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3262
+ insert_element_from_token(parser, token);
3263
+ pop_current_node(parser);
3264
+ acknowledge_self_closing_tag(parser);
3265
+ return true;
3266
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3267
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3268
+ parser_add_parse_error(parser, token);
3269
+ ignore_token(parser);
3270
+ return false;
3271
+ }
3272
+ pop_current_node(parser);
3273
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3274
+ return false;
3275
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3276
+ parser_add_parse_error(parser, token);
3277
+ ignore_token(parser);
3278
+ return false;
3279
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) ||
3280
+ tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3281
+ return handle_in_head(parser, token);
3282
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3283
+ return handle_in_body(parser, token);
3284
+ } else {
3285
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3286
+ parser_add_parse_error(parser, token);
3287
+ ignore_token(parser);
3288
+ return false;
3289
+ }
3290
+ pop_current_node(parser);
3291
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3292
+ parser->_parser_state->_reprocess_current_token = true;
3293
+ return true;
3294
+ }
3295
+ }
3296
+
3297
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody
3298
+ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3299
+ if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3300
+ clear_stack_to_table_body_context(parser);
3301
+ insert_element_from_token(parser, token);
3302
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3303
+ return true;
3304
+ } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3305
+ parser_add_parse_error(parser, token);
3306
+ clear_stack_to_table_body_context(parser);
3307
+ insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3308
+ parser->_parser_state->_reprocess_current_token = true;
3309
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3310
+ return false;
3311
+ } else if (tag_in(token, kEndTag,
3312
+ (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3313
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3314
+ parser_add_parse_error(parser, token);
3315
+ ignore_token(parser);
3316
+ return false;
3317
+ }
3318
+ clear_stack_to_table_body_context(parser);
3319
+ pop_current_node(parser);
3320
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3321
+ return true;
3322
+ } else if (tag_in(token, kStartTag,
3323
+ (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3324
+ TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) ||
3325
+ tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3326
+ if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) ||
3327
+ has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) ||
3328
+ has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) {
3329
+ parser_add_parse_error(parser, token);
3330
+ ignore_token(parser);
3331
+ return false;
3332
+ }
3333
+ clear_stack_to_table_body_context(parser);
3334
+ pop_current_node(parser);
3335
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3336
+ parser->_parser_state->_reprocess_current_token = true;
3337
+ return true;
3338
+ } else if (tag_in(token, kEndTag,
3339
+ (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR),
3340
+ TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) {
3341
+ parser_add_parse_error(parser, token);
3342
+ ignore_token(parser);
3343
+ return false;
3344
+ } else {
3345
+ return handle_in_table(parser, token);
3346
+ }
3347
+ }
3348
+
3349
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr
3350
+ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3351
+ if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TH), TAG(TD)})) {
3352
+ clear_stack_to_table_row_context(parser);
3353
+ insert_element_from_token(parser, token);
3354
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3355
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
3356
+ return true;
3357
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3358
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3359
+ parser_add_parse_error(parser, token);
3360
+ ignore_token(parser);
3361
+ return false;
3362
+ } else {
3363
+ clear_stack_to_table_row_context(parser);
3364
+ pop_current_node(parser);
3365
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3366
+ return true;
3367
+ }
3368
+ } else if (tag_in(token, kStartTag,
3369
+ (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3370
+ TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)}) ||
3371
+ tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3372
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3373
+ parser_add_parse_error(parser, token);
3374
+ ignore_token(parser);
3375
+ return false;
3376
+ } else {
3377
+ clear_stack_to_table_row_context(parser);
3378
+ pop_current_node(parser);
3379
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3380
+ parser->_parser_state->_reprocess_current_token = true;
3381
+ return true;
3382
+ }
3383
+ } else if (tag_in(token, kEndTag,
3384
+ (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3385
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag) ||
3386
+ (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR))) {
3387
+ parser_add_parse_error(parser, token);
3388
+ ignore_token(parser);
3389
+ return false;
3390
+ } else {
3391
+ clear_stack_to_table_row_context(parser);
3392
+ pop_current_node(parser);
3393
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3394
+ parser->_parser_state->_reprocess_current_token = true;
3395
+ return true;
3396
+ }
3397
+ } else if (tag_in(token, kEndTag,
3398
+ (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL),
3399
+ TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) {
3400
+ parser_add_parse_error(parser, token);
3401
+ ignore_token(parser);
3402
+ return false;
3403
+ } else {
3404
+ return handle_in_table(parser, token);
3405
+ }
3406
+ }
3407
+
3408
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd
3409
+ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3410
+ if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3411
+ GumboTag token_tag = token->v.end_tag;
3412
+ if (!has_an_element_in_table_scope(parser, token_tag)) {
3413
+ parser_add_parse_error(parser, token);
3414
+ ignore_token(parser);
3415
+ return false;
3416
+ }
3417
+ return close_table_cell(parser, token, token_tag);
3418
+ } else if (tag_in(token, kStartTag,
3419
+ (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3420
+ TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3421
+ TAG(TR)})) {
3422
+ gumbo_debug("Handling <td> in cell.\n");
3423
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) &&
3424
+ !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
3425
+ gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
3426
+ parser_add_parse_error(parser, token);
3427
+ ignore_token(parser);
3428
+ return false;
3429
+ }
3430
+ parser->_parser_state->_reprocess_current_token = true;
3431
+ return close_current_cell(parser, token);
3432
+ } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(CAPTION),
3433
+ TAG(COL), TAG(COLGROUP), TAG(HTML)})) {
3434
+ parser_add_parse_error(parser, token);
3435
+ ignore_token(parser);
3436
+ return false;
3437
+ } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TABLE), TAG(TBODY),
3438
+ TAG(TFOOT), TAG(THEAD), TAG(TR)})) {
3439
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3440
+ parser_add_parse_error(parser, token);
3441
+ ignore_token(parser);
3442
+ return false;
3443
+ }
3444
+ parser->_parser_state->_reprocess_current_token = true;
3445
+ return close_current_cell(parser, token);
3446
+ } else {
3447
+ return handle_in_body(parser, token);
3448
+ }
3449
+ }
3450
+
3451
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect
3452
+ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3453
+ if (token->type == GUMBO_TOKEN_NULL) {
3454
+ parser_add_parse_error(parser, token);
3455
+ ignore_token(parser);
3456
+ return false;
3457
+ } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3458
+ token->type == GUMBO_TOKEN_WHITESPACE) {
3459
+ insert_text_token(parser, token);
3460
+ return true;
3461
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3462
+ parser_add_parse_error(parser, token);
3463
+ ignore_token(parser);
3464
+ return false;
3465
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3466
+ append_comment_node(parser, get_current_node(parser), token);
3467
+ return true;
3468
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3469
+ return handle_in_body(parser, token);
3470
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3471
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3472
+ pop_current_node(parser);
3473
+ }
3474
+ insert_element_from_token(parser, token);
3475
+ return true;
3476
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3477
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3478
+ pop_current_node(parser);
3479
+ }
3480
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3481
+ pop_current_node(parser);
3482
+ }
3483
+ insert_element_from_token(parser, token);
3484
+ return true;
3485
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3486
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
3487
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
3488
+ node_html_tag_is(open_elements->data[open_elements->length - 2],
3489
+ GUMBO_TAG_OPTGROUP)) {
3490
+ pop_current_node(parser);
3491
+ }
3492
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3493
+ pop_current_node(parser);
3494
+ return true;
3495
+ } else {
3496
+ parser_add_parse_error(parser, token);
3497
+ ignore_token(parser);
3498
+ return false;
3499
+ }
3500
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3501
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3502
+ pop_current_node(parser);
3503
+ return true;
3504
+ } else {
3505
+ parser_add_parse_error(parser, token);
3506
+ ignore_token(parser);
3507
+ return false;
3508
+ }
3509
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3510
+ if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3511
+ parser_add_parse_error(parser, token);
3512
+ ignore_token(parser);
3513
+ return false;
3514
+ }
3515
+ close_current_select(parser);
3516
+ return true;
3517
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3518
+ parser_add_parse_error(parser, token);
3519
+ ignore_token(parser);
3520
+ if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3521
+ close_current_select(parser);
3522
+ }
3523
+ return false;
3524
+ } else if (tag_in(token, kStartTag,
3525
+ (gumbo_tagset){TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})) {
3526
+ parser_add_parse_error(parser, token);
3527
+ if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3528
+ ignore_token(parser);
3529
+ } else {
3530
+ close_current_select(parser);
3531
+ parser->_parser_state->_reprocess_current_token = true;
3532
+ }
3533
+ return false;
3534
+ } else if (tag_in(token, kStartTag,
3535
+ (gumbo_tagset){TAG(SCRIPT), TAG(TEMPLATE)}) ||
3536
+ tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3537
+ return handle_in_head(parser, token);
3538
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3539
+ return handle_in_body(parser, token);
3540
+ } else {
3541
+ parser_add_parse_error(parser, token);
3542
+ ignore_token(parser);
3543
+ return false;
3544
+ }
3545
+ }
3546
+
3547
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
3548
+ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3549
+ if (tag_in(token, kStartTag,
3550
+ (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT),
3551
+ TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
3552
+ parser_add_parse_error(parser, token);
3553
+ close_current_select(parser);
3554
+ parser->_parser_state->_reprocess_current_token = true;
3555
+ return false;
3556
+ } else if (tag_in(token, kEndTag,
3557
+ (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY),
3558
+ TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
3559
+ parser_add_parse_error(parser, token);
3560
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3561
+ ignore_token(parser);
3562
+ return false;
3563
+ } else {
3564
+ close_current_select(parser);
3565
+ // close_current_select already does the
3566
+ // reset_insertion_mode_appropriately
3567
+ // reset_insertion_mode_appropriately(parser);
3568
+ parser->_parser_state->_reprocess_current_token = true;
3569
+ return false;
3570
+ }
3571
+ } else {
3572
+ return handle_in_select(parser, token);
3573
+ }
3574
+ }
3575
+
3576
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
3577
+ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3578
+ GumboParserState* state = parser->_parser_state;
3579
+ if (token->type == GUMBO_TOKEN_WHITESPACE ||
3580
+ token->type == GUMBO_TOKEN_CHARACTER ||
3581
+ token->type == GUMBO_TOKEN_COMMENT || token->type == GUMBO_TOKEN_NULL ||
3582
+ token->type == GUMBO_TOKEN_DOCTYPE) {
3583
+ return handle_in_body(parser, token);
3584
+ } else if (tag_in(token, kStartTag,
3585
+ (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
3586
+ TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
3587
+ TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) ||
3588
+ tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3589
+ return handle_in_head(parser, token);
3590
+ } else if (tag_in(
3591
+ token, kStartTag, (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP),
3592
+ TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3593
+ pop_template_insertion_mode(parser);
3594
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3595
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3596
+ state->_reprocess_current_token = true;
3597
+ return true;
3598
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3599
+ pop_template_insertion_mode(parser);
3600
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3601
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3602
+ state->_reprocess_current_token = true;
3603
+ return true;
3604
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3605
+ pop_template_insertion_mode(parser);
3606
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3607
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3608
+ state->_reprocess_current_token = true;
3609
+ return true;
3610
+ } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3611
+ pop_template_insertion_mode(parser);
3612
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3613
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3614
+ state->_reprocess_current_token = true;
3615
+ return true;
3616
+ } else if (token->type == GUMBO_TOKEN_START_TAG) {
3617
+ pop_template_insertion_mode(parser);
3618
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3619
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3620
+ state->_reprocess_current_token = true;
3621
+ return true;
3622
+ } else if (token->type == GUMBO_TOKEN_END_TAG) {
3623
+ parser_add_parse_error(parser, token);
3624
+ ignore_token(parser);
3625
+ return false;
3626
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3627
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3628
+ // Stop parsing.
3629
+ return true;
3630
+ }
3631
+ parser_add_parse_error(parser, token);
3632
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
3633
+ ;
3634
+ clear_active_formatting_elements(parser);
3635
+ pop_template_insertion_mode(parser);
3636
+ reset_insertion_mode_appropriately(parser);
3637
+ state->_reprocess_current_token = true;
3638
+ return false;
3639
+ } else {
3640
+ assert(0);
3641
+ return false;
3642
+ }
3643
+ }
3644
+
3645
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
3646
+ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
3647
+ if (token->type == GUMBO_TOKEN_WHITESPACE ||
3648
+ tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3649
+ return handle_in_body(parser, token);
3650
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3651
+ GumboNode* html_node = parser->_output->root;
3652
+ assert(html_node != NULL);
3653
+ append_comment_node(parser, html_node, token);
3654
+ return true;
3655
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3656
+ parser_add_parse_error(parser, token);
3657
+ ignore_token(parser);
3658
+ return false;
3659
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3660
+ /* fragment case: ignore the closing HTML token */
3661
+ if (is_fragment_parser(parser)) {
3662
+ parser_add_parse_error(parser, token);
3663
+ ignore_token(parser);
3664
+ return false;
3665
+ }
3666
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
3667
+ GumboNode* html = parser->_parser_state->_open_elements.data[0];
3668
+ assert(node_html_tag_is(html, GUMBO_TAG_HTML));
3669
+ record_end_of_element(
3670
+ parser->_parser_state->_current_token, &html->v.element);
3671
+ return true;
3672
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3673
+ return true;
3674
+ } else {
3675
+ parser_add_parse_error(parser, token);
3676
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3677
+ parser->_parser_state->_reprocess_current_token = true;
3678
+ return false;
3679
+ }
3680
+ }
3681
+
3682
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset
3683
+ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
3684
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
3685
+ insert_text_token(parser, token);
3686
+ return true;
3687
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3688
+ append_comment_node(parser, get_current_node(parser), token);
3689
+ return true;
3690
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3691
+ parser_add_parse_error(parser, token);
3692
+ ignore_token(parser);
3693
+ return false;
3694
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3695
+ return handle_in_body(parser, token);
3696
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
3697
+ insert_element_from_token(parser, token);
3698
+ return true;
3699
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
3700
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3701
+ parser_add_parse_error(parser, token);
3702
+ ignore_token(parser);
3703
+ return false;
3704
+ }
3705
+ pop_current_node(parser);
3706
+ if (!is_fragment_parser(parser) &&
3707
+ !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
3708
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
3709
+ }
3710
+ return true;
3711
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
3712
+ insert_element_from_token(parser, token);
3713
+ pop_current_node(parser);
3714
+ acknowledge_self_closing_tag(parser);
3715
+ return true;
3716
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3717
+ return handle_in_head(parser, token);
3718
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3719
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3720
+ parser_add_parse_error(parser, token);
3721
+ return false;
3722
+ }
3723
+ return true;
3724
+ } else {
3725
+ parser_add_parse_error(parser, token);
3726
+ ignore_token(parser);
3727
+ return false;
3728
+ }
3729
+ }
3730
+
3731
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset
3732
+ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
3733
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
3734
+ insert_text_token(parser, token);
3735
+ return true;
3736
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3737
+ append_comment_node(parser, get_current_node(parser), token);
3738
+ return true;
3739
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3740
+ parser_add_parse_error(parser, token);
3741
+ ignore_token(parser);
3742
+ return false;
3743
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3744
+ return handle_in_body(parser, token);
3745
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3746
+ GumboNode* html = parser->_parser_state->_open_elements.data[0];
3747
+ assert(node_html_tag_is(html, GUMBO_TAG_HTML));
3748
+ record_end_of_element(
3749
+ parser->_parser_state->_current_token, &html->v.element);
3750
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
3751
+ return true;
3752
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3753
+ return handle_in_head(parser, token);
3754
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3755
+ return true;
3756
+ } else {
3757
+ parser_add_parse_error(parser, token);
3758
+ ignore_token(parser);
3759
+ return false;
3760
+ }
3761
+ }
3762
+
3763
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode
3764
+ static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
3765
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3766
+ append_comment_node(parser, get_document_node(parser), token);
3767
+ return true;
3768
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3769
+ token->type == GUMBO_TOKEN_WHITESPACE ||
3770
+ tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3771
+ return handle_in_body(parser, token);
3772
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3773
+ return true;
3774
+ } else {
3775
+ parser_add_parse_error(parser, token);
3776
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3777
+ parser->_parser_state->_reprocess_current_token = true;
3778
+ return false;
3779
+ }
3780
+ }
3781
+
3782
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode
3783
+ static bool handle_after_after_frameset(
3784
+ GumboParser* parser, GumboToken* token) {
3785
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3786
+ append_comment_node(parser, get_document_node(parser), token);
3787
+ return true;
3788
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3789
+ token->type == GUMBO_TOKEN_WHITESPACE ||
3790
+ tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3791
+ return handle_in_body(parser, token);
3792
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3793
+ return true;
3794
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3795
+ return handle_in_head(parser, token);
3796
+ } else {
3797
+ parser_add_parse_error(parser, token);
3798
+ ignore_token(parser);
3799
+ return false;
3800
+ }
3801
+ }
3802
+
3803
+ // Function pointers for each insertion mode. Keep in sync with
3804
+ // insertion_mode.h.
3805
+ typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token);
3806
+ static const TokenHandler kTokenHandlers[] = {handle_initial,
3807
+ handle_before_html, handle_before_head, handle_in_head,
3808
+ handle_in_head_noscript, handle_after_head, handle_in_body, handle_text,
3809
+ handle_in_table, handle_in_table_text, handle_in_caption,
3810
+ handle_in_column_group, handle_in_table_body, handle_in_row, handle_in_cell,
3811
+ handle_in_select, handle_in_select_in_table, handle_in_template,
3812
+ handle_after_body, handle_in_frameset, handle_after_frameset,
3813
+ handle_after_after_body, handle_after_after_frameset};
3814
+
3815
+ static bool handle_html_content(GumboParser* parser, GumboToken* token) {
3816
+ return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode](
3817
+ parser, token);
3818
+ }
3819
+
3820
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign
3821
+ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
3822
+ gumbo_debug("Handling foreign content");
3823
+ switch (token->type) {
3824
+ case GUMBO_TOKEN_NULL:
3825
+ parser_add_parse_error(parser, token);
3826
+ token->v.character = kUtf8ReplacementChar;
3827
+ insert_text_token(parser, token);
3828
+ return false;
3829
+ case GUMBO_TOKEN_WHITESPACE:
3830
+ insert_text_token(parser, token);
3831
+ return true;
3832
+ case GUMBO_TOKEN_CDATA:
3833
+ case GUMBO_TOKEN_CHARACTER:
3834
+ insert_text_token(parser, token);
3835
+ set_frameset_not_ok(parser);
3836
+ return true;
3837
+ case GUMBO_TOKEN_COMMENT:
3838
+ append_comment_node(parser, get_current_node(parser), token);
3839
+ return true;
3840
+ case GUMBO_TOKEN_DOCTYPE:
3841
+ parser_add_parse_error(parser, token);
3842
+ ignore_token(parser);
3843
+ return false;
3844
+ default:
3845
+ // Fall through to the if-statements below.
3846
+ break;
3847
+ }
3848
+ // Order matters for these clauses.
3849
+ if (tag_in(token, kStartTag,
3850
+ (gumbo_tagset){TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR),
3851
+ TAG(CENTER), TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT),
3852
+ TAG(EM), TAG(EMBED), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5),
3853
+ TAG(H6), TAG(HEAD), TAG(HR), TAG(I), TAG(IMG), TAG(LI),
3854
+ TAG(LISTING), TAG(MENU), TAG(META), TAG(NOBR), TAG(OL), TAG(P),
3855
+ TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL), TAG(SPAN), TAG(STRONG),
3856
+ TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE), TAG(TT), TAG(U),
3857
+ TAG(UL), TAG(VAR)}) ||
3858
+ (tag_is(token, kStartTag, GUMBO_TAG_FONT) &&
3859
+ (token_has_attribute(token, "color") ||
3860
+ token_has_attribute(token, "face") ||
3861
+ token_has_attribute(token, "size")))) {
3862
+ /* Parse error */
3863
+ parser_add_parse_error(parser, token);
3864
+
3865
+ /*
3866
+ * Fragment case: If the parser was originally created for the HTML
3867
+ * fragment parsing algorithm, then act as described in the "any other
3868
+ * start tag" entry below.
3869
+ */
3870
+ if (!is_fragment_parser(parser)) {
3871
+ do {
3872
+ pop_current_node(parser);
3873
+ } while (!(is_mathml_integration_point(get_current_node(parser)) ||
3874
+ is_html_integration_point(get_current_node(parser)) ||
3875
+ get_current_node(parser)->v.element.tag_namespace ==
3876
+ GUMBO_NAMESPACE_HTML));
3877
+ parser->_parser_state->_reprocess_current_token = true;
3878
+ return false;
3879
+ }
3880
+
3881
+ assert(token->type == GUMBO_TOKEN_START_TAG);
3882
+ }
3883
+
3884
+ if (token->type == GUMBO_TOKEN_START_TAG) {
3885
+ const GumboNamespaceEnum current_namespace =
3886
+ get_adjusted_current_node(parser)->v.element.tag_namespace;
3887
+ if (current_namespace == GUMBO_NAMESPACE_MATHML) {
3888
+ adjust_mathml_attributes(parser, token);
3889
+ }
3890
+ if (current_namespace == GUMBO_NAMESPACE_SVG) {
3891
+ // Tag adjustment is left to the gumbo_normalize_svg_tagname helper
3892
+ // function.
3893
+ adjust_svg_attributes(parser, token);
3894
+ }
3895
+ adjust_foreign_attributes(parser, token);
3896
+ insert_foreign_element(parser, token, current_namespace);
3897
+ if (token->v.start_tag.is_self_closing) {
3898
+ pop_current_node(parser);
3899
+ acknowledge_self_closing_tag(parser);
3900
+ }
3901
+ return true;
3902
+ // </script> tags are handled like any other end tag, putting the script's
3903
+ // text into a text node child and closing the current node.
3904
+ } else {
3905
+ assert(token->type == GUMBO_TOKEN_END_TAG);
3906
+ GumboNode* node = get_current_node(parser);
3907
+ assert(node != NULL);
3908
+ GumboStringPiece token_tagname = token->original_text;
3909
+ GumboStringPiece node_tagname = node->v.element.original_tag;
3910
+ gumbo_tag_from_original_text(&token_tagname);
3911
+ gumbo_tag_from_original_text(&node_tagname);
3912
+
3913
+ bool is_success = true;
3914
+ if (!gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3915
+ parser_add_parse_error(parser, token);
3916
+ is_success = false;
3917
+ }
3918
+ int i = parser->_parser_state->_open_elements.length;
3919
+ for (--i; i > 0;) {
3920
+ // Here we move up the stack until we find an HTML element (in which
3921
+ // case we do nothing) or we find the element that we're about to
3922
+ // close (in which case we pop everything we've seen until that
3923
+ // point.)
3924
+ gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length,
3925
+ node_tagname.data, i);
3926
+ if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3927
+ gumbo_debug("Matches.\n");
3928
+ while (pop_current_node(parser) != node) {
3929
+ // Pop all the nodes below the current one. Node is guaranteed to
3930
+ // be an element on the stack of open elements (set below), so
3931
+ // this loop is guaranteed to terminate.
3932
+ }
3933
+ return is_success;
3934
+ }
3935
+ --i;
3936
+ node = parser->_parser_state->_open_elements.data[i];
3937
+ if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
3938
+ // Must break before gumbo_tag_from_original_text to avoid passing
3939
+ // parser-inserted nodes through.
3940
+ break;
3941
+ }
3942
+ node_tagname = node->v.element.original_tag;
3943
+ gumbo_tag_from_original_text(&node_tagname);
3944
+ }
3945
+ assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
3946
+ // We can't call handle_token directly because the current node is still in
3947
+ // the SVG namespace, so it would re-enter this and result in infinite
3948
+ // recursion.
3949
+ return handle_html_content(parser, token) && is_success;
3950
+ }
3951
+ }
3952
+
3953
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction
3954
+ static bool handle_token(GumboParser* parser, GumboToken* token) {
3955
+ if (parser->_parser_state->_ignore_next_linefeed &&
3956
+ token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') {
3957
+ parser->_parser_state->_ignore_next_linefeed = false;
3958
+ ignore_token(parser);
3959
+ return true;
3960
+ }
3961
+ // This needs to be reset both here and in the conditional above to catch both
3962
+ // the case where the next token is not whitespace (so we don't ignore
3963
+ // whitespace in the middle of <pre> tags) and where there are multiple
3964
+ // whitespace tokens (so we don't ignore the second one).
3965
+ parser->_parser_state->_ignore_next_linefeed = false;
3966
+
3967
+ if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
3968
+ parser->_parser_state->_closed_body_tag = true;
3969
+ }
3970
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3971
+ parser->_parser_state->_closed_html_tag = true;
3972
+ }
3973
+
3974
+ const GumboNode* current_node = get_adjusted_current_node(parser);
3975
+ assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT ||
3976
+ current_node->type == GUMBO_NODE_TEMPLATE);
3977
+ if (current_node) {
3978
+ gumbo_debug("Current node: <%s>.\n",
3979
+ gumbo_normalized_tagname(current_node->v.element.tag));
3980
+ }
3981
+ if (!current_node ||
3982
+ current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML ||
3983
+ (is_mathml_integration_point(current_node) &&
3984
+ (token->type == GUMBO_TOKEN_CHARACTER ||
3985
+ token->type == GUMBO_TOKEN_WHITESPACE ||
3986
+ token->type == GUMBO_TOKEN_NULL ||
3987
+ (token->type == GUMBO_TOKEN_START_TAG &&
3988
+ !tag_in(token, kStartTag,
3989
+ (gumbo_tagset){TAG(MGLYPH), TAG(MALIGNMARK)})))) ||
3990
+ (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
3991
+ node_qualified_tag_is(
3992
+ current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
3993
+ tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
3994
+ (is_html_integration_point(current_node) &&
3995
+ (token->type == GUMBO_TOKEN_START_TAG ||
3996
+ token->type == GUMBO_TOKEN_CHARACTER ||
3997
+ token->type == GUMBO_TOKEN_NULL ||
3998
+ token->type == GUMBO_TOKEN_WHITESPACE)) ||
3999
+ token->type == GUMBO_TOKEN_EOF) {
4000
+ return handle_html_content(parser, token);
4001
+ } else {
4002
+ return handle_in_foreign_content(parser, token);
4003
+ }
4004
+ }
4005
+
4006
+ static void fragment_parser_init(GumboParser* parser, GumboTag fragment_ctx,
4007
+ GumboNamespaceEnum fragment_namespace) {
4008
+ GumboNode* root;
4009
+ assert(fragment_ctx != GUMBO_TAG_LAST);
4010
+
4011
+ // 3
4012
+ parser->_parser_state->_fragment_ctx = create_element(parser, fragment_ctx);
4013
+ parser->_parser_state->_fragment_ctx->v.element.tag_namespace =
4014
+ fragment_namespace;
4015
+
4016
+ // 4
4017
+ if (fragment_namespace == GUMBO_NAMESPACE_HTML) {
4018
+ // Non-HTML namespaces always start in the DATA state.
4019
+ switch (fragment_ctx) {
4020
+ case GUMBO_TAG_TITLE:
4021
+ case GUMBO_TAG_TEXTAREA:
4022
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
4023
+ break;
4024
+
4025
+ case GUMBO_TAG_STYLE:
4026
+ case GUMBO_TAG_XMP:
4027
+ case GUMBO_TAG_IFRAME:
4028
+ case GUMBO_TAG_NOEMBED:
4029
+ case GUMBO_TAG_NOFRAMES:
4030
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
4031
+ break;
4032
+
4033
+ case GUMBO_TAG_SCRIPT:
4034
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
4035
+ break;
4036
+
4037
+ case GUMBO_TAG_NOSCRIPT:
4038
+ /* scripting is disabled in Gumbo, so leave the tokenizer
4039
+ * in the default data state */
4040
+ break;
4041
+
4042
+ case GUMBO_TAG_PLAINTEXT:
4043
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
4044
+ break;
4045
+
4046
+ default:
4047
+ /* default data state */
4048
+ break;
4049
+ }
4050
+ }
4051
+
4052
+ // 5. 6. 7.
4053
+ root = insert_element_of_tag_type(
4054
+ parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
4055
+ parser->_output->root = root;
4056
+
4057
+ // 8.
4058
+ if (fragment_ctx == GUMBO_TAG_TEMPLATE) {
4059
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
4060
+ }
4061
+
4062
+ // 10.
4063
+ reset_insertion_mode_appropriately(parser);
4064
+ }
4065
+
4066
+ GumboOutput* gumbo_parse(const char* buffer) {
4067
+ return gumbo_parse_with_options(
4068
+ &kGumboDefaultOptions, buffer, strlen(buffer));
4069
+ }
4070
+
4071
+ GumboOutput* gumbo_parse_with_options(
4072
+ const GumboOptions* options, const char* buffer, size_t length) {
4073
+ GumboParser parser;
4074
+ parser._options = options;
4075
+ output_init(&parser);
4076
+ gumbo_tokenizer_state_init(&parser, buffer, length);
4077
+ parser_state_init(&parser);
4078
+
4079
+ if (options->fragment_context != GUMBO_TAG_LAST) {
4080
+ fragment_parser_init(
4081
+ &parser, options->fragment_context, options->fragment_namespace);
4082
+ }
4083
+
4084
+ GumboParserState* state = parser._parser_state;
4085
+ gumbo_debug("Parsing %.*s.\n", length, buffer);
4086
+
4087
+ // Sanity check so that infinite loops die with an assertion failure instead
4088
+ // of hanging the process before we ever get an error.
4089
+ int loop_count = 0;
4090
+
4091
+ GumboToken token;
4092
+ bool has_error = false;
4093
+
4094
+ do {
4095
+ if (state->_reprocess_current_token) {
4096
+ state->_reprocess_current_token = false;
4097
+ } else {
4098
+ GumboNode* current_node = get_current_node(&parser);
4099
+ gumbo_tokenizer_set_is_current_node_foreign(&parser,
4100
+ current_node &&
4101
+ current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML);
4102
+ has_error = !gumbo_lex(&parser, &token) || has_error;
4103
+ }
4104
+ const char* token_type = "text";
4105
+ switch (token.type) {
4106
+ case GUMBO_TOKEN_DOCTYPE:
4107
+ token_type = "doctype";
4108
+ break;
4109
+ case GUMBO_TOKEN_START_TAG:
4110
+ token_type = gumbo_normalized_tagname(token.v.start_tag.tag);
4111
+ break;
4112
+ case GUMBO_TOKEN_END_TAG:
4113
+ token_type = gumbo_normalized_tagname(token.v.end_tag);
4114
+ break;
4115
+ case GUMBO_TOKEN_COMMENT:
4116
+ token_type = "comment";
4117
+ break;
4118
+ default:
4119
+ break;
4120
+ }
4121
+ gumbo_debug("Handling %s token @%d:%d in state %d.\n", (char*) token_type,
4122
+ token.position.line, token.position.column, state->_insertion_mode);
4123
+
4124
+ state->_current_token = &token;
4125
+ state->_self_closing_flag_acknowledged =
4126
+ !(token.type == GUMBO_TOKEN_START_TAG &&
4127
+ token.v.start_tag.is_self_closing);
4128
+
4129
+ has_error = !handle_token(&parser, &token) || has_error;
4130
+
4131
+ // Check for memory leaks when ownership is transferred from start tag
4132
+ // tokens to nodes.
4133
+ assert(state->_reprocess_current_token ||
4134
+ token.type != GUMBO_TOKEN_START_TAG ||
4135
+ token.v.start_tag.attributes.data == NULL);
4136
+
4137
+ if (!state->_self_closing_flag_acknowledged) {
4138
+ GumboError* error = parser_add_parse_error(&parser, &token);
4139
+ if (error) {
4140
+ error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
4141
+ }
4142
+ }
4143
+
4144
+ ++loop_count;
4145
+ assert(loop_count < 1000000000);
4146
+
4147
+ } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) &&
4148
+ !(options->stop_on_first_error && has_error));
4149
+
4150
+ finish_parsing(&parser);
4151
+ // For API uniformity reasons, if the doctype still has nulls, convert them to
4152
+ // empty strings.
4153
+ GumboDocument* doc_type = &parser._output->document->v.document;
4154
+ if (doc_type->name == NULL) {
4155
+ doc_type->name = gumbo_copy_stringz(&parser, "");
4156
+ }
4157
+ if (doc_type->public_identifier == NULL) {
4158
+ doc_type->public_identifier = gumbo_copy_stringz(&parser, "");
4159
+ }
4160
+ if (doc_type->system_identifier == NULL) {
4161
+ doc_type->system_identifier = gumbo_copy_stringz(&parser, "");
4162
+ }
4163
+
4164
+ parser_state_destroy(&parser);
4165
+ gumbo_tokenizer_state_destroy(&parser);
4166
+ return parser._output;
4167
+ }
4168
+
4169
+ void gumbo_destroy_node(GumboOptions* options, GumboNode* node) {
4170
+ // Need a dummy GumboParser because the allocator comes along with the
4171
+ // options object.
4172
+ GumboParser parser;
4173
+ parser._options = options;
4174
+ destroy_node(&parser, node);
4175
+ }
4176
+
4177
+ void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
4178
+ // Need a dummy GumboParser because the allocator comes along with the
4179
+ // options object.
4180
+ GumboParser parser;
4181
+ parser._options = options;
4182
+ destroy_node(&parser, output->document);
4183
+ for (unsigned int i = 0; i < output->errors.length; ++i) {
4184
+ gumbo_error_destroy(&parser, output->errors.data[i]);
4185
+ }
4186
+ gumbo_vector_destroy(&parser, &output->errors);
4187
+ gumbo_parser_deallocate(&parser, output);
4188
+ }