nokogumbo 0.5 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
data/work/parser.c ADDED
@@ -0,0 +1,3893 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include <assert.h>
18
+ #include <ctype.h>
19
+ #include <stdarg.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "attribute.h"
25
+ #include "error.h"
26
+ #include "gumbo.h"
27
+ #include "insertion_mode.h"
28
+ #include "parser.h"
29
+ #include "tokenizer.h"
30
+ #include "tokenizer_states.h"
31
+ #include "utf8.h"
32
+ #include "util.h"
33
+ #include "vector.h"
34
+
35
+
36
+ #define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
37
+
38
+ #define GUMBO_STRING(literal) { literal, sizeof(literal) - 1 }
39
+ #define TERMINATOR { "", 0 }
40
+
41
+ static void* malloc_wrapper(void* unused, size_t size) {
42
+ return malloc(size);
43
+ }
44
+
45
+ static void free_wrapper(void* unused, void* ptr) {
46
+ return free(ptr);
47
+ }
48
+
49
+ const GumboOptions kGumboDefaultOptions = {
50
+ &malloc_wrapper,
51
+ &free_wrapper,
52
+ NULL,
53
+ 8,
54
+ false,
55
+ -1,
56
+ };
57
+
58
+ static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html");
59
+ static const GumboStringPiece kPublicIdHtml4_0 = GUMBO_STRING(
60
+ "-//W3C//DTD HTML 4.0//EN");
61
+ static const GumboStringPiece kPublicIdHtml4_01 = GUMBO_STRING(
62
+ "-//W3C//DTD HTML 4.01//EN");
63
+ static const GumboStringPiece kPublicIdXhtml1_0 = GUMBO_STRING(
64
+ "-//W3C//DTD XHTML 1.0 Strict//EN");
65
+ static const GumboStringPiece kPublicIdXhtml1_1 = GUMBO_STRING(
66
+ "-//W3C//DTD XHTML 1.1//EN");
67
+ static const GumboStringPiece kSystemIdRecHtml4_0 = GUMBO_STRING(
68
+ "http://www.w3.org/TR/REC-html40/strict.dtd");
69
+ static const GumboStringPiece kSystemIdHtml4 = GUMBO_STRING(
70
+ "http://www.w3.org/TR/html4/strict.dtd");
71
+ static const GumboStringPiece kSystemIdXhtmlStrict1_1 = GUMBO_STRING(
72
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
73
+ static const GumboStringPiece kSystemIdXhtml1_1 = GUMBO_STRING(
74
+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
75
+ static const GumboStringPiece kSystemIdLegacyCompat = GUMBO_STRING(
76
+ "about:legacy-compat");
77
+
78
+ // The doctype arrays have an explicit terminator because we want to pass them
79
+ // to a helper function, and passing them as a pointer discards sizeof
80
+ // information. The SVG arrays are used only by one-off functions, and so loops
81
+ // over them use sizeof directly instead of a terminator.
82
+
83
+ static const GumboStringPiece kQuirksModePublicIdPrefixes[] = {
84
+ GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"),
85
+ GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
86
+ GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"),
87
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"),
88
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"),
89
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"),
90
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"),
91
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"),
92
+ GUMBO_STRING("-//IETF//DTD HTML 2.0//"),
93
+ GUMBO_STRING("-//IETF//DTD HTML 2.1E//"),
94
+ GUMBO_STRING("-//IETF//DTD HTML 3.0//"),
95
+ GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"),
96
+ GUMBO_STRING("-//IETF//DTD HTML 3.2//"),
97
+ GUMBO_STRING("-//IETF//DTD HTML 3//"),
98
+ GUMBO_STRING("-//IETF//DTD HTML Level 0//"),
99
+ GUMBO_STRING("-//IETF//DTD HTML Level 1//"),
100
+ GUMBO_STRING("-//IETF//DTD HTML Level 2//"),
101
+ GUMBO_STRING("-//IETF//DTD HTML Level 3//"),
102
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"),
103
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"),
104
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"),
105
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"),
106
+ GUMBO_STRING("-//IETF//DTD HTML Strict//"),
107
+ GUMBO_STRING("-//IETF//DTD HTML//"),
108
+ GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"),
109
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
110
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
111
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
112
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
113
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
114
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
115
+ GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"),
116
+ GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"),
117
+ GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"),
118
+ GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
119
+ GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
120
+ GUMBO_STRING("-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)"
121
+ "extensions to HTML 4.0//"),
122
+ GUMBO_STRING("-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::"
123
+ "extensions to HTML 4.0//"),
124
+ GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"),
125
+ GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
126
+ GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
127
+ GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
128
+ GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"),
129
+ GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"),
130
+ GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"),
131
+ GUMBO_STRING("-//W3C//DTD HTML 3.2//"),
132
+ GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"),
133
+ GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"),
134
+ GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"),
135
+ GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"),
136
+ GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"),
137
+ GUMBO_STRING("-//W3C//DTD W3 HTML//"),
138
+ GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"),
139
+ GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"),
140
+ GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"),
141
+ TERMINATOR
142
+ };
143
+
144
+ static const GumboStringPiece kQuirksModePublicIdExactMatches[] = {
145
+ GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"),
146
+ GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"),
147
+ GUMBO_STRING("HTML"),
148
+ TERMINATOR
149
+ };
150
+
151
+ static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = {
152
+ GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"),
153
+ TERMINATOR
154
+ };
155
+
156
+ static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = {
157
+ GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"),
158
+ GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"),
159
+ TERMINATOR
160
+ };
161
+
162
+ static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] = {
163
+ GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"),
164
+ GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"),
165
+ TERMINATOR
166
+ };
167
+
168
+ // Indexed by GumboNamespaceEnum; keep in sync with that.
169
+ static const char* kLegalXmlns[] = {
170
+ "http://www.w3.org/1999/xhtml",
171
+ "http://www.w3.org/2000/svg",
172
+ "http://www.w3.org/1998/Math/MathML"
173
+ };
174
+
175
+ typedef struct _ReplacementEntry {
176
+ const GumboStringPiece from;
177
+ const GumboStringPiece to;
178
+ } ReplacementEntry;
179
+
180
+ #define REPLACEMENT_ENTRY(from, to) \
181
+ { GUMBO_STRING(from), GUMBO_STRING(to) }
182
+
183
+ // Static data for SVG attribute replacements.
184
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-svg-attributes
185
+ static const ReplacementEntry kSvgAttributeReplacements[] = {
186
+ REPLACEMENT_ENTRY("attributename", "attributeName"),
187
+ REPLACEMENT_ENTRY("attributetype", "attributeType"),
188
+ REPLACEMENT_ENTRY("basefrequency", "baseFrequency"),
189
+ REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
190
+ REPLACEMENT_ENTRY("calcmode", "calcMode"),
191
+ REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
192
+ REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
193
+ REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
194
+ REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
195
+ REPLACEMENT_ENTRY("edgemode", "edgeMode"),
196
+ REPLACEMENT_ENTRY("externalresourcesrequired", "externalResourcesRequired"),
197
+ REPLACEMENT_ENTRY("filterres", "filterRes"),
198
+ REPLACEMENT_ENTRY("filterunits", "filterUnits"),
199
+ REPLACEMENT_ENTRY("glyphref", "glyphRef"),
200
+ REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
201
+ REPLACEMENT_ENTRY("gradientunits", "gradientUnits"),
202
+ REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"),
203
+ REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"),
204
+ REPLACEMENT_ENTRY("keypoints", "keyPoints"),
205
+ REPLACEMENT_ENTRY("keysplines", "keySplines"),
206
+ REPLACEMENT_ENTRY("keytimes", "keyTimes"),
207
+ REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"),
208
+ REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"),
209
+ REPLACEMENT_ENTRY("markerheight", "markerHeight"),
210
+ REPLACEMENT_ENTRY("markerunits", "markerUnits"),
211
+ REPLACEMENT_ENTRY("markerwidth", "markerWidth"),
212
+ REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"),
213
+ REPLACEMENT_ENTRY("maskunits", "maskUnits"),
214
+ REPLACEMENT_ENTRY("numoctaves", "numOctaves"),
215
+ REPLACEMENT_ENTRY("pathlength", "pathLength"),
216
+ REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"),
217
+ REPLACEMENT_ENTRY("patterntransform", "patternTransform"),
218
+ REPLACEMENT_ENTRY("patternunits", "patternUnits"),
219
+ REPLACEMENT_ENTRY("pointsatx", "pointsAtX"),
220
+ REPLACEMENT_ENTRY("pointsaty", "pointsAtY"),
221
+ REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"),
222
+ REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"),
223
+ REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"),
224
+ REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"),
225
+ REPLACEMENT_ENTRY("refx", "refX"),
226
+ REPLACEMENT_ENTRY("refy", "refY"),
227
+ REPLACEMENT_ENTRY("repeatcount", "repeatCount"),
228
+ REPLACEMENT_ENTRY("repeatdur", "repeatDur"),
229
+ REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"),
230
+ REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"),
231
+ REPLACEMENT_ENTRY("specularconstant", "specularConstant"),
232
+ REPLACEMENT_ENTRY("specularexponent", "specularExponent"),
233
+ REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"),
234
+ REPLACEMENT_ENTRY("startoffset", "startOffset"),
235
+ REPLACEMENT_ENTRY("stddeviation", "stdDeviation"),
236
+ REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"),
237
+ REPLACEMENT_ENTRY("surfacescale", "surfaceScale"),
238
+ REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"),
239
+ REPLACEMENT_ENTRY("tablevalues", "tableValues"),
240
+ REPLACEMENT_ENTRY("targetx", "targetX"),
241
+ REPLACEMENT_ENTRY("targety", "targetY"),
242
+ REPLACEMENT_ENTRY("textlength", "textLength"),
243
+ REPLACEMENT_ENTRY("viewbox", "viewBox"),
244
+ REPLACEMENT_ENTRY("viewtarget", "viewTarget"),
245
+ REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"),
246
+ REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"),
247
+ REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"),
248
+ };
249
+
250
+ static const ReplacementEntry kSvgTagReplacements[] = {
251
+ REPLACEMENT_ENTRY("altglyph", "altGlyph"),
252
+ REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"),
253
+ REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"),
254
+ REPLACEMENT_ENTRY("animatecolor", "animateColor"),
255
+ REPLACEMENT_ENTRY("animatemotion", "animateMotion"),
256
+ REPLACEMENT_ENTRY("animatetransform", "animateTransform"),
257
+ REPLACEMENT_ENTRY("clippath", "clipPath"),
258
+ REPLACEMENT_ENTRY("feblend", "feBlend"),
259
+ REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"),
260
+ REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"),
261
+ REPLACEMENT_ENTRY("fecomposite", "feComposite"),
262
+ REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"),
263
+ REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"),
264
+ REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"),
265
+ REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"),
266
+ REPLACEMENT_ENTRY("feflood", "feFlood"),
267
+ REPLACEMENT_ENTRY("fefunca", "feFuncA"),
268
+ REPLACEMENT_ENTRY("fefuncb", "feFuncB"),
269
+ REPLACEMENT_ENTRY("fefuncg", "feFuncG"),
270
+ REPLACEMENT_ENTRY("fefuncr", "feFuncR"),
271
+ REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"),
272
+ REPLACEMENT_ENTRY("feimage", "feImage"),
273
+ REPLACEMENT_ENTRY("femerge", "feMerge"),
274
+ REPLACEMENT_ENTRY("femergenode", "feMergeNode"),
275
+ REPLACEMENT_ENTRY("femorphology", "feMorphology"),
276
+ REPLACEMENT_ENTRY("feoffset", "feOffset"),
277
+ REPLACEMENT_ENTRY("fepointlight", "fePointLight"),
278
+ REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"),
279
+ REPLACEMENT_ENTRY("fespotlight", "feSpotLight"),
280
+ REPLACEMENT_ENTRY("fetile", "feTile"),
281
+ REPLACEMENT_ENTRY("feturbulence", "feTurbulence"),
282
+ REPLACEMENT_ENTRY("foreignobject", "foreignObject"),
283
+ REPLACEMENT_ENTRY("glyphref", "glyphRef"),
284
+ REPLACEMENT_ENTRY("lineargradient", "linearGradient"),
285
+ REPLACEMENT_ENTRY("radialgradient", "radialGradient"),
286
+ REPLACEMENT_ENTRY("textpath", "textPath"),
287
+ };
288
+
289
+ typedef struct _NamespacedAttributeReplacement {
290
+ const char* from;
291
+ const char* local_name;
292
+ const GumboAttributeNamespaceEnum attr_namespace;
293
+ } NamespacedAttributeReplacement;
294
+
295
+ static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = {
296
+ { "xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK },
297
+ { "xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK },
298
+ { "xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK },
299
+ { "xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK },
300
+ { "xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK },
301
+ { "xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK },
302
+ { "xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK },
303
+ { "xml:base", "base", GUMBO_ATTR_NAMESPACE_XML },
304
+ { "xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML },
305
+ { "xml:space", "space", GUMBO_ATTR_NAMESPACE_XML },
306
+ { "xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS },
307
+ { "xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS },
308
+ };
309
+
310
+ // The "scope marker" for the list of active formatting elements. We use a
311
+ // pointer to this as a generic marker element, since the particular element
312
+ // scope doesn't matter.
313
+ static const GumboNode kActiveFormattingScopeMarker;
314
+
315
+ // The tag_is and tag_in function use true & false to denote start & end tags,
316
+ // but for readability, we define constants for them here.
317
+ static const bool kStartTag = true;
318
+ static const bool kEndTag = false;
319
+
320
+ // Because GumboStringPieces are immutable, we can't insert a character directly
321
+ // into a text node. Instead, we accumulate all pending characters here and
322
+ // flush them out to a text node whenever a new element is inserted.
323
+ //
324
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character
325
+ typedef struct _TextNodeBufferState {
326
+ // The accumulated text to be inserted into the current text node.
327
+ GumboStringBuffer _buffer;
328
+
329
+ // A pointer to the original text represented by this text node. Note that
330
+ // because of foster parenting and other strange DOM manipulations, this may
331
+ // include other non-text HTML tags in it; it is defined as the span of
332
+ // original text from the first character in this text node to the last
333
+ // character in this text node.
334
+ const char* _start_original_text;
335
+
336
+ // The source position of the start of this text node.
337
+ GumboSourcePosition _start_position;
338
+
339
+ // The type of node that will be inserted (TEXT or WHITESPACE).
340
+ GumboNodeType _type;
341
+ } TextNodeBufferState;
342
+
343
+ typedef struct _GumboParserState {
344
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
345
+ GumboInsertionMode _insertion_mode;
346
+
347
+ // Used for run_generic_parsing_algorithm, which needs to switch back to the
348
+ // original insertion mode at its conclusion.
349
+ GumboInsertionMode _original_insertion_mode;
350
+
351
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements
352
+ GumboVector /*GumboNode*/ _open_elements;
353
+
354
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements
355
+ GumboVector /*GumboNode*/ _active_formatting_elements;
356
+
357
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers
358
+ GumboNode* _head_element;
359
+ GumboNode* _form_element;
360
+
361
+ // The flag for when the spec says "Reprocess the current token in..."
362
+ bool _reprocess_current_token;
363
+
364
+ // The flag for "acknowledge the token's self-closing flag".
365
+ bool _self_closing_flag_acknowledged;
366
+
367
+ // The "frameset-ok" flag from the spec.
368
+ bool _frameset_ok;
369
+
370
+ // The flag for "If the next token is a LINE FEED, ignore that token...".
371
+ bool _ignore_next_linefeed;
372
+
373
+ // The flag for "whenever a node would be inserted into the current node, it
374
+ // must instead be foster parented". This is used for misnested table
375
+ // content, which needs to be handled according to "in body" rules yet foster
376
+ // parented outside of the table.
377
+ // It would perhaps be more explicit to have this as a parameter to
378
+ // handle_in_body and insert_element, but given how special-purpose this is
379
+ // and the number of call-sites that would need to take the extra parameter,
380
+ // it's easier just to have a state flag.
381
+ bool _foster_parent_insertions;
382
+
383
+ // The accumulated text node buffer state.
384
+ TextNodeBufferState _text_node;
385
+
386
+ // The current token.
387
+ GumboToken* _current_token;
388
+
389
+ // The way that the spec is written, the </body> and </html> tags are *always*
390
+ // implicit, because encountering one of those tokens merely switches the
391
+ // insertion mode out of "in body". So we have individual state flags for
392
+ // those end tags that are then inspected by pop_current_node when the <body>
393
+ // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG
394
+ // flag appropriately.
395
+ bool _closed_body_tag;
396
+ bool _closed_html_tag;
397
+ } GumboParserState;
398
+
399
+ static bool token_has_attribute(const GumboToken* token, const char* name) {
400
+ assert(token->type == GUMBO_TOKEN_START_TAG);
401
+ return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL;
402
+ }
403
+
404
+ // Checks if the value of the specified attribute is a case-insensitive match
405
+ // for the specified string.
406
+ static bool attribute_matches(
407
+ const GumboVector* attributes, const char* name, const char* value) {
408
+ const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
409
+ return attr ? strcasecmp(value, attr->value) == 0 : false;
410
+ }
411
+
412
+ // Checks if the value of the specified attribute is a case-sensitive match
413
+ // for the specified string.
414
+ static bool attribute_matches_case_sensitive(
415
+ const GumboVector* attributes, const char* name, const char* value) {
416
+ const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
417
+ return attr ? strcmp(value, attr->value) == 0 : false;
418
+ }
419
+
420
+ // Checks if the specified attribute vectors are identical.
421
+ static bool all_attributes_match(
422
+ const GumboVector* attr1, const GumboVector* attr2) {
423
+ int num_unmatched_attr2_elements = attr2->length;
424
+ for (int i = 0; i < attr1->length; ++i) {
425
+ const GumboAttribute* attr = attr1->data[i];
426
+ if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) {
427
+ --num_unmatched_attr2_elements;
428
+ } else {
429
+ return false;
430
+ }
431
+ }
432
+ return num_unmatched_attr2_elements == 0;
433
+ }
434
+
435
+ static void set_frameset_not_ok(GumboParser* parser) {
436
+ gumbo_debug("Setting frameset_ok to false.\n");
437
+ parser->_parser_state->_frameset_ok = false;
438
+ }
439
+
440
+ static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
441
+ GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
442
+ node->parent = NULL;
443
+ node->index_within_parent = -1;
444
+ node->type = type;
445
+ node->parse_flags = GUMBO_INSERTION_NORMAL;
446
+ return node;
447
+ }
448
+
449
+ static GumboNode* new_document_node(GumboParser* parser) {
450
+ GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT);
451
+ document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
452
+ gumbo_vector_init(
453
+ parser, 1, &document_node->v.document.children);
454
+
455
+ // Must be initialized explicitly, as there's no guarantee that we'll see a
456
+ // doc type token.
457
+ GumboDocument* document = &document_node->v.document;
458
+ document->has_doctype = false;
459
+ document->name = NULL;
460
+ document->public_identifier = NULL;
461
+ document->system_identifier = NULL;
462
+ return document_node;
463
+ }
464
+
465
+ static void output_init(GumboParser* parser) {
466
+ GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput));
467
+ output->root = NULL;
468
+ output->document = new_document_node(parser);
469
+ parser->_output = output;
470
+ gumbo_init_errors(parser);
471
+ }
472
+
473
+ static void parser_state_init(GumboParser* parser) {
474
+ GumboParserState* parser_state =
475
+ gumbo_parser_allocate(parser, sizeof(GumboParserState));
476
+ parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL;
477
+ parser_state->_reprocess_current_token = false;
478
+ parser_state->_frameset_ok = true;
479
+ parser_state->_ignore_next_linefeed = false;
480
+ parser_state->_foster_parent_insertions = false;
481
+ parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
482
+ gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer);
483
+ gumbo_vector_init(parser, 10, &parser_state->_open_elements);
484
+ gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements);
485
+ parser_state->_head_element = NULL;
486
+ parser_state->_form_element = NULL;
487
+ parser_state->_current_token = NULL;
488
+ parser_state->_closed_body_tag = false;
489
+ parser_state->_closed_html_tag = false;
490
+ parser->_parser_state = parser_state;
491
+ }
492
+
493
+ static void parser_state_destroy(GumboParser* parser) {
494
+ GumboParserState* state = parser->_parser_state;
495
+ gumbo_vector_destroy(parser, &state->_active_formatting_elements);
496
+ gumbo_vector_destroy(parser, &state->_open_elements);
497
+ gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
498
+ gumbo_parser_deallocate(parser, state);
499
+ }
500
+
501
+ static GumboNode* get_document_node(GumboParser* parser) {
502
+ return parser->_output->document;
503
+ }
504
+
505
+ // Returns the node at the bottom of the stack of open elements, or NULL if no
506
+ // elements have been added yet.
507
+ static GumboNode* get_current_node(GumboParser* parser) {
508
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
509
+ if (open_elements->length == 0) {
510
+ assert(!parser->_output->root);
511
+ return NULL;
512
+ }
513
+ assert(open_elements->length > 0);
514
+ assert(open_elements->data != NULL);
515
+ return open_elements->data[open_elements->length - 1];
516
+ }
517
+
518
+ // Returns true if the given needle is in the given array of literal
519
+ // GumboStringPieces. If exact_match is true, this requires that they match
520
+ // exactly; otherwise, this performs a prefix match to check if any of the
521
+ // elements in haystack start with needle. This always performs a
522
+ // case-insensitive match.
523
+ static bool is_in_static_list(
524
+ const char* needle, const GumboStringPiece* haystack, bool exact_match) {
525
+ for (int i = 0; haystack[i].length > 0; ++i) {
526
+ if ((exact_match && !strcmp(needle, haystack[i].data)) ||
527
+ (!exact_match && !strcasecmp(needle, haystack[i].data))) {
528
+ return true;
529
+ }
530
+ }
531
+ return false;
532
+ }
533
+
534
+ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
535
+ parser->_parser_state->_insertion_mode = mode;
536
+ }
537
+
538
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
539
+ // This is a helper function that returns the appropriate insertion mode instead
540
+ // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
541
+ // indicate that there is no appropriate insertion mode, and the loop should
542
+ // continue.
543
+ static GumboInsertionMode get_appropriate_insertion_mode(
544
+ const GumboNode* node, bool is_last) {
545
+ assert(node->type == GUMBO_NODE_ELEMENT);
546
+ switch (node->v.element.tag) {
547
+ case GUMBO_TAG_SELECT:
548
+ return GUMBO_INSERTION_MODE_IN_SELECT;
549
+ case GUMBO_TAG_TD:
550
+ case GUMBO_TAG_TH:
551
+ return is_last ?
552
+ GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_IN_CELL;
553
+ case GUMBO_TAG_TR:
554
+ return GUMBO_INSERTION_MODE_IN_ROW;
555
+ case GUMBO_TAG_TBODY:
556
+ case GUMBO_TAG_THEAD:
557
+ case GUMBO_TAG_TFOOT:
558
+ return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
559
+ case GUMBO_TAG_CAPTION:
560
+ return GUMBO_INSERTION_MODE_IN_CAPTION;
561
+ case GUMBO_TAG_COLGROUP:
562
+ return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
563
+ case GUMBO_TAG_TABLE:
564
+ return GUMBO_INSERTION_MODE_IN_TABLE;
565
+ case GUMBO_TAG_HEAD:
566
+ case GUMBO_TAG_BODY:
567
+ return GUMBO_INSERTION_MODE_IN_BODY;
568
+ case GUMBO_TAG_FRAMESET:
569
+ return GUMBO_INSERTION_MODE_IN_FRAMESET;
570
+ case GUMBO_TAG_HTML:
571
+ return GUMBO_INSERTION_MODE_BEFORE_HEAD;
572
+ default:
573
+ return is_last ?
574
+ GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
575
+ }
576
+ }
577
+
578
+ // This performs the actual "reset the insertion mode" loop.
579
+ static void reset_insertion_mode_appropriately(GumboParser* parser) {
580
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
581
+ for (int i = open_elements->length - 1; i >= 0; --i) {
582
+ GumboInsertionMode mode =
583
+ get_appropriate_insertion_mode(open_elements->data[i], i == 0);
584
+ if (mode != GUMBO_INSERTION_MODE_INITIAL) {
585
+ set_insertion_mode(parser, mode);
586
+ return;
587
+ }
588
+ }
589
+ // Should never get here, because is_last will be set on the last iteration
590
+ // and will force GUMBO_INSERTION_MODE_IN_BODY.
591
+ assert(0);
592
+ }
593
+
594
+ static GumboError* add_parse_error(GumboParser* parser, const GumboToken* token) {
595
+ gumbo_debug("Adding parse error.\n");
596
+ GumboError* error = gumbo_add_error(parser);
597
+ if (!error) {
598
+ return NULL;
599
+ }
600
+ error->type = GUMBO_ERR_PARSER;
601
+ error->position = token->position;
602
+ error->original_text = token->original_text.data;
603
+ GumboParserError* extra_data = &error->v.parser;
604
+ extra_data->input_type = token->type;
605
+ extra_data->input_tag = GUMBO_TAG_UNKNOWN;
606
+ if (token->type == GUMBO_TOKEN_START_TAG) {
607
+ extra_data->input_tag = token->v.start_tag.tag;
608
+ } else if (token->type == GUMBO_TOKEN_END_TAG) {
609
+ extra_data->input_tag = token->v.end_tag;
610
+ }
611
+ GumboParserState* state = parser->_parser_state;
612
+ extra_data->parser_state = state->_insertion_mode;
613
+ gumbo_vector_init(parser, state->_open_elements.length,
614
+ &extra_data->tag_stack);
615
+ for (int i = 0; i < state->_open_elements.length; ++i) {
616
+ const GumboNode* node = state->_open_elements.data[i];
617
+ assert(node->type == GUMBO_NODE_ELEMENT);
618
+ gumbo_vector_add(parser, (void*) node->v.element.tag,
619
+ &extra_data->tag_stack);
620
+ }
621
+ return error;
622
+ }
623
+
624
+ // Returns true if the specified token is either a start or end tag (specified
625
+ // by is_start) with one of the tag types in the varargs list. Terminate the
626
+ // list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
627
+ // the spec references tags that are not in the spec.
628
+ // TODO(jdtang): A lot of the tag lists for this function are repeated in many
629
+ // places in the code. This is how it's written in the spec (and it's done this
630
+ // way so it's easy to verify the code against the spec), but it may be worth
631
+ // coming up with a notion of a "tag set" that includes a list of tags, and
632
+ // using that in many places. It'd probably also help performance, but I want
633
+ // to profile before optimizing.
634
+ static bool tag_in(const GumboToken* token, bool is_start, ...) {
635
+ GumboTag token_tag;
636
+ if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
637
+ token_tag = token->v.start_tag.tag;
638
+ } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
639
+ token_tag = token->v.end_tag;
640
+ } else {
641
+ return false;
642
+ }
643
+
644
+ va_list tags;
645
+ va_start(tags, is_start);
646
+ bool result = false;
647
+ for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
648
+ tag = va_arg(tags, GumboTag)) {
649
+ if (tag == token_tag) {
650
+ result = true;
651
+ break;
652
+ }
653
+ }
654
+ va_end(tags);
655
+ return result;
656
+ }
657
+
658
+ // Like tag_in, but for the single-tag case.
659
+ static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
660
+ if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
661
+ return token->v.start_tag.tag == tag;
662
+ } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
663
+ return token->v.end_tag == tag;
664
+ } else {
665
+ return false;
666
+ }
667
+ }
668
+
669
+ // Like tag_in, but checks for the tag of a node, rather than a token.
670
+ static bool node_tag_in(const GumboNode* node, ...) {
671
+ assert(node != NULL);
672
+ if (node->type != GUMBO_NODE_ELEMENT) {
673
+ return false;
674
+ }
675
+ GumboTag node_tag = node->v.element.tag;
676
+
677
+ va_list tags;
678
+ va_start(tags, node);
679
+ bool result = false;
680
+ for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
681
+ tag = va_arg(tags, GumboTag)) {
682
+ assert(tag <= GUMBO_TAG_LAST);
683
+ if (tag == node_tag) {
684
+ result = true;
685
+ break;
686
+ }
687
+ }
688
+ va_end(tags);
689
+ return result;
690
+ }
691
+
692
+ // Like node_tag_in, but for the single-tag case.
693
+ static bool node_tag_is(const GumboNode* node, GumboTag tag) {
694
+ return node->type == GUMBO_NODE_ELEMENT && node->v.element.tag == tag;
695
+ }
696
+
697
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
698
+ static bool is_mathml_integration_point(const GumboNode* node) {
699
+ return node_tag_in(node, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
700
+ GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST) &&
701
+ node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML;
702
+ }
703
+
704
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
705
+ static bool is_html_integration_point(const GumboNode* node) {
706
+ return (node_tag_in(node, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC,
707
+ GUMBO_TAG_TITLE, GUMBO_TAG_LAST) &&
708
+ node->v.element.tag_namespace == GUMBO_NAMESPACE_SVG) ||
709
+ (node_tag_is(node, GUMBO_TAG_ANNOTATION_XML) && (
710
+ attribute_matches(&node->v.element.attributes,
711
+ "encoding", "text/html") ||
712
+ attribute_matches(&node->v.element.attributes,
713
+ "encoding", "application/xhtml+xml")));
714
+ }
715
+
716
+ // Appends a node to the end of its parent, setting the "parent" and
717
+ // "index_within_parent" fields appropriately.
718
+ static void append_node(
719
+ GumboParser* parser, GumboNode* parent, GumboNode* node) {
720
+ assert(node->parent == NULL);
721
+ assert(node->index_within_parent = -1);
722
+ GumboVector* children;
723
+ if (parent->type == GUMBO_NODE_ELEMENT) {
724
+ children = &parent->v.element.children;
725
+ } else {
726
+ assert(parent->type == GUMBO_NODE_DOCUMENT);
727
+ children = &parent->v.document.children;
728
+ }
729
+ node->parent = parent;
730
+ node->index_within_parent = children->length;
731
+ gumbo_vector_add(parser, (void*) node, children);
732
+ assert(node->index_within_parent < children->length);
733
+ }
734
+
735
+ // Inserts a node at the specified index within its parent, updating the
736
+ // "parent" and "index_within_parent" fields of it and all its siblings.
737
+ static void insert_node(
738
+ GumboParser* parser, GumboNode* parent, int index, GumboNode* node) {
739
+ assert(node->parent == NULL);
740
+ assert(node->index_within_parent = -1);
741
+ assert(parent->type == GUMBO_NODE_ELEMENT);
742
+ GumboVector* children = &parent->v.element.children;
743
+ assert(index >= 0);
744
+ assert(index < children->length);
745
+ node->parent = parent;
746
+ node->index_within_parent = index;
747
+ gumbo_vector_insert_at(parser, (void*) node, index, children);
748
+ assert(node->index_within_parent < children->length);
749
+ for (int i = index + 1; i < children->length; ++i) {
750
+ GumboNode* sibling = children->data[i];
751
+ sibling->index_within_parent = i;
752
+ assert(sibling->index_within_parent < children->length);
753
+ }
754
+ }
755
+
756
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#foster-parenting
757
+ static void foster_parent_element(GumboParser* parser, GumboNode* node) {
758
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
759
+ assert(open_elements->length > 2);
760
+
761
+ node->parse_flags |= GUMBO_INSERTION_FOSTER_PARENTED;
762
+ GumboNode* foster_parent_element = open_elements->data[0];
763
+ assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
764
+ assert(node_tag_is(foster_parent_element, GUMBO_TAG_HTML));
765
+ for (int i = open_elements->length - 1; i > 1; --i) {
766
+ GumboNode* table_element = open_elements->data[i];
767
+ if (node_tag_is(table_element, GUMBO_TAG_TABLE)) {
768
+ foster_parent_element = table_element->parent;
769
+ if (!foster_parent_element ||
770
+ foster_parent_element->type != GUMBO_NODE_ELEMENT) {
771
+ // Table has no parent; spec says it's possible if a script manipulated
772
+ // the DOM, although I don't think we have to worry about this case.
773
+ gumbo_debug("Table has no parent.\n");
774
+ foster_parent_element = open_elements->data[i - 1];
775
+ break;
776
+ }
777
+ assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
778
+ gumbo_debug("Found enclosing table (%x) at %d; parent=%s, index=%d.\n",
779
+ table_element, i, gumbo_normalized_tagname(
780
+ foster_parent_element->v.element.tag),
781
+ table_element->index_within_parent);
782
+ assert(foster_parent_element->v.element.children.data[
783
+ table_element->index_within_parent] == table_element);
784
+ insert_node(parser, foster_parent_element,
785
+ table_element->index_within_parent, node);
786
+ return;
787
+ }
788
+ }
789
+ if (node->type == GUMBO_NODE_ELEMENT) {
790
+ gumbo_vector_add(parser, (void*) node, open_elements);
791
+ }
792
+ append_node(parser, foster_parent_element, node);
793
+ }
794
+
795
+ static void maybe_flush_text_node_buffer(GumboParser* parser) {
796
+ GumboParserState* state = parser->_parser_state;
797
+ TextNodeBufferState* buffer_state = &state->_text_node;
798
+ if (buffer_state->_buffer.length == 0) {
799
+ return;
800
+ }
801
+
802
+ assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
803
+ buffer_state->_type == GUMBO_NODE_TEXT);
804
+ GumboNode* text_node = create_node(parser, buffer_state->_type);
805
+ GumboText* text_node_data = &text_node->v.text;
806
+ text_node_data->text = gumbo_string_buffer_to_string(
807
+ parser, &buffer_state->_buffer);
808
+ text_node_data->original_text.data = buffer_state->_start_original_text;
809
+ text_node_data->original_text.length =
810
+ state->_current_token->original_text.data -
811
+ buffer_state->_start_original_text;
812
+ text_node_data->start_pos = buffer_state->_start_position;
813
+ if (state->_foster_parent_insertions && node_tag_in(
814
+ get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
815
+ GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
816
+ foster_parent_element(parser, text_node);
817
+ } else {
818
+ append_node(
819
+ parser, parser->_output->root ?
820
+ get_current_node(parser) : parser->_output->document, text_node);
821
+ }
822
+ gumbo_debug("Flushing text node buffer of %.*s.\n",
823
+ (int) buffer_state->_buffer.length, buffer_state->_buffer.data);
824
+
825
+ gumbo_string_buffer_destroy(parser, &buffer_state->_buffer);
826
+ gumbo_string_buffer_init(parser, &buffer_state->_buffer);
827
+ buffer_state->_type = GUMBO_NODE_WHITESPACE;
828
+ assert(buffer_state->_buffer.length == 0);
829
+ }
830
+
831
+ static void record_end_of_element(
832
+ GumboToken* current_token, GumboElement* element) {
833
+ element->end_pos = current_token->position;
834
+ element->original_end_tag =
835
+ current_token->type == GUMBO_TOKEN_END_TAG ?
836
+ current_token->original_text : kGumboEmptyString;
837
+ }
838
+
839
+ static GumboNode* pop_current_node(GumboParser* parser) {
840
+ GumboParserState* state = parser->_parser_state;
841
+ maybe_flush_text_node_buffer(parser);
842
+ if (state->_open_elements.length > 0) {
843
+ assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
844
+ gumbo_debug(
845
+ "Popping %s node.\n",
846
+ gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
847
+ }
848
+ GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements);
849
+ if (!current_node) {
850
+ assert(state->_open_elements.length == 0);
851
+ return NULL;
852
+ }
853
+ assert(current_node->type == GUMBO_NODE_ELEMENT);
854
+ bool is_closed_body_or_html_tag =
855
+ (node_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
856
+ (node_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag);
857
+ if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
858
+ !node_tag_is(current_node, state->_current_token->v.end_tag)) &&
859
+ !is_closed_body_or_html_tag) {
860
+ current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
861
+ }
862
+ if (!is_closed_body_or_html_tag) {
863
+ record_end_of_element(state->_current_token, &current_node->v.element);
864
+ }
865
+ return current_node;
866
+ }
867
+
868
+ static void append_comment_node(
869
+ GumboParser* parser, GumboNode* node, const GumboToken* token) {
870
+ maybe_flush_text_node_buffer(parser);
871
+ GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT);
872
+ comment->type = GUMBO_NODE_COMMENT;
873
+ comment->parse_flags = GUMBO_INSERTION_NORMAL;
874
+ comment->v.text.text = token->v.text;
875
+ comment->v.text.original_text = token->original_text;
876
+ comment->v.text.start_pos = token->position;
877
+ append_node(parser, node, comment);
878
+ }
879
+
880
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
881
+ static void clear_stack_to_table_row_context(GumboParser* parser) {
882
+ while (!node_tag_in(get_current_node(parser),
883
+ GUMBO_TAG_HTML, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
884
+ pop_current_node(parser);
885
+ }
886
+ }
887
+
888
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
889
+ static void clear_stack_to_table_context(GumboParser* parser) {
890
+ while (!node_tag_in(get_current_node(parser),
891
+ GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST)) {
892
+ pop_current_node(parser);
893
+ }
894
+ }
895
+
896
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
897
+ void clear_stack_to_table_body_context(GumboParser* parser) {
898
+ while (!node_tag_in(get_current_node(parser), GUMBO_TAG_HTML,
899
+ GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
900
+ GUMBO_TAG_LAST)) {
901
+ pop_current_node(parser);
902
+ }
903
+ }
904
+
905
+ // Creates a parser-inserted element in the HTML namespace and returns it.
906
+ static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
907
+ GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
908
+ GumboElement* element = &node->v.element;
909
+ gumbo_vector_init(parser, 1, &element->children);
910
+ gumbo_vector_init(parser, 0, &element->attributes);
911
+ element->tag = tag;
912
+ element->tag_namespace = GUMBO_NAMESPACE_HTML;
913
+ element->original_tag = kGumboEmptyString;
914
+ element->original_end_tag = kGumboEmptyString;
915
+ element->start_pos = parser->_parser_state->_current_token->position;
916
+ element->end_pos = kGumboEmptySourcePosition;
917
+ return node;
918
+ }
919
+
920
+ // Constructs an element from the given start tag token.
921
+ static GumboNode* create_element_from_token(
922
+ GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
923
+ assert(token->type == GUMBO_TOKEN_START_TAG);
924
+ GumboTokenStartTag* start_tag = &token->v.start_tag;
925
+
926
+ GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
927
+ GumboElement* element = &node->v.element;
928
+ gumbo_vector_init(parser, 1, &element->children);
929
+ element->attributes = start_tag->attributes;
930
+ element->tag = start_tag->tag;
931
+ element->tag_namespace = tag_namespace;
932
+
933
+ assert(token->original_text.length >= 2);
934
+ assert(token->original_text.data[0] == '<');
935
+ assert(token->original_text.data[token->original_text.length - 1] == '>');
936
+ element->original_tag = token->original_text;
937
+ element->start_pos = token->position;
938
+ element->original_end_tag = kGumboEmptyString;
939
+ element->end_pos = kGumboEmptySourcePosition;
940
+
941
+ // The element takes ownership of the attributes from the token, so any
942
+ // allocated-memory fields should be nulled out.
943
+ start_tag->attributes = kGumboEmptyVector;
944
+ return node;
945
+ }
946
+
947
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element
948
+ static void insert_element(GumboParser* parser, GumboNode* node,
949
+ bool is_reconstructing_formatting_elements) {
950
+ GumboParserState* state = parser->_parser_state;
951
+ // NOTE(jdtang): The text node buffer must always be flushed before inserting
952
+ // a node, otherwise we're handling nodes in a different order than the spec
953
+ // mandated. However, one clause of the spec (character tokens in the body)
954
+ // requires that we reconstruct the active formatting elements *before* adding
955
+ // the character, and reconstructing the active formatting elements may itself
956
+ // result in the insertion of new elements (which should be pushed onto the
957
+ // stack of open elements before the buffer is flushed). We solve this (for
958
+ // the time being, the spec has been rewritten for <template> and the new
959
+ // version may be simpler here) with a boolean flag to this method.
960
+ if (!is_reconstructing_formatting_elements) {
961
+ maybe_flush_text_node_buffer(parser);
962
+ }
963
+ if (state->_foster_parent_insertions && node_tag_in(
964
+ get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
965
+ GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
966
+ foster_parent_element(parser, node);
967
+ gumbo_vector_add(parser, (void*) node, &state->_open_elements);
968
+ return;
969
+ }
970
+
971
+ // This is called to insert the root HTML element, but get_current_node
972
+ // assumes the stack of open elements is non-empty, so we need special
973
+ // handling for this case.
974
+ append_node(
975
+ parser, parser->_output->root ?
976
+ get_current_node(parser) : parser->_output->document, node);
977
+ gumbo_vector_add(parser, (void*) node, &state->_open_elements);
978
+ }
979
+
980
+ // Convenience method that combines create_element_from_token and
981
+ // insert_element, inserting the generated element directly into the current
982
+ // node. Returns the node inserted.
983
+ static GumboNode* insert_element_from_token(
984
+ GumboParser* parser, GumboToken* token) {
985
+ GumboNode* element =
986
+ create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML);
987
+ insert_element(parser, element, false);
988
+ gumbo_debug("Inserting <%s> element (@%x) from token.\n",
989
+ gumbo_normalized_tagname(element->v.element.tag), element);
990
+ return element;
991
+ }
992
+
993
+ // Convenience method that combines create_element and insert_element, inserting
994
+ // a parser-generated element of a specific tag type. Returns the node
995
+ // inserted.
996
+ static GumboNode* insert_element_of_tag_type(
997
+ GumboParser* parser, GumboTag tag, GumboParseFlags reason) {
998
+ GumboNode* element = create_element(parser, tag);
999
+ element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1000
+ insert_element(parser, element, false);
1001
+ gumbo_debug("Inserting %s element (@%x) from tag type.\n",
1002
+ gumbo_normalized_tagname(tag), element);
1003
+ return element;
1004
+ }
1005
+
1006
+ // Convenience method for creating foreign namespaced element. Returns the node
1007
+ // inserted.
1008
+ static GumboNode* insert_foreign_element(
1009
+ GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
1010
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1011
+ GumboNode* element = create_element_from_token(parser, token, tag_namespace);
1012
+ insert_element(parser, element, false);
1013
+ if (token_has_attribute(token, "xmlns") &&
1014
+ !attribute_matches_case_sensitive(
1015
+ &token->v.start_tag.attributes, "xmlns",
1016
+ kLegalXmlns[tag_namespace])) {
1017
+ // TODO(jdtang): Since there're multiple possible error codes here, we
1018
+ // eventually need reason codes to differentiate them.
1019
+ add_parse_error(parser, token);
1020
+ }
1021
+ if (token_has_attribute(token, "xmlns:xlink") &&
1022
+ !attribute_matches_case_sensitive(
1023
+ &token->v.start_tag.attributes,
1024
+ "xmlns:xlink", "http://www.w3.org/1999/xlink")) {
1025
+ add_parse_error(parser, token);
1026
+ }
1027
+ return element;
1028
+ }
1029
+
1030
+ static void insert_text_token(GumboParser* parser, GumboToken* token) {
1031
+ assert(token->type == GUMBO_TOKEN_WHITESPACE ||
1032
+ token->type == GUMBO_TOKEN_CHARACTER);
1033
+ TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
1034
+ if (buffer_state->_buffer.length == 0) {
1035
+ // Initialize position fields.
1036
+ buffer_state->_start_original_text = token->original_text.data;
1037
+ buffer_state->_start_position = token->position;
1038
+ }
1039
+ gumbo_string_buffer_append_codepoint(
1040
+ parser, token->v.character, &buffer_state->_buffer);
1041
+ if (token->type == GUMBO_TOKEN_CHARACTER) {
1042
+ buffer_state->_type = GUMBO_NODE_TEXT;
1043
+ }
1044
+ gumbo_debug("Inserting text token '%c'.\n", token->v.character);
1045
+ }
1046
+
1047
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm
1048
+ static void run_generic_parsing_algorithm(
1049
+ GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) {
1050
+ insert_element_from_token(parser, token);
1051
+ gumbo_tokenizer_set_state(parser, lexer_state);
1052
+ parser->_parser_state->_original_insertion_mode =
1053
+ parser->_parser_state->_insertion_mode;
1054
+ parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT;
1055
+ }
1056
+
1057
+ static void acknowledge_self_closing_tag(GumboParser* parser) {
1058
+ parser->_parser_state->_self_closing_flag_acknowledged = true;
1059
+ }
1060
+
1061
+ // Returns true if there's an anchor tag in the list of active formatting
1062
+ // elements, and fills in its index if so.
1063
+ static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
1064
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1065
+ for (int i = elements->length - 1; i >= 0; --i) {
1066
+ GumboNode* node = elements->data[i];
1067
+ if (node == &kActiveFormattingScopeMarker) {
1068
+ return false;
1069
+ }
1070
+ if (node_tag_is(node, GUMBO_TAG_A)) {
1071
+ *anchor_index = i;
1072
+ return true;
1073
+ }
1074
+ }
1075
+ return false;
1076
+ }
1077
+
1078
+ // Counts the number of open formatting elements in the list of active
1079
+ // formatting elements (after the last active scope marker) that have a specific
1080
+ // tag. If this is > 0, then earliest_matching_index will be filled in with the
1081
+ // index of the first such element.
1082
+ static int count_formatting_elements_of_tag(
1083
+ GumboParser* parser, const GumboNode* desired_node,
1084
+ int* earliest_matching_index) {
1085
+ const GumboElement* desired_element = &desired_node->v.element;
1086
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1087
+ int num_identical_elements = 0;
1088
+ for (int i = elements->length - 1; i >= 0; --i) {
1089
+ GumboNode* node = elements->data[i];
1090
+ if (node == &kActiveFormattingScopeMarker) {
1091
+ break;
1092
+ }
1093
+ assert(node->type == GUMBO_NODE_ELEMENT);
1094
+ GumboElement* element = &node->v.element;
1095
+ if (node_tag_is(node, desired_element->tag) &&
1096
+ element->tag_namespace == desired_element->tag_namespace &&
1097
+ all_attributes_match(&element->attributes,
1098
+ &desired_element->attributes)) {
1099
+ num_identical_elements++;
1100
+ *earliest_matching_index = i;
1101
+ }
1102
+ }
1103
+ return num_identical_elements;
1104
+ }
1105
+
1106
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements
1107
+ static void add_formatting_element(GumboParser* parser, const GumboNode* node) {
1108
+ assert(node == &kActiveFormattingScopeMarker ||
1109
+ node->type == GUMBO_NODE_ELEMENT);
1110
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1111
+ if (node == &kActiveFormattingScopeMarker) {
1112
+ gumbo_debug("Adding a scope marker.\n");
1113
+ } else {
1114
+ gumbo_debug("Adding a formatting element.\n");
1115
+ }
1116
+
1117
+ // Hunt for identical elements.
1118
+ int earliest_identical_element = elements->length;
1119
+ int num_identical_elements = count_formatting_elements_of_tag(
1120
+ parser, node, &earliest_identical_element);
1121
+
1122
+ // Noah's Ark clause: if there're at least 3, remove the earliest.
1123
+ if (num_identical_elements >= 3) {
1124
+ gumbo_debug("Noah's ark clause: removing element at %d.\n",
1125
+ earliest_identical_element);
1126
+ gumbo_vector_remove_at(parser, earliest_identical_element, elements);
1127
+ }
1128
+
1129
+ gumbo_vector_add(parser, (void*) node, elements);
1130
+ }
1131
+
1132
+ static bool is_open_element(GumboParser* parser, const GumboNode* node) {
1133
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1134
+ for (int i = 0; i < open_elements->length; ++i) {
1135
+ if (open_elements->data[i] == node) {
1136
+ return true;
1137
+ }
1138
+ }
1139
+ return false;
1140
+ }
1141
+
1142
+ // Clones attributes, tags, etc. of a node, but does not copy the content. The
1143
+ // clone shares no structure with the original node: all owned strings and
1144
+ // values are fresh copies.
1145
+ GumboNode* clone_node(
1146
+ GumboParser* parser, const GumboNode* node, GumboParseFlags reason) {
1147
+ assert(node->type == GUMBO_NODE_ELEMENT);
1148
+ GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
1149
+ *new_node = *node;
1150
+ new_node->parent = NULL;
1151
+ new_node->index_within_parent = -1;
1152
+ // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may
1153
+ // have a separate end tag.
1154
+ new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG;
1155
+ new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER;
1156
+ GumboElement* element = &new_node->v.element;
1157
+ gumbo_vector_init(parser, 1, &element->children);
1158
+
1159
+ const GumboVector* old_attributes = &node->v.element.attributes;
1160
+ gumbo_vector_init(parser, old_attributes->length, &element->attributes);
1161
+ for (int i = 0; i < old_attributes->length; ++i) {
1162
+ const GumboAttribute* old_attr = old_attributes->data[i];
1163
+ GumboAttribute* attr =
1164
+ gumbo_parser_allocate(parser, sizeof(GumboAttribute));
1165
+ *attr = *old_attr;
1166
+ attr->name = gumbo_copy_stringz(parser, old_attr->name);
1167
+ attr->value = gumbo_copy_stringz(parser, old_attr->value);
1168
+ gumbo_vector_add(parser, attr, &element->attributes);
1169
+ }
1170
+ return new_node;
1171
+ }
1172
+
1173
+ // "Reconstruct active formatting elements" part of the spec.
1174
+ // This implementation is based on the html5lib translation from the mess of
1175
+ // GOTOs in the spec to reasonably structured programming.
1176
+ // http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py
1177
+ static void reconstruct_active_formatting_elements(GumboParser* parser) {
1178
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1179
+ // Step 1
1180
+ if (elements->length == 0) {
1181
+ return;
1182
+ }
1183
+
1184
+ // Step 2 & 3
1185
+ int i = elements->length - 1;
1186
+ const GumboNode* element = elements->data[i];
1187
+ if (element == &kActiveFormattingScopeMarker ||
1188
+ is_open_element(parser, element)) {
1189
+ return;
1190
+ }
1191
+
1192
+ // Step 6
1193
+ do {
1194
+ if (i == 0) {
1195
+ // Step 4
1196
+ i = -1; // Incremented to 0 below.
1197
+ break;
1198
+ }
1199
+ // Step 5
1200
+ element = elements->data[--i];
1201
+ } while (element != &kActiveFormattingScopeMarker &&
1202
+ !is_open_element(parser, element));
1203
+
1204
+ ++i;
1205
+ gumbo_debug("Reconstructing elements from %d on %s parent.\n", i,
1206
+ gumbo_normalized_tagname(
1207
+ get_current_node(parser)->v.element.tag));
1208
+ for(; i < elements->length; ++i) {
1209
+ // Step 7 & 8.
1210
+ assert(elements->length > 0);
1211
+ assert(i < elements->length);
1212
+ element = elements->data[i];
1213
+ assert(element != &kActiveFormattingScopeMarker);
1214
+ GumboNode* clone = clone_node(
1215
+ parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
1216
+ // Step 9.
1217
+ insert_element(parser, clone, true);
1218
+ // Step 10.
1219
+ elements->data[i] = clone;
1220
+ gumbo_debug("Reconstructed %s element at %d.\n",
1221
+ gumbo_normalized_tagname(clone->v.element.tag), i);
1222
+ }
1223
+ }
1224
+
1225
+ static void clear_active_formatting_elements(GumboParser* parser) {
1226
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1227
+ int num_elements_cleared = 0;
1228
+ const GumboNode* node;
1229
+ do {
1230
+ node = gumbo_vector_pop(parser, elements);
1231
+ ++num_elements_cleared;
1232
+ } while(node && node != &kActiveFormattingScopeMarker);
1233
+ gumbo_debug("Cleared %d elements from active formatting list.\n",
1234
+ num_elements_cleared);
1235
+ }
1236
+
1237
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode
1238
+ static GumboQuirksModeEnum compute_quirks_mode(
1239
+ const GumboTokenDocType* doctype) {
1240
+ if (doctype->force_quirks ||
1241
+ strcmp(doctype->name, kDoctypeHtml.data) ||
1242
+ is_in_static_list(doctype->public_identifier,
1243
+ kQuirksModePublicIdPrefixes, false) ||
1244
+ is_in_static_list(doctype->public_identifier,
1245
+ kQuirksModePublicIdExactMatches, true) ||
1246
+ is_in_static_list(doctype->system_identifier,
1247
+ kQuirksModeSystemIdExactMatches, true) ||
1248
+ (is_in_static_list(doctype->public_identifier,
1249
+ kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false)
1250
+ && !doctype->has_system_identifier)) {
1251
+ return GUMBO_DOCTYPE_QUIRKS;
1252
+ } else if (
1253
+ is_in_static_list(doctype->public_identifier,
1254
+ kLimitedQuirksPublicIdPrefixes, false) ||
1255
+ (is_in_static_list(doctype->public_identifier,
1256
+ kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false)
1257
+ && doctype->has_system_identifier)) {
1258
+ return GUMBO_DOCTYPE_LIMITED_QUIRKS;
1259
+ }
1260
+ return GUMBO_DOCTYPE_NO_QUIRKS;
1261
+ }
1262
+
1263
+ // The following functions are all defined by the "has an element in __ scope"
1264
+ // sections of the HTML5 spec:
1265
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
1266
+ // The basic idea behind them is that they check for an element of the given tag
1267
+ // name, contained within a scope formed by a set of other tag names. For
1268
+ // example, "has an element in list scope" looks for an element of the given tag
1269
+ // within the nearest enclosing <ol> or <ul>, along with a bunch of generic
1270
+ // element types that serve to "firewall" their content from the rest of the
1271
+ // document.
1272
+ static bool has_an_element_in_specific_scope(
1273
+ GumboParser* parser, GumboVector* /* GumboTag */ expected, bool negate, ...) {
1274
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1275
+ va_list args;
1276
+ va_start(args, negate);
1277
+ // va_arg can only run through the list once, so we copy it to an GumboVector
1278
+ // here. I wonder if it'd make more sense to make tags the GumboVector*
1279
+ // parameter and 'expected' a vararg list, but that'd require changing a lot
1280
+ // of code for unknown benefit. We may want to change the representation of
1281
+ // these tag sets anyway, to something more efficient.
1282
+ GumboVector tags;
1283
+ gumbo_vector_init(parser, 10, &tags);
1284
+ for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
1285
+ tag = va_arg(args, GumboTag)) {
1286
+ // We store the tags inline instead of storing pointers to them.
1287
+ gumbo_vector_add(parser, (void*) tag, &tags);
1288
+ }
1289
+ va_end(args);
1290
+
1291
+ bool result = false;
1292
+ for (int i = open_elements->length - 1; i >= 0; --i) {
1293
+ const GumboNode* node = open_elements->data[i];
1294
+ if (node->type != GUMBO_NODE_ELEMENT) {
1295
+ continue;
1296
+ }
1297
+ GumboTag node_tag = node->v.element.tag;
1298
+ for (int j = 0; j < expected->length; ++j) {
1299
+ GumboTag expected_tag = (GumboTag) expected->data[j];
1300
+ if (node_tag == expected_tag) {
1301
+ result = true;
1302
+ goto cleanup;
1303
+ }
1304
+ }
1305
+
1306
+ bool found_tag = false;
1307
+ for (int j = 0; j < tags.length; ++j) {
1308
+ GumboTag tag = (GumboTag) tags.data[j];
1309
+ if (tag == node_tag) {
1310
+ found_tag = true;
1311
+ break;
1312
+ }
1313
+ }
1314
+ if (negate != found_tag) {
1315
+ result = false;
1316
+ goto cleanup;
1317
+ }
1318
+ }
1319
+ cleanup:
1320
+ gumbo_vector_destroy(parser, &tags);
1321
+ return result;
1322
+ }
1323
+
1324
+ // This is a bit of a hack to stack-allocate a one-element GumboVector name
1325
+ // 'varname' containing the 'from_var' variable, since it's used in nearly all
1326
+ // the subsequent helper functions. Note the use of void* and casts instead of
1327
+ // GumboTag; this is so the alignment requirements are the same as GumboVector
1328
+ // and the data inside it can be freely accessed as if it were a normal
1329
+ // GumboVector.
1330
+ #define DECLARE_ONE_ELEMENT_GUMBO_VECTOR(varname, from_var) \
1331
+ void* varname ## _tmp_array[1] = { (void*) from_var }; \
1332
+ GumboVector varname = { varname ## _tmp_array, 1, 1 }
1333
+
1334
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
1335
+ static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
1336
+ DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1337
+ return has_an_element_in_specific_scope(
1338
+ parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1339
+ GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1340
+ GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1341
+ GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1342
+ GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
1343
+ }
1344
+
1345
+ // Like "has an element in scope", but for the specific case of looking for a
1346
+ // unique target node, not for any node with a given tag name. This duplicates
1347
+ // much of the algorithm from has_an_element_in_specific_scope because the
1348
+ // predicate is different when checking for an exact node, and it's easier &
1349
+ // faster just to duplicate the code for this one case than to try and
1350
+ // parameterize it.
1351
+ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
1352
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1353
+ for (int i = open_elements->length - 1; i >= 0; --i) {
1354
+ const GumboNode* current = open_elements->data[i];
1355
+ if (current == node) {
1356
+ return true;
1357
+ }
1358
+ if (current->type != GUMBO_NODE_ELEMENT) {
1359
+ continue;
1360
+ }
1361
+ if (node_tag_in(
1362
+ current, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1363
+ GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1364
+ GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
1365
+ GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML,
1366
+ GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_TITLE,
1367
+ GUMBO_TAG_LAST)) {
1368
+ return false;
1369
+ }
1370
+ }
1371
+ assert(false);
1372
+ return false;
1373
+ }
1374
+
1375
+ // Like has_an_element_in_scope, but restricts the expected tag to a range of
1376
+ // possible tag names instead of just a single one.
1377
+ static bool has_an_element_in_scope_with_tagname(GumboParser* parser, ...) {
1378
+ GumboVector tags;
1379
+ // 6 = arbitrary initial size for vector, chosen because the major use-case
1380
+ // for this method is heading tags, of which there are 6.
1381
+ gumbo_vector_init(parser, 6, &tags);
1382
+ va_list args;
1383
+ va_start(args, parser);
1384
+ for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
1385
+ tag = va_arg(args, GumboTag)) {
1386
+ gumbo_vector_add(parser, (void*) tag, &tags);
1387
+ }
1388
+ bool found = has_an_element_in_specific_scope(
1389
+ parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1390
+ GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1391
+ GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1392
+ GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1393
+ GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
1394
+ gumbo_vector_destroy(parser, &tags);
1395
+ va_end(args);
1396
+ return found;
1397
+ }
1398
+
1399
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
1400
+ static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
1401
+ DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1402
+ return has_an_element_in_specific_scope(
1403
+ parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1404
+ GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1405
+ GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1406
+ GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1407
+ GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_OL, GUMBO_TAG_UL,
1408
+ GUMBO_TAG_LAST);
1409
+ }
1410
+
1411
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
1412
+ static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
1413
+ DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1414
+ return has_an_element_in_specific_scope(
1415
+ parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1416
+ GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1417
+ GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1418
+ GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1419
+ GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_BUTTON, GUMBO_TAG_LAST);
1420
+ }
1421
+
1422
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
1423
+ static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
1424
+ DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1425
+ return has_an_element_in_specific_scope(
1426
+ parser, &tags, false, GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST);
1427
+ }
1428
+
1429
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
1430
+ static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
1431
+ DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1432
+ return has_an_element_in_specific_scope(
1433
+ parser, &tags, true, GUMBO_TAG_OPTGROUP, GUMBO_TAG_OPTION,
1434
+ GUMBO_TAG_LAST);
1435
+ }
1436
+
1437
+
1438
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
1439
+ // "exception" is the "element to exclude from the process" listed in the spec.
1440
+ // Pass GUMBO_TAG_LAST to not exclude any of them.
1441
+ static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1442
+ for (;
1443
+ node_tag_in(get_current_node(parser), GUMBO_TAG_DD, GUMBO_TAG_DT,
1444
+ GUMBO_TAG_LI, GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP,
1445
+ GUMBO_TAG_P, GUMBO_TAG_RP, GUMBO_TAG_RT, GUMBO_TAG_LAST) &&
1446
+ !node_tag_is(get_current_node(parser), exception);
1447
+ pop_current_node(parser));
1448
+ }
1449
+
1450
+ // This factors out the clauses relating to "act as if an end tag token with tag
1451
+ // name "table" had been seen. Returns true if there's a table element in table
1452
+ // scope which was successfully closed, false if not and the token should be
1453
+ // ignored. Does not add parse errors; callers should handle that.
1454
+ static bool close_table(GumboParser* parser) {
1455
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) {
1456
+ return false;
1457
+ }
1458
+
1459
+ GumboNode* node = pop_current_node(parser);
1460
+ while (!node_tag_is(node, GUMBO_TAG_TABLE)) {
1461
+ node = pop_current_node(parser);
1462
+ }
1463
+ reset_insertion_mode_appropriately(parser);
1464
+ return true;
1465
+ }
1466
+
1467
+ // This factors out the clauses relating to "act as if an end tag token with tag
1468
+ // name `cell_tag` had been seen".
1469
+ static bool close_table_cell(GumboParser* parser, const GumboToken* token,
1470
+ GumboTag cell_tag) {
1471
+ bool result = true;
1472
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
1473
+ const GumboNode* node = get_current_node(parser);
1474
+ if (!node_tag_is(node, cell_tag)) {
1475
+ add_parse_error(parser, token);
1476
+ result = false;
1477
+ }
1478
+ do {
1479
+ node = pop_current_node(parser);
1480
+ } while (!node_tag_is(node, cell_tag));
1481
+
1482
+ clear_active_formatting_elements(parser);
1483
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
1484
+ return result;
1485
+ }
1486
+
1487
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell
1488
+ // This holds the logic to determine whether we should close a <td> or a <th>.
1489
+ static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1490
+ if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1491
+ assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1492
+ return close_table_cell(parser, token, GUMBO_TAG_TD);
1493
+ } else {
1494
+ assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1495
+ return close_table_cell(parser, token, GUMBO_TAG_TH);
1496
+ }
1497
+ }
1498
+
1499
+ // This factors out the "act as if an end tag of tag name 'select' had been
1500
+ // seen" clause of the spec, since it's referenced in several places. It pops
1501
+ // all nodes from the stack until the current <select> has been closed, then
1502
+ // resets the insertion mode appropriately.
1503
+ static void close_current_select(GumboParser* parser) {
1504
+ GumboNode* node = pop_current_node(parser);
1505
+ while (!node_tag_is(node, GUMBO_TAG_SELECT)) {
1506
+ node = pop_current_node(parser);
1507
+ }
1508
+ reset_insertion_mode_appropriately(parser);
1509
+ }
1510
+
1511
+ // The list of nodes in the "special" category:
1512
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
1513
+ static bool is_special_node(const GumboNode* node) {
1514
+ assert(node->type == GUMBO_NODE_ELEMENT);
1515
+ switch (node->v.element.tag_namespace) {
1516
+ case GUMBO_NAMESPACE_HTML:
1517
+ return node_tag_in(node,
1518
+ GUMBO_TAG_ADDRESS, GUMBO_TAG_APPLET, GUMBO_TAG_AREA,
1519
+ GUMBO_TAG_ARTICLE, GUMBO_TAG_ASIDE, GUMBO_TAG_BASE,
1520
+ GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND, GUMBO_TAG_BLOCKQUOTE,
1521
+ GUMBO_TAG_BODY, GUMBO_TAG_BR, GUMBO_TAG_BUTTON, GUMBO_TAG_CAPTION,
1522
+ GUMBO_TAG_CENTER, GUMBO_TAG_COL, GUMBO_TAG_COLGROUP,
1523
+ GUMBO_TAG_COMMAND, GUMBO_TAG_DD, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
1524
+ GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EMBED,
1525
+ GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE,
1526
+ GUMBO_TAG_FOOTER, GUMBO_TAG_FORM, GUMBO_TAG_FRAME,
1527
+ GUMBO_TAG_FRAMESET, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
1528
+ GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HEAD,
1529
+ GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_HR, GUMBO_TAG_HTML,
1530
+ GUMBO_TAG_IFRAME, GUMBO_TAG_IMG, GUMBO_TAG_INPUT, GUMBO_TAG_ISINDEX,
1531
+ GUMBO_TAG_LI, GUMBO_TAG_LINK, GUMBO_TAG_LISTING, GUMBO_TAG_MARQUEE,
1532
+ GUMBO_TAG_MENU, GUMBO_TAG_META, GUMBO_TAG_NAV, GUMBO_TAG_NOEMBED,
1533
+ GUMBO_TAG_NOFRAMES, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_OBJECT,
1534
+ GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_PARAM, GUMBO_TAG_PLAINTEXT,
1535
+ GUMBO_TAG_PRE, GUMBO_TAG_SCRIPT, GUMBO_TAG_SECTION, GUMBO_TAG_SELECT,
1536
+ GUMBO_TAG_STYLE, GUMBO_TAG_SUMMARY, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
1537
+ GUMBO_TAG_TD, GUMBO_TAG_TEXTAREA, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
1538
+ GUMBO_TAG_THEAD, GUMBO_TAG_TITLE, GUMBO_TAG_TR, GUMBO_TAG_UL,
1539
+ GUMBO_TAG_WBR, GUMBO_TAG_XMP, GUMBO_TAG_LAST);
1540
+ case GUMBO_NAMESPACE_MATHML:
1541
+ return node_tag_in(node,
1542
+ GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1543
+ GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_LAST);
1544
+ case GUMBO_NAMESPACE_SVG:
1545
+ return node_tag_in(node,
1546
+ GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_LAST);
1547
+ }
1548
+ abort();
1549
+ return false; // Pacify compiler.
1550
+ }
1551
+
1552
+ // Implicitly closes currently open tags until it reaches an element with the
1553
+ // specified tag name. If the elements closed are in the set handled by
1554
+ // generate_implied_end_tags, this is normal operation and this function returns
1555
+ // true. Otherwise, a parse error is recorded and this function returns false.
1556
+ static bool implicitly_close_tags(
1557
+ GumboParser* parser, GumboToken* token, GumboTag target) {
1558
+ bool result = true;
1559
+ generate_implied_end_tags(parser, target);
1560
+ if (!node_tag_is(get_current_node(parser), target)) {
1561
+ add_parse_error(parser, token);
1562
+ while (!node_tag_is(get_current_node(parser), target)) {
1563
+ pop_current_node(parser);
1564
+ }
1565
+ result = false;
1566
+ }
1567
+ assert(node_tag_is(get_current_node(parser), target));
1568
+ pop_current_node(parser);
1569
+ return result;
1570
+ }
1571
+
1572
+ // If the stack of open elements has a <p> tag in button scope, this acts as if
1573
+ // a </p> tag was encountered, implicitly closing tags. Returns false if a
1574
+ // parse error occurs. This is a convenience function because this particular
1575
+ // clause appears several times in the spec.
1576
+ static bool maybe_implicitly_close_p_tag(GumboParser* parser, GumboToken* token) {
1577
+ if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1578
+ return implicitly_close_tags(parser, token, GUMBO_TAG_P);
1579
+ }
1580
+ return true;
1581
+ }
1582
+
1583
+ // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
1584
+ // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
1585
+ static void maybe_implicitly_close_list_tag(
1586
+ GumboParser* parser, GumboToken* token, bool is_li) {
1587
+ GumboParserState* state = parser->_parser_state;
1588
+ state->_frameset_ok = false;
1589
+ for (int i = state->_open_elements.length - 1; i >= 0; --i) {
1590
+ const GumboNode* node = state->_open_elements.data[i];
1591
+ bool is_list_tag = is_li ?
1592
+ node_tag_is(node, GUMBO_TAG_LI) :
1593
+ node_tag_in(node, GUMBO_TAG_DD, GUMBO_TAG_DT, GUMBO_TAG_LAST);
1594
+ if (is_list_tag) {
1595
+ implicitly_close_tags(parser, token, node->v.element.tag);
1596
+ return;
1597
+ }
1598
+ if (is_special_node(node) &&
1599
+ !node_tag_in(node, GUMBO_TAG_ADDRESS, GUMBO_TAG_DIV, GUMBO_TAG_P,
1600
+ GUMBO_TAG_LAST)) {
1601
+ return;
1602
+ }
1603
+ }
1604
+ }
1605
+
1606
+ static void merge_attributes(
1607
+ GumboParser* parser, GumboToken* token, GumboNode* node) {
1608
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1609
+ assert(node->type == GUMBO_NODE_ELEMENT);
1610
+ const GumboVector* token_attr = &token->v.start_tag.attributes;
1611
+ GumboVector* node_attr = &node->v.element.attributes;
1612
+
1613
+ for (int i = 0; i < token_attr->length; ++i) {
1614
+ GumboAttribute* attr = token_attr->data[i];
1615
+ if (!gumbo_get_attribute(node_attr, attr->name)) {
1616
+ // Ownership of the attribute is transferred by this gumbo_vector_add,
1617
+ // so it has to be nulled out of the original token so it doesn't get
1618
+ // double-deleted.
1619
+ gumbo_vector_add(parser, attr, node_attr);
1620
+ token_attr->data[i] = NULL;
1621
+ }
1622
+ }
1623
+ // When attributes are merged, it means the token has been ignored and merged
1624
+ // with another token, so we need to free its memory. The attributes that are
1625
+ // transferred need to be nulled-out in the vector above so that they aren't
1626
+ // double-deleted.
1627
+ gumbo_token_destroy(parser, token);
1628
+
1629
+ #ifndef NDEBUG
1630
+ // Mark this sentinel so the assertion in the main loop knows it's been
1631
+ // destroyed.
1632
+ token->v.start_tag.attributes = kGumboEmptyVector;
1633
+ #endif
1634
+ }
1635
+
1636
+ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) {
1637
+ for (int i = 0;
1638
+ i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry); ++i) {
1639
+ const ReplacementEntry* entry = &kSvgTagReplacements[i];
1640
+ if (gumbo_string_equals_ignore_case(tag, &entry->from)) {
1641
+ return entry->to.data;
1642
+ }
1643
+ }
1644
+ return NULL;
1645
+ }
1646
+
1647
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
1648
+ // This destructively modifies any matching attributes on the token and sets the
1649
+ // namespace appropriately.
1650
+ static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) {
1651
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1652
+ const GumboVector* attributes = &token->v.start_tag.attributes;
1653
+ for (int i = 0;
1654
+ i < sizeof(kForeignAttributeReplacements) /
1655
+ sizeof(NamespacedAttributeReplacement); ++i) {
1656
+ const NamespacedAttributeReplacement* entry =
1657
+ &kForeignAttributeReplacements[i];
1658
+ GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from);
1659
+ if (!attr) {
1660
+ continue;
1661
+ }
1662
+ gumbo_parser_deallocate(parser, (void*) attr->name);
1663
+ attr->attr_namespace = entry->attr_namespace;
1664
+ attr->name = gumbo_copy_stringz(parser, entry->local_name);
1665
+ }
1666
+ }
1667
+
1668
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes
1669
+ // This destructively modifies any matching attributes on the token.
1670
+ static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) {
1671
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1672
+ const GumboVector* attributes = &token->v.start_tag.attributes;
1673
+ for (int i = 0;
1674
+ i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) {
1675
+ const ReplacementEntry* entry = &kSvgAttributeReplacements[i];
1676
+ GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data);
1677
+ if (!attr) {
1678
+ continue;
1679
+ }
1680
+ gumbo_parser_deallocate(parser, (void*) attr->name);
1681
+ attr->name = gumbo_copy_stringz(parser, entry->to.data);
1682
+ }
1683
+ }
1684
+
1685
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes
1686
+ // Note that this may destructively modify the token with the new attribute
1687
+ // value.
1688
+ static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) {
1689
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1690
+ GumboAttribute* attr = gumbo_get_attribute(
1691
+ &token->v.start_tag.attributes, "definitionurl");
1692
+ if (!attr) {
1693
+ return;
1694
+ }
1695
+ gumbo_parser_deallocate(parser, (void*) attr->name);
1696
+ attr->name = gumbo_copy_stringz(parser, "definitionURL");
1697
+ }
1698
+
1699
+ static bool doctype_matches(
1700
+ const GumboTokenDocType* doctype,
1701
+ const GumboStringPiece* public_id,
1702
+ const GumboStringPiece* system_id,
1703
+ bool allow_missing_system_id) {
1704
+ return !strcmp(doctype->public_identifier, public_id->data) &&
1705
+ (allow_missing_system_id || doctype->has_system_identifier) &&
1706
+ !strcmp(doctype->system_identifier, system_id->data);
1707
+ }
1708
+
1709
+ static bool maybe_add_doctype_error(
1710
+ GumboParser* parser, const GumboToken* token) {
1711
+ const GumboTokenDocType* doctype = &token->v.doc_type;
1712
+ bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data);
1713
+ if (!html_doctype ||
1714
+ doctype->has_public_identifier ||
1715
+ (doctype->has_system_identifier && !strcmp(
1716
+ doctype->system_identifier, kSystemIdLegacyCompat.data)) ||
1717
+ !(html_doctype && (
1718
+ doctype_matches(doctype, &kPublicIdHtml4_0,
1719
+ &kSystemIdRecHtml4_0, true) ||
1720
+ doctype_matches(doctype, &kPublicIdHtml4_01, &kSystemIdHtml4, true) ||
1721
+ doctype_matches(doctype, &kPublicIdXhtml1_0,
1722
+ &kSystemIdXhtmlStrict1_1, false) ||
1723
+ doctype_matches(doctype, &kPublicIdXhtml1_1,
1724
+ &kSystemIdXhtml1_1, false)))) {
1725
+ add_parse_error(parser, token);
1726
+ return false;
1727
+ }
1728
+ return true;
1729
+ }
1730
+
1731
+ static void remove_from_parent(GumboParser* parser, GumboNode* node) {
1732
+ if (!node->parent) {
1733
+ // The node may not have a parent if, for example, it is a newly-cloned copy
1734
+ // of an active formatting element. DOM manipulations continue with the
1735
+ // orphaned fragment of the DOM tree until it's appended/foster-parented to
1736
+ // the common ancestor at the end of the adoption agency algorithm.
1737
+ return;
1738
+ }
1739
+ assert(node->parent->type == GUMBO_NODE_ELEMENT);
1740
+ GumboVector* children = &node->parent->v.element.children;
1741
+ int index = gumbo_vector_index_of(children, node);
1742
+ assert(index != -1);
1743
+
1744
+ gumbo_vector_remove_at(parser, index, children);
1745
+ node->parent = NULL;
1746
+ node->index_within_parent = -1;
1747
+ for (int i = index; i < children->length; ++i) {
1748
+ GumboNode* child = children->data[i];
1749
+ child->index_within_parent = i;
1750
+ }
1751
+ }
1752
+
1753
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
1754
+ // Also described in the "in body" handling for end formatting tags.
1755
+ static bool adoption_agency_algorithm(
1756
+ GumboParser* parser, GumboToken* token, GumboTag closing_tag) {
1757
+ GumboParserState* state = parser->_parser_state;
1758
+ gumbo_debug("Entering adoption agency algorithm.\n");
1759
+ // Steps 1-3 & 16:
1760
+ for (int i = 0; i < 8; ++i) {
1761
+ // Step 4.
1762
+ GumboNode* formatting_node = NULL;
1763
+ int formatting_node_in_open_elements = -1;
1764
+ for (int j = state->_active_formatting_elements.length - 1; j >= 0; --j) {
1765
+ GumboNode* current_node = state->_active_formatting_elements.data[j];
1766
+ if (current_node == &kActiveFormattingScopeMarker) {
1767
+ gumbo_debug("Broke on scope marker; aborting.\n");
1768
+ // Last scope marker; abort the algorithm.
1769
+ return false;
1770
+ }
1771
+ if (node_tag_is(current_node, closing_tag)) {
1772
+ // Found it.
1773
+ formatting_node = current_node;
1774
+ formatting_node_in_open_elements = gumbo_vector_index_of(
1775
+ &state->_open_elements, formatting_node);
1776
+ gumbo_debug("Formatting element of tag %s at %d.\n",
1777
+ gumbo_normalized_tagname(closing_tag),
1778
+ formatting_node_in_open_elements);
1779
+ break;
1780
+ }
1781
+ }
1782
+ if (!formatting_node) {
1783
+ // No matching tag; not a parse error outright, but fall through to the
1784
+ // "any other end tag" clause (which may potentially add a parse error,
1785
+ // but not always).
1786
+ gumbo_debug("No active formatting elements; aborting.\n");
1787
+ return false;
1788
+ }
1789
+
1790
+ if (formatting_node_in_open_elements == -1) {
1791
+ gumbo_debug("Formatting node not on stack of open elements.\n");
1792
+ gumbo_vector_remove(parser, formatting_node,
1793
+ &state->_active_formatting_elements);
1794
+ return false;
1795
+ }
1796
+
1797
+ if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
1798
+ add_parse_error(parser, token);
1799
+ gumbo_debug("Element not in scope.\n");
1800
+ return false;
1801
+ }
1802
+ if (formatting_node != get_current_node(parser)) {
1803
+ add_parse_error(parser, token); // But continue onwards.
1804
+ }
1805
+ assert(formatting_node);
1806
+ assert(!node_tag_is(formatting_node, GUMBO_TAG_HTML));
1807
+ assert(!node_tag_is(formatting_node, GUMBO_TAG_BODY));
1808
+
1809
+ // Step 5 & 6.
1810
+ GumboNode* furthest_block = NULL;
1811
+ for (int j = formatting_node_in_open_elements;
1812
+ j < state->_open_elements.length; ++j) {
1813
+ assert(j > 0);
1814
+ GumboNode* current = state->_open_elements.data[j];
1815
+ if (is_special_node(current)) {
1816
+ // Step 5.
1817
+ furthest_block = current;
1818
+ break;
1819
+ }
1820
+ }
1821
+ if (!furthest_block) {
1822
+ // Step 6.
1823
+ while (get_current_node(parser) != formatting_node) {
1824
+ pop_current_node(parser);
1825
+ }
1826
+ // And the formatting element itself.
1827
+ pop_current_node(parser);
1828
+ gumbo_vector_remove(parser, formatting_node,
1829
+ &state->_active_formatting_elements);
1830
+ return false;
1831
+ }
1832
+ assert(!node_tag_is(furthest_block, GUMBO_TAG_HTML));
1833
+ assert(furthest_block);
1834
+
1835
+ // Step 7.
1836
+ // Elements may be moved and reparented by this algorithm, so
1837
+ // common_ancestor is not necessarily the same as formatting_node->parent.
1838
+ GumboNode* common_ancestor =
1839
+ state->_open_elements.data[gumbo_vector_index_of(
1840
+ &state->_open_elements, formatting_node) - 1];
1841
+ gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
1842
+ gumbo_normalized_tagname(common_ancestor->v.element.tag),
1843
+ gumbo_normalized_tagname(furthest_block->v.element.tag));
1844
+
1845
+ // Step 8.
1846
+ int bookmark = gumbo_vector_index_of(
1847
+ &state->_active_formatting_elements, formatting_node);;
1848
+ // Step 9.
1849
+ GumboNode* node = furthest_block;
1850
+ GumboNode* last_node = furthest_block;
1851
+ // Must be stored explicitly, in case node is removed from the stack of open
1852
+ // elements, to handle step 9.4.
1853
+ int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
1854
+ assert(saved_node_index > 0);
1855
+ // Step 9.1-9.3 & 9.11.
1856
+ for (int j = 0; j < 3; ++j) {
1857
+ // Step 9.4.
1858
+ int node_index = gumbo_vector_index_of(&state->_open_elements, node);
1859
+ gumbo_debug(
1860
+ "Current index: %d, last index: %d.\n", node_index, saved_node_index);
1861
+ if (node_index == -1) {
1862
+ node_index = saved_node_index;
1863
+ }
1864
+ saved_node_index = --node_index;
1865
+ assert(node_index > 0);
1866
+ assert(node_index < state->_open_elements.capacity);
1867
+ node = state->_open_elements.data[node_index];
1868
+ assert(node->parent);
1869
+ // Step 9.5.
1870
+ if (gumbo_vector_index_of(
1871
+ &state->_active_formatting_elements, node) == -1) {
1872
+ gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
1873
+ continue;
1874
+ } else if (node == formatting_node) {
1875
+ // Step 9.6.
1876
+ break;
1877
+ }
1878
+ // Step 9.7.
1879
+ int formatting_index = gumbo_vector_index_of(
1880
+ &state->_active_formatting_elements, node);
1881
+ node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1882
+ state->_active_formatting_elements.data[formatting_index] = node;
1883
+ state->_open_elements.data[node_index] = node;
1884
+ // Step 9.8.
1885
+ if (last_node == furthest_block) {
1886
+ bookmark = formatting_index + 1;
1887
+ assert(bookmark <= state->_active_formatting_elements.length);
1888
+ }
1889
+ // Step 9.9.
1890
+ last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1891
+ remove_from_parent(parser, last_node);
1892
+ append_node(parser, node, last_node);
1893
+ // Step 9.10.
1894
+ last_node = node;
1895
+ }
1896
+
1897
+ // Step 10.
1898
+ gumbo_debug("Removing %s node from parent ",
1899
+ gumbo_normalized_tagname(last_node->v.element.tag));
1900
+ remove_from_parent(parser, last_node);
1901
+ last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1902
+ if (node_tag_in(common_ancestor, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
1903
+ GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
1904
+ GUMBO_TAG_LAST)) {
1905
+ gumbo_debug("and foster-parenting it.\n");
1906
+ foster_parent_element(parser, last_node);
1907
+ } else {
1908
+ gumbo_debug("and inserting it into %s.\n",
1909
+ gumbo_normalized_tagname(common_ancestor->v.element.tag));
1910
+ append_node(parser, common_ancestor, last_node);
1911
+ }
1912
+
1913
+ // Step 11.
1914
+ GumboNode* new_formatting_node = clone_node(
1915
+ parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1916
+ formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1917
+
1918
+ // Step 12. Instead of appending nodes one-by-one, we swap the children
1919
+ // vector of furthest_block with the empty children of new_formatting_node,
1920
+ // reducing memory traffic and allocations. We still have to reset their
1921
+ // parent pointers, though.
1922
+ GumboVector temp = new_formatting_node->v.element.children;
1923
+ new_formatting_node->v.element.children =
1924
+ furthest_block->v.element.children;
1925
+ furthest_block->v.element.children = temp;
1926
+
1927
+ temp = new_formatting_node->v.element.children;
1928
+ for (int i = 0; i < temp.length; ++i) {
1929
+ GumboNode* child = temp.data[i];
1930
+ child->parent = new_formatting_node;
1931
+ }
1932
+
1933
+ // Step 13.
1934
+ append_node(parser, furthest_block, new_formatting_node);
1935
+
1936
+ // Step 14.
1937
+ // If the formatting node was before the bookmark, it may shift over all
1938
+ // indices after it, so we need to explicitly find the index and possibly
1939
+ // adjust the bookmark.
1940
+ int formatting_node_index = gumbo_vector_index_of(
1941
+ &state->_active_formatting_elements, formatting_node);
1942
+ assert(formatting_node_index != -1);
1943
+ if (formatting_node_index < bookmark) {
1944
+ --bookmark;
1945
+ }
1946
+ gumbo_vector_remove_at(
1947
+ parser, formatting_node_index, &state->_active_formatting_elements);
1948
+ assert(bookmark >= 0);
1949
+ assert(bookmark <= state->_active_formatting_elements.length);
1950
+ gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
1951
+ &state->_active_formatting_elements);
1952
+
1953
+ // Step 15.
1954
+ gumbo_vector_remove(
1955
+ parser, formatting_node, &state->_open_elements);
1956
+ int insert_at = gumbo_vector_index_of(
1957
+ &state->_open_elements, furthest_block) + 1;
1958
+ assert(insert_at >= 0);
1959
+ assert(insert_at <= state->_open_elements.length);
1960
+ gumbo_vector_insert_at(
1961
+ parser, new_formatting_node, insert_at, &state->_open_elements);
1962
+ }
1963
+ return true;
1964
+ }
1965
+
1966
+ // This is here to clean up memory when the spec says "Ignore current token."
1967
+ static void ignore_token(GumboParser* parser) {
1968
+ GumboToken* token = parser->_parser_state->_current_token;
1969
+ // Ownership of the token's internal buffers are normally transferred to the
1970
+ // element, but if no element is emitted (as happens in non-verbatim-mode
1971
+ // when a token is ignored), we need to free it here to prevent a memory
1972
+ // leak.
1973
+ gumbo_token_destroy(parser, token);
1974
+ #ifndef NDEBUG
1975
+ if (token->type == GUMBO_TOKEN_START_TAG) {
1976
+ // Mark this sentinel so the assertion in the main loop knows it's been
1977
+ // destroyed.
1978
+ token->v.start_tag.attributes = kGumboEmptyVector;
1979
+ }
1980
+ #endif
1981
+ }
1982
+
1983
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html
1984
+ static void finish_parsing(GumboParser* parser) {
1985
+ maybe_flush_text_node_buffer(parser);
1986
+ GumboParserState* state = parser->_parser_state;
1987
+ for (GumboNode* node = pop_current_node(parser); node;
1988
+ node = pop_current_node(parser)) {
1989
+ if ((node_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
1990
+ (node_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
1991
+ continue;
1992
+ }
1993
+ node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1994
+ }
1995
+ while (pop_current_node(parser)); // Pop them all.
1996
+ }
1997
+
1998
+ static bool handle_initial(GumboParser* parser, GumboToken* token) {
1999
+ GumboDocument* document = &get_document_node(parser)->v.document;
2000
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2001
+ ignore_token(parser);
2002
+ return true;
2003
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2004
+ append_comment_node(parser, get_document_node(parser), token);
2005
+ return true;
2006
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2007
+ document->has_doctype = true;
2008
+ document->name = token->v.doc_type.name;
2009
+ document->public_identifier = token->v.doc_type.public_identifier;
2010
+ document->system_identifier = token->v.doc_type.system_identifier;
2011
+ document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
2012
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2013
+ return maybe_add_doctype_error(parser, token);
2014
+ }
2015
+ add_parse_error(parser, token);
2016
+ document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
2017
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2018
+ parser->_parser_state->_reprocess_current_token = true;
2019
+ return true;
2020
+ }
2021
+
2022
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode
2023
+ static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2024
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2025
+ add_parse_error(parser, token);
2026
+ ignore_token(parser);
2027
+ return false;
2028
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2029
+ append_comment_node(parser, get_document_node(parser), token);
2030
+ return true;
2031
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2032
+ ignore_token(parser);
2033
+ return true;
2034
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2035
+ GumboNode* html_node = insert_element_from_token(parser, token);
2036
+ parser->_output->root = html_node;
2037
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2038
+ return true;
2039
+ } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
2040
+ token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2041
+ GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
2042
+ add_parse_error(parser, token);
2043
+ ignore_token(parser);
2044
+ return false;
2045
+ } else {
2046
+ GumboNode* html_node = insert_element_of_tag_type(
2047
+ parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
2048
+ assert(html_node);
2049
+ parser->_output->root = html_node;
2050
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2051
+ parser->_parser_state->_reprocess_current_token = true;
2052
+ return true;
2053
+ }
2054
+ }
2055
+
2056
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode
2057
+ static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2058
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2059
+ add_parse_error(parser, token);
2060
+ ignore_token(parser);
2061
+ return false;
2062
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2063
+ append_comment_node(parser, get_current_node(parser), token);
2064
+ return true;
2065
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2066
+ ignore_token(parser);
2067
+ return true;
2068
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2069
+ GumboNode* node = insert_element_from_token(parser, token);
2070
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2071
+ parser->_parser_state->_head_element = node;
2072
+ return true;
2073
+ } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
2074
+ token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2075
+ GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
2076
+ add_parse_error(parser, token);
2077
+ ignore_token(parser);
2078
+ return false;
2079
+ } else {
2080
+ GumboNode* node = insert_element_of_tag_type(
2081
+ parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED);
2082
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2083
+ parser->_parser_state->_head_element = node;
2084
+ parser->_parser_state->_reprocess_current_token = true;
2085
+ return true;
2086
+ }
2087
+ }
2088
+
2089
+ // Forward declarations because of mutual dependencies.
2090
+ static bool handle_token(GumboParser* parser, GumboToken* token);
2091
+ static bool handle_in_body(GumboParser* parser, GumboToken* token);
2092
+
2093
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead
2094
+ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2095
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2096
+ insert_text_token(parser, token);
2097
+ return true;
2098
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2099
+ add_parse_error(parser, token);
2100
+ ignore_token(parser);
2101
+ return false;
2102
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2103
+ append_comment_node(parser, get_current_node(parser), token);
2104
+ return true;
2105
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2106
+ return handle_in_body(parser, token);
2107
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2108
+ GUMBO_TAG_BGSOUND, GUMBO_TAG_COMMAND, GUMBO_TAG_LINK,
2109
+ GUMBO_TAG_LAST)) {
2110
+ insert_element_from_token(parser, token);
2111
+ pop_current_node(parser);
2112
+ acknowledge_self_closing_tag(parser);
2113
+ return true;
2114
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2115
+ insert_element_from_token(parser, token);
2116
+ pop_current_node(parser);
2117
+ acknowledge_self_closing_tag(parser);
2118
+ // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the
2119
+ // spec doesn't apply. If clients want to handle meta-tag re-encoding, they
2120
+ // should specifically look for that string in the document and re-encode it
2121
+ // before passing to Gumbo.
2122
+ return true;
2123
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2124
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2125
+ return true;
2126
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_NOFRAMES, GUMBO_TAG_STYLE,
2127
+ GUMBO_TAG_LAST)) {
2128
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2129
+ return true;
2130
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2131
+ insert_element_from_token(parser, token);
2132
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2133
+ return true;
2134
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2135
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
2136
+ return true;
2137
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2138
+ GumboNode* head = pop_current_node(parser);
2139
+ AVOID_UNUSED_VARIABLE_WARNING(head);
2140
+ assert(node_tag_is(head, GUMBO_TAG_HEAD));
2141
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2142
+ return true;
2143
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2144
+ add_parse_error(parser, token);
2145
+ ignore_token(parser);
2146
+ return false;
2147
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2148
+ (token->type == GUMBO_TOKEN_END_TAG &&
2149
+ !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2150
+ GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
2151
+ add_parse_error(parser, token);
2152
+ return false;
2153
+ } else {
2154
+ const GumboNode* node = pop_current_node(parser);
2155
+ assert(node_tag_is(node, GUMBO_TAG_HEAD));
2156
+ AVOID_UNUSED_VARIABLE_WARNING(node);
2157
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2158
+ parser->_parser_state->_reprocess_current_token = true;
2159
+ return true;
2160
+ }
2161
+
2162
+ return true;
2163
+ }
2164
+
2165
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript
2166
+ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2167
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2168
+ add_parse_error(parser, token);
2169
+ return false;
2170
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2171
+ return handle_in_body(parser, token);
2172
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2173
+ const GumboNode* node = pop_current_node(parser);
2174
+ assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
2175
+ AVOID_UNUSED_VARIABLE_WARNING(node);
2176
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2177
+ return true;
2178
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE ||
2179
+ token->type == GUMBO_TOKEN_COMMENT ||
2180
+ tag_in(token, kStartTag, GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND,
2181
+ GUMBO_TAG_LINK, GUMBO_TAG_META, GUMBO_TAG_NOFRAMES,
2182
+ GUMBO_TAG_STYLE, GUMBO_TAG_LAST)) {
2183
+ return handle_in_head(parser, token);
2184
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_HEAD, GUMBO_TAG_NOSCRIPT,
2185
+ GUMBO_TAG_LAST) ||
2186
+ (token->type == GUMBO_TOKEN_END_TAG &&
2187
+ !tag_is(token, kEndTag, GUMBO_TAG_BR))) {
2188
+ add_parse_error(parser, token);
2189
+ ignore_token(parser);
2190
+ return false;
2191
+ } else {
2192
+ add_parse_error(parser, token);
2193
+ const GumboNode* node = pop_current_node(parser);
2194
+ assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
2195
+ AVOID_UNUSED_VARIABLE_WARNING(node);
2196
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2197
+ parser->_parser_state->_reprocess_current_token = true;
2198
+ return false;
2199
+ }
2200
+ }
2201
+
2202
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode
2203
+ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2204
+ GumboParserState* state = parser->_parser_state;
2205
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2206
+ insert_text_token(parser, token);
2207
+ return true;
2208
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2209
+ add_parse_error(parser, token);
2210
+ ignore_token(parser);
2211
+ return false;
2212
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2213
+ append_comment_node(parser, get_current_node(parser), token);
2214
+ return true;
2215
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2216
+ return handle_in_body(parser, token);
2217
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2218
+ insert_element_from_token(parser, token);
2219
+ state->_frameset_ok = false;
2220
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2221
+ return true;
2222
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2223
+ insert_element_from_token(parser, token);
2224
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2225
+ return true;
2226
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2227
+ GUMBO_TAG_BGSOUND, GUMBO_TAG_LINK, GUMBO_TAG_META,
2228
+ GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT, GUMBO_TAG_STYLE,
2229
+ GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
2230
+ add_parse_error(parser, token);
2231
+ assert(state->_head_element != NULL);
2232
+ // This must be flushed before we push the head element on, as there may be
2233
+ // pending character tokens that should be attached to the root.
2234
+ maybe_flush_text_node_buffer(parser);
2235
+ gumbo_vector_add(parser, state->_head_element, &state->_open_elements);
2236
+ bool result = handle_in_head(parser, token);
2237
+ gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
2238
+ return result;
2239
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2240
+ (token->type == GUMBO_TOKEN_END_TAG &&
2241
+ !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2242
+ GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
2243
+ add_parse_error(parser, token);
2244
+ ignore_token(parser);
2245
+ return false;
2246
+ } else {
2247
+ insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2248
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2249
+ state->_reprocess_current_token = true;
2250
+ return true;
2251
+ }
2252
+ }
2253
+
2254
+ static void destroy_node(GumboParser* parser, GumboNode* node) {
2255
+ switch (node->type) {
2256
+ case GUMBO_NODE_DOCUMENT:
2257
+ {
2258
+ GumboDocument* doc = &node->v.document;
2259
+ for (int i = 0; i < doc->children.length; ++i) {
2260
+ destroy_node(parser, doc->children.data[i]);
2261
+ }
2262
+ gumbo_parser_deallocate(parser, (void*) doc->children.data);
2263
+ gumbo_parser_deallocate(parser, (void*) doc->name);
2264
+ gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
2265
+ gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
2266
+ }
2267
+ break;
2268
+ case GUMBO_NODE_ELEMENT:
2269
+ for (int i = 0; i < node->v.element.attributes.length; ++i) {
2270
+ gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
2271
+ }
2272
+ gumbo_parser_deallocate(parser, node->v.element.attributes.data);
2273
+ for (int i = 0; i < node->v.element.children.length; ++i) {
2274
+ destroy_node(parser, node->v.element.children.data[i]);
2275
+ }
2276
+ gumbo_parser_deallocate(parser, node->v.element.children.data);
2277
+ break;
2278
+ case GUMBO_NODE_TEXT:
2279
+ case GUMBO_NODE_CDATA:
2280
+ case GUMBO_NODE_COMMENT:
2281
+ case GUMBO_NODE_WHITESPACE:
2282
+ gumbo_parser_deallocate(parser, (void*) node->v.text.text);
2283
+ break;
2284
+ }
2285
+ gumbo_parser_deallocate(parser, node);
2286
+ }
2287
+
2288
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
2289
+ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2290
+ GumboParserState* state = parser->_parser_state;
2291
+ assert(state->_open_elements.length > 0);
2292
+ if (token->type == GUMBO_TOKEN_NULL) {
2293
+ add_parse_error(parser, token);
2294
+ ignore_token(parser);
2295
+ return false;
2296
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2297
+ reconstruct_active_formatting_elements(parser);
2298
+ insert_text_token(parser, token);
2299
+ return true;
2300
+ } else if (token->type == GUMBO_TOKEN_CHARACTER) {
2301
+ reconstruct_active_formatting_elements(parser);
2302
+ insert_text_token(parser, token);
2303
+ set_frameset_not_ok(parser);
2304
+ return true;
2305
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2306
+ append_comment_node(parser, get_current_node(parser), token);
2307
+ return true;
2308
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2309
+ add_parse_error(parser, token);
2310
+ ignore_token(parser);
2311
+ return false;
2312
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2313
+ assert(parser->_output->root != NULL);
2314
+ assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2315
+ add_parse_error(parser, token);
2316
+ merge_attributes(parser, token, parser->_output->root);
2317
+ return false;
2318
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2319
+ GUMBO_TAG_BGSOUND, GUMBO_TAG_COMMAND, GUMBO_TAG_LINK,
2320
+ GUMBO_TAG_META, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT,
2321
+ GUMBO_TAG_STYLE, GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
2322
+ return handle_in_head(parser, token);
2323
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2324
+ add_parse_error(parser, token);
2325
+ if (state->_open_elements.length < 2 ||
2326
+ !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY)) {
2327
+ ignore_token(parser);
2328
+ return false;
2329
+ }
2330
+ state->_frameset_ok = false;
2331
+ merge_attributes(parser, token, state->_open_elements.data[1]);
2332
+ return false;
2333
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2334
+ add_parse_error(parser, token);
2335
+ if (state->_open_elements.length < 2 ||
2336
+ !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2337
+ !state->_frameset_ok) {
2338
+ ignore_token(parser);
2339
+ return false;
2340
+ }
2341
+ // Save the body node for later removal.
2342
+ GumboNode* body_node = state->_open_elements.data[1];
2343
+
2344
+ // Pop all nodes except root HTML element.
2345
+ GumboNode* node;
2346
+ do {
2347
+ node = pop_current_node(parser);
2348
+ } while (node != state->_open_elements.data[1]);
2349
+
2350
+ // Remove the body node. We may want to factor this out into a generic
2351
+ // helper, but right now this is the only code that needs to do this.
2352
+ GumboVector* children = &parser->_output->root->v.element.children;
2353
+ for (int i = 0; i < children->length; ++i) {
2354
+ if (children->data[i] == body_node) {
2355
+ gumbo_vector_remove_at(parser, i, children);
2356
+ break;
2357
+ }
2358
+ }
2359
+ destroy_node(parser, body_node);
2360
+
2361
+ // Insert the <frameset>, and switch the insertion mode.
2362
+ insert_element_from_token(parser, token);
2363
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2364
+ return true;
2365
+ } else if (token->type == GUMBO_TOKEN_EOF) {
2366
+ for (int i = 0; i < state->_open_elements.length; ++i) {
2367
+ if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
2368
+ GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_P, GUMBO_TAG_TBODY,
2369
+ GUMBO_TAG_TD, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
2370
+ GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_BODY,
2371
+ GUMBO_TAG_HTML, GUMBO_TAG_LAST)) {
2372
+ add_parse_error(parser, token);
2373
+ return false;
2374
+ }
2375
+ }
2376
+ return true;
2377
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2378
+ GUMBO_TAG_LAST)) {
2379
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2380
+ add_parse_error(parser, token);
2381
+ ignore_token(parser);
2382
+ return false;
2383
+ }
2384
+ bool success = true;
2385
+ for (int i = 0; i < state->_open_elements.length; ++i) {
2386
+ if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
2387
+ GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_OPTGROUP,
2388
+ GUMBO_TAG_OPTION, GUMBO_TAG_P, GUMBO_TAG_RP,
2389
+ GUMBO_TAG_RT, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
2390
+ GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
2391
+ GUMBO_TAG_TR, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2392
+ GUMBO_TAG_LAST)) {
2393
+ add_parse_error(parser, token);
2394
+ success = false;
2395
+ break;
2396
+ }
2397
+ }
2398
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2399
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2400
+ parser->_parser_state->_reprocess_current_token = true;
2401
+ } else {
2402
+ GumboNode* body = state->_open_elements.data[1];
2403
+ assert(node_tag_is(body, GUMBO_TAG_BODY));
2404
+ record_end_of_element(state->_current_token, &body->v.element);
2405
+ }
2406
+ return success;
2407
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
2408
+ GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_CENTER,
2409
+ GUMBO_TAG_DETAILS, GUMBO_TAG_DIR, GUMBO_TAG_DIV,
2410
+ GUMBO_TAG_DL, GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION,
2411
+ GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER, GUMBO_TAG_HEADER,
2412
+ GUMBO_TAG_HGROUP, GUMBO_TAG_MENU, GUMBO_TAG_NAV,
2413
+ GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_SECTION,
2414
+ GUMBO_TAG_SUMMARY, GUMBO_TAG_UL, GUMBO_TAG_LAST)) {
2415
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2416
+ insert_element_from_token(parser, token);
2417
+ return result;
2418
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2419
+ GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2420
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2421
+ if (node_tag_in(get_current_node(parser), GUMBO_TAG_H1, GUMBO_TAG_H2,
2422
+ GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6,
2423
+ GUMBO_TAG_LAST)) {
2424
+ add_parse_error(parser, token);
2425
+ pop_current_node(parser);
2426
+ result = false;
2427
+ }
2428
+ insert_element_from_token(parser, token);
2429
+ return result;
2430
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_PRE, GUMBO_TAG_LISTING,
2431
+ GUMBO_TAG_LAST)) {
2432
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2433
+ insert_element_from_token(parser, token);
2434
+ state->_ignore_next_linefeed = true;
2435
+ state->_frameset_ok = false;
2436
+ return result;
2437
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2438
+ if (state->_form_element != NULL) {
2439
+ gumbo_debug("Ignoring nested form.\n");
2440
+ add_parse_error(parser, token);
2441
+ ignore_token(parser);
2442
+ return false;
2443
+ }
2444
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2445
+ state->_form_element =
2446
+ insert_element_from_token(parser, token);
2447
+ return result;
2448
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2449
+ maybe_implicitly_close_list_tag(parser, token, true);
2450
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2451
+ insert_element_from_token(parser, token);
2452
+ return result;
2453
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
2454
+ GUMBO_TAG_LAST)) {
2455
+ maybe_implicitly_close_list_tag(parser, token, false);
2456
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2457
+ insert_element_from_token(parser, token);
2458
+ return result;
2459
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2460
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2461
+ insert_element_from_token(parser, token);
2462
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
2463
+ return result;
2464
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2465
+ if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2466
+ add_parse_error(parser, token);
2467
+ implicitly_close_tags(parser, token, GUMBO_TAG_BUTTON);
2468
+ state->_reprocess_current_token = true;
2469
+ return false;
2470
+ }
2471
+ reconstruct_active_formatting_elements(parser);
2472
+ insert_element_from_token(parser, token);
2473
+ state->_frameset_ok = false;
2474
+ return true;
2475
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
2476
+ GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_BUTTON,
2477
+ GUMBO_TAG_CENTER, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
2478
+ GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_FIELDSET,
2479
+ GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER,
2480
+ GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_LISTING,
2481
+ GUMBO_TAG_MENU, GUMBO_TAG_NAV, GUMBO_TAG_OL, GUMBO_TAG_PRE,
2482
+ GUMBO_TAG_SECTION, GUMBO_TAG_SUMMARY, GUMBO_TAG_UL,
2483
+ GUMBO_TAG_LAST)) {
2484
+ GumboTag tag = token->v.end_tag;
2485
+ if (!has_an_element_in_scope(parser, tag)) {
2486
+ add_parse_error(parser, token);
2487
+ ignore_token(parser);
2488
+ return false;
2489
+ }
2490
+ implicitly_close_tags(parser, token, token->v.end_tag);
2491
+ return true;
2492
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2493
+ bool result = true;
2494
+ const GumboNode* node = state->_form_element;
2495
+ assert(!node || node->type == GUMBO_NODE_ELEMENT);
2496
+ state->_form_element = NULL;
2497
+ if (!node || !has_node_in_scope(parser, node)) {
2498
+ gumbo_debug("Closing an unopened form.\n");
2499
+ add_parse_error(parser, token);
2500
+ ignore_token(parser);
2501
+ return false;
2502
+ }
2503
+ // This differs from implicitly_close_tags because we remove *only* the
2504
+ // <form> element; other nodes are left in scope.
2505
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2506
+ if (get_current_node(parser) != node) {
2507
+ add_parse_error(parser, token);
2508
+ result = false;
2509
+ }
2510
+
2511
+ GumboVector* open_elements = &state->_open_elements;
2512
+ int index = open_elements->length - 1;
2513
+ for (; index >= 0 && open_elements->data[index] != node; --index);
2514
+ assert(index >= 0);
2515
+ gumbo_vector_remove_at(parser, index, open_elements);
2516
+ return result;
2517
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
2518
+ if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2519
+ add_parse_error(parser, token);
2520
+ reconstruct_active_formatting_elements(parser);
2521
+ insert_element_of_tag_type(
2522
+ parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2523
+ state->_reprocess_current_token = true;
2524
+ return false;
2525
+ }
2526
+ return implicitly_close_tags(parser, token, GUMBO_TAG_P);
2527
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
2528
+ if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
2529
+ add_parse_error(parser, token);
2530
+ ignore_token(parser);
2531
+ return false;
2532
+ }
2533
+ return implicitly_close_tags(parser, token, GUMBO_TAG_LI);
2534
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
2535
+ GUMBO_TAG_LAST)) {
2536
+ assert(token->type == GUMBO_TOKEN_END_TAG);
2537
+ GumboTag token_tag = token->v.end_tag;
2538
+ if (!has_an_element_in_scope(parser, token_tag)) {
2539
+ add_parse_error(parser, token);
2540
+ ignore_token(parser);
2541
+ return false;
2542
+ }
2543
+ return implicitly_close_tags(parser, token, token_tag);
2544
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2545
+ GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2546
+ if (!has_an_element_in_scope_with_tagname(
2547
+ parser, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
2548
+ GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2549
+ // No heading open; ignore the token entirely.
2550
+ add_parse_error(parser, token);
2551
+ ignore_token(parser);
2552
+ return false;
2553
+ } else {
2554
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2555
+ const GumboNode* current_node = get_current_node(parser);
2556
+ bool success = node_tag_is(current_node, token->v.end_tag);
2557
+ if (!success) {
2558
+ // There're children of the heading currently open; close them below and
2559
+ // record a parse error.
2560
+ // TODO(jdtang): Add a way to distinguish this error case from the one
2561
+ // above.
2562
+ add_parse_error(parser, token);
2563
+ }
2564
+ do {
2565
+ current_node = pop_current_node(parser);
2566
+ } while (!node_tag_in(current_node, GUMBO_TAG_H1, GUMBO_TAG_H2,
2567
+ GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5,
2568
+ GUMBO_TAG_H6, GUMBO_TAG_LAST));
2569
+ return success;
2570
+ }
2571
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
2572
+ bool success = true;
2573
+ int last_a;
2574
+ int has_matching_a = find_last_anchor_index(parser, &last_a);
2575
+ if (has_matching_a) {
2576
+ assert(has_matching_a == 1);
2577
+ add_parse_error(parser, token);
2578
+ adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
2579
+ // The adoption agency algorithm usually removes all instances of <a>
2580
+ // from the list of active formatting elements, but in case it doesn't,
2581
+ // we're supposed to do this. (The conditions where it might not are
2582
+ // listed in the spec.)
2583
+ if (find_last_anchor_index(parser, &last_a)) {
2584
+ void* last_element = gumbo_vector_remove_at(
2585
+ parser, last_a, &state->_active_formatting_elements);
2586
+ gumbo_vector_remove(
2587
+ parser, last_element, &state->_open_elements);
2588
+ }
2589
+ success = false;
2590
+ }
2591
+ reconstruct_active_formatting_elements(parser);
2592
+ add_formatting_element(parser, insert_element_from_token(parser, token));
2593
+ return success;
2594
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_B, GUMBO_TAG_BIG,
2595
+ GUMBO_TAG_CODE, GUMBO_TAG_EM, GUMBO_TAG_FONT, GUMBO_TAG_I,
2596
+ GUMBO_TAG_S, GUMBO_TAG_SMALL, GUMBO_TAG_STRIKE,
2597
+ GUMBO_TAG_STRONG, GUMBO_TAG_TT, GUMBO_TAG_U,
2598
+ GUMBO_TAG_LAST)) {
2599
+ reconstruct_active_formatting_elements(parser);
2600
+ add_formatting_element(parser, insert_element_from_token(parser, token));
2601
+ return true;
2602
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
2603
+ bool result = true;
2604
+ reconstruct_active_formatting_elements(parser);
2605
+ if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
2606
+ result = false;
2607
+ add_parse_error(parser, token);
2608
+ adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
2609
+ reconstruct_active_formatting_elements(parser);
2610
+ }
2611
+ insert_element_from_token(parser, token);
2612
+ add_formatting_element(parser, get_current_node(parser));
2613
+ return result;
2614
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_A, GUMBO_TAG_B, GUMBO_TAG_BIG,
2615
+ GUMBO_TAG_CODE, GUMBO_TAG_EM, GUMBO_TAG_FONT, GUMBO_TAG_I,
2616
+ GUMBO_TAG_NOBR, GUMBO_TAG_S, GUMBO_TAG_SMALL,
2617
+ GUMBO_TAG_STRIKE, GUMBO_TAG_STRONG, GUMBO_TAG_TT,
2618
+ GUMBO_TAG_U, GUMBO_TAG_LAST)) {
2619
+ return adoption_agency_algorithm(parser, token, token->v.end_tag);
2620
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_APPLET, GUMBO_TAG_MARQUEE,
2621
+ GUMBO_TAG_OBJECT, GUMBO_TAG_LAST)) {
2622
+ reconstruct_active_formatting_elements(parser);
2623
+ insert_element_from_token(parser, token);
2624
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
2625
+ set_frameset_not_ok(parser);
2626
+ return true;
2627
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_APPLET, GUMBO_TAG_MARQUEE,
2628
+ GUMBO_TAG_OBJECT, GUMBO_TAG_LAST)) {
2629
+ GumboTag token_tag = token->v.end_tag;
2630
+ if (!has_an_element_in_table_scope(parser, token_tag)) {
2631
+ add_parse_error(parser, token);
2632
+ ignore_token(parser);
2633
+ return false;
2634
+ }
2635
+ implicitly_close_tags(parser, token, token_tag);
2636
+ clear_active_formatting_elements(parser);
2637
+ return true;
2638
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
2639
+ if (get_document_node(parser)->v.document.doc_type_quirks_mode !=
2640
+ GUMBO_DOCTYPE_QUIRKS) {
2641
+ maybe_implicitly_close_p_tag(parser, token);
2642
+ }
2643
+ insert_element_from_token(parser, token);
2644
+ set_frameset_not_ok(parser);
2645
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
2646
+ return true;
2647
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_AREA, GUMBO_TAG_BR,
2648
+ GUMBO_TAG_EMBED, GUMBO_TAG_IMG, GUMBO_TAG_IMAGE,
2649
+ GUMBO_TAG_KEYGEN, GUMBO_TAG_WBR, GUMBO_TAG_LAST)) {
2650
+ bool success = true;
2651
+ if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2652
+ success = false;
2653
+ add_parse_error(parser, token);
2654
+ token->v.start_tag.tag = GUMBO_TAG_IMG;
2655
+ }
2656
+ reconstruct_active_formatting_elements(parser);
2657
+ GumboNode* node = insert_element_from_token(parser, token);
2658
+ if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2659
+ success = false;
2660
+ add_parse_error(parser, token);
2661
+ node->v.element.tag = GUMBO_TAG_IMG;
2662
+ node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
2663
+ }
2664
+ pop_current_node(parser);
2665
+ acknowledge_self_closing_tag(parser);
2666
+ set_frameset_not_ok(parser);
2667
+ return success;
2668
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
2669
+ if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
2670
+ // Must be before the element is inserted, as that takes ownership of the
2671
+ // token's attribute vector.
2672
+ set_frameset_not_ok(parser);
2673
+ }
2674
+ reconstruct_active_formatting_elements(parser);
2675
+ insert_element_from_token(parser, token);
2676
+ pop_current_node(parser);
2677
+ acknowledge_self_closing_tag(parser);
2678
+ return true;
2679
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_PARAM, GUMBO_TAG_SOURCE,
2680
+ GUMBO_TAG_TRACK, GUMBO_TAG_LAST)) {
2681
+ insert_element_from_token(parser, token);
2682
+ pop_current_node(parser);
2683
+ acknowledge_self_closing_tag(parser);
2684
+ return true;
2685
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
2686
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2687
+ insert_element_from_token(parser, token);
2688
+ pop_current_node(parser);
2689
+ acknowledge_self_closing_tag(parser);
2690
+ set_frameset_not_ok(parser);
2691
+ return result;
2692
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
2693
+ add_parse_error(parser, token);
2694
+ if (parser->_parser_state->_form_element != NULL) {
2695
+ ignore_token(parser);
2696
+ return false;
2697
+ }
2698
+ acknowledge_self_closing_tag(parser);
2699
+ maybe_implicitly_close_p_tag(parser, token);
2700
+ set_frameset_not_ok(parser);
2701
+
2702
+ GumboVector* token_attrs = &token->v.start_tag.attributes;
2703
+ GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt");
2704
+ GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action");
2705
+ GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "isindex");
2706
+
2707
+ GumboNode* form = insert_element_of_tag_type(
2708
+ parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
2709
+ if (action_attr) {
2710
+ gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
2711
+ }
2712
+ insert_element_of_tag_type(parser, GUMBO_TAG_HR,
2713
+ GUMBO_INSERTION_FROM_ISINDEX);
2714
+ pop_current_node(parser); // <hr>
2715
+
2716
+ insert_element_of_tag_type(parser, GUMBO_TAG_LABEL,
2717
+ GUMBO_INSERTION_FROM_ISINDEX);
2718
+ TextNodeBufferState* text_state = &parser->_parser_state->_text_node;
2719
+ text_state->_start_original_text = token->original_text.data;
2720
+ text_state->_start_position = token->position;
2721
+ text_state->_type = GUMBO_NODE_TEXT;
2722
+ if (prompt_attr) {
2723
+ int prompt_attr_length = strlen(prompt_attr->value);
2724
+ gumbo_string_buffer_destroy(parser, &text_state->_buffer);
2725
+ text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value);
2726
+ text_state->_buffer.length = prompt_attr_length;
2727
+ text_state->_buffer.capacity = prompt_attr_length + 1;
2728
+ gumbo_destroy_attribute(parser, prompt_attr);
2729
+ } else {
2730
+ GumboStringPiece prompt_text = GUMBO_STRING(
2731
+ "This is a searchable index. Enter search keywords: ");
2732
+ gumbo_string_buffer_append_string(
2733
+ parser, &prompt_text, &text_state->_buffer);
2734
+ }
2735
+
2736
+ GumboNode* input = insert_element_of_tag_type(
2737
+ parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX);
2738
+ for (int i = 0; i < token_attrs->length; ++i) {
2739
+ GumboAttribute* attr = token_attrs->data[i];
2740
+ if (attr != prompt_attr && attr != action_attr && attr != name_attr) {
2741
+ gumbo_vector_add(parser, attr, &input->v.element.attributes);
2742
+ }
2743
+ token_attrs->data[i] = NULL;
2744
+ }
2745
+
2746
+ // All attributes have been successfully transferred and nulled out at this
2747
+ // point, so the call to ignore_token will free the memory for it without
2748
+ // touching the attributes.
2749
+ ignore_token(parser);
2750
+
2751
+ GumboAttribute* name =
2752
+ gumbo_parser_allocate(parser, sizeof(GumboAttribute));
2753
+ GumboStringPiece name_str = GUMBO_STRING("name");
2754
+ GumboStringPiece isindex_str = GUMBO_STRING("isindex");
2755
+ name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
2756
+ name->name = gumbo_copy_stringz(parser, "name");
2757
+ name->value = gumbo_copy_stringz(parser, "isindex");
2758
+ name->original_name = name_str;
2759
+ name->original_value = isindex_str;
2760
+ name->name_start = kGumboEmptySourcePosition;
2761
+ name->name_end = kGumboEmptySourcePosition;
2762
+ name->value_start = kGumboEmptySourcePosition;
2763
+ name->value_end = kGumboEmptySourcePosition;
2764
+ gumbo_vector_add(parser, name, &input->v.element.attributes);
2765
+
2766
+ pop_current_node(parser); // <input>
2767
+ pop_current_node(parser); // <label>
2768
+ insert_element_of_tag_type(
2769
+ parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2770
+ pop_current_node(parser); // <hr>
2771
+ pop_current_node(parser); // <form>
2772
+ return false;
2773
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
2774
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2775
+ parser->_parser_state->_ignore_next_linefeed = true;
2776
+ set_frameset_not_ok(parser);
2777
+ return true;
2778
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
2779
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2780
+ reconstruct_active_formatting_elements(parser);
2781
+ set_frameset_not_ok(parser);
2782
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2783
+ return result;
2784
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
2785
+ set_frameset_not_ok(parser);
2786
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2787
+ return true;
2788
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
2789
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2790
+ return true;
2791
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
2792
+ reconstruct_active_formatting_elements(parser);
2793
+ insert_element_from_token(parser, token);
2794
+ set_frameset_not_ok(parser);
2795
+ GumboInsertionMode state = parser->_parser_state->_insertion_mode;
2796
+ if (state == GUMBO_INSERTION_MODE_IN_TABLE ||
2797
+ state == GUMBO_INSERTION_MODE_IN_CAPTION ||
2798
+ state == GUMBO_INSERTION_MODE_IN_TABLE_BODY ||
2799
+ state == GUMBO_INSERTION_MODE_IN_ROW ||
2800
+ state == GUMBO_INSERTION_MODE_IN_CELL) {
2801
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
2802
+ } else {
2803
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
2804
+ }
2805
+ return true;
2806
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP,
2807
+ GUMBO_TAG_LAST)) {
2808
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2809
+ pop_current_node(parser);
2810
+ }
2811
+ reconstruct_active_formatting_elements(parser);
2812
+ insert_element_from_token(parser, token);
2813
+ return true;
2814
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_RP, GUMBO_TAG_RT,
2815
+ GUMBO_TAG_LAST)) {
2816
+ bool success = true;
2817
+ if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
2818
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2819
+ }
2820
+ if (!node_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)) {
2821
+ add_parse_error(parser, token);
2822
+ success = false;
2823
+ }
2824
+ insert_element_from_token(parser, token);
2825
+ return success;
2826
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
2827
+ add_parse_error(parser, token);
2828
+ reconstruct_active_formatting_elements(parser);
2829
+ insert_element_of_tag_type(
2830
+ parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2831
+ pop_current_node(parser);
2832
+ return false;
2833
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
2834
+ reconstruct_active_formatting_elements(parser);
2835
+ adjust_mathml_attributes(parser, token);
2836
+ adjust_foreign_attributes(parser, token);
2837
+ insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML);
2838
+ if (token->v.start_tag.is_self_closing) {
2839
+ pop_current_node(parser);
2840
+ acknowledge_self_closing_tag(parser);
2841
+ }
2842
+ return true;
2843
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
2844
+ reconstruct_active_formatting_elements(parser);
2845
+ adjust_svg_attributes(parser, token);
2846
+ adjust_foreign_attributes(parser, token);
2847
+ insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG);
2848
+ if (token->v.start_tag.is_self_closing) {
2849
+ pop_current_node(parser);
2850
+ acknowledge_self_closing_tag(parser);
2851
+ }
2852
+ return true;
2853
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
2854
+ GUMBO_TAG_COLGROUP, GUMBO_TAG_FRAME, GUMBO_TAG_HEAD,
2855
+ GUMBO_TAG_TBODY, GUMBO_TAG_TD, GUMBO_TAG_TFOOT,
2856
+ GUMBO_TAG_TH, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
2857
+ GUMBO_TAG_LAST)) {
2858
+ add_parse_error(parser, token);
2859
+ ignore_token(parser);
2860
+ return false;
2861
+ } else if (token->type == GUMBO_TOKEN_START_TAG) {
2862
+ reconstruct_active_formatting_elements(parser);
2863
+ insert_element_from_token(parser, token);
2864
+ return true;
2865
+ } else {
2866
+ assert(token->type == GUMBO_TOKEN_END_TAG);
2867
+ GumboTag end_tag = token->v.end_tag;
2868
+ assert(state->_open_elements.length > 0);
2869
+ assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
2870
+ // Walk up the stack of open elements until we find one that either:
2871
+ // a) Matches the tag name we saw
2872
+ // b) Is in the "special" category.
2873
+ // If we see a), implicitly close everything up to and including it. If we
2874
+ // see b), then record a parse error, don't close anything (except the
2875
+ // implied end tags) and ignore the end tag token.
2876
+ for (int i = state->_open_elements.length - 1; ; --i) {
2877
+ const GumboNode* node = state->_open_elements.data[i];
2878
+ if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
2879
+ node_tag_is(node, end_tag)) {
2880
+ generate_implied_end_tags(parser, end_tag);
2881
+ // TODO(jdtang): Do I need to add a parse error here? The condition in
2882
+ // the spec seems like it's the inverse of the loop condition above, and
2883
+ // so would never fire.
2884
+ while (node != pop_current_node(parser)); // Pop everything.
2885
+ return true;
2886
+ } else if (is_special_node(node)) {
2887
+ add_parse_error(parser, token);
2888
+ ignore_token(parser);
2889
+ return false;
2890
+ }
2891
+ }
2892
+ // <html> is in the special category, so we should never get here.
2893
+ assert(0);
2894
+ return false;
2895
+ }
2896
+ }
2897
+
2898
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata
2899
+ static bool handle_text(GumboParser* parser, GumboToken* token) {
2900
+ if (token->type == GUMBO_TOKEN_CHARACTER || token->type == GUMBO_TOKEN_WHITESPACE) {
2901
+ insert_text_token(parser, token);
2902
+ } else {
2903
+ // We provide only bare-bones script handling that doesn't involve any of
2904
+ // the parser-pause/already-started/script-nesting flags or re-entrant
2905
+ // invocations of the tokenizer. Because the intended usage of this library
2906
+ // is mostly for templating, refactoring, and static-analysis libraries, we
2907
+ // provide the script body as a text-node child of the <script> element.
2908
+ // This behavior doesn't support document.write of partial HTML elements,
2909
+ // but should be adequate for almost all other scripting support.
2910
+ if (token->type == GUMBO_TOKEN_EOF) {
2911
+ add_parse_error(parser, token);
2912
+ parser->_parser_state->_reprocess_current_token = true;
2913
+ }
2914
+ pop_current_node(parser);
2915
+ set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
2916
+ }
2917
+ return true;
2918
+ }
2919
+
2920
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable
2921
+ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
2922
+ GumboParserState* state = parser->_parser_state;
2923
+ if (token->type == GUMBO_TOKEN_CHARACTER ||
2924
+ token->type == GUMBO_TOKEN_WHITESPACE) {
2925
+ // The "pending table character tokens" list described in the spec is
2926
+ // nothing more than the TextNodeBufferState. We accumulate text tokens as
2927
+ // normal, except that when we go to flush them in the handle_in_table_text,
2928
+ // we set _foster_parent_insertions if there're non-whitespace characters in
2929
+ // the buffer.
2930
+ assert(state->_text_node._buffer.length == 0);
2931
+ state->_original_insertion_mode = state->_insertion_mode;
2932
+ state->_reprocess_current_token = true;
2933
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
2934
+ return true;
2935
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2936
+ add_parse_error(parser, token);
2937
+ ignore_token(parser);
2938
+ return false;
2939
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2940
+ append_comment_node(parser, get_current_node(parser), token);
2941
+ return true;
2942
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
2943
+ clear_stack_to_table_context(parser);
2944
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
2945
+ insert_element_from_token(parser, token);
2946
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
2947
+ return true;
2948
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
2949
+ clear_stack_to_table_context(parser);
2950
+ insert_element_from_token(parser, token);
2951
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
2952
+ return true;
2953
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
2954
+ clear_stack_to_table_context(parser);
2955
+ insert_element_of_tag_type(
2956
+ parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED);
2957
+ parser->_parser_state->_reprocess_current_token = true;
2958
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
2959
+ return true;
2960
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
2961
+ GUMBO_TAG_THEAD, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_TR,
2962
+ GUMBO_TAG_LAST)) {
2963
+ clear_stack_to_table_context(parser);
2964
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
2965
+ if (tag_in(token, kStartTag, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_TR,
2966
+ GUMBO_TAG_LAST)) {
2967
+ insert_element_of_tag_type(
2968
+ parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED);
2969
+ state->_reprocess_current_token = true;
2970
+ } else {
2971
+ insert_element_from_token(parser, token);
2972
+ }
2973
+ return true;
2974
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
2975
+ add_parse_error(parser, token);
2976
+ if (close_table(parser)) {
2977
+ parser->_parser_state->_reprocess_current_token = true;
2978
+ } else {
2979
+ ignore_token(parser);
2980
+ }
2981
+ return false;
2982
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
2983
+ if (!close_table(parser)) {
2984
+ add_parse_error(parser, token);
2985
+ return false;
2986
+ }
2987
+ return true;
2988
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
2989
+ GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
2990
+ GUMBO_TAG_TBODY, GUMBO_TAG_TD, GUMBO_TAG_TFOOT,
2991
+ GUMBO_TAG_TH, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
2992
+ GUMBO_TAG_LAST)) {
2993
+ add_parse_error(parser, token);
2994
+ ignore_token(parser);
2995
+ return false;
2996
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_STYLE, GUMBO_TAG_SCRIPT,
2997
+ GUMBO_TAG_LAST)) {
2998
+ return handle_in_head(parser, token);
2999
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) &&
3000
+ attribute_matches(&token->v.start_tag.attributes,
3001
+ "type", "hidden")) {
3002
+ add_parse_error(parser, token);
3003
+ insert_element_from_token(parser, token);
3004
+ pop_current_node(parser);
3005
+ return false;
3006
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3007
+ add_parse_error(parser, token);
3008
+ if (state->_form_element) {
3009
+ ignore_token(parser);
3010
+ return false;
3011
+ }
3012
+ state->_form_element = insert_element_from_token(parser, token);
3013
+ pop_current_node(parser);
3014
+ return false;
3015
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3016
+ if (!node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3017
+ add_parse_error(parser, token);
3018
+ return false;
3019
+ }
3020
+ return true;
3021
+ } else {
3022
+ add_parse_error(parser, token);
3023
+ state->_foster_parent_insertions = true;
3024
+ bool result = handle_in_body(parser, token);
3025
+ state->_foster_parent_insertions = false;
3026
+ return result;
3027
+ }
3028
+ }
3029
+
3030
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext
3031
+ static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3032
+ if (token->type == GUMBO_TOKEN_NULL) {
3033
+ add_parse_error(parser, token);
3034
+ ignore_token(parser);
3035
+ return false;
3036
+ } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3037
+ token->type == GUMBO_TOKEN_WHITESPACE) {
3038
+ insert_text_token(parser, token);
3039
+ return true;
3040
+ } else {
3041
+ GumboParserState* state = parser->_parser_state;
3042
+ GumboStringBuffer* buffer = &state->_text_node._buffer;
3043
+ // Can't use strspn for this because GumboStringBuffers are not
3044
+ // null-terminated.
3045
+ // Note that TextNodeBuffer may contain UTF-8 characters, but the presence
3046
+ // of any one byte that is not whitespace means we flip the flag, so this
3047
+ // loop is still valid.
3048
+ for (int i = 0; i < buffer->length; ++i) {
3049
+ if (!isspace(buffer->data[i]) || buffer->data[i] == '\v') {
3050
+ state->_foster_parent_insertions = true;
3051
+ reconstruct_active_formatting_elements(parser);
3052
+ break;
3053
+ }
3054
+ }
3055
+ maybe_flush_text_node_buffer(parser);
3056
+ state->_foster_parent_insertions = false;
3057
+ state->_reprocess_current_token = true;
3058
+ state->_insertion_mode = state->_original_insertion_mode;
3059
+ return true;
3060
+ }
3061
+ }
3062
+
3063
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption
3064
+ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3065
+ if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3066
+ GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
3067
+ GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
3068
+ GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
3069
+ tag_in(token, kEndTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3070
+ GUMBO_TAG_LAST)) {
3071
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3072
+ add_parse_error(parser, token);
3073
+ ignore_token(parser);
3074
+ return false;
3075
+ }
3076
+ if (!tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3077
+ add_parse_error(parser, token);
3078
+ parser->_parser_state->_reprocess_current_token = true;
3079
+ }
3080
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3081
+ bool result = true;
3082
+ if (!node_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3083
+ add_parse_error(parser, token);
3084
+ while (!node_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3085
+ pop_current_node(parser);
3086
+ }
3087
+ result = false;
3088
+ }
3089
+ pop_current_node(parser); // The <caption> itself.
3090
+ clear_active_formatting_elements(parser);
3091
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3092
+ return result;
3093
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_COL,
3094
+ GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML, GUMBO_TAG_TBODY,
3095
+ GUMBO_TAG_TD, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
3096
+ GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
3097
+ add_parse_error(parser, token);
3098
+ ignore_token(parser);
3099
+ return false;
3100
+ } else {
3101
+ return handle_in_body(parser, token);
3102
+ }
3103
+ }
3104
+
3105
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup
3106
+ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3107
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
3108
+ insert_text_token(parser, token);
3109
+ return true;
3110
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3111
+ add_parse_error(parser, token);
3112
+ ignore_token(parser);
3113
+ return false;
3114
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3115
+ append_comment_node(parser, get_current_node(parser), token);
3116
+ return true;
3117
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3118
+ return handle_in_body(parser, token);
3119
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3120
+ insert_element_from_token(parser, token);
3121
+ pop_current_node(parser);
3122
+ acknowledge_self_closing_tag(parser);
3123
+ return true;
3124
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3125
+ add_parse_error(parser, token);
3126
+ ignore_token(parser);
3127
+ return false;
3128
+ } else if (token->type == GUMBO_TOKEN_EOF &&
3129
+ get_current_node(parser) == parser->_output->root) {
3130
+ return true;
3131
+ } else {
3132
+ if (get_current_node(parser) == parser->_output->root) {
3133
+ add_parse_error(parser, token);
3134
+ return false;
3135
+ }
3136
+ assert(node_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP));
3137
+ pop_current_node(parser);
3138
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3139
+ if (!tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3140
+ parser->_parser_state->_reprocess_current_token = true;
3141
+ }
3142
+ return true;
3143
+ }
3144
+ }
3145
+
3146
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody
3147
+ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3148
+ if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3149
+ clear_stack_to_table_body_context(parser);
3150
+ insert_element_from_token(parser, token);
3151
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3152
+ return true;
3153
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_TD, GUMBO_TAG_TH,
3154
+ GUMBO_TAG_LAST)) {
3155
+ add_parse_error(parser, token);
3156
+ clear_stack_to_table_body_context(parser);
3157
+ insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3158
+ parser->_parser_state->_reprocess_current_token = true;
3159
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3160
+ return false;
3161
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3162
+ GUMBO_TAG_THEAD, GUMBO_TAG_LAST)) {
3163
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3164
+ add_parse_error(parser, token);
3165
+ ignore_token(parser);
3166
+ return false;
3167
+ }
3168
+ clear_stack_to_table_body_context(parser);
3169
+ pop_current_node(parser);
3170
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3171
+ return true;
3172
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3173
+ GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3174
+ GUMBO_TAG_THEAD, GUMBO_TAG_LAST) ||
3175
+ tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3176
+ if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) ||
3177
+ has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) ||
3178
+ has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) {
3179
+ add_parse_error(parser, token);
3180
+ ignore_token(parser);
3181
+ return false;
3182
+ }
3183
+ clear_stack_to_table_body_context(parser);
3184
+ pop_current_node(parser);
3185
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3186
+ parser->_parser_state->_reprocess_current_token = true;
3187
+ return true;
3188
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3189
+ GUMBO_TAG_COL, GUMBO_TAG_TR, GUMBO_TAG_COLGROUP,
3190
+ GUMBO_TAG_HTML, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST))
3191
+ {
3192
+ add_parse_error(parser, token);
3193
+ ignore_token(parser);
3194
+ return false;
3195
+ } else {
3196
+ return handle_in_table(parser, token);
3197
+ }
3198
+ }
3199
+
3200
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr
3201
+ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3202
+ if (tag_in(token, kStartTag, GUMBO_TAG_TH, GUMBO_TAG_TD, GUMBO_TAG_LAST)) {
3203
+ clear_stack_to_table_row_context(parser);
3204
+ insert_element_from_token(parser, token);
3205
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3206
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
3207
+ return true;
3208
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COLGROUP,
3209
+ GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3210
+ GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
3211
+ tag_in(token, kEndTag, GUMBO_TAG_TR, GUMBO_TAG_TABLE,
3212
+ GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3213
+ GUMBO_TAG_LAST)) {
3214
+ // This case covers 4 clauses of the spec, each of which say "Otherwise, act
3215
+ // as if an end tag with the tag name "tr" had been seen." The differences
3216
+ // are in error handling and whether the current token is reprocessed.
3217
+ GumboTag desired_tag =
3218
+ tag_in(token, kEndTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3219
+ GUMBO_TAG_THEAD, GUMBO_TAG_LAST)
3220
+ ? token->v.end_tag : GUMBO_TAG_TR;
3221
+ if (!has_an_element_in_table_scope(parser, desired_tag)) {
3222
+ gumbo_debug("Bailing because there is no tag %s in table scope.\nOpen elements:",
3223
+ gumbo_normalized_tagname(desired_tag));
3224
+ for (int i = 0; i < parser->_parser_state->_open_elements.length; ++i) {
3225
+ const GumboNode* node = parser->_parser_state->_open_elements.data[i];
3226
+ gumbo_debug("%s\n", gumbo_normalized_tagname(node->v.element.tag));
3227
+ }
3228
+ add_parse_error(parser, token);
3229
+ ignore_token(parser);
3230
+ return false;
3231
+ }
3232
+ clear_stack_to_table_row_context(parser);
3233
+ GumboNode* last_element = pop_current_node(parser);
3234
+ assert(node_tag_is(last_element, GUMBO_TAG_TR));
3235
+ AVOID_UNUSED_VARIABLE_WARNING(last_element);
3236
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3237
+ if (!tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3238
+ parser->_parser_state->_reprocess_current_token = true;
3239
+ }
3240
+ return true;
3241
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3242
+ GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
3243
+ GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3244
+ add_parse_error(parser, token);
3245
+ ignore_token(parser);
3246
+ return false;
3247
+ } else {
3248
+ return handle_in_table(parser, token);
3249
+ }
3250
+ }
3251
+
3252
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd
3253
+ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3254
+ if (tag_in(token, kEndTag, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3255
+ GumboTag token_tag = token->v.end_tag;
3256
+ if (!has_an_element_in_table_scope(parser, token_tag)) {
3257
+ add_parse_error(parser, token);
3258
+ return false;
3259
+ }
3260
+ return close_table_cell(parser, token, token_tag);
3261
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3262
+ GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
3263
+ GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
3264
+ GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
3265
+ gumbo_debug("Handling <td> in cell.\n");
3266
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) &&
3267
+ !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
3268
+ gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
3269
+ add_parse_error(parser, token);
3270
+ ignore_token(parser);
3271
+ return false;
3272
+ }
3273
+ parser->_parser_state->_reprocess_current_token = true;
3274
+ return close_current_cell(parser, token);
3275
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3276
+ GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
3277
+ GUMBO_TAG_LAST)) {
3278
+ add_parse_error(parser, token);
3279
+ ignore_token(parser);
3280
+ return false;
3281
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
3282
+ GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
3283
+ GUMBO_TAG_LAST)) {
3284
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3285
+ add_parse_error(parser, token);
3286
+ ignore_token(parser);
3287
+ return false;
3288
+ }
3289
+ parser->_parser_state->_reprocess_current_token = true;
3290
+ return close_current_cell(parser, token);
3291
+ } else {
3292
+ return handle_in_body(parser, token);
3293
+ }
3294
+ }
3295
+
3296
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect
3297
+ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3298
+ if (token->type == GUMBO_TOKEN_NULL) {
3299
+ add_parse_error(parser, token);
3300
+ ignore_token(parser);
3301
+ return false;
3302
+ } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3303
+ token->type == GUMBO_TOKEN_WHITESPACE) {
3304
+ insert_text_token(parser, token);
3305
+ return true;
3306
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3307
+ add_parse_error(parser, token);
3308
+ ignore_token(parser);
3309
+ return false;
3310
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3311
+ append_comment_node(parser, get_current_node(parser), token);
3312
+ return true;
3313
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3314
+ return handle_in_body(parser, token);
3315
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3316
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3317
+ pop_current_node(parser);
3318
+ }
3319
+ insert_element_from_token(parser, token);
3320
+ return true;
3321
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3322
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3323
+ pop_current_node(parser);
3324
+ }
3325
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3326
+ pop_current_node(parser);
3327
+ }
3328
+ insert_element_from_token(parser, token);
3329
+ return true;
3330
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3331
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
3332
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
3333
+ node_tag_is(open_elements->data[open_elements->length - 2],
3334
+ GUMBO_TAG_OPTGROUP)) {
3335
+ pop_current_node(parser);
3336
+ }
3337
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3338
+ pop_current_node(parser);
3339
+ return true;
3340
+ } else {
3341
+ add_parse_error(parser, token);
3342
+ ignore_token(parser);
3343
+ return false;
3344
+ }
3345
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3346
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3347
+ pop_current_node(parser);
3348
+ return true;
3349
+ } else {
3350
+ add_parse_error(parser, token);
3351
+ ignore_token(parser);
3352
+ return false;
3353
+ }
3354
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3355
+ if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3356
+ add_parse_error(parser, token);
3357
+ ignore_token(parser);
3358
+ return false;
3359
+ }
3360
+ close_current_select(parser);
3361
+ return true;
3362
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3363
+ add_parse_error(parser, token);
3364
+ ignore_token(parser);
3365
+ close_current_select(parser);
3366
+ return false;
3367
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_INPUT, GUMBO_TAG_KEYGEN,
3368
+ GUMBO_TAG_TEXTAREA, GUMBO_TAG_LAST)) {
3369
+ add_parse_error(parser, token);
3370
+ if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3371
+ ignore_token(parser);
3372
+ } else {
3373
+ close_current_select(parser);
3374
+ parser->_parser_state->_reprocess_current_token = true;
3375
+ }
3376
+ return false;
3377
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
3378
+ return handle_in_head(parser, token);
3379
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3380
+ if (get_current_node(parser) != parser->_output->root) {
3381
+ add_parse_error(parser, token);
3382
+ return false;
3383
+ }
3384
+ return true;
3385
+ } else {
3386
+ add_parse_error(parser, token);
3387
+ ignore_token(parser);
3388
+ return false;
3389
+ }
3390
+ }
3391
+
3392
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
3393
+ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3394
+ if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3395
+ GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
3396
+ GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3397
+ add_parse_error(parser, token);
3398
+ close_current_select(parser);
3399
+ parser->_parser_state->_reprocess_current_token = true;
3400
+ return false;
3401
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3402
+ GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3403
+ GUMBO_TAG_TR, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3404
+ add_parse_error(parser, token);
3405
+ if (has_an_element_in_table_scope(parser, token->v.end_tag)) {
3406
+ close_current_select(parser);
3407
+ reset_insertion_mode_appropriately(parser);
3408
+ parser->_parser_state->_reprocess_current_token = true;
3409
+ } else {
3410
+ ignore_token(parser);
3411
+ }
3412
+ return false;
3413
+ } else {
3414
+ return handle_in_select(parser, token);
3415
+ }
3416
+ }
3417
+
3418
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
3419
+ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
3420
+ if (token->type == GUMBO_TOKEN_WHITESPACE ||
3421
+ tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3422
+ return handle_in_body(parser, token);
3423
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3424
+ GumboNode* html_node = parser->_output->root;
3425
+ assert(html_node != NULL);
3426
+ append_comment_node(parser, html_node, token);
3427
+ return true;
3428
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3429
+ add_parse_error(parser, token);
3430
+ ignore_token(parser);
3431
+ return false;
3432
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3433
+ // TODO(jdtang): Handle fragment parsing algorithm case.
3434
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
3435
+ GumboNode* html = parser->_parser_state->_open_elements.data[0];
3436
+ assert(node_tag_is(html, GUMBO_TAG_HTML));
3437
+ record_end_of_element(
3438
+ parser->_parser_state->_current_token, &html->v.element);
3439
+ return true;
3440
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3441
+ return true;
3442
+ } else {
3443
+ add_parse_error(parser, token);
3444
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3445
+ parser->_parser_state->_reprocess_current_token = true;
3446
+ return false;
3447
+ }
3448
+ }
3449
+
3450
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset
3451
+ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
3452
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
3453
+ insert_text_token(parser, token);
3454
+ return true;
3455
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3456
+ append_comment_node(parser, get_current_node(parser), token);
3457
+ return true;
3458
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3459
+ add_parse_error(parser, token);
3460
+ ignore_token(parser);
3461
+ return false;
3462
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3463
+ return handle_in_body(parser, token);
3464
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
3465
+ insert_element_from_token(parser, token);
3466
+ return true;
3467
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
3468
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3469
+ add_parse_error(parser, token);
3470
+ ignore_token(parser);
3471
+ return false;
3472
+ }
3473
+ pop_current_node(parser);
3474
+ // TODO(jdtang): Add a condition to ignore this for the fragment parsing
3475
+ // algorithm.
3476
+ if (!node_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
3477
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
3478
+ }
3479
+ return true;
3480
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
3481
+ insert_element_from_token(parser, token);
3482
+ pop_current_node(parser);
3483
+ acknowledge_self_closing_tag(parser);
3484
+ return true;
3485
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3486
+ return handle_in_head(parser, token);
3487
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3488
+ if (!node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3489
+ add_parse_error(parser, token);
3490
+ return false;
3491
+ }
3492
+ return true;
3493
+ } else {
3494
+ add_parse_error(parser, token);
3495
+ ignore_token(parser);
3496
+ return false;
3497
+ }
3498
+ }
3499
+
3500
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset
3501
+ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
3502
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
3503
+ insert_text_token(parser, token);
3504
+ return true;
3505
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3506
+ append_comment_node(parser, get_current_node(parser), token);
3507
+ return true;
3508
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3509
+ add_parse_error(parser, token);
3510
+ ignore_token(parser);
3511
+ return false;
3512
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3513
+ return handle_in_body(parser, token);
3514
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3515
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
3516
+ return true;
3517
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3518
+ return handle_in_head(parser, token);
3519
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3520
+ return true;
3521
+ } else {
3522
+ add_parse_error(parser, token);
3523
+ ignore_token(parser);
3524
+ return false;
3525
+ }
3526
+ }
3527
+
3528
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode
3529
+ static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
3530
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3531
+ append_comment_node(parser, get_document_node(parser), token);
3532
+ return true;
3533
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3534
+ token->type == GUMBO_TOKEN_WHITESPACE ||
3535
+ tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3536
+ return handle_in_body(parser, token);
3537
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3538
+ return true;
3539
+ } else {
3540
+ add_parse_error(parser, token);
3541
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3542
+ parser->_parser_state->_reprocess_current_token = true;
3543
+ return false;
3544
+ }
3545
+ }
3546
+
3547
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode
3548
+ static bool handle_after_after_frameset(
3549
+ GumboParser* parser, GumboToken* token) {
3550
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3551
+ append_comment_node(parser, get_document_node(parser), token);
3552
+ return true;
3553
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3554
+ token->type == GUMBO_TOKEN_WHITESPACE ||
3555
+ tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3556
+ return handle_in_body(parser, token);
3557
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3558
+ return true;
3559
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3560
+ return handle_in_head(parser, token);
3561
+ } else {
3562
+ add_parse_error(parser, token);
3563
+ ignore_token(parser);
3564
+ return false;
3565
+ }
3566
+ }
3567
+
3568
+ // Function pointers for each insertion mode. Keep in sync with
3569
+ // insertion_mode.h.
3570
+ typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token);
3571
+ static const TokenHandler kTokenHandlers[] = {
3572
+ handle_initial,
3573
+ handle_before_html,
3574
+ handle_before_head,
3575
+ handle_in_head,
3576
+ handle_in_head_noscript,
3577
+ handle_after_head,
3578
+ handle_in_body,
3579
+ handle_text,
3580
+ handle_in_table,
3581
+ handle_in_table_text,
3582
+ handle_in_caption,
3583
+ handle_in_column_group,
3584
+ handle_in_table_body,
3585
+ handle_in_row,
3586
+ handle_in_cell,
3587
+ handle_in_select,
3588
+ handle_in_select_in_table,
3589
+ handle_after_body,
3590
+ handle_in_frameset,
3591
+ handle_after_frameset,
3592
+ handle_after_after_body,
3593
+ handle_after_after_frameset
3594
+ };
3595
+
3596
+ static bool handle_html_content(GumboParser* parser, GumboToken* token) {
3597
+ return kTokenHandlers[parser->_parser_state->_insertion_mode](
3598
+ parser, token);
3599
+ }
3600
+
3601
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign
3602
+ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
3603
+ switch (token->type) {
3604
+ case GUMBO_TOKEN_NULL:
3605
+ add_parse_error(parser, token);
3606
+ token->type = GUMBO_TOKEN_CHARACTER;
3607
+ token->v.character = kUtf8ReplacementChar;
3608
+ insert_text_token(parser, token);
3609
+ return false;
3610
+ case GUMBO_TOKEN_WHITESPACE:
3611
+ insert_text_token(parser, token);
3612
+ return true;
3613
+ case GUMBO_TOKEN_CHARACTER:
3614
+ insert_text_token(parser, token);
3615
+ set_frameset_not_ok(parser);
3616
+ return true;
3617
+ case GUMBO_TOKEN_COMMENT:
3618
+ append_comment_node(parser, get_current_node(parser), token);
3619
+ return true;
3620
+ case GUMBO_TOKEN_DOCTYPE:
3621
+ add_parse_error(parser, token);
3622
+ ignore_token(parser);
3623
+ return false;
3624
+ default:
3625
+ // Fall through to the if-statements below.
3626
+ break;
3627
+ }
3628
+ // Order matters for these clauses.
3629
+ if (tag_in(token, kStartTag, GUMBO_TAG_B, GUMBO_TAG_BIG,
3630
+ GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_BODY, GUMBO_TAG_BR,
3631
+ GUMBO_TAG_CENTER, GUMBO_TAG_CODE, GUMBO_TAG_DD, GUMBO_TAG_DIV,
3632
+ GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EM, GUMBO_TAG_EMBED,
3633
+ GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
3634
+ GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HEAD, GUMBO_TAG_HR,
3635
+ GUMBO_TAG_I, GUMBO_TAG_IMG, GUMBO_TAG_LI, GUMBO_TAG_LISTING,
3636
+ GUMBO_TAG_MENU, GUMBO_TAG_META, GUMBO_TAG_NOBR, GUMBO_TAG_OL,
3637
+ GUMBO_TAG_P, GUMBO_TAG_PRE, GUMBO_TAG_RUBY, GUMBO_TAG_S,
3638
+ GUMBO_TAG_SMALL, GUMBO_TAG_SPAN, GUMBO_TAG_STRONG,
3639
+ GUMBO_TAG_STRIKE, GUMBO_TAG_SUB, GUMBO_TAG_SUP,
3640
+ GUMBO_TAG_TABLE, GUMBO_TAG_TT, GUMBO_TAG_U, GUMBO_TAG_UL,
3641
+ GUMBO_TAG_VAR, GUMBO_TAG_LAST) ||
3642
+ (tag_is(token, kStartTag, GUMBO_TAG_FONT) && (
3643
+ token_has_attribute(token, "color") ||
3644
+ token_has_attribute(token, "face") ||
3645
+ token_has_attribute(token, "size")))) {
3646
+ add_parse_error(parser, token);
3647
+ do {
3648
+ pop_current_node(parser);
3649
+ } while(!(is_mathml_integration_point(get_current_node(parser)) ||
3650
+ is_html_integration_point(get_current_node(parser)) ||
3651
+ get_current_node(parser)->v.element.tag_namespace ==
3652
+ GUMBO_NAMESPACE_HTML));
3653
+ parser->_parser_state->_reprocess_current_token = true;
3654
+ return false;
3655
+ } else if (token->type == GUMBO_TOKEN_START_TAG) {
3656
+ const GumboNamespaceEnum current_namespace =
3657
+ get_current_node(parser)->v.element.tag_namespace;
3658
+ if (current_namespace == GUMBO_NAMESPACE_MATHML) {
3659
+ adjust_mathml_attributes(parser, token);
3660
+ }
3661
+ if (current_namespace == GUMBO_NAMESPACE_SVG) {
3662
+ // Tag adjustment is left to the gumbo_normalize_svg_tagname helper
3663
+ // function.
3664
+ adjust_svg_attributes(parser, token);
3665
+ }
3666
+ adjust_foreign_attributes(parser, token);
3667
+ insert_foreign_element(parser, token, current_namespace);
3668
+ if (token->v.start_tag.is_self_closing) {
3669
+ pop_current_node(parser);
3670
+ acknowledge_self_closing_tag(parser);
3671
+ }
3672
+ return true;
3673
+ // </script> tags are handled like any other end tag, putting the script's
3674
+ // text into a text node child and closing the current node.
3675
+ } else {
3676
+ assert(token->type == GUMBO_TOKEN_END_TAG);
3677
+ GumboNode* node = get_current_node(parser);
3678
+ assert(node != NULL);
3679
+ GumboStringPiece token_tagname = token->original_text;
3680
+ GumboStringPiece node_tagname = node->v.element.original_tag;
3681
+ gumbo_tag_from_original_text(&token_tagname);
3682
+ gumbo_tag_from_original_text(&node_tagname);
3683
+
3684
+ bool is_success = true;
3685
+ if (!gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3686
+ add_parse_error(parser, token);
3687
+ is_success = false;
3688
+ }
3689
+ int i = parser->_parser_state->_open_elements.length - 1;
3690
+ while (i > 0) {
3691
+ // Here we move up the stack until we find an HTML element (in which
3692
+ // case we do nothing) or we find the element that we're about to
3693
+ // close (in which case we pop everything we've seen until that
3694
+ // point.)
3695
+ gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length,
3696
+ node_tagname.data, i);
3697
+ if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3698
+ gumbo_debug("Matches.\n");
3699
+ while (pop_current_node(parser) != node) {
3700
+ // Pop all the nodes below the current one. Node is guaranteed to
3701
+ // be an element on the stack of open elements (set below), so
3702
+ // this loop is guaranteed to terminate.
3703
+ }
3704
+ return is_success;
3705
+ }
3706
+ --i;
3707
+ node = parser->_parser_state->_open_elements.data[i];
3708
+ if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
3709
+ // Must break before gumbo_tag_from_original_text to avoid passing
3710
+ // parser-inserted nodes through.
3711
+ break;
3712
+ }
3713
+ node_tagname = node->v.element.original_tag;
3714
+ gumbo_tag_from_original_text(&node_tagname);
3715
+ }
3716
+ assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
3717
+ // We can't call handle_token directly because the current node is still in
3718
+ // the SVG namespace, so it would re-enter this and result in infinite
3719
+ // recursion.
3720
+ return handle_html_content(parser, token) && is_success;
3721
+ }
3722
+ }
3723
+
3724
+
3725
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction
3726
+ static bool handle_token(GumboParser* parser, GumboToken* token) {
3727
+ if (parser->_parser_state->_ignore_next_linefeed &&
3728
+ token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') {
3729
+ parser->_parser_state->_ignore_next_linefeed = false;
3730
+ ignore_token(parser);
3731
+ return true;
3732
+ }
3733
+ // This needs to be reset both here and in the conditional above to catch both
3734
+ // the case where the next token is not whitespace (so we don't ignore
3735
+ // whitespace in the middle of <pre> tags) and where there are multiple
3736
+ // whitespace tokens (so we don't ignore the second one).
3737
+ parser->_parser_state->_ignore_next_linefeed = false;
3738
+
3739
+ if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
3740
+ parser->_parser_state->_closed_body_tag = true;
3741
+ }
3742
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3743
+ parser->_parser_state->_closed_html_tag = true;
3744
+ }
3745
+
3746
+ const GumboNode* current_node = get_current_node(parser);
3747
+ assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT);
3748
+ if (current_node) {
3749
+ gumbo_debug("Current node: <%s>.\n",
3750
+ gumbo_normalized_tagname(current_node->v.element.tag));
3751
+ }
3752
+ if (!current_node ||
3753
+ current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML ||
3754
+ (is_mathml_integration_point(current_node) &&
3755
+ (token->type == GUMBO_TOKEN_CHARACTER ||
3756
+ token->type == GUMBO_TOKEN_WHITESPACE ||
3757
+ token->type == GUMBO_TOKEN_NULL ||
3758
+ (token->type == GUMBO_TOKEN_START_TAG &&
3759
+ !tag_in(token, kStartTag, GUMBO_TAG_MGLYPH, GUMBO_TAG_MALIGNMARK,
3760
+ GUMBO_TAG_LAST)))) ||
3761
+ (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
3762
+ node_tag_is(current_node, GUMBO_TAG_ANNOTATION_XML) &&
3763
+ tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
3764
+ (is_html_integration_point(current_node) && (
3765
+ token->type == GUMBO_TOKEN_START_TAG ||
3766
+ token->type == GUMBO_TOKEN_CHARACTER ||
3767
+ token->type == GUMBO_TOKEN_NULL ||
3768
+ token->type == GUMBO_TOKEN_WHITESPACE)) ||
3769
+ token->type == GUMBO_TOKEN_EOF) {
3770
+ return handle_html_content(parser, token);
3771
+ } else {
3772
+ return handle_in_foreign_content(parser, token);
3773
+ }
3774
+ }
3775
+
3776
+ GumboOutput* gumbo_parse(const char* buffer) {
3777
+ return gumbo_parse_with_options(
3778
+ &kGumboDefaultOptions, buffer, strlen(buffer));
3779
+ }
3780
+
3781
+ GumboOutput* gumbo_parse_with_options(
3782
+ const GumboOptions* options, const char* buffer, size_t length) {
3783
+ GumboParser parser;
3784
+ parser._options = options;
3785
+ output_init(&parser);
3786
+ gumbo_tokenizer_state_init(&parser, buffer, length);
3787
+ parser_state_init(&parser);
3788
+
3789
+ GumboParserState* state = parser._parser_state;
3790
+ gumbo_debug("Parsing %.*s.\n", length, buffer);
3791
+
3792
+ // Sanity check so that infinite loops die with an assertion failure instead
3793
+ // of hanging the process before we ever get an error.
3794
+ int loop_count = 0;
3795
+
3796
+ GumboToken token;
3797
+ bool has_error = false;
3798
+ do {
3799
+ if (state->_reprocess_current_token) {
3800
+ state->_reprocess_current_token = false;
3801
+ } else {
3802
+ GumboNode* current_node = get_current_node(&parser);
3803
+ gumbo_tokenizer_set_is_current_node_foreign(
3804
+ &parser, current_node &&
3805
+ current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML);
3806
+ has_error = !gumbo_lex(&parser, &token) || has_error;
3807
+ }
3808
+ const char* token_type = "text";
3809
+ switch (token.type) {
3810
+ case GUMBO_TOKEN_DOCTYPE:
3811
+ token_type = "doctype";
3812
+ break;
3813
+ case GUMBO_TOKEN_START_TAG:
3814
+ token_type = gumbo_normalized_tagname(token.v.start_tag.tag);
3815
+ break;
3816
+ case GUMBO_TOKEN_END_TAG:
3817
+ token_type = gumbo_normalized_tagname(token.v.end_tag);
3818
+ break;
3819
+ case GUMBO_TOKEN_COMMENT:
3820
+ token_type = "comment";
3821
+ break;
3822
+ default:
3823
+ break;
3824
+ }
3825
+ gumbo_debug("Handling %s token @%d:%d in state %d.\n",
3826
+ (char*) token_type, token.position.line, token.position.column,
3827
+ state->_insertion_mode);
3828
+
3829
+ state->_current_token = &token;
3830
+ state->_self_closing_flag_acknowledged =
3831
+ !(token.type == GUMBO_TOKEN_START_TAG &&
3832
+ token.v.start_tag.is_self_closing);
3833
+
3834
+ has_error = !handle_token(&parser, &token) || has_error;
3835
+
3836
+ // Check for memory leaks when ownership is transferred from start tag
3837
+ // tokens to nodes.
3838
+ assert(state->_reprocess_current_token ||
3839
+ token.type != GUMBO_TOKEN_START_TAG ||
3840
+ token.v.start_tag.attributes.data == NULL);
3841
+
3842
+ if (!state->_self_closing_flag_acknowledged) {
3843
+ GumboError* error = add_parse_error(&parser, &token);
3844
+ if (error) {
3845
+ error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
3846
+ }
3847
+ }
3848
+
3849
+ ++loop_count;
3850
+ assert(loop_count < 1000000000);
3851
+
3852
+ } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) &&
3853
+ !(options->stop_on_first_error && has_error));
3854
+
3855
+ finish_parsing(&parser);
3856
+ // For API uniformity reasons, if the doctype still has nulls, convert them to
3857
+ // empty strings.
3858
+ GumboDocument* doc_type = &parser._output->document->v.document;
3859
+ if (doc_type->name == NULL) {
3860
+ doc_type->name = gumbo_copy_stringz(&parser, "");
3861
+ }
3862
+ if (doc_type->public_identifier == NULL) {
3863
+ doc_type->public_identifier = gumbo_copy_stringz(&parser, "");
3864
+ }
3865
+ if (doc_type->system_identifier == NULL) {
3866
+ doc_type->system_identifier = gumbo_copy_stringz(&parser, "");
3867
+ }
3868
+
3869
+ parser_state_destroy(&parser);
3870
+ gumbo_tokenizer_state_destroy(&parser);
3871
+ return parser._output;
3872
+ }
3873
+
3874
+ void gumbo_destroy_node(GumboOptions* options, GumboNode* node) {
3875
+ // Need a dummy GumboParser because the allocator comes along with the
3876
+ // options object.
3877
+ GumboParser parser;
3878
+ parser._options = options;
3879
+ destroy_node(&parser, node);
3880
+ }
3881
+
3882
+ void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
3883
+ // Need a dummy GumboParser because the allocator comes along with the
3884
+ // options object.
3885
+ GumboParser parser;
3886
+ parser._options = options;
3887
+ destroy_node(&parser, output->document);
3888
+ for (int i = 0; i < output->errors.length; ++i) {
3889
+ gumbo_error_destroy(&parser, output->errors.data[i]);
3890
+ }
3891
+ gumbo_vector_destroy(&parser, &output->errors);
3892
+ gumbo_parser_deallocate(&parser, output);
3893
+ }