nokogumbo 1.4.7 → 1.4.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,57 +0,0 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #ifndef GUMBO_INSERTION_MODE_H_
18
- #define GUMBO_INSERTION_MODE_H_
19
-
20
- #ifdef __cplusplus
21
- extern "C" {
22
- #endif
23
-
24
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
25
- // If new enum values are added, be sure to update the kTokenHandlers dispatch
26
- // table in parser.c.
27
- typedef enum {
28
- GUMBO_INSERTION_MODE_INITIAL,
29
- GUMBO_INSERTION_MODE_BEFORE_HTML,
30
- GUMBO_INSERTION_MODE_BEFORE_HEAD,
31
- GUMBO_INSERTION_MODE_IN_HEAD,
32
- GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT,
33
- GUMBO_INSERTION_MODE_AFTER_HEAD,
34
- GUMBO_INSERTION_MODE_IN_BODY,
35
- GUMBO_INSERTION_MODE_TEXT,
36
- GUMBO_INSERTION_MODE_IN_TABLE,
37
- GUMBO_INSERTION_MODE_IN_TABLE_TEXT,
38
- GUMBO_INSERTION_MODE_IN_CAPTION,
39
- GUMBO_INSERTION_MODE_IN_COLUMN_GROUP,
40
- GUMBO_INSERTION_MODE_IN_TABLE_BODY,
41
- GUMBO_INSERTION_MODE_IN_ROW,
42
- GUMBO_INSERTION_MODE_IN_CELL,
43
- GUMBO_INSERTION_MODE_IN_SELECT,
44
- GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
45
- GUMBO_INSERTION_MODE_IN_TEMPLATE,
46
- GUMBO_INSERTION_MODE_AFTER_BODY,
47
- GUMBO_INSERTION_MODE_IN_FRAMESET,
48
- GUMBO_INSERTION_MODE_AFTER_FRAMESET,
49
- GUMBO_INSERTION_MODE_AFTER_AFTER_BODY,
50
- GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET
51
- } GumboInsertionMode;
52
-
53
- #ifdef __cplusplus
54
- } // extern C
55
- #endif
56
-
57
- #endif // GUMBO_INSERTION_MODE_H_
@@ -1,4188 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include <assert.h>
18
- #include <ctype.h>
19
- #include <stdarg.h>
20
- #include <stdlib.h>
21
- #include <string.h>
22
- #include <strings.h>
23
-
24
- #include "attribute.h"
25
- #include "error.h"
26
- #include "gumbo.h"
27
- #include "insertion_mode.h"
28
- #include "parser.h"
29
- #include "tokenizer.h"
30
- #include "tokenizer_states.h"
31
- #include "utf8.h"
32
- #include "util.h"
33
- #include "vector.h"
34
-
35
- #define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
36
-
37
- #define GUMBO_STRING(literal) \
38
- { literal, sizeof(literal) - 1 }
39
- #define TERMINATOR \
40
- { "", 0 }
41
-
42
- typedef char gumbo_tagset[GUMBO_TAG_LAST];
43
- #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML)
44
- #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG)
45
- #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML)
46
-
47
- #define TAGSET_INCLUDES(tagset, namespace, tag) \
48
- (tag < GUMBO_TAG_LAST && tagset[(int) tag] == (1 << (int) namespace))
49
-
50
- // selected forward declarations as it is getting hard to find
51
- // an appropriate order
52
- static bool node_html_tag_is(const GumboNode*, GumboTag);
53
- static GumboInsertionMode get_current_template_insertion_mode(
54
- const GumboParser*);
55
- static bool handle_in_template(GumboParser*, GumboToken*);
56
- static void destroy_node(GumboParser*, GumboNode*);
57
-
58
- static void* malloc_wrapper(void* unused, size_t size) { return malloc(size); }
59
-
60
- static void free_wrapper(void* unused, void* ptr) { free(ptr); }
61
-
62
- const GumboOptions kGumboDefaultOptions = {&malloc_wrapper, &free_wrapper, NULL,
63
- 8, false, -1, GUMBO_TAG_LAST, GUMBO_NAMESPACE_HTML};
64
-
65
- static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html");
66
- static const GumboStringPiece kPublicIdHtml4_0 =
67
- GUMBO_STRING("-//W3C//DTD HTML 4.0//EN");
68
- static const GumboStringPiece kPublicIdHtml4_01 =
69
- GUMBO_STRING("-//W3C//DTD HTML 4.01//EN");
70
- static const GumboStringPiece kPublicIdXhtml1_0 =
71
- GUMBO_STRING("-//W3C//DTD XHTML 1.0 Strict//EN");
72
- static const GumboStringPiece kPublicIdXhtml1_1 =
73
- GUMBO_STRING("-//W3C//DTD XHTML 1.1//EN");
74
- static const GumboStringPiece kSystemIdRecHtml4_0 =
75
- GUMBO_STRING("http://www.w3.org/TR/REC-html40/strict.dtd");
76
- static const GumboStringPiece kSystemIdHtml4 =
77
- GUMBO_STRING("http://www.w3.org/TR/html4/strict.dtd");
78
- static const GumboStringPiece kSystemIdXhtmlStrict1_1 =
79
- GUMBO_STRING("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
80
- static const GumboStringPiece kSystemIdXhtml1_1 =
81
- GUMBO_STRING("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
82
- static const GumboStringPiece kSystemIdLegacyCompat =
83
- GUMBO_STRING("about:legacy-compat");
84
-
85
- // The doctype arrays have an explicit terminator because we want to pass them
86
- // to a helper function, and passing them as a pointer discards sizeof
87
- // information. The SVG arrays are used only by one-off functions, and so loops
88
- // over them use sizeof directly instead of a terminator.
89
-
90
- static const GumboStringPiece kQuirksModePublicIdPrefixes[] = {
91
- GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"),
92
- GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
93
- GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"),
94
- GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"),
95
- GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"),
96
- GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"),
97
- GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"),
98
- GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"),
99
- GUMBO_STRING("-//IETF//DTD HTML 2.0//"),
100
- GUMBO_STRING("-//IETF//DTD HTML 2.1E//"),
101
- GUMBO_STRING("-//IETF//DTD HTML 3.0//"),
102
- GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"),
103
- GUMBO_STRING("-//IETF//DTD HTML 3.2//"),
104
- GUMBO_STRING("-//IETF//DTD HTML 3//"),
105
- GUMBO_STRING("-//IETF//DTD HTML Level 0//"),
106
- GUMBO_STRING("-//IETF//DTD HTML Level 1//"),
107
- GUMBO_STRING("-//IETF//DTD HTML Level 2//"),
108
- GUMBO_STRING("-//IETF//DTD HTML Level 3//"),
109
- GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"),
110
- GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"),
111
- GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"),
112
- GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"),
113
- GUMBO_STRING("-//IETF//DTD HTML Strict//"),
114
- GUMBO_STRING("-//IETF//DTD HTML//"),
115
- GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"),
116
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
117
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
118
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
119
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
120
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
121
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
122
- GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"),
123
- GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"),
124
- GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"),
125
- GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
126
- GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
127
- GUMBO_STRING(
128
- "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)"
129
- "extensions to HTML 4.0//"),
130
- GUMBO_STRING(
131
- "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::"
132
- "extensions to HTML 4.0//"),
133
- GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"),
134
- GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
135
- GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
136
- GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
137
- GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"),
138
- GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"),
139
- GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"),
140
- GUMBO_STRING("-//W3C//DTD HTML 3.2//"),
141
- GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"),
142
- GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"),
143
- GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"),
144
- GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"),
145
- GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"),
146
- GUMBO_STRING("-//W3C//DTD W3 HTML//"),
147
- GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"),
148
- GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"),
149
- GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"), TERMINATOR};
150
-
151
- static const GumboStringPiece kQuirksModePublicIdExactMatches[] = {
152
- GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"),
153
- GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), GUMBO_STRING("HTML"),
154
- TERMINATOR};
155
-
156
- static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = {
157
- GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"),
158
- TERMINATOR};
159
-
160
- static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = {
161
- GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"),
162
- GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"), TERMINATOR};
163
-
164
- static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] =
165
- {GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"),
166
- GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"), TERMINATOR};
167
-
168
- // Indexed by GumboNamespaceEnum; keep in sync with that.
169
- static const char* kLegalXmlns[] = {"http://www.w3.org/1999/xhtml",
170
- "http://www.w3.org/2000/svg", "http://www.w3.org/1998/Math/MathML"};
171
-
172
- typedef struct _ReplacementEntry {
173
- const GumboStringPiece from;
174
- const GumboStringPiece to;
175
- } ReplacementEntry;
176
-
177
- #define REPLACEMENT_ENTRY(from, to) \
178
- { GUMBO_STRING(from), GUMBO_STRING(to) }
179
-
180
- // Static data for SVG attribute replacements.
181
- // https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes
182
- static const ReplacementEntry kSvgAttributeReplacements[] = {
183
- REPLACEMENT_ENTRY("attributename", "attributeName"),
184
- REPLACEMENT_ENTRY("attributetype", "attributeType"),
185
- REPLACEMENT_ENTRY("basefrequency", "baseFrequency"),
186
- REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
187
- REPLACEMENT_ENTRY("calcmode", "calcMode"),
188
- REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
189
- // REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
190
- // REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
191
- REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
192
- REPLACEMENT_ENTRY("edgemode", "edgeMode"),
193
- // REPLACEMENT_ENTRY("externalresourcesrequired",
194
- // "externalResourcesRequired"),
195
- // REPLACEMENT_ENTRY("filterres", "filterRes"),
196
- REPLACEMENT_ENTRY("filterunits", "filterUnits"),
197
- REPLACEMENT_ENTRY("glyphref", "glyphRef"),
198
- REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
199
- REPLACEMENT_ENTRY("gradientunits", "gradientUnits"),
200
- REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"),
201
- REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"),
202
- REPLACEMENT_ENTRY("keypoints", "keyPoints"),
203
- REPLACEMENT_ENTRY("keysplines", "keySplines"),
204
- REPLACEMENT_ENTRY("keytimes", "keyTimes"),
205
- REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"),
206
- REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"),
207
- REPLACEMENT_ENTRY("markerheight", "markerHeight"),
208
- REPLACEMENT_ENTRY("markerunits", "markerUnits"),
209
- REPLACEMENT_ENTRY("markerwidth", "markerWidth"),
210
- REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"),
211
- REPLACEMENT_ENTRY("maskunits", "maskUnits"),
212
- REPLACEMENT_ENTRY("numoctaves", "numOctaves"),
213
- REPLACEMENT_ENTRY("pathlength", "pathLength"),
214
- REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"),
215
- REPLACEMENT_ENTRY("patterntransform", "patternTransform"),
216
- REPLACEMENT_ENTRY("patternunits", "patternUnits"),
217
- REPLACEMENT_ENTRY("pointsatx", "pointsAtX"),
218
- REPLACEMENT_ENTRY("pointsaty", "pointsAtY"),
219
- REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"),
220
- REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"),
221
- REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"),
222
- REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"),
223
- REPLACEMENT_ENTRY("refx", "refX"), REPLACEMENT_ENTRY("refy", "refY"),
224
- REPLACEMENT_ENTRY("repeatcount", "repeatCount"),
225
- REPLACEMENT_ENTRY("repeatdur", "repeatDur"),
226
- REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"),
227
- REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"),
228
- REPLACEMENT_ENTRY("specularconstant", "specularConstant"),
229
- REPLACEMENT_ENTRY("specularexponent", "specularExponent"),
230
- REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"),
231
- REPLACEMENT_ENTRY("startoffset", "startOffset"),
232
- REPLACEMENT_ENTRY("stddeviation", "stdDeviation"),
233
- REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"),
234
- REPLACEMENT_ENTRY("surfacescale", "surfaceScale"),
235
- REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"),
236
- REPLACEMENT_ENTRY("tablevalues", "tableValues"),
237
- REPLACEMENT_ENTRY("targetx", "targetX"),
238
- REPLACEMENT_ENTRY("targety", "targetY"),
239
- REPLACEMENT_ENTRY("textlength", "textLength"),
240
- REPLACEMENT_ENTRY("viewbox", "viewBox"),
241
- REPLACEMENT_ENTRY("viewtarget", "viewTarget"),
242
- REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"),
243
- REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"),
244
- REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"),
245
- };
246
-
247
- static const ReplacementEntry kSvgTagReplacements[] = {
248
- REPLACEMENT_ENTRY("altglyph", "altGlyph"),
249
- REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"),
250
- REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"),
251
- REPLACEMENT_ENTRY("animatecolor", "animateColor"),
252
- REPLACEMENT_ENTRY("animatemotion", "animateMotion"),
253
- REPLACEMENT_ENTRY("animatetransform", "animateTransform"),
254
- REPLACEMENT_ENTRY("clippath", "clipPath"),
255
- REPLACEMENT_ENTRY("feblend", "feBlend"),
256
- REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"),
257
- REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"),
258
- REPLACEMENT_ENTRY("fecomposite", "feComposite"),
259
- REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"),
260
- REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"),
261
- REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"),
262
- REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"),
263
- REPLACEMENT_ENTRY("feflood", "feFlood"),
264
- REPLACEMENT_ENTRY("fefunca", "feFuncA"),
265
- REPLACEMENT_ENTRY("fefuncb", "feFuncB"),
266
- REPLACEMENT_ENTRY("fefuncg", "feFuncG"),
267
- REPLACEMENT_ENTRY("fefuncr", "feFuncR"),
268
- REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"),
269
- REPLACEMENT_ENTRY("feimage", "feImage"),
270
- REPLACEMENT_ENTRY("femerge", "feMerge"),
271
- REPLACEMENT_ENTRY("femergenode", "feMergeNode"),
272
- REPLACEMENT_ENTRY("femorphology", "feMorphology"),
273
- REPLACEMENT_ENTRY("feoffset", "feOffset"),
274
- REPLACEMENT_ENTRY("fepointlight", "fePointLight"),
275
- REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"),
276
- REPLACEMENT_ENTRY("fespotlight", "feSpotLight"),
277
- REPLACEMENT_ENTRY("fetile", "feTile"),
278
- REPLACEMENT_ENTRY("feturbulence", "feTurbulence"),
279
- REPLACEMENT_ENTRY("foreignobject", "foreignObject"),
280
- REPLACEMENT_ENTRY("glyphref", "glyphRef"),
281
- REPLACEMENT_ENTRY("lineargradient", "linearGradient"),
282
- REPLACEMENT_ENTRY("radialgradient", "radialGradient"),
283
- REPLACEMENT_ENTRY("textpath", "textPath"),
284
- };
285
-
286
- typedef struct _NamespacedAttributeReplacement {
287
- const char* from;
288
- const char* local_name;
289
- const GumboAttributeNamespaceEnum attr_namespace;
290
- } NamespacedAttributeReplacement;
291
-
292
- static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = {
293
- {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
294
- {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
295
- {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
296
- {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
297
- {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
298
- {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
299
- {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
300
- {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML},
301
- {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
302
- {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
303
- {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
304
- {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
305
- };
306
-
307
- // The "scope marker" for the list of active formatting elements. We use a
308
- // pointer to this as a generic marker element, since the particular element
309
- // scope doesn't matter.
310
- static const GumboNode kActiveFormattingScopeMarker;
311
-
312
- // The tag_is and tag_in function use true & false to denote start & end tags,
313
- // but for readability, we define constants for them here.
314
- static const bool kStartTag = true;
315
- static const bool kEndTag = false;
316
-
317
- // Because GumboStringPieces are immutable, we can't insert a character directly
318
- // into a text node. Instead, we accumulate all pending characters here and
319
- // flush them out to a text node whenever a new element is inserted.
320
- //
321
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character
322
- typedef struct _TextNodeBufferState {
323
- // The accumulated text to be inserted into the current text node.
324
- GumboStringBuffer _buffer;
325
-
326
- // A pointer to the original text represented by this text node. Note that
327
- // because of foster parenting and other strange DOM manipulations, this may
328
- // include other non-text HTML tags in it; it is defined as the span of
329
- // original text from the first character in this text node to the last
330
- // character in this text node.
331
- const char* _start_original_text;
332
-
333
- // The source position of the start of this text node.
334
- GumboSourcePosition _start_position;
335
-
336
- // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
337
- GumboNodeType _type;
338
- } TextNodeBufferState;
339
-
340
- typedef struct GumboInternalParserState {
341
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
342
- GumboInsertionMode _insertion_mode;
343
-
344
- // Used for run_generic_parsing_algorithm, which needs to switch back to the
345
- // original insertion mode at its conclusion.
346
- GumboInsertionMode _original_insertion_mode;
347
-
348
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements
349
- GumboVector /*GumboNode*/ _open_elements;
350
-
351
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements
352
- GumboVector /*GumboNode*/ _active_formatting_elements;
353
-
354
- // The stack of template insertion modes.
355
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode
356
- GumboVector /*InsertionMode*/ _template_insertion_modes;
357
-
358
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers
359
- GumboNode* _head_element;
360
- GumboNode* _form_element;
361
-
362
- // The element used as fragment context when parsing in fragment mode
363
- GumboNode* _fragment_ctx;
364
-
365
- // The flag for when the spec says "Reprocess the current token in..."
366
- bool _reprocess_current_token;
367
-
368
- // The flag for "acknowledge the token's self-closing flag".
369
- bool _self_closing_flag_acknowledged;
370
-
371
- // The "frameset-ok" flag from the spec.
372
- bool _frameset_ok;
373
-
374
- // The flag for "If the next token is a LINE FEED, ignore that token...".
375
- bool _ignore_next_linefeed;
376
-
377
- // The flag for "whenever a node would be inserted into the current node, it
378
- // must instead be foster parented". This is used for misnested table
379
- // content, which needs to be handled according to "in body" rules yet foster
380
- // parented outside of the table.
381
- // It would perhaps be more explicit to have this as a parameter to
382
- // handle_in_body and insert_element, but given how special-purpose this is
383
- // and the number of call-sites that would need to take the extra parameter,
384
- // it's easier just to have a state flag.
385
- bool _foster_parent_insertions;
386
-
387
- // The accumulated text node buffer state.
388
- TextNodeBufferState _text_node;
389
-
390
- // The current token.
391
- GumboToken* _current_token;
392
-
393
- // The way that the spec is written, the </body> and </html> tags are *always*
394
- // implicit, because encountering one of those tokens merely switches the
395
- // insertion mode out of "in body". So we have individual state flags for
396
- // those end tags that are then inspected by pop_current_node when the <body>
397
- // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG
398
- // flag appropriately.
399
- bool _closed_body_tag;
400
- bool _closed_html_tag;
401
- } GumboParserState;
402
-
403
- static bool token_has_attribute(const GumboToken* token, const char* name) {
404
- assert(token->type == GUMBO_TOKEN_START_TAG);
405
- return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL;
406
- }
407
-
408
- // Checks if the value of the specified attribute is a case-insensitive match
409
- // for the specified string.
410
- static bool attribute_matches(
411
- const GumboVector* attributes, const char* name, const char* value) {
412
- const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
413
- return attr ? strcasecmp(value, attr->value) == 0 : false;
414
- }
415
-
416
- // Checks if the value of the specified attribute is a case-sensitive match
417
- // for the specified string.
418
- static bool attribute_matches_case_sensitive(
419
- const GumboVector* attributes, const char* name, const char* value) {
420
- const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
421
- return attr ? strcmp(value, attr->value) == 0 : false;
422
- }
423
-
424
- // Checks if the specified attribute vectors are identical.
425
- static bool all_attributes_match(
426
- const GumboVector* attr1, const GumboVector* attr2) {
427
- unsigned int num_unmatched_attr2_elements = attr2->length;
428
- for (unsigned int i = 0; i < attr1->length; ++i) {
429
- const GumboAttribute* attr = attr1->data[i];
430
- if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) {
431
- --num_unmatched_attr2_elements;
432
- } else {
433
- return false;
434
- }
435
- }
436
- return num_unmatched_attr2_elements == 0;
437
- }
438
-
439
- static void set_frameset_not_ok(GumboParser* parser) {
440
- gumbo_debug("Setting frameset_ok to false.\n");
441
- parser->_parser_state->_frameset_ok = false;
442
- }
443
-
444
- static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
445
- GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
446
- node->parent = NULL;
447
- node->index_within_parent = -1;
448
- node->type = type;
449
- node->parse_flags = GUMBO_INSERTION_NORMAL;
450
- return node;
451
- }
452
-
453
- static GumboNode* new_document_node(GumboParser* parser) {
454
- GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT);
455
- document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
456
- gumbo_vector_init(parser, 1, &document_node->v.document.children);
457
-
458
- // Must be initialized explicitly, as there's no guarantee that we'll see a
459
- // doc type token.
460
- GumboDocument* document = &document_node->v.document;
461
- document->has_doctype = false;
462
- document->name = NULL;
463
- document->public_identifier = NULL;
464
- document->system_identifier = NULL;
465
- return document_node;
466
- }
467
-
468
- static void output_init(GumboParser* parser) {
469
- GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput));
470
- output->root = NULL;
471
- output->document = new_document_node(parser);
472
- parser->_output = output;
473
- gumbo_init_errors(parser);
474
- }
475
-
476
- static void parser_state_init(GumboParser* parser) {
477
- GumboParserState* parser_state =
478
- gumbo_parser_allocate(parser, sizeof(GumboParserState));
479
- parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL;
480
- parser_state->_reprocess_current_token = false;
481
- parser_state->_frameset_ok = true;
482
- parser_state->_ignore_next_linefeed = false;
483
- parser_state->_foster_parent_insertions = false;
484
- parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
485
- gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer);
486
- gumbo_vector_init(parser, 10, &parser_state->_open_elements);
487
- gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements);
488
- gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
489
- parser_state->_head_element = NULL;
490
- parser_state->_form_element = NULL;
491
- parser_state->_fragment_ctx = NULL;
492
- parser_state->_current_token = NULL;
493
- parser_state->_closed_body_tag = false;
494
- parser_state->_closed_html_tag = false;
495
- parser->_parser_state = parser_state;
496
- }
497
-
498
- static void parser_state_destroy(GumboParser* parser) {
499
- GumboParserState* state = parser->_parser_state;
500
- if (state->_fragment_ctx) {
501
- destroy_node(parser, state->_fragment_ctx);
502
- }
503
- gumbo_vector_destroy(parser, &state->_active_formatting_elements);
504
- gumbo_vector_destroy(parser, &state->_open_elements);
505
- gumbo_vector_destroy(parser, &state->_template_insertion_modes);
506
- gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
507
- gumbo_parser_deallocate(parser, state);
508
- }
509
-
510
- static GumboNode* get_document_node(GumboParser* parser) {
511
- return parser->_output->document;
512
- }
513
-
514
- static bool is_fragment_parser(const GumboParser* parser) {
515
- return !!parser->_parser_state->_fragment_ctx;
516
- }
517
-
518
- // Returns the node at the bottom of the stack of open elements, or NULL if no
519
- // elements have been added yet.
520
- static GumboNode* get_current_node(GumboParser* parser) {
521
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
522
- if (open_elements->length == 0) {
523
- assert(!parser->_output->root);
524
- return NULL;
525
- }
526
- assert(open_elements->length > 0);
527
- assert(open_elements->data != NULL);
528
- return open_elements->data[open_elements->length - 1];
529
- }
530
-
531
- static GumboNode* get_adjusted_current_node(GumboParser* parser) {
532
- GumboParserState* state = parser->_parser_state;
533
- if (state->_open_elements.length == 1 && state->_fragment_ctx) {
534
- return state->_fragment_ctx;
535
- }
536
- return get_current_node(parser);
537
- }
538
-
539
- // Returns true if the given needle is in the given array of literal
540
- // GumboStringPieces. If exact_match is true, this requires that they match
541
- // exactly; otherwise, this performs a prefix match to check if any of the
542
- // elements in haystack start with needle. This always performs a
543
- // case-insensitive match.
544
- static bool is_in_static_list(
545
- const char* needle, const GumboStringPiece* haystack, bool exact_match) {
546
- for (unsigned int i = 0; haystack[i].length > 0; ++i) {
547
- if ((exact_match && !strcmp(needle, haystack[i].data)) ||
548
- (!exact_match && !strcasecmp(needle, haystack[i].data))) {
549
- return true;
550
- }
551
- }
552
- return false;
553
- }
554
-
555
- static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
556
- parser->_parser_state->_insertion_mode = mode;
557
- }
558
-
559
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
560
- // This is a helper function that returns the appropriate insertion mode instead
561
- // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
562
- // indicate that there is no appropriate insertion mode, and the loop should
563
- // continue.
564
- static GumboInsertionMode get_appropriate_insertion_mode(
565
- const GumboParser* parser, int index) {
566
- const GumboVector* open_elements = &parser->_parser_state->_open_elements;
567
- const GumboNode* node = open_elements->data[index];
568
- const bool is_last = index == 0;
569
-
570
- if (is_last && is_fragment_parser(parser)) {
571
- node = parser->_parser_state->_fragment_ctx;
572
- }
573
-
574
- assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
575
- switch (node->v.element.tag) {
576
- case GUMBO_TAG_SELECT: {
577
- if (is_last) {
578
- return GUMBO_INSERTION_MODE_IN_SELECT;
579
- }
580
- for (int i = index; i > 0; --i) {
581
- const GumboNode* ancestor = open_elements->data[i];
582
- if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) {
583
- return GUMBO_INSERTION_MODE_IN_SELECT;
584
- }
585
- if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) {
586
- return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE;
587
- }
588
- }
589
- return GUMBO_INSERTION_MODE_IN_SELECT;
590
- }
591
- case GUMBO_TAG_TD:
592
- case GUMBO_TAG_TH:
593
- if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL;
594
- break;
595
- case GUMBO_TAG_TR:
596
- return GUMBO_INSERTION_MODE_IN_ROW;
597
- case GUMBO_TAG_TBODY:
598
- case GUMBO_TAG_THEAD:
599
- case GUMBO_TAG_TFOOT:
600
- return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
601
- case GUMBO_TAG_CAPTION:
602
- return GUMBO_INSERTION_MODE_IN_CAPTION;
603
- case GUMBO_TAG_COLGROUP:
604
- return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
605
- case GUMBO_TAG_TABLE:
606
- return GUMBO_INSERTION_MODE_IN_TABLE;
607
- case GUMBO_TAG_TEMPLATE:
608
- return get_current_template_insertion_mode(parser);
609
- case GUMBO_TAG_HEAD:
610
- if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD;
611
- break;
612
- case GUMBO_TAG_BODY:
613
- return GUMBO_INSERTION_MODE_IN_BODY;
614
- case GUMBO_TAG_FRAMESET:
615
- return GUMBO_INSERTION_MODE_IN_FRAMESET;
616
- case GUMBO_TAG_HTML:
617
- return parser->_parser_state->_head_element
618
- ? GUMBO_INSERTION_MODE_AFTER_HEAD
619
- : GUMBO_INSERTION_MODE_BEFORE_HEAD;
620
- default:
621
- break;
622
- }
623
- return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
624
- }
625
-
626
- // This performs the actual "reset the insertion mode" loop.
627
- static void reset_insertion_mode_appropriately(GumboParser* parser) {
628
- const GumboVector* open_elements = &parser->_parser_state->_open_elements;
629
- for (int i = open_elements->length; --i >= 0;) {
630
- GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i);
631
- if (mode != GUMBO_INSERTION_MODE_INITIAL) {
632
- set_insertion_mode(parser, mode);
633
- return;
634
- }
635
- }
636
- // Should never get here, because is_last will be set on the last iteration
637
- // and will force GUMBO_INSERTION_MODE_IN_BODY.
638
- assert(0);
639
- }
640
-
641
- static GumboError* parser_add_parse_error(
642
- GumboParser* parser, const GumboToken* token) {
643
- gumbo_debug("Adding parse error.\n");
644
- GumboError* error = gumbo_add_error(parser);
645
- if (!error) {
646
- return NULL;
647
- }
648
- error->type = GUMBO_ERR_PARSER;
649
- error->position = token->position;
650
- error->original_text = token->original_text.data;
651
- GumboParserError* extra_data = &error->v.parser;
652
- extra_data->input_type = token->type;
653
- extra_data->input_tag = GUMBO_TAG_UNKNOWN;
654
- if (token->type == GUMBO_TOKEN_START_TAG) {
655
- extra_data->input_tag = token->v.start_tag.tag;
656
- } else if (token->type == GUMBO_TOKEN_END_TAG) {
657
- extra_data->input_tag = token->v.end_tag;
658
- }
659
- GumboParserState* state = parser->_parser_state;
660
- extra_data->parser_state = state->_insertion_mode;
661
- gumbo_vector_init(
662
- parser, state->_open_elements.length, &extra_data->tag_stack);
663
- for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
664
- const GumboNode* node = state->_open_elements.data[i];
665
- assert(
666
- node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
667
- gumbo_vector_add(
668
- parser, (void*) node->v.element.tag, &extra_data->tag_stack);
669
- }
670
- return error;
671
- }
672
-
673
- // Returns true if the specified token is either a start or end tag (specified
674
- // by is_start) with one of the tag types in the varargs list. Terminate the
675
- // list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
676
- // the spec references tags that are not in the spec.
677
- static bool tag_in(
678
- const GumboToken* token, bool is_start, const gumbo_tagset tags) {
679
- GumboTag token_tag;
680
- if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
681
- token_tag = token->v.start_tag.tag;
682
- } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
683
- token_tag = token->v.end_tag;
684
- } else {
685
- return false;
686
- }
687
- return (token_tag < GUMBO_TAG_LAST && tags[(int) token_tag] != 0);
688
- }
689
-
690
- // Like tag_in, but for the single-tag case.
691
- static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
692
- if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
693
- return token->v.start_tag.tag == tag;
694
- } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
695
- return token->v.end_tag == tag;
696
- } else {
697
- return false;
698
- }
699
- }
700
-
701
- // Like tag_in, but checks for the tag of a node, rather than a token.
702
- static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) {
703
- assert(node != NULL);
704
- if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
705
- return false;
706
- }
707
- return TAGSET_INCLUDES(
708
- tags, node->v.element.tag_namespace, node->v.element.tag);
709
- }
710
-
711
- // Like node_tag_in, but for the single-tag case.
712
- static bool node_qualified_tag_is(
713
- const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) {
714
- assert(node);
715
- return (node->type == GUMBO_NODE_ELEMENT ||
716
- node->type == GUMBO_NODE_TEMPLATE) &&
717
- node->v.element.tag == tag && node->v.element.tag_namespace == ns;
718
- }
719
-
720
- // Like node_tag_in, but for the single-tag case in the HTML namespace
721
- static bool node_html_tag_is(const GumboNode* node, GumboTag tag) {
722
- return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
723
- }
724
-
725
- static void push_template_insertion_mode(
726
- GumboParser* parser, GumboInsertionMode mode) {
727
- gumbo_vector_add(
728
- parser, (void*) mode, &parser->_parser_state->_template_insertion_modes);
729
- }
730
-
731
- static void pop_template_insertion_mode(GumboParser* parser) {
732
- gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes);
733
- }
734
-
735
- // Returns the current template insertion mode. If the stack of template
736
- // insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL.
737
- static GumboInsertionMode get_current_template_insertion_mode(
738
- const GumboParser* parser) {
739
- GumboVector* template_insertion_modes =
740
- &parser->_parser_state->_template_insertion_modes;
741
- if (template_insertion_modes->length == 0) {
742
- return GUMBO_INSERTION_MODE_INITIAL;
743
- }
744
- return (GumboInsertionMode)
745
- template_insertion_modes->data[(template_insertion_modes->length - 1)];
746
- }
747
-
748
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
749
- static bool is_mathml_integration_point(const GumboNode* node) {
750
- return node_tag_in_set(
751
- node, (gumbo_tagset){TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
752
- TAG_MATHML(MS), TAG_MATHML(MTEXT)});
753
- }
754
-
755
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
756
- static bool is_html_integration_point(const GumboNode* node) {
757
- return node_tag_in_set(node, (gumbo_tagset){TAG_SVG(FOREIGNOBJECT),
758
- TAG_SVG(DESC), TAG_SVG(TITLE)}) ||
759
- (node_qualified_tag_is(
760
- node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
761
- (attribute_matches(
762
- &node->v.element.attributes, "encoding", "text/html") ||
763
- attribute_matches(&node->v.element.attributes, "encoding",
764
- "application/xhtml+xml")));
765
- }
766
-
767
- // This represents a place to insert a node, consisting of a target parent and a
768
- // child index within that parent. If the node should be inserted at the end of
769
- // the parent's child, index will be -1.
770
- typedef struct {
771
- GumboNode* target;
772
- int index;
773
- } InsertionLocation;
774
-
775
- InsertionLocation get_appropriate_insertion_location(
776
- GumboParser* parser, GumboNode* override_target) {
777
- InsertionLocation retval = {override_target, -1};
778
- if (retval.target == NULL) {
779
- // No override target; default to the current node, but special-case the
780
- // root node since get_current_node() assumes the stack of open elements is
781
- // non-empty.
782
- retval.target = parser->_output->root != NULL ? get_current_node(parser)
783
- : get_document_node(parser);
784
- }
785
- if (!parser->_parser_state->_foster_parent_insertions ||
786
- !node_tag_in_set(retval.target, (gumbo_tagset){TAG(TABLE), TAG(TBODY),
787
- TAG(TFOOT), TAG(THEAD), TAG(TR)})) {
788
- return retval;
789
- }
790
-
791
- // Foster-parenting case.
792
- int last_template_index = -1;
793
- int last_table_index = -1;
794
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
795
- for (unsigned int i = 0; i < open_elements->length; ++i) {
796
- if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) {
797
- last_template_index = i;
798
- }
799
- if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) {
800
- last_table_index = i;
801
- }
802
- }
803
- if (last_template_index != -1 &&
804
- (last_table_index == -1 || last_template_index > last_table_index)) {
805
- retval.target = open_elements->data[last_template_index];
806
- return retval;
807
- }
808
- if (last_table_index == -1) {
809
- retval.target = open_elements->data[0];
810
- return retval;
811
- }
812
- GumboNode* last_table = open_elements->data[last_table_index];
813
- if (last_table->parent != NULL) {
814
- retval.target = last_table->parent;
815
- retval.index = last_table->index_within_parent;
816
- return retval;
817
- }
818
-
819
- retval.target = open_elements->data[last_table_index - 1];
820
- return retval;
821
- }
822
-
823
- // Appends a node to the end of its parent, setting the "parent" and
824
- // "index_within_parent" fields appropriately.
825
- static void append_node(
826
- GumboParser* parser, GumboNode* parent, GumboNode* node) {
827
- assert(node->parent == NULL);
828
- assert(node->index_within_parent == -1);
829
- GumboVector* children;
830
- if (parent->type == GUMBO_NODE_ELEMENT ||
831
- parent->type == GUMBO_NODE_TEMPLATE) {
832
- children = &parent->v.element.children;
833
- } else {
834
- assert(parent->type == GUMBO_NODE_DOCUMENT);
835
- children = &parent->v.document.children;
836
- }
837
- node->parent = parent;
838
- node->index_within_parent = children->length;
839
- gumbo_vector_add(parser, (void*) node, children);
840
- assert(node->index_within_parent < children->length);
841
- }
842
-
843
- // Inserts a node at the specified InsertionLocation, updating the
844
- // "parent" and "index_within_parent" fields of it and all its siblings.
845
- // If the index of the location is -1, this calls append_node.
846
- static void insert_node(
847
- GumboParser* parser, GumboNode* node, InsertionLocation location) {
848
- assert(node->parent == NULL);
849
- assert(node->index_within_parent == -1);
850
- GumboNode* parent = location.target;
851
- int index = location.index;
852
- if (index != -1) {
853
- GumboVector* children = NULL;
854
- if (parent->type == GUMBO_NODE_ELEMENT ||
855
- parent->type == GUMBO_NODE_TEMPLATE) {
856
- children = &parent->v.element.children;
857
- } else if (parent->type == GUMBO_NODE_DOCUMENT) {
858
- children = &parent->v.document.children;
859
- assert(children->length == 0);
860
- } else {
861
- assert(0);
862
- }
863
-
864
- assert(index >= 0);
865
- assert((unsigned int) index < children->length);
866
- node->parent = parent;
867
- node->index_within_parent = index;
868
- gumbo_vector_insert_at(parser, (void*) node, index, children);
869
- assert(node->index_within_parent < children->length);
870
- for (unsigned int i = index + 1; i < children->length; ++i) {
871
- GumboNode* sibling = children->data[i];
872
- sibling->index_within_parent = i;
873
- assert(sibling->index_within_parent < children->length);
874
- }
875
- } else {
876
- append_node(parser, parent, node);
877
- }
878
- }
879
-
880
- static void maybe_flush_text_node_buffer(GumboParser* parser) {
881
- GumboParserState* state = parser->_parser_state;
882
- TextNodeBufferState* buffer_state = &state->_text_node;
883
- if (buffer_state->_buffer.length == 0) {
884
- return;
885
- }
886
-
887
- assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
888
- buffer_state->_type == GUMBO_NODE_TEXT ||
889
- buffer_state->_type == GUMBO_NODE_CDATA);
890
- GumboNode* text_node = create_node(parser, buffer_state->_type);
891
- GumboText* text_node_data = &text_node->v.text;
892
- text_node_data->text =
893
- gumbo_string_buffer_to_string(parser, &buffer_state->_buffer);
894
- text_node_data->original_text.data = buffer_state->_start_original_text;
895
- text_node_data->original_text.length =
896
- state->_current_token->original_text.data -
897
- buffer_state->_start_original_text;
898
- text_node_data->start_pos = buffer_state->_start_position;
899
-
900
- gumbo_debug("Flushing text node buffer of %.*s.\n",
901
- (int) buffer_state->_buffer.length, buffer_state->_buffer.data);
902
-
903
- InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
904
- if (location.target->type == GUMBO_NODE_DOCUMENT) {
905
- // The DOM does not allow Document nodes to have Text children, so per the
906
- // spec, they are dropped on the floor.
907
- destroy_node(parser, text_node);
908
- } else {
909
- insert_node(parser, text_node, location);
910
- }
911
-
912
- gumbo_string_buffer_clear(parser, &buffer_state->_buffer);
913
- buffer_state->_type = GUMBO_NODE_WHITESPACE;
914
- assert(buffer_state->_buffer.length == 0);
915
- }
916
-
917
- static void record_end_of_element(
918
- GumboToken* current_token, GumboElement* element) {
919
- element->end_pos = current_token->position;
920
- element->original_end_tag = current_token->type == GUMBO_TOKEN_END_TAG
921
- ? current_token->original_text
922
- : kGumboEmptyString;
923
- }
924
-
925
- static GumboNode* pop_current_node(GumboParser* parser) {
926
- GumboParserState* state = parser->_parser_state;
927
- maybe_flush_text_node_buffer(parser);
928
- if (state->_open_elements.length > 0) {
929
- assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
930
- gumbo_debug("Popping %s node.\n",
931
- gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
932
- }
933
- GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements);
934
- if (!current_node) {
935
- assert(state->_open_elements.length == 0);
936
- return NULL;
937
- }
938
- assert(current_node->type == GUMBO_NODE_ELEMENT ||
939
- current_node->type == GUMBO_NODE_TEMPLATE);
940
- bool is_closed_body_or_html_tag =
941
- (node_html_tag_is(current_node, GUMBO_TAG_BODY) &&
942
- state->_closed_body_tag) ||
943
- (node_html_tag_is(current_node, GUMBO_TAG_HTML) &&
944
- state->_closed_html_tag);
945
- if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
946
- !node_html_tag_is(current_node, state->_current_token->v.end_tag)) &&
947
- !is_closed_body_or_html_tag) {
948
- current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
949
- }
950
- if (!is_closed_body_or_html_tag) {
951
- record_end_of_element(state->_current_token, &current_node->v.element);
952
- }
953
- return current_node;
954
- }
955
-
956
- static void append_comment_node(
957
- GumboParser* parser, GumboNode* node, const GumboToken* token) {
958
- maybe_flush_text_node_buffer(parser);
959
- GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT);
960
- comment->type = GUMBO_NODE_COMMENT;
961
- comment->parse_flags = GUMBO_INSERTION_NORMAL;
962
- comment->v.text.text = token->v.text;
963
- comment->v.text.original_text = token->original_text;
964
- comment->v.text.start_pos = token->position;
965
- append_node(parser, node, comment);
966
- }
967
-
968
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
969
- static void clear_stack_to_table_row_context(GumboParser* parser) {
970
- while (!node_tag_in_set(get_current_node(parser),
971
- (gumbo_tagset){TAG(HTML), TAG(TR), TAG(TEMPLATE)})) {
972
- pop_current_node(parser);
973
- }
974
- }
975
-
976
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
977
- static void clear_stack_to_table_context(GumboParser* parser) {
978
- while (!node_tag_in_set(get_current_node(parser),
979
- (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)})) {
980
- pop_current_node(parser);
981
- }
982
- }
983
-
984
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
985
- void clear_stack_to_table_body_context(GumboParser* parser) {
986
- while (!node_tag_in_set(get_current_node(parser),
987
- (gumbo_tagset){TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD),
988
- TAG(TEMPLATE)})) {
989
- pop_current_node(parser);
990
- }
991
- }
992
-
993
- // Creates a parser-inserted element in the HTML namespace and returns it.
994
- static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
995
- GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
996
- GumboElement* element = &node->v.element;
997
- gumbo_vector_init(parser, 1, &element->children);
998
- gumbo_vector_init(parser, 0, &element->attributes);
999
- element->tag = tag;
1000
- element->tag_namespace = GUMBO_NAMESPACE_HTML;
1001
- element->original_tag = kGumboEmptyString;
1002
- element->original_end_tag = kGumboEmptyString;
1003
- element->start_pos = (parser->_parser_state->_current_token)
1004
- ? parser->_parser_state->_current_token->position
1005
- : kGumboEmptySourcePosition;
1006
- element->end_pos = kGumboEmptySourcePosition;
1007
- return node;
1008
- }
1009
-
1010
- // Constructs an element from the given start tag token.
1011
- static GumboNode* create_element_from_token(
1012
- GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
1013
- assert(token->type == GUMBO_TOKEN_START_TAG);
1014
- GumboTokenStartTag* start_tag = &token->v.start_tag;
1015
-
1016
- GumboNodeType type = (tag_namespace == GUMBO_NAMESPACE_HTML &&
1017
- start_tag->tag == GUMBO_TAG_TEMPLATE)
1018
- ? GUMBO_NODE_TEMPLATE
1019
- : GUMBO_NODE_ELEMENT;
1020
-
1021
- GumboNode* node = create_node(parser, type);
1022
- GumboElement* element = &node->v.element;
1023
- gumbo_vector_init(parser, 1, &element->children);
1024
- element->attributes = start_tag->attributes;
1025
- element->tag = start_tag->tag;
1026
- element->tag_namespace = tag_namespace;
1027
-
1028
- assert(token->original_text.length >= 2);
1029
- assert(token->original_text.data[0] == '<');
1030
- assert(token->original_text.data[token->original_text.length - 1] == '>');
1031
- element->original_tag = token->original_text;
1032
- element->start_pos = token->position;
1033
- element->original_end_tag = kGumboEmptyString;
1034
- element->end_pos = kGumboEmptySourcePosition;
1035
-
1036
- // The element takes ownership of the attributes from the token, so any
1037
- // allocated-memory fields should be nulled out.
1038
- start_tag->attributes = kGumboEmptyVector;
1039
- return node;
1040
- }
1041
-
1042
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element
1043
- static void insert_element(GumboParser* parser, GumboNode* node,
1044
- bool is_reconstructing_formatting_elements) {
1045
- GumboParserState* state = parser->_parser_state;
1046
- // NOTE(jdtang): The text node buffer must always be flushed before inserting
1047
- // a node, otherwise we're handling nodes in a different order than the spec
1048
- // mandated. However, one clause of the spec (character tokens in the body)
1049
- // requires that we reconstruct the active formatting elements *before* adding
1050
- // the character, and reconstructing the active formatting elements may itself
1051
- // result in the insertion of new elements (which should be pushed onto the
1052
- // stack of open elements before the buffer is flushed). We solve this (for
1053
- // the time being, the spec has been rewritten for <template> and the new
1054
- // version may be simpler here) with a boolean flag to this method.
1055
- if (!is_reconstructing_formatting_elements) {
1056
- maybe_flush_text_node_buffer(parser);
1057
- }
1058
- InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
1059
- insert_node(parser, node, location);
1060
- gumbo_vector_add(parser, (void*) node, &state->_open_elements);
1061
- }
1062
-
1063
- // Convenience method that combines create_element_from_token and
1064
- // insert_element, inserting the generated element directly into the current
1065
- // node. Returns the node inserted.
1066
- static GumboNode* insert_element_from_token(
1067
- GumboParser* parser, GumboToken* token) {
1068
- GumboNode* element =
1069
- create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML);
1070
- insert_element(parser, element, false);
1071
- gumbo_debug("Inserting <%s> element (@%x) from token.\n",
1072
- gumbo_normalized_tagname(element->v.element.tag), element);
1073
- return element;
1074
- }
1075
-
1076
- // Convenience method that combines create_element and insert_element, inserting
1077
- // a parser-generated element of a specific tag type. Returns the node
1078
- // inserted.
1079
- static GumboNode* insert_element_of_tag_type(
1080
- GumboParser* parser, GumboTag tag, GumboParseFlags reason) {
1081
- GumboNode* element = create_element(parser, tag);
1082
- element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1083
- insert_element(parser, element, false);
1084
- gumbo_debug("Inserting %s element (@%x) from tag type.\n",
1085
- gumbo_normalized_tagname(tag), element);
1086
- return element;
1087
- }
1088
-
1089
- // Convenience method for creating foreign namespaced element. Returns the node
1090
- // inserted.
1091
- static GumboNode* insert_foreign_element(
1092
- GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
1093
- assert(token->type == GUMBO_TOKEN_START_TAG);
1094
- GumboNode* element = create_element_from_token(parser, token, tag_namespace);
1095
- insert_element(parser, element, false);
1096
- if (token_has_attribute(token, "xmlns") &&
1097
- !attribute_matches_case_sensitive(&token->v.start_tag.attributes, "xmlns",
1098
- kLegalXmlns[tag_namespace])) {
1099
- // TODO(jdtang): Since there're multiple possible error codes here, we
1100
- // eventually need reason codes to differentiate them.
1101
- parser_add_parse_error(parser, token);
1102
- }
1103
- if (token_has_attribute(token, "xmlns:xlink") &&
1104
- !attribute_matches_case_sensitive(&token->v.start_tag.attributes,
1105
- "xmlns:xlink", "http://www.w3.org/1999/xlink")) {
1106
- parser_add_parse_error(parser, token);
1107
- }
1108
- return element;
1109
- }
1110
-
1111
- static void insert_text_token(GumboParser* parser, GumboToken* token) {
1112
- assert(token->type == GUMBO_TOKEN_WHITESPACE ||
1113
- token->type == GUMBO_TOKEN_CHARACTER ||
1114
- token->type == GUMBO_TOKEN_NULL || token->type == GUMBO_TOKEN_CDATA);
1115
- TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
1116
- if (buffer_state->_buffer.length == 0) {
1117
- // Initialize position fields.
1118
- buffer_state->_start_original_text = token->original_text.data;
1119
- buffer_state->_start_position = token->position;
1120
- }
1121
- gumbo_string_buffer_append_codepoint(
1122
- parser, token->v.character, &buffer_state->_buffer);
1123
- if (token->type == GUMBO_TOKEN_CHARACTER) {
1124
- buffer_state->_type = GUMBO_NODE_TEXT;
1125
- } else if (token->type == GUMBO_TOKEN_CDATA) {
1126
- buffer_state->_type = GUMBO_NODE_CDATA;
1127
- }
1128
- gumbo_debug("Inserting text token '%c'.\n", token->v.character);
1129
- }
1130
-
1131
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm
1132
- static void run_generic_parsing_algorithm(
1133
- GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) {
1134
- insert_element_from_token(parser, token);
1135
- gumbo_tokenizer_set_state(parser, lexer_state);
1136
- parser->_parser_state->_original_insertion_mode =
1137
- parser->_parser_state->_insertion_mode;
1138
- parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT;
1139
- }
1140
-
1141
- static void acknowledge_self_closing_tag(GumboParser* parser) {
1142
- parser->_parser_state->_self_closing_flag_acknowledged = true;
1143
- }
1144
-
1145
- // Returns true if there's an anchor tag in the list of active formatting
1146
- // elements, and fills in its index if so.
1147
- static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
1148
- GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1149
- for (int i = elements->length; --i >= 0;) {
1150
- GumboNode* node = elements->data[i];
1151
- if (node == &kActiveFormattingScopeMarker) {
1152
- return false;
1153
- }
1154
- if (node_html_tag_is(node, GUMBO_TAG_A)) {
1155
- *anchor_index = i;
1156
- return true;
1157
- }
1158
- }
1159
- return false;
1160
- }
1161
-
1162
- // Counts the number of open formatting elements in the list of active
1163
- // formatting elements (after the last active scope marker) that have a specific
1164
- // tag. If this is > 0, then earliest_matching_index will be filled in with the
1165
- // index of the first such element.
1166
- static int count_formatting_elements_of_tag(GumboParser* parser,
1167
- const GumboNode* desired_node, int* earliest_matching_index) {
1168
- const GumboElement* desired_element = &desired_node->v.element;
1169
- GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1170
- int num_identical_elements = 0;
1171
- for (int i = elements->length; --i >= 0;) {
1172
- GumboNode* node = elements->data[i];
1173
- if (node == &kActiveFormattingScopeMarker) {
1174
- break;
1175
- }
1176
- assert(node->type == GUMBO_NODE_ELEMENT);
1177
- if (node_qualified_tag_is(
1178
- node, desired_element->tag_namespace, desired_element->tag) &&
1179
- all_attributes_match(
1180
- &node->v.element.attributes, &desired_element->attributes)) {
1181
- num_identical_elements++;
1182
- *earliest_matching_index = i;
1183
- }
1184
- }
1185
- return num_identical_elements;
1186
- }
1187
-
1188
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements
1189
- static void add_formatting_element(GumboParser* parser, const GumboNode* node) {
1190
- assert(node == &kActiveFormattingScopeMarker ||
1191
- node->type == GUMBO_NODE_ELEMENT);
1192
- GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1193
- if (node == &kActiveFormattingScopeMarker) {
1194
- gumbo_debug("Adding a scope marker.\n");
1195
- } else {
1196
- gumbo_debug("Adding a formatting element.\n");
1197
- }
1198
-
1199
- // Hunt for identical elements.
1200
- int earliest_identical_element = elements->length;
1201
- int num_identical_elements = count_formatting_elements_of_tag(
1202
- parser, node, &earliest_identical_element);
1203
-
1204
- // Noah's Ark clause: if there're at least 3, remove the earliest.
1205
- if (num_identical_elements >= 3) {
1206
- gumbo_debug("Noah's ark clause: removing element at %d.\n",
1207
- earliest_identical_element);
1208
- gumbo_vector_remove_at(parser, earliest_identical_element, elements);
1209
- }
1210
-
1211
- gumbo_vector_add(parser, (void*) node, elements);
1212
- }
1213
-
1214
- static bool is_open_element(GumboParser* parser, const GumboNode* node) {
1215
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
1216
- for (unsigned int i = 0; i < open_elements->length; ++i) {
1217
- if (open_elements->data[i] == node) {
1218
- return true;
1219
- }
1220
- }
1221
- return false;
1222
- }
1223
-
1224
- // Clones attributes, tags, etc. of a node, but does not copy the content. The
1225
- // clone shares no structure with the original node: all owned strings and
1226
- // values are fresh copies.
1227
- GumboNode* clone_node(
1228
- GumboParser* parser, GumboNode* node, GumboParseFlags reason) {
1229
- assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1230
- GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
1231
- *new_node = *node;
1232
- new_node->parent = NULL;
1233
- new_node->index_within_parent = -1;
1234
- // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may
1235
- // have a separate end tag.
1236
- new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG;
1237
- new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER;
1238
- GumboElement* element = &new_node->v.element;
1239
- gumbo_vector_init(parser, 1, &element->children);
1240
-
1241
- const GumboVector* old_attributes = &node->v.element.attributes;
1242
- gumbo_vector_init(parser, old_attributes->length, &element->attributes);
1243
- for (unsigned int i = 0; i < old_attributes->length; ++i) {
1244
- const GumboAttribute* old_attr = old_attributes->data[i];
1245
- GumboAttribute* attr =
1246
- gumbo_parser_allocate(parser, sizeof(GumboAttribute));
1247
- *attr = *old_attr;
1248
- attr->name = gumbo_copy_stringz(parser, old_attr->name);
1249
- attr->value = gumbo_copy_stringz(parser, old_attr->value);
1250
- gumbo_vector_add(parser, attr, &element->attributes);
1251
- }
1252
- return new_node;
1253
- }
1254
-
1255
- // "Reconstruct active formatting elements" part of the spec.
1256
- // This implementation is based on the html5lib translation from the mess of
1257
- // GOTOs in the spec to reasonably structured programming.
1258
- // http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py
1259
- static void reconstruct_active_formatting_elements(GumboParser* parser) {
1260
- GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1261
- // Step 1
1262
- if (elements->length == 0) {
1263
- return;
1264
- }
1265
-
1266
- // Step 2 & 3
1267
- unsigned int i = elements->length - 1;
1268
- GumboNode* element = elements->data[i];
1269
- if (element == &kActiveFormattingScopeMarker ||
1270
- is_open_element(parser, element)) {
1271
- return;
1272
- }
1273
-
1274
- // Step 6
1275
- do {
1276
- if (i == 0) {
1277
- // Step 4
1278
- i = -1; // Incremented to 0 below.
1279
- break;
1280
- }
1281
- // Step 5
1282
- element = elements->data[--i];
1283
- } while (element != &kActiveFormattingScopeMarker &&
1284
- !is_open_element(parser, element));
1285
-
1286
- ++i;
1287
- gumbo_debug("Reconstructing elements from %d on %s parent.\n", i,
1288
- gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
1289
- for (; i < elements->length; ++i) {
1290
- // Step 7 & 8.
1291
- assert(elements->length > 0);
1292
- assert(i < elements->length);
1293
- element = elements->data[i];
1294
- assert(element != &kActiveFormattingScopeMarker);
1295
- GumboNode* clone = clone_node(
1296
- parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
1297
- // Step 9.
1298
- InsertionLocation location =
1299
- get_appropriate_insertion_location(parser, NULL);
1300
- insert_node(parser, clone, location);
1301
- gumbo_vector_add(
1302
- parser, (void*) clone, &parser->_parser_state->_open_elements);
1303
-
1304
- // Step 10.
1305
- elements->data[i] = clone;
1306
- gumbo_debug("Reconstructed %s element at %d.\n",
1307
- gumbo_normalized_tagname(clone->v.element.tag), i);
1308
- }
1309
- }
1310
-
1311
- static void clear_active_formatting_elements(GumboParser* parser) {
1312
- GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1313
- int num_elements_cleared = 0;
1314
- const GumboNode* node;
1315
- do {
1316
- node = gumbo_vector_pop(parser, elements);
1317
- ++num_elements_cleared;
1318
- } while (node && node != &kActiveFormattingScopeMarker);
1319
- gumbo_debug("Cleared %d elements from active formatting list.\n",
1320
- num_elements_cleared);
1321
- }
1322
-
1323
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode
1324
- static GumboQuirksModeEnum compute_quirks_mode(
1325
- const GumboTokenDocType* doctype) {
1326
- if (doctype->force_quirks || strcmp(doctype->name, kDoctypeHtml.data) ||
1327
- is_in_static_list(
1328
- doctype->public_identifier, kQuirksModePublicIdPrefixes, false) ||
1329
- is_in_static_list(
1330
- doctype->public_identifier, kQuirksModePublicIdExactMatches, true) ||
1331
- is_in_static_list(
1332
- doctype->system_identifier, kQuirksModeSystemIdExactMatches, true) ||
1333
- (is_in_static_list(doctype->public_identifier,
1334
- kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) &&
1335
- !doctype->has_system_identifier)) {
1336
- return GUMBO_DOCTYPE_QUIRKS;
1337
- } else if (is_in_static_list(doctype->public_identifier,
1338
- kLimitedQuirksPublicIdPrefixes, false) ||
1339
- (is_in_static_list(doctype->public_identifier,
1340
- kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false) &&
1341
- doctype->has_system_identifier)) {
1342
- return GUMBO_DOCTYPE_LIMITED_QUIRKS;
1343
- }
1344
- return GUMBO_DOCTYPE_NO_QUIRKS;
1345
- }
1346
-
1347
- // The following functions are all defined by the "has an element in __ scope"
1348
- // sections of the HTML5 spec:
1349
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
1350
- // The basic idea behind them is that they check for an element of the given
1351
- // qualified name, contained within a scope formed by a set of other qualified
1352
- // names. For example, "has an element in list scope" looks for an element of
1353
- // the given qualified name within the nearest enclosing <ol> or <ul>, along
1354
- // with a bunch of generic element types that serve to "firewall" their content
1355
- // from the rest of the document. Note that because of the way the spec is
1356
- // written,
1357
- // all elements are expected to be in the HTML namespace
1358
- static bool has_an_element_in_specific_scope(GumboParser* parser,
1359
- int expected_size, const GumboTag* expected, bool negate,
1360
- const gumbo_tagset tags) {
1361
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
1362
- for (int i = open_elements->length; --i >= 0;) {
1363
- const GumboNode* node = open_elements->data[i];
1364
- if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE)
1365
- continue;
1366
-
1367
- GumboTag node_tag = node->v.element.tag;
1368
- GumboNamespaceEnum node_ns = node->v.element.tag_namespace;
1369
- for (int j = 0; j < expected_size; ++j) {
1370
- if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML)
1371
- return true;
1372
- }
1373
-
1374
- bool found = TAGSET_INCLUDES(tags, node_ns, node_tag);
1375
- if (negate != found) return false;
1376
- }
1377
- return false;
1378
- }
1379
-
1380
- // Checks for the presence of an open element of the specified tag type.
1381
- static bool has_open_element(GumboParser* parser, GumboTag tag) {
1382
- return has_an_element_in_specific_scope(
1383
- parser, 1, &tag, false, (gumbo_tagset){TAG(HTML)});
1384
- }
1385
-
1386
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
1387
- static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
1388
- return has_an_element_in_specific_scope(parser, 1, &tag, false,
1389
- (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1390
- TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1391
- TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1392
- TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1393
- TAG_SVG(TITLE)});
1394
- }
1395
-
1396
- // Like "has an element in scope", but for the specific case of looking for a
1397
- // unique target node, not for any node with a given tag name. This duplicates
1398
- // much of the algorithm from has_an_element_in_specific_scope because the
1399
- // predicate is different when checking for an exact node, and it's easier &
1400
- // faster just to duplicate the code for this one case than to try and
1401
- // parameterize it.
1402
- static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
1403
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
1404
- for (int i = open_elements->length; --i >= 0;) {
1405
- const GumboNode* current = open_elements->data[i];
1406
- if (current == node) {
1407
- return true;
1408
- }
1409
- if (current->type != GUMBO_NODE_ELEMENT &&
1410
- current->type != GUMBO_NODE_TEMPLATE) {
1411
- continue;
1412
- }
1413
- if (node_tag_in_set(current,
1414
- (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE),
1415
- TAG(TD), TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE),
1416
- TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1417
- TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1418
- TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE)})) {
1419
- return false;
1420
- }
1421
- }
1422
- assert(false);
1423
- return false;
1424
- }
1425
-
1426
- // Like has_an_element_in_scope, but restricts the expected qualified name to a
1427
- // range of possible qualified names instead of just a single one.
1428
- static bool has_an_element_in_scope_with_tagname(
1429
- GumboParser* parser, int expected_len, const GumboTag expected[]) {
1430
- return has_an_element_in_specific_scope(parser, expected_len, expected, false,
1431
- (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1432
- TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1433
- TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1434
- TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1435
- TAG_SVG(TITLE)});
1436
- }
1437
-
1438
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
1439
- static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
1440
- return has_an_element_in_specific_scope(parser, 1, &tag, false,
1441
- (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1442
- TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1443
- TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1444
- TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1445
- TAG_SVG(TITLE), TAG(OL), TAG(UL)});
1446
- }
1447
-
1448
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
1449
- static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
1450
- return has_an_element_in_specific_scope(parser, 1, &tag, false,
1451
- (gumbo_tagset){TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD),
1452
- TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI),
1453
- TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT),
1454
- TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC),
1455
- TAG_SVG(TITLE), TAG(BUTTON)});
1456
- }
1457
-
1458
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
1459
- static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
1460
- return has_an_element_in_specific_scope(parser, 1, &tag, false,
1461
- (gumbo_tagset){TAG(HTML), TAG(TABLE), TAG(TEMPLATE)});
1462
- }
1463
-
1464
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
1465
- static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
1466
- return has_an_element_in_specific_scope(
1467
- parser, 1, &tag, true, (gumbo_tagset){TAG(OPTGROUP), TAG(OPTION)});
1468
- }
1469
-
1470
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
1471
- // "exception" is the "element to exclude from the process" listed in the spec.
1472
- // Pass GUMBO_TAG_LAST to not exclude any of them.
1473
- static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1474
- for (; node_tag_in_set(get_current_node(parser),
1475
- (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTION),
1476
- TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RB), TAG(RT), TAG(RTC)}) &&
1477
- !node_html_tag_is(get_current_node(parser), exception);
1478
- pop_current_node(parser))
1479
- ;
1480
- }
1481
-
1482
- // This is the "generate all implied end tags thoroughly" clause of the spec.
1483
- // https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags
1484
- static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
1485
- for (
1486
- ; node_tag_in_set(get_current_node(parser),
1487
- (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI),
1488
- TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RT), TAG(RTC),
1489
- TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR)});
1490
- pop_current_node(parser))
1491
- ;
1492
- }
1493
-
1494
- // This factors out the clauses relating to "act as if an end tag token with tag
1495
- // name "table" had been seen. Returns true if there's a table element in table
1496
- // scope which was successfully closed, false if not and the token should be
1497
- // ignored. Does not add parse errors; callers should handle that.
1498
- static bool close_table(GumboParser* parser) {
1499
- if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) {
1500
- return false;
1501
- }
1502
-
1503
- GumboNode* node = pop_current_node(parser);
1504
- while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) {
1505
- node = pop_current_node(parser);
1506
- }
1507
- reset_insertion_mode_appropriately(parser);
1508
- return true;
1509
- }
1510
-
1511
- // This factors out the clauses relating to "act as if an end tag token with tag
1512
- // name `cell_tag` had been seen".
1513
- static bool close_table_cell(
1514
- GumboParser* parser, const GumboToken* token, GumboTag cell_tag) {
1515
- bool result = true;
1516
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
1517
- const GumboNode* node = get_current_node(parser);
1518
- if (!node_html_tag_is(node, cell_tag)) {
1519
- parser_add_parse_error(parser, token);
1520
- result = false;
1521
- }
1522
- do {
1523
- node = pop_current_node(parser);
1524
- } while (!node_html_tag_is(node, cell_tag));
1525
-
1526
- clear_active_formatting_elements(parser);
1527
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
1528
- return result;
1529
- }
1530
-
1531
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell
1532
- // This holds the logic to determine whether we should close a <td> or a <th>.
1533
- static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1534
- if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1535
- assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1536
- return close_table_cell(parser, token, GUMBO_TAG_TD);
1537
- } else {
1538
- assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1539
- return close_table_cell(parser, token, GUMBO_TAG_TH);
1540
- }
1541
- }
1542
-
1543
- // This factors out the "act as if an end tag of tag name 'select' had been
1544
- // seen" clause of the spec, since it's referenced in several places. It pops
1545
- // all nodes from the stack until the current <select> has been closed, then
1546
- // resets the insertion mode appropriately.
1547
- static void close_current_select(GumboParser* parser) {
1548
- GumboNode* node = pop_current_node(parser);
1549
- while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) {
1550
- node = pop_current_node(parser);
1551
- }
1552
- reset_insertion_mode_appropriately(parser);
1553
- }
1554
-
1555
- // The list of nodes in the "special" category:
1556
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
1557
- static bool is_special_node(const GumboNode* node) {
1558
- assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1559
- return node_tag_in_set(node,
1560
- (gumbo_tagset){TAG(ADDRESS), TAG(APPLET), TAG(AREA), TAG(ARTICLE),
1561
- TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE),
1562
- TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL),
1563
- TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR),
1564
- TAG(DIV), TAG(DL), TAG(DT), TAG(EMBED), TAG(FIELDSET),
1565
- TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(FORM), TAG(FRAME),
1566
- TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6),
1567
- TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML), TAG(IFRAME),
1568
- TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK), TAG(LISTING),
1569
- TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED),
1570
- TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P),
1571
- TAG(PARAM), TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION),
1572
- TAG(SELECT), TAG(STYLE), TAG(SUMMARY), TAG(TABLE), TAG(TBODY),
1573
- TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA), TAG(TFOOT), TAG(TH),
1574
- TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP),
1575
-
1576
- TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1577
- TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1578
-
1579
- TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC)});
1580
- }
1581
-
1582
- // Implicitly closes currently open elements until it reaches an element with
1583
- // the
1584
- // specified qualified name. If the elements closed are in the set handled by
1585
- // generate_implied_end_tags, this is normal operation and this function returns
1586
- // true. Otherwise, a parse error is recorded and this function returns false.
1587
- static bool implicitly_close_tags(GumboParser* parser, GumboToken* token,
1588
- GumboNamespaceEnum target_ns, GumboTag target) {
1589
- bool result = true;
1590
- generate_implied_end_tags(parser, target);
1591
- if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1592
- parser_add_parse_error(parser, token);
1593
- while (
1594
- !node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1595
- pop_current_node(parser);
1596
- }
1597
- result = false;
1598
- }
1599
- assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
1600
- pop_current_node(parser);
1601
- return result;
1602
- }
1603
-
1604
- // If the stack of open elements has a <p> tag in button scope, this acts as if
1605
- // a </p> tag was encountered, implicitly closing tags. Returns false if a
1606
- // parse error occurs. This is a convenience function because this particular
1607
- // clause appears several times in the spec.
1608
- static bool maybe_implicitly_close_p_tag(
1609
- GumboParser* parser, GumboToken* token) {
1610
- if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1611
- return implicitly_close_tags(
1612
- parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
1613
- }
1614
- return true;
1615
- }
1616
-
1617
- // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
1618
- // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
1619
- static void maybe_implicitly_close_list_tag(
1620
- GumboParser* parser, GumboToken* token, bool is_li) {
1621
- GumboParserState* state = parser->_parser_state;
1622
- state->_frameset_ok = false;
1623
- for (int i = state->_open_elements.length; --i >= 0;) {
1624
- const GumboNode* node = state->_open_elements.data[i];
1625
- bool is_list_tag =
1626
- is_li ? node_html_tag_is(node, GUMBO_TAG_LI)
1627
- : node_tag_in_set(node, (gumbo_tagset){TAG(DD), TAG(DT)});
1628
- if (is_list_tag) {
1629
- implicitly_close_tags(
1630
- parser, token, node->v.element.tag_namespace, node->v.element.tag);
1631
- return;
1632
- }
1633
- if (is_special_node(node) &&
1634
- !node_tag_in_set(
1635
- node, (gumbo_tagset){TAG(ADDRESS), TAG(DIV), TAG(P)})) {
1636
- return;
1637
- }
1638
- }
1639
- }
1640
-
1641
- static void merge_attributes(
1642
- GumboParser* parser, GumboToken* token, GumboNode* node) {
1643
- assert(token->type == GUMBO_TOKEN_START_TAG);
1644
- assert(node->type == GUMBO_NODE_ELEMENT);
1645
- const GumboVector* token_attr = &token->v.start_tag.attributes;
1646
- GumboVector* node_attr = &node->v.element.attributes;
1647
-
1648
- for (unsigned int i = 0; i < token_attr->length; ++i) {
1649
- GumboAttribute* attr = token_attr->data[i];
1650
- if (!gumbo_get_attribute(node_attr, attr->name)) {
1651
- // Ownership of the attribute is transferred by this gumbo_vector_add,
1652
- // so it has to be nulled out of the original token so it doesn't get
1653
- // double-deleted.
1654
- gumbo_vector_add(parser, attr, node_attr);
1655
- token_attr->data[i] = NULL;
1656
- }
1657
- }
1658
- // When attributes are merged, it means the token has been ignored and merged
1659
- // with another token, so we need to free its memory. The attributes that are
1660
- // transferred need to be nulled-out in the vector above so that they aren't
1661
- // double-deleted.
1662
- gumbo_token_destroy(parser, token);
1663
-
1664
- #ifndef NDEBUG
1665
- // Mark this sentinel so the assertion in the main loop knows it's been
1666
- // destroyed.
1667
- token->v.start_tag.attributes = kGumboEmptyVector;
1668
- #endif
1669
- }
1670
-
1671
- const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) {
1672
- for (size_t i = 0; i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry);
1673
- ++i) {
1674
- const ReplacementEntry* entry = &kSvgTagReplacements[i];
1675
- if (gumbo_string_equals_ignore_case(tag, &entry->from)) {
1676
- return entry->to.data;
1677
- }
1678
- }
1679
- return NULL;
1680
- }
1681
-
1682
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
1683
- // This destructively modifies any matching attributes on the token and sets the
1684
- // namespace appropriately.
1685
- static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) {
1686
- assert(token->type == GUMBO_TOKEN_START_TAG);
1687
- const GumboVector* attributes = &token->v.start_tag.attributes;
1688
- for (size_t i = 0; i < sizeof(kForeignAttributeReplacements) /
1689
- sizeof(NamespacedAttributeReplacement);
1690
- ++i) {
1691
- const NamespacedAttributeReplacement* entry =
1692
- &kForeignAttributeReplacements[i];
1693
- GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from);
1694
- if (!attr) {
1695
- continue;
1696
- }
1697
- gumbo_parser_deallocate(parser, (void*) attr->name);
1698
- attr->attr_namespace = entry->attr_namespace;
1699
- attr->name = gumbo_copy_stringz(parser, entry->local_name);
1700
- }
1701
- }
1702
-
1703
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes
1704
- // This destructively modifies any matching attributes on the token.
1705
- static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) {
1706
- assert(token->type == GUMBO_TOKEN_START_TAG);
1707
- const GumboVector* attributes = &token->v.start_tag.attributes;
1708
- for (size_t i = 0;
1709
- i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) {
1710
- const ReplacementEntry* entry = &kSvgAttributeReplacements[i];
1711
- GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data);
1712
- if (!attr) {
1713
- continue;
1714
- }
1715
- gumbo_parser_deallocate(parser, (void*) attr->name);
1716
- attr->name = gumbo_copy_stringz(parser, entry->to.data);
1717
- }
1718
- }
1719
-
1720
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes
1721
- // Note that this may destructively modify the token with the new attribute
1722
- // value.
1723
- static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) {
1724
- assert(token->type == GUMBO_TOKEN_START_TAG);
1725
- GumboAttribute* attr =
1726
- gumbo_get_attribute(&token->v.start_tag.attributes, "definitionurl");
1727
- if (!attr) {
1728
- return;
1729
- }
1730
- gumbo_parser_deallocate(parser, (void*) attr->name);
1731
- attr->name = gumbo_copy_stringz(parser, "definitionURL");
1732
- }
1733
-
1734
- static bool doctype_matches(const GumboTokenDocType* doctype,
1735
- const GumboStringPiece* public_id, const GumboStringPiece* system_id,
1736
- bool allow_missing_system_id) {
1737
- return !strcmp(doctype->public_identifier, public_id->data) &&
1738
- (allow_missing_system_id || doctype->has_system_identifier) &&
1739
- !strcmp(doctype->system_identifier, system_id->data);
1740
- }
1741
-
1742
- static bool maybe_add_doctype_error(
1743
- GumboParser* parser, const GumboToken* token) {
1744
- const GumboTokenDocType* doctype = &token->v.doc_type;
1745
- bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data);
1746
- if ((!html_doctype || doctype->has_public_identifier ||
1747
- (doctype->has_system_identifier &&
1748
- !strcmp(
1749
- doctype->system_identifier, kSystemIdLegacyCompat.data))) &&
1750
- !(html_doctype && (doctype_matches(doctype, &kPublicIdHtml4_0,
1751
- &kSystemIdRecHtml4_0, true) ||
1752
- doctype_matches(doctype, &kPublicIdHtml4_01,
1753
- &kSystemIdHtml4, true) ||
1754
- doctype_matches(doctype, &kPublicIdXhtml1_0,
1755
- &kSystemIdXhtmlStrict1_1, false) ||
1756
- doctype_matches(doctype, &kPublicIdXhtml1_1,
1757
- &kSystemIdXhtml1_1, false)))) {
1758
- parser_add_parse_error(parser, token);
1759
- return false;
1760
- }
1761
- return true;
1762
- }
1763
-
1764
- static void remove_from_parent(GumboParser* parser, GumboNode* node) {
1765
- if (!node->parent) {
1766
- // The node may not have a parent if, for example, it is a newly-cloned copy
1767
- // of an active formatting element. DOM manipulations continue with the
1768
- // orphaned fragment of the DOM tree until it's appended/foster-parented to
1769
- // the common ancestor at the end of the adoption agency algorithm.
1770
- return;
1771
- }
1772
- assert(node->parent->type == GUMBO_NODE_ELEMENT);
1773
- GumboVector* children = &node->parent->v.element.children;
1774
- int index = gumbo_vector_index_of(children, node);
1775
- assert(index != -1);
1776
-
1777
- gumbo_vector_remove_at(parser, index, children);
1778
- node->parent = NULL;
1779
- node->index_within_parent = -1;
1780
- for (unsigned int i = index; i < children->length; ++i) {
1781
- GumboNode* child = children->data[i];
1782
- child->index_within_parent = i;
1783
- }
1784
- }
1785
-
1786
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
1787
- // Also described in the "in body" handling for end formatting tags.
1788
- static bool adoption_agency_algorithm(
1789
- GumboParser* parser, GumboToken* token, GumboTag subject) {
1790
- GumboParserState* state = parser->_parser_state;
1791
- gumbo_debug("Entering adoption agency algorithm.\n");
1792
- // Step 1.
1793
- GumboNode* current_node = get_current_node(parser);
1794
- if (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
1795
- current_node->v.element.tag == subject &&
1796
- gumbo_vector_index_of(
1797
- &state->_active_formatting_elements, current_node) == -1) {
1798
- pop_current_node(parser);
1799
- return false;
1800
- }
1801
- // Steps 2-4 & 20:
1802
- for (unsigned int i = 0; i < 8; ++i) {
1803
- // Step 5.
1804
- GumboNode* formatting_node = NULL;
1805
- int formatting_node_in_open_elements = -1;
1806
- for (int j = state->_active_formatting_elements.length; --j >= 0;) {
1807
- GumboNode* current_node = state->_active_formatting_elements.data[j];
1808
- if (current_node == &kActiveFormattingScopeMarker) {
1809
- gumbo_debug("Broke on scope marker; aborting.\n");
1810
- // Last scope marker; abort the algorithm.
1811
- return false;
1812
- }
1813
- if (node_html_tag_is(current_node, subject)) {
1814
- // Found it.
1815
- formatting_node = current_node;
1816
- formatting_node_in_open_elements =
1817
- gumbo_vector_index_of(&state->_open_elements, formatting_node);
1818
- gumbo_debug("Formatting element of tag %s at %d.\n",
1819
- gumbo_normalized_tagname(subject),
1820
- formatting_node_in_open_elements);
1821
- break;
1822
- }
1823
- }
1824
- if (!formatting_node) {
1825
- // No matching tag; not a parse error outright, but fall through to the
1826
- // "any other end tag" clause (which may potentially add a parse error,
1827
- // but not always).
1828
- gumbo_debug("No active formatting elements; aborting.\n");
1829
- return false;
1830
- }
1831
-
1832
- // Step 6
1833
- if (formatting_node_in_open_elements == -1) {
1834
- gumbo_debug("Formatting node not on stack of open elements.\n");
1835
- parser_add_parse_error(parser, token);
1836
- gumbo_vector_remove(
1837
- parser, formatting_node, &state->_active_formatting_elements);
1838
- return false;
1839
- }
1840
-
1841
- // Step 7
1842
- if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
1843
- parser_add_parse_error(parser, token);
1844
- gumbo_debug("Element not in scope.\n");
1845
- return false;
1846
- }
1847
-
1848
- // Step 8
1849
- if (formatting_node != get_current_node(parser)) {
1850
- parser_add_parse_error(parser, token); // But continue onwards.
1851
- }
1852
- assert(formatting_node);
1853
- assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
1854
- assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
1855
-
1856
- // Step 9 & 10
1857
- GumboNode* furthest_block = NULL;
1858
- for (unsigned int j = formatting_node_in_open_elements;
1859
- j < state->_open_elements.length; ++j) {
1860
- assert(j > 0);
1861
- GumboNode* current = state->_open_elements.data[j];
1862
- if (is_special_node(current)) {
1863
- // Step 9.
1864
- furthest_block = current;
1865
- break;
1866
- }
1867
- }
1868
- if (!furthest_block) {
1869
- // Step 10.
1870
- while (get_current_node(parser) != formatting_node) {
1871
- pop_current_node(parser);
1872
- }
1873
- // And the formatting element itself.
1874
- pop_current_node(parser);
1875
- gumbo_vector_remove(
1876
- parser, formatting_node, &state->_active_formatting_elements);
1877
- return false;
1878
- }
1879
- assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
1880
- assert(furthest_block);
1881
-
1882
- // Step 11.
1883
- // Elements may be moved and reparented by this algorithm, so
1884
- // common_ancestor is not necessarily the same as formatting_node->parent.
1885
- GumboNode* common_ancestor =
1886
- state->_open_elements.data[gumbo_vector_index_of(&state->_open_elements,
1887
- formatting_node) -
1888
- 1];
1889
- gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
1890
- gumbo_normalized_tagname(common_ancestor->v.element.tag),
1891
- gumbo_normalized_tagname(furthest_block->v.element.tag));
1892
-
1893
- // Step 12.
1894
- int bookmark = gumbo_vector_index_of(
1895
- &state->_active_formatting_elements, formatting_node) +
1896
- 1;
1897
- gumbo_debug("Bookmark at %d.\n", bookmark);
1898
- // Step 13.
1899
- GumboNode* node = furthest_block;
1900
- GumboNode* last_node = furthest_block;
1901
- // Must be stored explicitly, in case node is removed from the stack of open
1902
- // elements, to handle step 9.4.
1903
- int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
1904
- assert(saved_node_index > 0);
1905
- // Step 13.1.
1906
- for (int j = 0;;) {
1907
- // Step 13.2.
1908
- ++j;
1909
- // Step 13.3.
1910
- int node_index = gumbo_vector_index_of(&state->_open_elements, node);
1911
- gumbo_debug(
1912
- "Current index: %d, last index: %d.\n", node_index, saved_node_index);
1913
- if (node_index == -1) {
1914
- node_index = saved_node_index;
1915
- }
1916
- saved_node_index = --node_index;
1917
- assert(node_index > 0);
1918
- assert((unsigned int) node_index < state->_open_elements.capacity);
1919
- node = state->_open_elements.data[node_index];
1920
- assert(node->parent);
1921
- if (node == formatting_node) {
1922
- // Step 13.4.
1923
- break;
1924
- }
1925
- int formatting_index =
1926
- gumbo_vector_index_of(&state->_active_formatting_elements, node);
1927
- if (j > 3 && formatting_index != -1) {
1928
- // Step 13.5.
1929
- gumbo_debug("Removing formatting element at %d.\n", formatting_index);
1930
- gumbo_vector_remove_at(
1931
- parser, formatting_index, &state->_active_formatting_elements);
1932
- // Removing the element shifts all indices over by one, so we may need
1933
- // to move the bookmark.
1934
- if (formatting_index < bookmark) {
1935
- --bookmark;
1936
- gumbo_debug("Moving bookmark to %d.\n", bookmark);
1937
- }
1938
- continue;
1939
- }
1940
- if (formatting_index == -1) {
1941
- // Step 13.6.
1942
- gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
1943
- continue;
1944
- }
1945
- // Step 13.7.
1946
- // "common ancestor as the intended parent" doesn't actually mean insert
1947
- // it into the common ancestor; that happens below.
1948
- node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1949
- assert(formatting_index >= 0);
1950
- state->_active_formatting_elements.data[formatting_index] = node;
1951
- assert(node_index >= 0);
1952
- state->_open_elements.data[node_index] = node;
1953
- // Step 13.8.
1954
- if (last_node == furthest_block) {
1955
- bookmark = formatting_index + 1;
1956
- gumbo_debug("Bookmark moved to %d.\n", bookmark);
1957
- assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
1958
- }
1959
- // Step 13.9.
1960
- last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1961
- remove_from_parent(parser, last_node);
1962
- append_node(parser, node, last_node);
1963
- // Step 13.10.
1964
- last_node = node;
1965
- } // Step 13.11.
1966
-
1967
- // Step 14.
1968
- gumbo_debug("Removing %s node from parent ",
1969
- gumbo_normalized_tagname(last_node->v.element.tag));
1970
- remove_from_parent(parser, last_node);
1971
- last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1972
- InsertionLocation location =
1973
- get_appropriate_insertion_location(parser, common_ancestor);
1974
- gumbo_debug("and inserting it into %s.\n",
1975
- gumbo_normalized_tagname(location.target->v.element.tag));
1976
- insert_node(parser, last_node, location);
1977
-
1978
- // Step 15.
1979
- GumboNode* new_formatting_node = clone_node(
1980
- parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1981
- formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1982
-
1983
- // Step 16. Instead of appending nodes one-by-one, we swap the children
1984
- // vector of furthest_block with the empty children of new_formatting_node,
1985
- // reducing memory traffic and allocations. We still have to reset their
1986
- // parent pointers, though.
1987
- GumboVector temp = new_formatting_node->v.element.children;
1988
- new_formatting_node->v.element.children =
1989
- furthest_block->v.element.children;
1990
- furthest_block->v.element.children = temp;
1991
-
1992
- temp = new_formatting_node->v.element.children;
1993
- for (unsigned int i = 0; i < temp.length; ++i) {
1994
- GumboNode* child = temp.data[i];
1995
- child->parent = new_formatting_node;
1996
- }
1997
-
1998
- // Step 17.
1999
- append_node(parser, furthest_block, new_formatting_node);
2000
-
2001
- // Step 18.
2002
- // If the formatting node was before the bookmark, it may shift over all
2003
- // indices after it, so we need to explicitly find the index and possibly
2004
- // adjust the bookmark.
2005
- int formatting_node_index = gumbo_vector_index_of(
2006
- &state->_active_formatting_elements, formatting_node);
2007
- assert(formatting_node_index != -1);
2008
- if (formatting_node_index < bookmark) {
2009
- gumbo_debug(
2010
- "Formatting node at %d is before bookmark at %d; decrementing.\n",
2011
- formatting_node_index, bookmark);
2012
- --bookmark;
2013
- }
2014
- gumbo_vector_remove_at(
2015
- parser, formatting_node_index, &state->_active_formatting_elements);
2016
- assert(bookmark >= 0);
2017
- assert((unsigned int) bookmark <= state->_active_formatting_elements.length);
2018
- gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
2019
- &state->_active_formatting_elements);
2020
-
2021
- // Step 19.
2022
- gumbo_vector_remove(parser, formatting_node, &state->_open_elements);
2023
- int insert_at =
2024
- gumbo_vector_index_of(&state->_open_elements, furthest_block) + 1;
2025
- assert(insert_at >= 0);
2026
- assert((unsigned int) insert_at <= state->_open_elements.length);
2027
- gumbo_vector_insert_at(
2028
- parser, new_formatting_node, insert_at, &state->_open_elements);
2029
- } // Step 20.
2030
- return true;
2031
- }
2032
-
2033
- // This is here to clean up memory when the spec says "Ignore current token."
2034
- static void ignore_token(GumboParser* parser) {
2035
- GumboToken* token = parser->_parser_state->_current_token;
2036
- // Ownership of the token's internal buffers are normally transferred to the
2037
- // element, but if no element is emitted (as happens in non-verbatim-mode
2038
- // when a token is ignored), we need to free it here to prevent a memory
2039
- // leak.
2040
- gumbo_token_destroy(parser, token);
2041
- #ifndef NDEBUG
2042
- if (token->type == GUMBO_TOKEN_START_TAG) {
2043
- // Mark this sentinel so the assertion in the main loop knows it's been
2044
- // destroyed.
2045
- token->v.start_tag.attributes = kGumboEmptyVector;
2046
- }
2047
- #endif
2048
- }
2049
-
2050
- // http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html
2051
- static void finish_parsing(GumboParser* parser) {
2052
- gumbo_debug("Finishing parsing");
2053
- maybe_flush_text_node_buffer(parser);
2054
- GumboParserState* state = parser->_parser_state;
2055
- for (GumboNode* node = pop_current_node(parser); node;
2056
- node = pop_current_node(parser)) {
2057
- if ((node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
2058
- (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
2059
- continue;
2060
- }
2061
- node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2062
- }
2063
- while (pop_current_node(parser))
2064
- ; // Pop them all.
2065
- }
2066
-
2067
- static bool handle_initial(GumboParser* parser, GumboToken* token) {
2068
- GumboDocument* document = &get_document_node(parser)->v.document;
2069
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
2070
- ignore_token(parser);
2071
- return true;
2072
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2073
- append_comment_node(parser, get_document_node(parser), token);
2074
- return true;
2075
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2076
- document->has_doctype = true;
2077
- document->name = token->v.doc_type.name;
2078
- document->public_identifier = token->v.doc_type.public_identifier;
2079
- document->system_identifier = token->v.doc_type.system_identifier;
2080
- document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
2081
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2082
- return maybe_add_doctype_error(parser, token);
2083
- }
2084
- parser_add_parse_error(parser, token);
2085
- document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
2086
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2087
- parser->_parser_state->_reprocess_current_token = true;
2088
- return true;
2089
- }
2090
-
2091
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode
2092
- static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2093
- if (token->type == GUMBO_TOKEN_DOCTYPE) {
2094
- parser_add_parse_error(parser, token);
2095
- ignore_token(parser);
2096
- return false;
2097
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2098
- append_comment_node(parser, get_document_node(parser), token);
2099
- return true;
2100
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2101
- ignore_token(parser);
2102
- return true;
2103
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2104
- GumboNode* html_node = insert_element_from_token(parser, token);
2105
- parser->_output->root = html_node;
2106
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2107
- return true;
2108
- } else if (token->type == GUMBO_TOKEN_END_TAG &&
2109
- !tag_in(token, false,
2110
- (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) {
2111
- parser_add_parse_error(parser, token);
2112
- ignore_token(parser);
2113
- return false;
2114
- } else {
2115
- GumboNode* html_node = insert_element_of_tag_type(
2116
- parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
2117
- assert(html_node);
2118
- parser->_output->root = html_node;
2119
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2120
- parser->_parser_state->_reprocess_current_token = true;
2121
- return true;
2122
- }
2123
- }
2124
-
2125
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode
2126
- static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2127
- if (token->type == GUMBO_TOKEN_DOCTYPE) {
2128
- parser_add_parse_error(parser, token);
2129
- ignore_token(parser);
2130
- return false;
2131
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2132
- append_comment_node(parser, get_current_node(parser), token);
2133
- return true;
2134
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2135
- ignore_token(parser);
2136
- return true;
2137
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2138
- GumboNode* node = insert_element_from_token(parser, token);
2139
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2140
- parser->_parser_state->_head_element = node;
2141
- return true;
2142
- } else if (token->type == GUMBO_TOKEN_END_TAG &&
2143
- !tag_in(token, false,
2144
- (gumbo_tagset){TAG(HEAD), TAG(BODY), TAG(HTML), TAG(BR)})) {
2145
- parser_add_parse_error(parser, token);
2146
- ignore_token(parser);
2147
- return false;
2148
- } else {
2149
- GumboNode* node = insert_element_of_tag_type(
2150
- parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED);
2151
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2152
- parser->_parser_state->_head_element = node;
2153
- parser->_parser_state->_reprocess_current_token = true;
2154
- return true;
2155
- }
2156
- }
2157
-
2158
- // Forward declarations because of mutual dependencies.
2159
- static bool handle_token(GumboParser* parser, GumboToken* token);
2160
- static bool handle_in_body(GumboParser* parser, GumboToken* token);
2161
-
2162
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead
2163
- static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2164
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
2165
- insert_text_token(parser, token);
2166
- return true;
2167
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2168
- parser_add_parse_error(parser, token);
2169
- ignore_token(parser);
2170
- return false;
2171
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2172
- append_comment_node(parser, get_current_node(parser), token);
2173
- return true;
2174
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2175
- return handle_in_body(parser, token);
2176
- } else if (tag_in(token, kStartTag,
2177
- (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2178
- TAG(MENUITEM), TAG(LINK)})) {
2179
- insert_element_from_token(parser, token);
2180
- pop_current_node(parser);
2181
- acknowledge_self_closing_tag(parser);
2182
- return true;
2183
- } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2184
- insert_element_from_token(parser, token);
2185
- pop_current_node(parser);
2186
- acknowledge_self_closing_tag(parser);
2187
- // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the
2188
- // spec doesn't apply. If clients want to handle meta-tag re-encoding, they
2189
- // should specifically look for that string in the document and re-encode it
2190
- // before passing to Gumbo.
2191
- return true;
2192
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2193
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2194
- return true;
2195
- } else if (tag_in(
2196
- token, kStartTag, (gumbo_tagset){TAG(NOFRAMES), TAG(STYLE)})) {
2197
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2198
- return true;
2199
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2200
- insert_element_from_token(parser, token);
2201
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2202
- return true;
2203
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2204
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
2205
- return true;
2206
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2207
- GumboNode* head = pop_current_node(parser);
2208
- AVOID_UNUSED_VARIABLE_WARNING(head);
2209
- assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
2210
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2211
- return true;
2212
- } else if (tag_in(token, kEndTag,
2213
- (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)})) {
2214
- pop_current_node(parser);
2215
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2216
- parser->_parser_state->_reprocess_current_token = true;
2217
- return true;
2218
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2219
- insert_element_from_token(parser, token);
2220
- add_formatting_element(parser, &kActiveFormattingScopeMarker);
2221
- parser->_parser_state->_frameset_ok = false;
2222
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2223
- push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2224
- return true;
2225
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2226
- if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2227
- parser_add_parse_error(parser, token);
2228
- ignore_token(parser);
2229
- return false;
2230
- }
2231
- generate_all_implied_end_tags_thoroughly(parser);
2232
- bool success = true;
2233
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) {
2234
- parser_add_parse_error(parser, token);
2235
- success = false;
2236
- }
2237
- while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
2238
- ;
2239
- clear_active_formatting_elements(parser);
2240
- pop_template_insertion_mode(parser);
2241
- reset_insertion_mode_appropriately(parser);
2242
- return success;
2243
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2244
- (token->type == GUMBO_TOKEN_END_TAG)) {
2245
- parser_add_parse_error(parser, token);
2246
- ignore_token(parser);
2247
- return false;
2248
- } else {
2249
- pop_current_node(parser);
2250
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2251
- parser->_parser_state->_reprocess_current_token = true;
2252
- return true;
2253
- }
2254
- return true;
2255
- }
2256
-
2257
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript
2258
- static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2259
- if (token->type == GUMBO_TOKEN_DOCTYPE) {
2260
- parser_add_parse_error(parser, token);
2261
- return false;
2262
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2263
- return handle_in_body(parser, token);
2264
- } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2265
- const GumboNode* node = pop_current_node(parser);
2266
- assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2267
- AVOID_UNUSED_VARIABLE_WARNING(node);
2268
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2269
- return true;
2270
- } else if (token->type == GUMBO_TOKEN_WHITESPACE ||
2271
- token->type == GUMBO_TOKEN_COMMENT ||
2272
- tag_in(token, kStartTag,
2273
- (gumbo_tagset){TAG(BASEFONT), TAG(BGSOUND), TAG(LINK),
2274
- TAG(META), TAG(NOFRAMES), TAG(STYLE)})) {
2275
- return handle_in_head(parser, token);
2276
- } else if (tag_in(
2277
- token, kStartTag, (gumbo_tagset){TAG(HEAD), TAG(NOSCRIPT)}) ||
2278
- (token->type == GUMBO_TOKEN_END_TAG &&
2279
- !tag_is(token, kEndTag, GUMBO_TAG_BR))) {
2280
- parser_add_parse_error(parser, token);
2281
- ignore_token(parser);
2282
- return false;
2283
- } else {
2284
- parser_add_parse_error(parser, token);
2285
- const GumboNode* node = pop_current_node(parser);
2286
- assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2287
- AVOID_UNUSED_VARIABLE_WARNING(node);
2288
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2289
- parser->_parser_state->_reprocess_current_token = true;
2290
- return false;
2291
- }
2292
- }
2293
-
2294
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode
2295
- static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2296
- GumboParserState* state = parser->_parser_state;
2297
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
2298
- insert_text_token(parser, token);
2299
- return true;
2300
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2301
- parser_add_parse_error(parser, token);
2302
- ignore_token(parser);
2303
- return false;
2304
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2305
- append_comment_node(parser, get_current_node(parser), token);
2306
- return true;
2307
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2308
- return handle_in_body(parser, token);
2309
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2310
- insert_element_from_token(parser, token);
2311
- state->_frameset_ok = false;
2312
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2313
- return true;
2314
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2315
- insert_element_from_token(parser, token);
2316
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2317
- return true;
2318
- } else if (tag_in(token, kStartTag,
2319
- (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2320
- TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
2321
- TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)})) {
2322
- parser_add_parse_error(parser, token);
2323
- assert(state->_head_element != NULL);
2324
- // This must be flushed before we push the head element on, as there may be
2325
- // pending character tokens that should be attached to the root.
2326
- maybe_flush_text_node_buffer(parser);
2327
- gumbo_vector_add(parser, state->_head_element, &state->_open_elements);
2328
- bool result = handle_in_head(parser, token);
2329
- gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
2330
- return result;
2331
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2332
- return handle_in_head(parser, token);
2333
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2334
- (token->type == GUMBO_TOKEN_END_TAG &&
2335
- !tag_in(token, kEndTag,
2336
- (gumbo_tagset){TAG(BODY), TAG(HTML), TAG(BR)}))) {
2337
- parser_add_parse_error(parser, token);
2338
- ignore_token(parser);
2339
- return false;
2340
- } else {
2341
- insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2342
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2343
- state->_reprocess_current_token = true;
2344
- return true;
2345
- }
2346
- }
2347
-
2348
- static void destroy_node(GumboParser* parser, GumboNode* node) {
2349
- switch (node->type) {
2350
- case GUMBO_NODE_DOCUMENT: {
2351
- GumboDocument* doc = &node->v.document;
2352
- for (unsigned int i = 0; i < doc->children.length; ++i) {
2353
- destroy_node(parser, doc->children.data[i]);
2354
- }
2355
- gumbo_parser_deallocate(parser, (void*) doc->children.data);
2356
- gumbo_parser_deallocate(parser, (void*) doc->name);
2357
- gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
2358
- gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
2359
- } break;
2360
- case GUMBO_NODE_TEMPLATE:
2361
- case GUMBO_NODE_ELEMENT:
2362
- for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) {
2363
- gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
2364
- }
2365
- gumbo_parser_deallocate(parser, node->v.element.attributes.data);
2366
- for (unsigned int i = 0; i < node->v.element.children.length; ++i) {
2367
- destroy_node(parser, node->v.element.children.data[i]);
2368
- }
2369
- gumbo_parser_deallocate(parser, node->v.element.children.data);
2370
- break;
2371
- case GUMBO_NODE_TEXT:
2372
- case GUMBO_NODE_CDATA:
2373
- case GUMBO_NODE_COMMENT:
2374
- case GUMBO_NODE_WHITESPACE:
2375
- gumbo_parser_deallocate(parser, (void*) node->v.text.text);
2376
- break;
2377
- }
2378
- gumbo_parser_deallocate(parser, node);
2379
- }
2380
-
2381
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
2382
- static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2383
- GumboParserState* state = parser->_parser_state;
2384
- assert(state->_open_elements.length > 0);
2385
- if (token->type == GUMBO_TOKEN_NULL) {
2386
- parser_add_parse_error(parser, token);
2387
- ignore_token(parser);
2388
- return false;
2389
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2390
- reconstruct_active_formatting_elements(parser);
2391
- insert_text_token(parser, token);
2392
- return true;
2393
- } else if (token->type == GUMBO_TOKEN_CHARACTER ||
2394
- token->type == GUMBO_TOKEN_CDATA) {
2395
- reconstruct_active_formatting_elements(parser);
2396
- insert_text_token(parser, token);
2397
- set_frameset_not_ok(parser);
2398
- return true;
2399
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2400
- append_comment_node(parser, get_current_node(parser), token);
2401
- return true;
2402
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2403
- parser_add_parse_error(parser, token);
2404
- ignore_token(parser);
2405
- return false;
2406
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2407
- parser_add_parse_error(parser, token);
2408
- if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2409
- ignore_token(parser);
2410
- return false;
2411
- }
2412
- assert(parser->_output->root != NULL);
2413
- assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2414
- merge_attributes(parser, token, parser->_output->root);
2415
- return false;
2416
- } else if (tag_in(token, kStartTag,
2417
- (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
2418
- TAG(MENUITEM), TAG(LINK), TAG(META), TAG(NOFRAMES),
2419
- TAG(SCRIPT), TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) ||
2420
- tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2421
- return handle_in_head(parser, token);
2422
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2423
- parser_add_parse_error(parser, token);
2424
- if (state->_open_elements.length < 2 ||
2425
- !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2426
- has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2427
- ignore_token(parser);
2428
- return false;
2429
- }
2430
- state->_frameset_ok = false;
2431
- merge_attributes(parser, token, state->_open_elements.data[1]);
2432
- return false;
2433
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2434
- parser_add_parse_error(parser, token);
2435
- if (state->_open_elements.length < 2 ||
2436
- !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2437
- !state->_frameset_ok) {
2438
- ignore_token(parser);
2439
- return false;
2440
- }
2441
- // Save the body node for later removal.
2442
- GumboNode* body_node = state->_open_elements.data[1];
2443
-
2444
- // Pop all nodes except root HTML element.
2445
- GumboNode* node;
2446
- do {
2447
- node = pop_current_node(parser);
2448
- } while (node != state->_open_elements.data[1]);
2449
-
2450
- // Removing & destroying the body node is going to kill any nodes that have
2451
- // been added to the list of active formatting elements, and so we should
2452
- // clear it to prevent a use-after-free if the list of active formatting
2453
- // elements is reconstructed afterwards. This may happen if whitespace
2454
- // follows the </frameset>.
2455
- clear_active_formatting_elements(parser);
2456
-
2457
- // Remove the body node. We may want to factor this out into a generic
2458
- // helper, but right now this is the only code that needs to do this.
2459
- GumboVector* children = &parser->_output->root->v.element.children;
2460
- for (unsigned int i = 0; i < children->length; ++i) {
2461
- if (children->data[i] == body_node) {
2462
- gumbo_vector_remove_at(parser, i, children);
2463
- break;
2464
- }
2465
- }
2466
- destroy_node(parser, body_node);
2467
-
2468
- // Insert the <frameset>, and switch the insertion mode.
2469
- insert_element_from_token(parser, token);
2470
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2471
- return true;
2472
- } else if (token->type == GUMBO_TOKEN_EOF) {
2473
- for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2474
- if (!node_tag_in_set(state->_open_elements.data[i],
2475
- (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(P), TAG(TBODY),
2476
- TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR), TAG(BODY),
2477
- TAG(HTML)})) {
2478
- parser_add_parse_error(parser, token);
2479
- }
2480
- }
2481
- if (get_current_template_insertion_mode(parser) !=
2482
- GUMBO_INSERTION_MODE_INITIAL) {
2483
- return handle_in_template(parser, token);
2484
- }
2485
- return true;
2486
- } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(HTML)})) {
2487
- if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2488
- parser_add_parse_error(parser, token);
2489
- ignore_token(parser);
2490
- return false;
2491
- }
2492
- bool success = true;
2493
- for (unsigned int i = 0; i < state->_open_elements.length; ++i) {
2494
- if (!node_tag_in_set(state->_open_elements.data[i],
2495
- (gumbo_tagset){TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP),
2496
- TAG(OPTION), TAG(P), TAG(RB), TAG(RP), TAG(RT), TAG(RTC),
2497
- TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR),
2498
- TAG(BODY), TAG(HTML)})) {
2499
- parser_add_parse_error(parser, token);
2500
- success = false;
2501
- break;
2502
- }
2503
- }
2504
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2505
- if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2506
- parser->_parser_state->_reprocess_current_token = true;
2507
- } else {
2508
- GumboNode* body = state->_open_elements.data[1];
2509
- assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2510
- record_end_of_element(state->_current_token, &body->v.element);
2511
- }
2512
- return success;
2513
- } else if (tag_in(token, kStartTag,
2514
- (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
2515
- TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS), TAG(DIR),
2516
- TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
2517
- TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
2518
- TAG(MENU), TAG(MAIN), TAG(NAV), TAG(OL), TAG(P),
2519
- TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
2520
- bool result = maybe_implicitly_close_p_tag(parser, token);
2521
- insert_element_from_token(parser, token);
2522
- return result;
2523
- } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2524
- TAG(H4), TAG(H5), TAG(H6)})) {
2525
- bool result = maybe_implicitly_close_p_tag(parser, token);
2526
- if (node_tag_in_set(
2527
- get_current_node(parser), (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2528
- TAG(H4), TAG(H5), TAG(H6)})) {
2529
- parser_add_parse_error(parser, token);
2530
- pop_current_node(parser);
2531
- result = false;
2532
- }
2533
- insert_element_from_token(parser, token);
2534
- return result;
2535
- } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(PRE), TAG(LISTING)})) {
2536
- bool result = maybe_implicitly_close_p_tag(parser, token);
2537
- insert_element_from_token(parser, token);
2538
- state->_ignore_next_linefeed = true;
2539
- state->_frameset_ok = false;
2540
- return result;
2541
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2542
- if (state->_form_element != NULL &&
2543
- !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2544
- gumbo_debug("Ignoring nested form.\n");
2545
- parser_add_parse_error(parser, token);
2546
- ignore_token(parser);
2547
- return false;
2548
- }
2549
- bool result = maybe_implicitly_close_p_tag(parser, token);
2550
- GumboNode* form_element = insert_element_from_token(parser, token);
2551
- if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2552
- state->_form_element = form_element;
2553
- }
2554
- return result;
2555
- } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2556
- maybe_implicitly_close_list_tag(parser, token, true);
2557
- bool result = maybe_implicitly_close_p_tag(parser, token);
2558
- insert_element_from_token(parser, token);
2559
- return result;
2560
- } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(DD), TAG(DT)})) {
2561
- maybe_implicitly_close_list_tag(parser, token, false);
2562
- bool result = maybe_implicitly_close_p_tag(parser, token);
2563
- insert_element_from_token(parser, token);
2564
- return result;
2565
- } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2566
- bool result = maybe_implicitly_close_p_tag(parser, token);
2567
- insert_element_from_token(parser, token);
2568
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
2569
- return result;
2570
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2571
- if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2572
- parser_add_parse_error(parser, token);
2573
- implicitly_close_tags(
2574
- parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_BUTTON);
2575
- state->_reprocess_current_token = true;
2576
- return false;
2577
- }
2578
- reconstruct_active_formatting_elements(parser);
2579
- insert_element_from_token(parser, token);
2580
- state->_frameset_ok = false;
2581
- return true;
2582
- } else if (tag_in(token, kEndTag,
2583
- (gumbo_tagset){TAG(ADDRESS), TAG(ARTICLE), TAG(ASIDE),
2584
- TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS),
2585
- TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
2586
- TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER),
2587
- TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV),
2588
- TAG(OL), TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL)})) {
2589
- GumboTag tag = token->v.end_tag;
2590
- if (!has_an_element_in_scope(parser, tag)) {
2591
- parser_add_parse_error(parser, token);
2592
- ignore_token(parser);
2593
- return false;
2594
- }
2595
- implicitly_close_tags(
2596
- parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag);
2597
- return true;
2598
- } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2599
- if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2600
- if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
2601
- parser_add_parse_error(parser, token);
2602
- ignore_token(parser);
2603
- return false;
2604
- }
2605
- bool success = true;
2606
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2607
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
2608
- parser_add_parse_error(parser, token);
2609
- return false;
2610
- }
2611
- while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM))
2612
- ;
2613
- return success;
2614
- } else {
2615
- bool result = true;
2616
- const GumboNode* node = state->_form_element;
2617
- assert(!node || node->type == GUMBO_NODE_ELEMENT);
2618
- state->_form_element = NULL;
2619
- if (!node || !has_node_in_scope(parser, node)) {
2620
- gumbo_debug("Closing an unopened form.\n");
2621
- parser_add_parse_error(parser, token);
2622
- ignore_token(parser);
2623
- return false;
2624
- }
2625
- // This differs from implicitly_close_tags because we remove *only* the
2626
- // <form> element; other nodes are left in scope.
2627
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2628
- if (get_current_node(parser) != node) {
2629
- parser_add_parse_error(parser, token);
2630
- result = false;
2631
- }
2632
-
2633
- GumboVector* open_elements = &state->_open_elements;
2634
- int index = gumbo_vector_index_of(open_elements, node);
2635
- assert(index >= 0);
2636
- gumbo_vector_remove_at(parser, index, open_elements);
2637
- return result;
2638
- }
2639
- } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
2640
- if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2641
- parser_add_parse_error(parser, token);
2642
- // reconstruct_active_formatting_elements(parser);
2643
- insert_element_of_tag_type(
2644
- parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2645
- state->_reprocess_current_token = true;
2646
- return false;
2647
- }
2648
- return implicitly_close_tags(
2649
- parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
2650
- } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
2651
- if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
2652
- parser_add_parse_error(parser, token);
2653
- ignore_token(parser);
2654
- return false;
2655
- }
2656
- return implicitly_close_tags(
2657
- parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_LI);
2658
- } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(DD), TAG(DT)})) {
2659
- assert(token->type == GUMBO_TOKEN_END_TAG);
2660
- GumboTag token_tag = token->v.end_tag;
2661
- if (!has_an_element_in_scope(parser, token_tag)) {
2662
- parser_add_parse_error(parser, token);
2663
- ignore_token(parser);
2664
- return false;
2665
- }
2666
- return implicitly_close_tags(
2667
- parser, token, GUMBO_NAMESPACE_HTML, token_tag);
2668
- } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2669
- TAG(H4), TAG(H5), TAG(H6)})) {
2670
- if (!has_an_element_in_scope_with_tagname(
2671
- parser, 6, (GumboTag[]){GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2672
- GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) {
2673
- // No heading open; ignore the token entirely.
2674
- parser_add_parse_error(parser, token);
2675
- ignore_token(parser);
2676
- return false;
2677
- } else {
2678
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2679
- const GumboNode* current_node = get_current_node(parser);
2680
- bool success = node_html_tag_is(current_node, token->v.end_tag);
2681
- if (!success) {
2682
- // There're children of the heading currently open; close them below and
2683
- // record a parse error.
2684
- // TODO(jdtang): Add a way to distinguish this error case from the one
2685
- // above.
2686
- parser_add_parse_error(parser, token);
2687
- }
2688
- do {
2689
- current_node = pop_current_node(parser);
2690
- } while (!node_tag_in_set(
2691
- current_node, (gumbo_tagset){TAG(H1), TAG(H2), TAG(H3),
2692
- TAG(H4), TAG(H5), TAG(H6)}));
2693
- return success;
2694
- }
2695
- } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
2696
- bool success = true;
2697
- int last_a;
2698
- int has_matching_a = find_last_anchor_index(parser, &last_a);
2699
- if (has_matching_a) {
2700
- assert(has_matching_a == 1);
2701
- parser_add_parse_error(parser, token);
2702
- adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
2703
- // The adoption agency algorithm usually removes all instances of <a>
2704
- // from the list of active formatting elements, but in case it doesn't,
2705
- // we're supposed to do this. (The conditions where it might not are
2706
- // listed in the spec.)
2707
- if (find_last_anchor_index(parser, &last_a)) {
2708
- void* last_element = gumbo_vector_remove_at(
2709
- parser, last_a, &state->_active_formatting_elements);
2710
- gumbo_vector_remove(parser, last_element, &state->_open_elements);
2711
- }
2712
- success = false;
2713
- }
2714
- reconstruct_active_formatting_elements(parser);
2715
- add_formatting_element(parser, insert_element_from_token(parser, token));
2716
- return success;
2717
- } else if (tag_in(token, kStartTag,
2718
- (gumbo_tagset){TAG(B), TAG(BIG), TAG(CODE), TAG(EM), TAG(FONT),
2719
- TAG(I), TAG(S), TAG(SMALL), TAG(STRIKE), TAG(STRONG),
2720
- TAG(TT), TAG(U)})) {
2721
- reconstruct_active_formatting_elements(parser);
2722
- add_formatting_element(parser, insert_element_from_token(parser, token));
2723
- return true;
2724
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
2725
- bool result = true;
2726
- reconstruct_active_formatting_elements(parser);
2727
- if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
2728
- result = false;
2729
- parser_add_parse_error(parser, token);
2730
- adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
2731
- reconstruct_active_formatting_elements(parser);
2732
- }
2733
- insert_element_from_token(parser, token);
2734
- add_formatting_element(parser, get_current_node(parser));
2735
- return result;
2736
- } else if (tag_in(token, kEndTag,
2737
- (gumbo_tagset){TAG(A), TAG(B), TAG(BIG), TAG(CODE), TAG(EM),
2738
- TAG(FONT), TAG(I), TAG(NOBR), TAG(S), TAG(SMALL),
2739
- TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U)})) {
2740
- return adoption_agency_algorithm(parser, token, token->v.end_tag);
2741
- } else if (tag_in(token, kStartTag,
2742
- (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) {
2743
- reconstruct_active_formatting_elements(parser);
2744
- insert_element_from_token(parser, token);
2745
- add_formatting_element(parser, &kActiveFormattingScopeMarker);
2746
- set_frameset_not_ok(parser);
2747
- return true;
2748
- } else if (tag_in(token, kEndTag,
2749
- (gumbo_tagset){TAG(APPLET), TAG(MARQUEE), TAG(OBJECT)})) {
2750
- GumboTag token_tag = token->v.end_tag;
2751
- if (!has_an_element_in_table_scope(parser, token_tag)) {
2752
- parser_add_parse_error(parser, token);
2753
- ignore_token(parser);
2754
- return false;
2755
- }
2756
- implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
2757
- clear_active_formatting_elements(parser);
2758
- return true;
2759
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
2760
- if (get_document_node(parser)->v.document.doc_type_quirks_mode !=
2761
- GUMBO_DOCTYPE_QUIRKS) {
2762
- maybe_implicitly_close_p_tag(parser, token);
2763
- }
2764
- insert_element_from_token(parser, token);
2765
- set_frameset_not_ok(parser);
2766
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
2767
- return true;
2768
- } else if (tag_in(token, kStartTag,
2769
- (gumbo_tagset){TAG(AREA), TAG(BR), TAG(EMBED), TAG(IMG),
2770
- TAG(IMAGE), TAG(KEYGEN), TAG(WBR)})) {
2771
- bool success = true;
2772
- if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2773
- success = false;
2774
- parser_add_parse_error(parser, token);
2775
- token->v.start_tag.tag = GUMBO_TAG_IMG;
2776
- }
2777
- reconstruct_active_formatting_elements(parser);
2778
- GumboNode* node = insert_element_from_token(parser, token);
2779
- if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2780
- success = false;
2781
- parser_add_parse_error(parser, token);
2782
- node->v.element.tag = GUMBO_TAG_IMG;
2783
- node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
2784
- }
2785
- pop_current_node(parser);
2786
- acknowledge_self_closing_tag(parser);
2787
- set_frameset_not_ok(parser);
2788
- return success;
2789
- } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
2790
- if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
2791
- // Must be before the element is inserted, as that takes ownership of the
2792
- // token's attribute vector.
2793
- set_frameset_not_ok(parser);
2794
- }
2795
- reconstruct_active_formatting_elements(parser);
2796
- insert_element_from_token(parser, token);
2797
- pop_current_node(parser);
2798
- acknowledge_self_closing_tag(parser);
2799
- return true;
2800
- } else if (tag_in(token, kStartTag,
2801
- (gumbo_tagset){TAG(PARAM), TAG(SOURCE), TAG(TRACK)})) {
2802
- insert_element_from_token(parser, token);
2803
- pop_current_node(parser);
2804
- acknowledge_self_closing_tag(parser);
2805
- return true;
2806
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
2807
- bool result = maybe_implicitly_close_p_tag(parser, token);
2808
- insert_element_from_token(parser, token);
2809
- pop_current_node(parser);
2810
- acknowledge_self_closing_tag(parser);
2811
- set_frameset_not_ok(parser);
2812
- return result;
2813
- } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
2814
- parser_add_parse_error(parser, token);
2815
- if (parser->_parser_state->_form_element != NULL &&
2816
- !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2817
- ignore_token(parser);
2818
- return false;
2819
- }
2820
- acknowledge_self_closing_tag(parser);
2821
- maybe_implicitly_close_p_tag(parser, token);
2822
- set_frameset_not_ok(parser);
2823
-
2824
- GumboVector* token_attrs = &token->v.start_tag.attributes;
2825
- GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt");
2826
- GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action");
2827
- GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "name");
2828
-
2829
- GumboNode* form = insert_element_of_tag_type(
2830
- parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
2831
- if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2832
- parser->_parser_state->_form_element = form;
2833
- }
2834
- if (action_attr) {
2835
- gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
2836
- }
2837
- insert_element_of_tag_type(
2838
- parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2839
- pop_current_node(parser); // <hr>
2840
-
2841
- insert_element_of_tag_type(
2842
- parser, GUMBO_TAG_LABEL, GUMBO_INSERTION_FROM_ISINDEX);
2843
- TextNodeBufferState* text_state = &parser->_parser_state->_text_node;
2844
- text_state->_start_original_text = token->original_text.data;
2845
- text_state->_start_position = token->position;
2846
- text_state->_type = GUMBO_NODE_TEXT;
2847
- if (prompt_attr) {
2848
- int prompt_attr_length = strlen(prompt_attr->value);
2849
- gumbo_string_buffer_destroy(parser, &text_state->_buffer);
2850
- text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value);
2851
- text_state->_buffer.length = prompt_attr_length;
2852
- text_state->_buffer.capacity = prompt_attr_length + 1;
2853
- gumbo_destroy_attribute(parser, prompt_attr);
2854
- } else {
2855
- GumboStringPiece prompt_text =
2856
- GUMBO_STRING("This is a searchable index. Enter search keywords: ");
2857
- gumbo_string_buffer_append_string(
2858
- parser, &prompt_text, &text_state->_buffer);
2859
- }
2860
-
2861
- GumboNode* input = insert_element_of_tag_type(
2862
- parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX);
2863
- for (unsigned int i = 0; i < token_attrs->length; ++i) {
2864
- GumboAttribute* attr = token_attrs->data[i];
2865
- if (attr != prompt_attr && attr != action_attr && attr != name_attr) {
2866
- gumbo_vector_add(parser, attr, &input->v.element.attributes);
2867
- }
2868
- token_attrs->data[i] = NULL;
2869
- }
2870
-
2871
- // All attributes have been successfully transferred and nulled out at this
2872
- // point, so the call to ignore_token will free the memory for it without
2873
- // touching the attributes.
2874
- ignore_token(parser);
2875
-
2876
- // The name attribute, if present, should be destroyed since it's ignored
2877
- // when copying over. The action attribute should be kept since it's moved
2878
- // to the form.
2879
- if (name_attr) {
2880
- gumbo_destroy_attribute(parser, name_attr);
2881
- }
2882
-
2883
- GumboAttribute* name =
2884
- gumbo_parser_allocate(parser, sizeof(GumboAttribute));
2885
- GumboStringPiece name_str = GUMBO_STRING("name");
2886
- GumboStringPiece isindex_str = GUMBO_STRING("isindex");
2887
- name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
2888
- name->name = gumbo_copy_stringz(parser, "name");
2889
- name->value = gumbo_copy_stringz(parser, "isindex");
2890
- name->original_name = name_str;
2891
- name->original_value = isindex_str;
2892
- name->name_start = kGumboEmptySourcePosition;
2893
- name->name_end = kGumboEmptySourcePosition;
2894
- name->value_start = kGumboEmptySourcePosition;
2895
- name->value_end = kGumboEmptySourcePosition;
2896
- gumbo_vector_add(parser, name, &input->v.element.attributes);
2897
-
2898
- pop_current_node(parser); // <input>
2899
- pop_current_node(parser); // <label>
2900
- insert_element_of_tag_type(
2901
- parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2902
- pop_current_node(parser); // <hr>
2903
- pop_current_node(parser); // <form>
2904
- if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2905
- parser->_parser_state->_form_element = NULL;
2906
- }
2907
- return false;
2908
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
2909
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2910
- parser->_parser_state->_ignore_next_linefeed = true;
2911
- set_frameset_not_ok(parser);
2912
- return true;
2913
- } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
2914
- bool result = maybe_implicitly_close_p_tag(parser, token);
2915
- reconstruct_active_formatting_elements(parser);
2916
- set_frameset_not_ok(parser);
2917
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2918
- return result;
2919
- } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
2920
- set_frameset_not_ok(parser);
2921
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2922
- return true;
2923
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
2924
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2925
- return true;
2926
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
2927
- reconstruct_active_formatting_elements(parser);
2928
- insert_element_from_token(parser, token);
2929
- set_frameset_not_ok(parser);
2930
- GumboInsertionMode state = parser->_parser_state->_insertion_mode;
2931
- if (state == GUMBO_INSERTION_MODE_IN_TABLE ||
2932
- state == GUMBO_INSERTION_MODE_IN_CAPTION ||
2933
- state == GUMBO_INSERTION_MODE_IN_TABLE_BODY ||
2934
- state == GUMBO_INSERTION_MODE_IN_ROW ||
2935
- state == GUMBO_INSERTION_MODE_IN_CELL) {
2936
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
2937
- } else {
2938
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
2939
- }
2940
- return true;
2941
- } else if (tag_in(token, kStartTag,
2942
- (gumbo_tagset){TAG(OPTION), TAG(OPTGROUP)})) {
2943
- if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2944
- pop_current_node(parser);
2945
- }
2946
- reconstruct_active_formatting_elements(parser);
2947
- insert_element_from_token(parser, token);
2948
- return true;
2949
- } else if (tag_in(token, kStartTag,
2950
- (gumbo_tagset){TAG(RB), TAG(RP), TAG(RT), TAG(RTC)})) {
2951
- bool success = true;
2952
- GumboTag exception =
2953
- tag_in(token, kStartTag, (gumbo_tagset){TAG(RT), TAG(RP)})
2954
- ? GUMBO_TAG_RTC
2955
- : GUMBO_TAG_LAST;
2956
- if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
2957
- generate_implied_end_tags(parser, exception);
2958
- }
2959
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY) &&
2960
- !(exception == GUMBO_TAG_LAST ||
2961
- node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC))) {
2962
- parser_add_parse_error(parser, token);
2963
- success = false;
2964
- }
2965
- insert_element_from_token(parser, token);
2966
- return success;
2967
- } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
2968
- parser_add_parse_error(parser, token);
2969
- reconstruct_active_formatting_elements(parser);
2970
- insert_element_of_tag_type(
2971
- parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2972
- pop_current_node(parser);
2973
- return false;
2974
- } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
2975
- reconstruct_active_formatting_elements(parser);
2976
- adjust_mathml_attributes(parser, token);
2977
- adjust_foreign_attributes(parser, token);
2978
- insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML);
2979
- if (token->v.start_tag.is_self_closing) {
2980
- pop_current_node(parser);
2981
- acknowledge_self_closing_tag(parser);
2982
- }
2983
- return true;
2984
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
2985
- reconstruct_active_formatting_elements(parser);
2986
- adjust_svg_attributes(parser, token);
2987
- adjust_foreign_attributes(parser, token);
2988
- insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG);
2989
- if (token->v.start_tag.is_self_closing) {
2990
- pop_current_node(parser);
2991
- acknowledge_self_closing_tag(parser);
2992
- }
2993
- return true;
2994
- } else if (tag_in(token, kStartTag,
2995
- (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
2996
- TAG(FRAME), TAG(HEAD), TAG(TBODY), TAG(TD), TAG(TFOOT),
2997
- TAG(TH), TAG(THEAD), TAG(TR)})) {
2998
- parser_add_parse_error(parser, token);
2999
- ignore_token(parser);
3000
- return false;
3001
- } else if (token->type == GUMBO_TOKEN_START_TAG) {
3002
- reconstruct_active_formatting_elements(parser);
3003
- insert_element_from_token(parser, token);
3004
- return true;
3005
- } else {
3006
- assert(token->type == GUMBO_TOKEN_END_TAG);
3007
- GumboTag end_tag = token->v.end_tag;
3008
- assert(state->_open_elements.length > 0);
3009
- assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
3010
- // Walk up the stack of open elements until we find one that either:
3011
- // a) Matches the tag name we saw
3012
- // b) Is in the "special" category.
3013
- // If we see a), implicitly close everything up to and including it. If we
3014
- // see b), then record a parse error, don't close anything (except the
3015
- // implied end tags) and ignore the end tag token.
3016
- for (int i = state->_open_elements.length; --i >= 0;) {
3017
- const GumboNode* node = state->_open_elements.data[i];
3018
- if (node_html_tag_is(node, end_tag)) {
3019
- generate_implied_end_tags(parser, end_tag);
3020
- // TODO(jdtang): Do I need to add a parse error here? The condition in
3021
- // the spec seems like it's the inverse of the loop condition above, and
3022
- // so would never fire.
3023
- while (node != pop_current_node(parser))
3024
- ; // Pop everything.
3025
- return true;
3026
- } else if (is_special_node(node)) {
3027
- parser_add_parse_error(parser, token);
3028
- ignore_token(parser);
3029
- return false;
3030
- }
3031
- }
3032
- // <html> is in the special category, so we should never get here.
3033
- assert(0);
3034
- return false;
3035
- }
3036
- }
3037
-
3038
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata
3039
- static bool handle_text(GumboParser* parser, GumboToken* token) {
3040
- if (token->type == GUMBO_TOKEN_CHARACTER ||
3041
- token->type == GUMBO_TOKEN_WHITESPACE) {
3042
- insert_text_token(parser, token);
3043
- } else {
3044
- // We provide only bare-bones script handling that doesn't involve any of
3045
- // the parser-pause/already-started/script-nesting flags or re-entrant
3046
- // invocations of the tokenizer. Because the intended usage of this library
3047
- // is mostly for templating, refactoring, and static-analysis libraries, we
3048
- // provide the script body as a text-node child of the <script> element.
3049
- // This behavior doesn't support document.write of partial HTML elements,
3050
- // but should be adequate for almost all other scripting support.
3051
- if (token->type == GUMBO_TOKEN_EOF) {
3052
- parser_add_parse_error(parser, token);
3053
- parser->_parser_state->_reprocess_current_token = true;
3054
- }
3055
- pop_current_node(parser);
3056
- set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
3057
- }
3058
- return true;
3059
- }
3060
-
3061
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable
3062
- static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3063
- GumboParserState* state = parser->_parser_state;
3064
- if (token->type == GUMBO_TOKEN_CHARACTER ||
3065
- token->type == GUMBO_TOKEN_WHITESPACE) {
3066
- // The "pending table character tokens" list described in the spec is
3067
- // nothing more than the TextNodeBufferState. We accumulate text tokens as
3068
- // normal, except that when we go to flush them in the handle_in_table_text,
3069
- // we set _foster_parent_insertions if there're non-whitespace characters in
3070
- // the buffer.
3071
- assert(state->_text_node._buffer.length == 0);
3072
- state->_original_insertion_mode = state->_insertion_mode;
3073
- state->_reprocess_current_token = true;
3074
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
3075
- return true;
3076
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3077
- parser_add_parse_error(parser, token);
3078
- ignore_token(parser);
3079
- return false;
3080
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3081
- append_comment_node(parser, get_current_node(parser), token);
3082
- return true;
3083
- } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
3084
- clear_stack_to_table_context(parser);
3085
- add_formatting_element(parser, &kActiveFormattingScopeMarker);
3086
- insert_element_from_token(parser, token);
3087
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
3088
- return true;
3089
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
3090
- clear_stack_to_table_context(parser);
3091
- insert_element_from_token(parser, token);
3092
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3093
- return true;
3094
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3095
- clear_stack_to_table_context(parser);
3096
- insert_element_of_tag_type(
3097
- parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED);
3098
- parser->_parser_state->_reprocess_current_token = true;
3099
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3100
- return true;
3101
- } else if (tag_in(token, kStartTag,
3102
- (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TD),
3103
- TAG(TH), TAG(TR)})) {
3104
- clear_stack_to_table_context(parser);
3105
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3106
- if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH), TAG(TR)})) {
3107
- insert_element_of_tag_type(
3108
- parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED);
3109
- state->_reprocess_current_token = true;
3110
- } else {
3111
- insert_element_from_token(parser, token);
3112
- }
3113
- return true;
3114
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
3115
- parser_add_parse_error(parser, token);
3116
- if (close_table(parser)) {
3117
- parser->_parser_state->_reprocess_current_token = true;
3118
- } else {
3119
- ignore_token(parser);
3120
- }
3121
- return false;
3122
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3123
- if (!close_table(parser)) {
3124
- parser_add_parse_error(parser, token);
3125
- return false;
3126
- }
3127
- return true;
3128
- } else if (tag_in(token, kEndTag,
3129
- (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL),
3130
- TAG(COLGROUP), TAG(HTML), TAG(TBODY), TAG(TD), TAG(TFOOT),
3131
- TAG(TH), TAG(THEAD), TAG(TR)})) {
3132
- parser_add_parse_error(parser, token);
3133
- ignore_token(parser);
3134
- return false;
3135
- } else if (tag_in(token, kStartTag,
3136
- (gumbo_tagset){TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE)}) ||
3137
- (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))) {
3138
- return handle_in_head(parser, token);
3139
- } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) &&
3140
- attribute_matches(
3141
- &token->v.start_tag.attributes, "type", "hidden")) {
3142
- parser_add_parse_error(parser, token);
3143
- insert_element_from_token(parser, token);
3144
- pop_current_node(parser);
3145
- return false;
3146
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3147
- parser_add_parse_error(parser, token);
3148
- if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3149
- ignore_token(parser);
3150
- return false;
3151
- }
3152
- state->_form_element = insert_element_from_token(parser, token);
3153
- pop_current_node(parser);
3154
- return false;
3155
- } else if (token->type == GUMBO_TOKEN_EOF) {
3156
- return handle_in_body(parser, token);
3157
- } else {
3158
- parser_add_parse_error(parser, token);
3159
- state->_foster_parent_insertions = true;
3160
- bool result = handle_in_body(parser, token);
3161
- state->_foster_parent_insertions = false;
3162
- return result;
3163
- }
3164
- }
3165
-
3166
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext
3167
- static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3168
- if (token->type == GUMBO_TOKEN_NULL) {
3169
- parser_add_parse_error(parser, token);
3170
- ignore_token(parser);
3171
- return false;
3172
- } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3173
- token->type == GUMBO_TOKEN_WHITESPACE) {
3174
- insert_text_token(parser, token);
3175
- return true;
3176
- } else {
3177
- GumboParserState* state = parser->_parser_state;
3178
- GumboStringBuffer* buffer = &state->_text_node._buffer;
3179
- // Can't use strspn for this because GumboStringBuffers are not
3180
- // null-terminated.
3181
- // Note that TextNodeBuffer may contain UTF-8 characters, but the presence
3182
- // of any one byte that is not whitespace means we flip the flag, so this
3183
- // loop is still valid.
3184
- for (unsigned int i = 0; i < buffer->length; ++i) {
3185
- if (!isspace((unsigned char) buffer->data[i]) ||
3186
- buffer->data[i] == '\v') {
3187
- state->_foster_parent_insertions = true;
3188
- reconstruct_active_formatting_elements(parser);
3189
- break;
3190
- }
3191
- }
3192
- maybe_flush_text_node_buffer(parser);
3193
- state->_foster_parent_insertions = false;
3194
- state->_reprocess_current_token = true;
3195
- state->_insertion_mode = state->_original_insertion_mode;
3196
- return true;
3197
- }
3198
- }
3199
-
3200
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption
3201
- static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3202
- if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3203
- if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3204
- parser_add_parse_error(parser, token);
3205
- ignore_token(parser);
3206
- return false;
3207
- } else {
3208
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3209
- bool result = true;
3210
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3211
- parser_add_parse_error(parser, token);
3212
- }
3213
- while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3214
- ;
3215
- clear_active_formatting_elements(parser);
3216
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3217
- return result;
3218
- }
3219
- } else if (tag_in(token, kStartTag,
3220
- (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3221
- TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3222
- TAG(TR)}) ||
3223
- (tag_is(token, kEndTag, GUMBO_TAG_TABLE))) {
3224
- if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3225
- parser_add_parse_error(parser, token);
3226
- ignore_token(parser);
3227
- return false;
3228
- }
3229
- while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION))
3230
- ;
3231
- clear_active_formatting_elements(parser);
3232
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3233
- parser->_parser_state->_reprocess_current_token = true;
3234
- return true;
3235
- } else if (tag_in(token, kEndTag,
3236
- (gumbo_tagset){TAG(BODY), TAG(COL), TAG(COLGROUP), TAG(HTML),
3237
- TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3238
- TAG(TR)})) {
3239
- parser_add_parse_error(parser, token);
3240
- ignore_token(parser);
3241
- return false;
3242
- } else {
3243
- return handle_in_body(parser, token);
3244
- }
3245
- }
3246
-
3247
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup
3248
- static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3249
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
3250
- insert_text_token(parser, token);
3251
- return true;
3252
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3253
- parser_add_parse_error(parser, token);
3254
- ignore_token(parser);
3255
- return false;
3256
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3257
- append_comment_node(parser, get_current_node(parser), token);
3258
- return true;
3259
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3260
- return handle_in_body(parser, token);
3261
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3262
- insert_element_from_token(parser, token);
3263
- pop_current_node(parser);
3264
- acknowledge_self_closing_tag(parser);
3265
- return true;
3266
- } else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3267
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3268
- parser_add_parse_error(parser, token);
3269
- ignore_token(parser);
3270
- return false;
3271
- }
3272
- pop_current_node(parser);
3273
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3274
- return false;
3275
- } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3276
- parser_add_parse_error(parser, token);
3277
- ignore_token(parser);
3278
- return false;
3279
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) ||
3280
- tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3281
- return handle_in_head(parser, token);
3282
- } else if (token->type == GUMBO_TOKEN_EOF) {
3283
- return handle_in_body(parser, token);
3284
- } else {
3285
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3286
- parser_add_parse_error(parser, token);
3287
- ignore_token(parser);
3288
- return false;
3289
- }
3290
- pop_current_node(parser);
3291
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3292
- parser->_parser_state->_reprocess_current_token = true;
3293
- return true;
3294
- }
3295
- }
3296
-
3297
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody
3298
- static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3299
- if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3300
- clear_stack_to_table_body_context(parser);
3301
- insert_element_from_token(parser, token);
3302
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3303
- return true;
3304
- } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3305
- parser_add_parse_error(parser, token);
3306
- clear_stack_to_table_body_context(parser);
3307
- insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3308
- parser->_parser_state->_reprocess_current_token = true;
3309
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3310
- return false;
3311
- } else if (tag_in(token, kEndTag,
3312
- (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3313
- if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3314
- parser_add_parse_error(parser, token);
3315
- ignore_token(parser);
3316
- return false;
3317
- }
3318
- clear_stack_to_table_body_context(parser);
3319
- pop_current_node(parser);
3320
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3321
- return true;
3322
- } else if (tag_in(token, kStartTag,
3323
- (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3324
- TAG(TBODY), TAG(TFOOT), TAG(THEAD)}) ||
3325
- tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3326
- if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) ||
3327
- has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) ||
3328
- has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) {
3329
- parser_add_parse_error(parser, token);
3330
- ignore_token(parser);
3331
- return false;
3332
- }
3333
- clear_stack_to_table_body_context(parser);
3334
- pop_current_node(parser);
3335
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3336
- parser->_parser_state->_reprocess_current_token = true;
3337
- return true;
3338
- } else if (tag_in(token, kEndTag,
3339
- (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL), TAG(TR),
3340
- TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) {
3341
- parser_add_parse_error(parser, token);
3342
- ignore_token(parser);
3343
- return false;
3344
- } else {
3345
- return handle_in_table(parser, token);
3346
- }
3347
- }
3348
-
3349
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr
3350
- static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3351
- if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TH), TAG(TD)})) {
3352
- clear_stack_to_table_row_context(parser);
3353
- insert_element_from_token(parser, token);
3354
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3355
- add_formatting_element(parser, &kActiveFormattingScopeMarker);
3356
- return true;
3357
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3358
- if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3359
- parser_add_parse_error(parser, token);
3360
- ignore_token(parser);
3361
- return false;
3362
- } else {
3363
- clear_stack_to_table_row_context(parser);
3364
- pop_current_node(parser);
3365
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3366
- return true;
3367
- }
3368
- } else if (tag_in(token, kStartTag,
3369
- (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3370
- TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)}) ||
3371
- tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3372
- if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR)) {
3373
- parser_add_parse_error(parser, token);
3374
- ignore_token(parser);
3375
- return false;
3376
- } else {
3377
- clear_stack_to_table_row_context(parser);
3378
- pop_current_node(parser);
3379
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3380
- parser->_parser_state->_reprocess_current_token = true;
3381
- return true;
3382
- }
3383
- } else if (tag_in(token, kEndTag,
3384
- (gumbo_tagset){TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3385
- if (!has_an_element_in_table_scope(parser, token->v.end_tag) ||
3386
- (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR))) {
3387
- parser_add_parse_error(parser, token);
3388
- ignore_token(parser);
3389
- return false;
3390
- } else {
3391
- clear_stack_to_table_row_context(parser);
3392
- pop_current_node(parser);
3393
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3394
- parser->_parser_state->_reprocess_current_token = true;
3395
- return true;
3396
- }
3397
- } else if (tag_in(token, kEndTag,
3398
- (gumbo_tagset){TAG(BODY), TAG(CAPTION), TAG(COL),
3399
- TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH)})) {
3400
- parser_add_parse_error(parser, token);
3401
- ignore_token(parser);
3402
- return false;
3403
- } else {
3404
- return handle_in_table(parser, token);
3405
- }
3406
- }
3407
-
3408
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd
3409
- static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3410
- if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3411
- GumboTag token_tag = token->v.end_tag;
3412
- if (!has_an_element_in_table_scope(parser, token_tag)) {
3413
- parser_add_parse_error(parser, token);
3414
- ignore_token(parser);
3415
- return false;
3416
- }
3417
- return close_table_cell(parser, token, token_tag);
3418
- } else if (tag_in(token, kStartTag,
3419
- (gumbo_tagset){TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3420
- TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD),
3421
- TAG(TR)})) {
3422
- gumbo_debug("Handling <td> in cell.\n");
3423
- if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) &&
3424
- !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
3425
- gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
3426
- parser_add_parse_error(parser, token);
3427
- ignore_token(parser);
3428
- return false;
3429
- }
3430
- parser->_parser_state->_reprocess_current_token = true;
3431
- return close_current_cell(parser, token);
3432
- } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(BODY), TAG(CAPTION),
3433
- TAG(COL), TAG(COLGROUP), TAG(HTML)})) {
3434
- parser_add_parse_error(parser, token);
3435
- ignore_token(parser);
3436
- return false;
3437
- } else if (tag_in(token, kEndTag, (gumbo_tagset){TAG(TABLE), TAG(TBODY),
3438
- TAG(TFOOT), TAG(THEAD), TAG(TR)})) {
3439
- if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3440
- parser_add_parse_error(parser, token);
3441
- ignore_token(parser);
3442
- return false;
3443
- }
3444
- parser->_parser_state->_reprocess_current_token = true;
3445
- return close_current_cell(parser, token);
3446
- } else {
3447
- return handle_in_body(parser, token);
3448
- }
3449
- }
3450
-
3451
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect
3452
- static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3453
- if (token->type == GUMBO_TOKEN_NULL) {
3454
- parser_add_parse_error(parser, token);
3455
- ignore_token(parser);
3456
- return false;
3457
- } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3458
- token->type == GUMBO_TOKEN_WHITESPACE) {
3459
- insert_text_token(parser, token);
3460
- return true;
3461
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3462
- parser_add_parse_error(parser, token);
3463
- ignore_token(parser);
3464
- return false;
3465
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3466
- append_comment_node(parser, get_current_node(parser), token);
3467
- return true;
3468
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3469
- return handle_in_body(parser, token);
3470
- } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3471
- if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3472
- pop_current_node(parser);
3473
- }
3474
- insert_element_from_token(parser, token);
3475
- return true;
3476
- } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3477
- if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3478
- pop_current_node(parser);
3479
- }
3480
- if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3481
- pop_current_node(parser);
3482
- }
3483
- insert_element_from_token(parser, token);
3484
- return true;
3485
- } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3486
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
3487
- if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
3488
- node_html_tag_is(open_elements->data[open_elements->length - 2],
3489
- GUMBO_TAG_OPTGROUP)) {
3490
- pop_current_node(parser);
3491
- }
3492
- if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3493
- pop_current_node(parser);
3494
- return true;
3495
- } else {
3496
- parser_add_parse_error(parser, token);
3497
- ignore_token(parser);
3498
- return false;
3499
- }
3500
- } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3501
- if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3502
- pop_current_node(parser);
3503
- return true;
3504
- } else {
3505
- parser_add_parse_error(parser, token);
3506
- ignore_token(parser);
3507
- return false;
3508
- }
3509
- } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3510
- if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3511
- parser_add_parse_error(parser, token);
3512
- ignore_token(parser);
3513
- return false;
3514
- }
3515
- close_current_select(parser);
3516
- return true;
3517
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3518
- parser_add_parse_error(parser, token);
3519
- ignore_token(parser);
3520
- if (has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3521
- close_current_select(parser);
3522
- }
3523
- return false;
3524
- } else if (tag_in(token, kStartTag,
3525
- (gumbo_tagset){TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA)})) {
3526
- parser_add_parse_error(parser, token);
3527
- if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3528
- ignore_token(parser);
3529
- } else {
3530
- close_current_select(parser);
3531
- parser->_parser_state->_reprocess_current_token = true;
3532
- }
3533
- return false;
3534
- } else if (tag_in(token, kStartTag,
3535
- (gumbo_tagset){TAG(SCRIPT), TAG(TEMPLATE)}) ||
3536
- tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3537
- return handle_in_head(parser, token);
3538
- } else if (token->type == GUMBO_TOKEN_EOF) {
3539
- return handle_in_body(parser, token);
3540
- } else {
3541
- parser_add_parse_error(parser, token);
3542
- ignore_token(parser);
3543
- return false;
3544
- }
3545
- }
3546
-
3547
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
3548
- static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3549
- if (tag_in(token, kStartTag,
3550
- (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY), TAG(TFOOT),
3551
- TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
3552
- parser_add_parse_error(parser, token);
3553
- close_current_select(parser);
3554
- parser->_parser_state->_reprocess_current_token = true;
3555
- return false;
3556
- } else if (tag_in(token, kEndTag,
3557
- (gumbo_tagset){TAG(CAPTION), TAG(TABLE), TAG(TBODY),
3558
- TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH)})) {
3559
- parser_add_parse_error(parser, token);
3560
- if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3561
- ignore_token(parser);
3562
- return false;
3563
- } else {
3564
- close_current_select(parser);
3565
- // close_current_select already does the
3566
- // reset_insertion_mode_appropriately
3567
- // reset_insertion_mode_appropriately(parser);
3568
- parser->_parser_state->_reprocess_current_token = true;
3569
- return false;
3570
- }
3571
- } else {
3572
- return handle_in_select(parser, token);
3573
- }
3574
- }
3575
-
3576
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
3577
- static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3578
- GumboParserState* state = parser->_parser_state;
3579
- if (token->type == GUMBO_TOKEN_WHITESPACE ||
3580
- token->type == GUMBO_TOKEN_CHARACTER ||
3581
- token->type == GUMBO_TOKEN_COMMENT || token->type == GUMBO_TOKEN_NULL ||
3582
- token->type == GUMBO_TOKEN_DOCTYPE) {
3583
- return handle_in_body(parser, token);
3584
- } else if (tag_in(token, kStartTag,
3585
- (gumbo_tagset){TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
3586
- TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
3587
- TAG(STYLE), TAG(TEMPLATE), TAG(TITLE)}) ||
3588
- tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3589
- return handle_in_head(parser, token);
3590
- } else if (tag_in(
3591
- token, kStartTag, (gumbo_tagset){TAG(CAPTION), TAG(COLGROUP),
3592
- TAG(TBODY), TAG(TFOOT), TAG(THEAD)})) {
3593
- pop_template_insertion_mode(parser);
3594
- push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3595
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3596
- state->_reprocess_current_token = true;
3597
- return true;
3598
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3599
- pop_template_insertion_mode(parser);
3600
- push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3601
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3602
- state->_reprocess_current_token = true;
3603
- return true;
3604
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3605
- pop_template_insertion_mode(parser);
3606
- push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3607
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3608
- state->_reprocess_current_token = true;
3609
- return true;
3610
- } else if (tag_in(token, kStartTag, (gumbo_tagset){TAG(TD), TAG(TH)})) {
3611
- pop_template_insertion_mode(parser);
3612
- push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3613
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3614
- state->_reprocess_current_token = true;
3615
- return true;
3616
- } else if (token->type == GUMBO_TOKEN_START_TAG) {
3617
- pop_template_insertion_mode(parser);
3618
- push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3619
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3620
- state->_reprocess_current_token = true;
3621
- return true;
3622
- } else if (token->type == GUMBO_TOKEN_END_TAG) {
3623
- parser_add_parse_error(parser, token);
3624
- ignore_token(parser);
3625
- return false;
3626
- } else if (token->type == GUMBO_TOKEN_EOF) {
3627
- if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3628
- // Stop parsing.
3629
- return true;
3630
- }
3631
- parser_add_parse_error(parser, token);
3632
- while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE))
3633
- ;
3634
- clear_active_formatting_elements(parser);
3635
- pop_template_insertion_mode(parser);
3636
- reset_insertion_mode_appropriately(parser);
3637
- state->_reprocess_current_token = true;
3638
- return false;
3639
- } else {
3640
- assert(0);
3641
- return false;
3642
- }
3643
- }
3644
-
3645
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
3646
- static bool handle_after_body(GumboParser* parser, GumboToken* token) {
3647
- if (token->type == GUMBO_TOKEN_WHITESPACE ||
3648
- tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3649
- return handle_in_body(parser, token);
3650
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3651
- GumboNode* html_node = parser->_output->root;
3652
- assert(html_node != NULL);
3653
- append_comment_node(parser, html_node, token);
3654
- return true;
3655
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3656
- parser_add_parse_error(parser, token);
3657
- ignore_token(parser);
3658
- return false;
3659
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3660
- /* fragment case: ignore the closing HTML token */
3661
- if (is_fragment_parser(parser)) {
3662
- parser_add_parse_error(parser, token);
3663
- ignore_token(parser);
3664
- return false;
3665
- }
3666
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
3667
- GumboNode* html = parser->_parser_state->_open_elements.data[0];
3668
- assert(node_html_tag_is(html, GUMBO_TAG_HTML));
3669
- record_end_of_element(
3670
- parser->_parser_state->_current_token, &html->v.element);
3671
- return true;
3672
- } else if (token->type == GUMBO_TOKEN_EOF) {
3673
- return true;
3674
- } else {
3675
- parser_add_parse_error(parser, token);
3676
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3677
- parser->_parser_state->_reprocess_current_token = true;
3678
- return false;
3679
- }
3680
- }
3681
-
3682
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset
3683
- static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
3684
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
3685
- insert_text_token(parser, token);
3686
- return true;
3687
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3688
- append_comment_node(parser, get_current_node(parser), token);
3689
- return true;
3690
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3691
- parser_add_parse_error(parser, token);
3692
- ignore_token(parser);
3693
- return false;
3694
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3695
- return handle_in_body(parser, token);
3696
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
3697
- insert_element_from_token(parser, token);
3698
- return true;
3699
- } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
3700
- if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3701
- parser_add_parse_error(parser, token);
3702
- ignore_token(parser);
3703
- return false;
3704
- }
3705
- pop_current_node(parser);
3706
- if (!is_fragment_parser(parser) &&
3707
- !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
3708
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
3709
- }
3710
- return true;
3711
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
3712
- insert_element_from_token(parser, token);
3713
- pop_current_node(parser);
3714
- acknowledge_self_closing_tag(parser);
3715
- return true;
3716
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3717
- return handle_in_head(parser, token);
3718
- } else if (token->type == GUMBO_TOKEN_EOF) {
3719
- if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3720
- parser_add_parse_error(parser, token);
3721
- return false;
3722
- }
3723
- return true;
3724
- } else {
3725
- parser_add_parse_error(parser, token);
3726
- ignore_token(parser);
3727
- return false;
3728
- }
3729
- }
3730
-
3731
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset
3732
- static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
3733
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
3734
- insert_text_token(parser, token);
3735
- return true;
3736
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3737
- append_comment_node(parser, get_current_node(parser), token);
3738
- return true;
3739
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3740
- parser_add_parse_error(parser, token);
3741
- ignore_token(parser);
3742
- return false;
3743
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3744
- return handle_in_body(parser, token);
3745
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3746
- GumboNode* html = parser->_parser_state->_open_elements.data[0];
3747
- assert(node_html_tag_is(html, GUMBO_TAG_HTML));
3748
- record_end_of_element(
3749
- parser->_parser_state->_current_token, &html->v.element);
3750
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
3751
- return true;
3752
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3753
- return handle_in_head(parser, token);
3754
- } else if (token->type == GUMBO_TOKEN_EOF) {
3755
- return true;
3756
- } else {
3757
- parser_add_parse_error(parser, token);
3758
- ignore_token(parser);
3759
- return false;
3760
- }
3761
- }
3762
-
3763
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode
3764
- static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
3765
- if (token->type == GUMBO_TOKEN_COMMENT) {
3766
- append_comment_node(parser, get_document_node(parser), token);
3767
- return true;
3768
- } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3769
- token->type == GUMBO_TOKEN_WHITESPACE ||
3770
- tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3771
- return handle_in_body(parser, token);
3772
- } else if (token->type == GUMBO_TOKEN_EOF) {
3773
- return true;
3774
- } else {
3775
- parser_add_parse_error(parser, token);
3776
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3777
- parser->_parser_state->_reprocess_current_token = true;
3778
- return false;
3779
- }
3780
- }
3781
-
3782
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode
3783
- static bool handle_after_after_frameset(
3784
- GumboParser* parser, GumboToken* token) {
3785
- if (token->type == GUMBO_TOKEN_COMMENT) {
3786
- append_comment_node(parser, get_document_node(parser), token);
3787
- return true;
3788
- } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3789
- token->type == GUMBO_TOKEN_WHITESPACE ||
3790
- tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3791
- return handle_in_body(parser, token);
3792
- } else if (token->type == GUMBO_TOKEN_EOF) {
3793
- return true;
3794
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3795
- return handle_in_head(parser, token);
3796
- } else {
3797
- parser_add_parse_error(parser, token);
3798
- ignore_token(parser);
3799
- return false;
3800
- }
3801
- }
3802
-
3803
- // Function pointers for each insertion mode. Keep in sync with
3804
- // insertion_mode.h.
3805
- typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token);
3806
- static const TokenHandler kTokenHandlers[] = {handle_initial,
3807
- handle_before_html, handle_before_head, handle_in_head,
3808
- handle_in_head_noscript, handle_after_head, handle_in_body, handle_text,
3809
- handle_in_table, handle_in_table_text, handle_in_caption,
3810
- handle_in_column_group, handle_in_table_body, handle_in_row, handle_in_cell,
3811
- handle_in_select, handle_in_select_in_table, handle_in_template,
3812
- handle_after_body, handle_in_frameset, handle_after_frameset,
3813
- handle_after_after_body, handle_after_after_frameset};
3814
-
3815
- static bool handle_html_content(GumboParser* parser, GumboToken* token) {
3816
- return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode](
3817
- parser, token);
3818
- }
3819
-
3820
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign
3821
- static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
3822
- gumbo_debug("Handling foreign content");
3823
- switch (token->type) {
3824
- case GUMBO_TOKEN_NULL:
3825
- parser_add_parse_error(parser, token);
3826
- token->v.character = kUtf8ReplacementChar;
3827
- insert_text_token(parser, token);
3828
- return false;
3829
- case GUMBO_TOKEN_WHITESPACE:
3830
- insert_text_token(parser, token);
3831
- return true;
3832
- case GUMBO_TOKEN_CDATA:
3833
- case GUMBO_TOKEN_CHARACTER:
3834
- insert_text_token(parser, token);
3835
- set_frameset_not_ok(parser);
3836
- return true;
3837
- case GUMBO_TOKEN_COMMENT:
3838
- append_comment_node(parser, get_current_node(parser), token);
3839
- return true;
3840
- case GUMBO_TOKEN_DOCTYPE:
3841
- parser_add_parse_error(parser, token);
3842
- ignore_token(parser);
3843
- return false;
3844
- default:
3845
- // Fall through to the if-statements below.
3846
- break;
3847
- }
3848
- // Order matters for these clauses.
3849
- if (tag_in(token, kStartTag,
3850
- (gumbo_tagset){TAG(B), TAG(BIG), TAG(BLOCKQUOTE), TAG(BODY), TAG(BR),
3851
- TAG(CENTER), TAG(CODE), TAG(DD), TAG(DIV), TAG(DL), TAG(DT),
3852
- TAG(EM), TAG(EMBED), TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5),
3853
- TAG(H6), TAG(HEAD), TAG(HR), TAG(I), TAG(IMG), TAG(LI),
3854
- TAG(LISTING), TAG(MENU), TAG(META), TAG(NOBR), TAG(OL), TAG(P),
3855
- TAG(PRE), TAG(RUBY), TAG(S), TAG(SMALL), TAG(SPAN), TAG(STRONG),
3856
- TAG(STRIKE), TAG(SUB), TAG(SUP), TAG(TABLE), TAG(TT), TAG(U),
3857
- TAG(UL), TAG(VAR)}) ||
3858
- (tag_is(token, kStartTag, GUMBO_TAG_FONT) &&
3859
- (token_has_attribute(token, "color") ||
3860
- token_has_attribute(token, "face") ||
3861
- token_has_attribute(token, "size")))) {
3862
- /* Parse error */
3863
- parser_add_parse_error(parser, token);
3864
-
3865
- /*
3866
- * Fragment case: If the parser was originally created for the HTML
3867
- * fragment parsing algorithm, then act as described in the "any other
3868
- * start tag" entry below.
3869
- */
3870
- if (!is_fragment_parser(parser)) {
3871
- do {
3872
- pop_current_node(parser);
3873
- } while (!(is_mathml_integration_point(get_current_node(parser)) ||
3874
- is_html_integration_point(get_current_node(parser)) ||
3875
- get_current_node(parser)->v.element.tag_namespace ==
3876
- GUMBO_NAMESPACE_HTML));
3877
- parser->_parser_state->_reprocess_current_token = true;
3878
- return false;
3879
- }
3880
-
3881
- assert(token->type == GUMBO_TOKEN_START_TAG);
3882
- }
3883
-
3884
- if (token->type == GUMBO_TOKEN_START_TAG) {
3885
- const GumboNamespaceEnum current_namespace =
3886
- get_adjusted_current_node(parser)->v.element.tag_namespace;
3887
- if (current_namespace == GUMBO_NAMESPACE_MATHML) {
3888
- adjust_mathml_attributes(parser, token);
3889
- }
3890
- if (current_namespace == GUMBO_NAMESPACE_SVG) {
3891
- // Tag adjustment is left to the gumbo_normalize_svg_tagname helper
3892
- // function.
3893
- adjust_svg_attributes(parser, token);
3894
- }
3895
- adjust_foreign_attributes(parser, token);
3896
- insert_foreign_element(parser, token, current_namespace);
3897
- if (token->v.start_tag.is_self_closing) {
3898
- pop_current_node(parser);
3899
- acknowledge_self_closing_tag(parser);
3900
- }
3901
- return true;
3902
- // </script> tags are handled like any other end tag, putting the script's
3903
- // text into a text node child and closing the current node.
3904
- } else {
3905
- assert(token->type == GUMBO_TOKEN_END_TAG);
3906
- GumboNode* node = get_current_node(parser);
3907
- assert(node != NULL);
3908
- GumboStringPiece token_tagname = token->original_text;
3909
- GumboStringPiece node_tagname = node->v.element.original_tag;
3910
- gumbo_tag_from_original_text(&token_tagname);
3911
- gumbo_tag_from_original_text(&node_tagname);
3912
-
3913
- bool is_success = true;
3914
- if (!gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3915
- parser_add_parse_error(parser, token);
3916
- is_success = false;
3917
- }
3918
- int i = parser->_parser_state->_open_elements.length;
3919
- for (--i; i > 0;) {
3920
- // Here we move up the stack until we find an HTML element (in which
3921
- // case we do nothing) or we find the element that we're about to
3922
- // close (in which case we pop everything we've seen until that
3923
- // point.)
3924
- gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length,
3925
- node_tagname.data, i);
3926
- if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3927
- gumbo_debug("Matches.\n");
3928
- while (pop_current_node(parser) != node) {
3929
- // Pop all the nodes below the current one. Node is guaranteed to
3930
- // be an element on the stack of open elements (set below), so
3931
- // this loop is guaranteed to terminate.
3932
- }
3933
- return is_success;
3934
- }
3935
- --i;
3936
- node = parser->_parser_state->_open_elements.data[i];
3937
- if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
3938
- // Must break before gumbo_tag_from_original_text to avoid passing
3939
- // parser-inserted nodes through.
3940
- break;
3941
- }
3942
- node_tagname = node->v.element.original_tag;
3943
- gumbo_tag_from_original_text(&node_tagname);
3944
- }
3945
- assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
3946
- // We can't call handle_token directly because the current node is still in
3947
- // the SVG namespace, so it would re-enter this and result in infinite
3948
- // recursion.
3949
- return handle_html_content(parser, token) && is_success;
3950
- }
3951
- }
3952
-
3953
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction
3954
- static bool handle_token(GumboParser* parser, GumboToken* token) {
3955
- if (parser->_parser_state->_ignore_next_linefeed &&
3956
- token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') {
3957
- parser->_parser_state->_ignore_next_linefeed = false;
3958
- ignore_token(parser);
3959
- return true;
3960
- }
3961
- // This needs to be reset both here and in the conditional above to catch both
3962
- // the case where the next token is not whitespace (so we don't ignore
3963
- // whitespace in the middle of <pre> tags) and where there are multiple
3964
- // whitespace tokens (so we don't ignore the second one).
3965
- parser->_parser_state->_ignore_next_linefeed = false;
3966
-
3967
- if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
3968
- parser->_parser_state->_closed_body_tag = true;
3969
- }
3970
- if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3971
- parser->_parser_state->_closed_html_tag = true;
3972
- }
3973
-
3974
- const GumboNode* current_node = get_adjusted_current_node(parser);
3975
- assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT ||
3976
- current_node->type == GUMBO_NODE_TEMPLATE);
3977
- if (current_node) {
3978
- gumbo_debug("Current node: <%s>.\n",
3979
- gumbo_normalized_tagname(current_node->v.element.tag));
3980
- }
3981
- if (!current_node ||
3982
- current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML ||
3983
- (is_mathml_integration_point(current_node) &&
3984
- (token->type == GUMBO_TOKEN_CHARACTER ||
3985
- token->type == GUMBO_TOKEN_WHITESPACE ||
3986
- token->type == GUMBO_TOKEN_NULL ||
3987
- (token->type == GUMBO_TOKEN_START_TAG &&
3988
- !tag_in(token, kStartTag,
3989
- (gumbo_tagset){TAG(MGLYPH), TAG(MALIGNMARK)})))) ||
3990
- (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
3991
- node_qualified_tag_is(
3992
- current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
3993
- tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
3994
- (is_html_integration_point(current_node) &&
3995
- (token->type == GUMBO_TOKEN_START_TAG ||
3996
- token->type == GUMBO_TOKEN_CHARACTER ||
3997
- token->type == GUMBO_TOKEN_NULL ||
3998
- token->type == GUMBO_TOKEN_WHITESPACE)) ||
3999
- token->type == GUMBO_TOKEN_EOF) {
4000
- return handle_html_content(parser, token);
4001
- } else {
4002
- return handle_in_foreign_content(parser, token);
4003
- }
4004
- }
4005
-
4006
- static void fragment_parser_init(GumboParser* parser, GumboTag fragment_ctx,
4007
- GumboNamespaceEnum fragment_namespace) {
4008
- GumboNode* root;
4009
- assert(fragment_ctx != GUMBO_TAG_LAST);
4010
-
4011
- // 3
4012
- parser->_parser_state->_fragment_ctx = create_element(parser, fragment_ctx);
4013
- parser->_parser_state->_fragment_ctx->v.element.tag_namespace =
4014
- fragment_namespace;
4015
-
4016
- // 4
4017
- if (fragment_namespace == GUMBO_NAMESPACE_HTML) {
4018
- // Non-HTML namespaces always start in the DATA state.
4019
- switch (fragment_ctx) {
4020
- case GUMBO_TAG_TITLE:
4021
- case GUMBO_TAG_TEXTAREA:
4022
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
4023
- break;
4024
-
4025
- case GUMBO_TAG_STYLE:
4026
- case GUMBO_TAG_XMP:
4027
- case GUMBO_TAG_IFRAME:
4028
- case GUMBO_TAG_NOEMBED:
4029
- case GUMBO_TAG_NOFRAMES:
4030
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
4031
- break;
4032
-
4033
- case GUMBO_TAG_SCRIPT:
4034
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
4035
- break;
4036
-
4037
- case GUMBO_TAG_NOSCRIPT:
4038
- /* scripting is disabled in Gumbo, so leave the tokenizer
4039
- * in the default data state */
4040
- break;
4041
-
4042
- case GUMBO_TAG_PLAINTEXT:
4043
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
4044
- break;
4045
-
4046
- default:
4047
- /* default data state */
4048
- break;
4049
- }
4050
- }
4051
-
4052
- // 5. 6. 7.
4053
- root = insert_element_of_tag_type(
4054
- parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
4055
- parser->_output->root = root;
4056
-
4057
- // 8.
4058
- if (fragment_ctx == GUMBO_TAG_TEMPLATE) {
4059
- push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
4060
- }
4061
-
4062
- // 10.
4063
- reset_insertion_mode_appropriately(parser);
4064
- }
4065
-
4066
- GumboOutput* gumbo_parse(const char* buffer) {
4067
- return gumbo_parse_with_options(
4068
- &kGumboDefaultOptions, buffer, strlen(buffer));
4069
- }
4070
-
4071
- GumboOutput* gumbo_parse_with_options(
4072
- const GumboOptions* options, const char* buffer, size_t length) {
4073
- GumboParser parser;
4074
- parser._options = options;
4075
- output_init(&parser);
4076
- gumbo_tokenizer_state_init(&parser, buffer, length);
4077
- parser_state_init(&parser);
4078
-
4079
- if (options->fragment_context != GUMBO_TAG_LAST) {
4080
- fragment_parser_init(
4081
- &parser, options->fragment_context, options->fragment_namespace);
4082
- }
4083
-
4084
- GumboParserState* state = parser._parser_state;
4085
- gumbo_debug("Parsing %.*s.\n", length, buffer);
4086
-
4087
- // Sanity check so that infinite loops die with an assertion failure instead
4088
- // of hanging the process before we ever get an error.
4089
- int loop_count = 0;
4090
-
4091
- GumboToken token;
4092
- bool has_error = false;
4093
-
4094
- do {
4095
- if (state->_reprocess_current_token) {
4096
- state->_reprocess_current_token = false;
4097
- } else {
4098
- GumboNode* current_node = get_current_node(&parser);
4099
- gumbo_tokenizer_set_is_current_node_foreign(&parser,
4100
- current_node &&
4101
- current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML);
4102
- has_error = !gumbo_lex(&parser, &token) || has_error;
4103
- }
4104
- const char* token_type = "text";
4105
- switch (token.type) {
4106
- case GUMBO_TOKEN_DOCTYPE:
4107
- token_type = "doctype";
4108
- break;
4109
- case GUMBO_TOKEN_START_TAG:
4110
- token_type = gumbo_normalized_tagname(token.v.start_tag.tag);
4111
- break;
4112
- case GUMBO_TOKEN_END_TAG:
4113
- token_type = gumbo_normalized_tagname(token.v.end_tag);
4114
- break;
4115
- case GUMBO_TOKEN_COMMENT:
4116
- token_type = "comment";
4117
- break;
4118
- default:
4119
- break;
4120
- }
4121
- gumbo_debug("Handling %s token @%d:%d in state %d.\n", (char*) token_type,
4122
- token.position.line, token.position.column, state->_insertion_mode);
4123
-
4124
- state->_current_token = &token;
4125
- state->_self_closing_flag_acknowledged =
4126
- !(token.type == GUMBO_TOKEN_START_TAG &&
4127
- token.v.start_tag.is_self_closing);
4128
-
4129
- has_error = !handle_token(&parser, &token) || has_error;
4130
-
4131
- // Check for memory leaks when ownership is transferred from start tag
4132
- // tokens to nodes.
4133
- assert(state->_reprocess_current_token ||
4134
- token.type != GUMBO_TOKEN_START_TAG ||
4135
- token.v.start_tag.attributes.data == NULL);
4136
-
4137
- if (!state->_self_closing_flag_acknowledged) {
4138
- GumboError* error = parser_add_parse_error(&parser, &token);
4139
- if (error) {
4140
- error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
4141
- }
4142
- }
4143
-
4144
- ++loop_count;
4145
- assert(loop_count < 1000000000);
4146
-
4147
- } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) &&
4148
- !(options->stop_on_first_error && has_error));
4149
-
4150
- finish_parsing(&parser);
4151
- // For API uniformity reasons, if the doctype still has nulls, convert them to
4152
- // empty strings.
4153
- GumboDocument* doc_type = &parser._output->document->v.document;
4154
- if (doc_type->name == NULL) {
4155
- doc_type->name = gumbo_copy_stringz(&parser, "");
4156
- }
4157
- if (doc_type->public_identifier == NULL) {
4158
- doc_type->public_identifier = gumbo_copy_stringz(&parser, "");
4159
- }
4160
- if (doc_type->system_identifier == NULL) {
4161
- doc_type->system_identifier = gumbo_copy_stringz(&parser, "");
4162
- }
4163
-
4164
- parser_state_destroy(&parser);
4165
- gumbo_tokenizer_state_destroy(&parser);
4166
- return parser._output;
4167
- }
4168
-
4169
- void gumbo_destroy_node(GumboOptions* options, GumboNode* node) {
4170
- // Need a dummy GumboParser because the allocator comes along with the
4171
- // options object.
4172
- GumboParser parser;
4173
- parser._options = options;
4174
- destroy_node(&parser, node);
4175
- }
4176
-
4177
- void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
4178
- // Need a dummy GumboParser because the allocator comes along with the
4179
- // options object.
4180
- GumboParser parser;
4181
- parser._options = options;
4182
- destroy_node(&parser, output->document);
4183
- for (unsigned int i = 0; i < output->errors.length; ++i) {
4184
- gumbo_error_destroy(&parser, output->errors.data[i]);
4185
- }
4186
- gumbo_vector_destroy(&parser, &output->errors);
4187
- gumbo_parser_deallocate(&parser, output);
4188
- }