ruby-gumbo 1.0.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.mkd +28 -31
  4. data/Rakefile +60 -59
  5. data/ext/extconf.rb +17 -9
  6. data/ext/{gumbo.c → ruby_gumbo_ext.c} +29 -28
  7. data/lib/gumbo.rb +19 -0
  8. data/lib/gumbo/element.rb +52 -0
  9. data/lib/gumbo/{extra.rb → node.rb} +19 -22
  10. data/lib/gumbo/text.rb +29 -0
  11. data/vendor/gumbo-parser/src/attribute.c +44 -0
  12. data/vendor/gumbo-parser/src/attribute.h +37 -0
  13. data/vendor/gumbo-parser/src/char_ref.c +2561 -0
  14. data/vendor/gumbo-parser/src/char_ref.h +61 -0
  15. data/vendor/gumbo-parser/src/error.c +258 -0
  16. data/vendor/gumbo-parser/src/error.h +227 -0
  17. data/vendor/gumbo-parser/src/gumbo.h +807 -0
  18. data/vendor/gumbo-parser/src/insertion_mode.h +57 -0
  19. data/vendor/gumbo-parser/src/parser.c +3917 -0
  20. data/vendor/gumbo-parser/src/parser.h +57 -0
  21. data/vendor/gumbo-parser/src/string_buffer.c +106 -0
  22. data/vendor/gumbo-parser/src/string_buffer.h +81 -0
  23. data/vendor/gumbo-parser/src/string_piece.c +49 -0
  24. data/vendor/gumbo-parser/src/string_piece.h +39 -0
  25. data/vendor/gumbo-parser/src/tag.c +225 -0
  26. data/vendor/gumbo-parser/src/token_type.h +40 -0
  27. data/vendor/gumbo-parser/src/tokenizer.c +2980 -0
  28. data/vendor/gumbo-parser/src/tokenizer.h +123 -0
  29. data/vendor/gumbo-parser/src/tokenizer_states.h +103 -0
  30. data/vendor/gumbo-parser/src/utf8.c +275 -0
  31. data/vendor/gumbo-parser/src/utf8.h +127 -0
  32. data/vendor/gumbo-parser/src/util.c +58 -0
  33. data/vendor/gumbo-parser/src/util.h +62 -0
  34. data/vendor/gumbo-parser/src/vector.c +123 -0
  35. data/vendor/gumbo-parser/src/vector.h +69 -0
  36. metadata +40 -10
  37. data/ext/extconf.h +0 -3
@@ -0,0 +1,57 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #ifndef GUMBO_INSERTION_MODE_H_
18
+ #define GUMBO_INSERTION_MODE_H_
19
+
20
+ #ifdef __cplusplus
21
+ extern "C" {
22
+ #endif
23
+
24
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
25
+ // If new enum values are added, be sure to update the kTokenHandlers dispatch
26
+ // table in parser.c.
27
+ typedef enum {
28
+ GUMBO_INSERTION_MODE_INITIAL,
29
+ GUMBO_INSERTION_MODE_BEFORE_HTML,
30
+ GUMBO_INSERTION_MODE_BEFORE_HEAD,
31
+ GUMBO_INSERTION_MODE_IN_HEAD,
32
+ GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT,
33
+ GUMBO_INSERTION_MODE_AFTER_HEAD,
34
+ GUMBO_INSERTION_MODE_IN_BODY,
35
+ GUMBO_INSERTION_MODE_TEXT,
36
+ GUMBO_INSERTION_MODE_IN_TABLE,
37
+ GUMBO_INSERTION_MODE_IN_TABLE_TEXT,
38
+ GUMBO_INSERTION_MODE_IN_CAPTION,
39
+ GUMBO_INSERTION_MODE_IN_COLUMN_GROUP,
40
+ GUMBO_INSERTION_MODE_IN_TABLE_BODY,
41
+ GUMBO_INSERTION_MODE_IN_ROW,
42
+ GUMBO_INSERTION_MODE_IN_CELL,
43
+ GUMBO_INSERTION_MODE_IN_SELECT,
44
+ GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
45
+ GUMBO_INSERTION_MODE_IN_TEMPLATE,
46
+ GUMBO_INSERTION_MODE_AFTER_BODY,
47
+ GUMBO_INSERTION_MODE_IN_FRAMESET,
48
+ GUMBO_INSERTION_MODE_AFTER_FRAMESET,
49
+ GUMBO_INSERTION_MODE_AFTER_AFTER_BODY,
50
+ GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET
51
+ } GumboInsertionMode;
52
+
53
+ #ifdef __cplusplus
54
+ } // extern C
55
+ #endif
56
+
57
+ #endif // GUMBO_INSERTION_MODE_H_
@@ -0,0 +1,3917 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include <assert.h>
18
+ #include <ctype.h>
19
+ #include <stdarg.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "attribute.h"
25
+ #include "error.h"
26
+ #include "gumbo.h"
27
+ #include "insertion_mode.h"
28
+ #include "parser.h"
29
+ #include "tokenizer.h"
30
+ #include "tokenizer_states.h"
31
+ #include "utf8.h"
32
+ #include "util.h"
33
+ #include "vector.h"
34
+
35
+
36
+ #define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
37
+
38
+ #define GUMBO_STRING(literal) { literal, sizeof(literal) - 1 }
39
+ #define TERMINATOR { "", 0 }
40
+
41
+ static void* malloc_wrapper(void* unused, size_t size) {
42
+ return malloc(size);
43
+ }
44
+
45
+ static void free_wrapper(void* unused, void* ptr) {
46
+ free(ptr);
47
+ }
48
+
49
+ const GumboOptions kGumboDefaultOptions = {
50
+ &malloc_wrapper,
51
+ &free_wrapper,
52
+ NULL,
53
+ 8,
54
+ false,
55
+ -1,
56
+ };
57
+
58
+ static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html");
59
+ static const GumboStringPiece kPublicIdHtml4_0 = GUMBO_STRING(
60
+ "-//W3C//DTD HTML 4.0//EN");
61
+ static const GumboStringPiece kPublicIdHtml4_01 = GUMBO_STRING(
62
+ "-//W3C//DTD HTML 4.01//EN");
63
+ static const GumboStringPiece kPublicIdXhtml1_0 = GUMBO_STRING(
64
+ "-//W3C//DTD XHTML 1.0 Strict//EN");
65
+ static const GumboStringPiece kPublicIdXhtml1_1 = GUMBO_STRING(
66
+ "-//W3C//DTD XHTML 1.1//EN");
67
+ static const GumboStringPiece kSystemIdRecHtml4_0 = GUMBO_STRING(
68
+ "http://www.w3.org/TR/REC-html40/strict.dtd");
69
+ static const GumboStringPiece kSystemIdHtml4 = GUMBO_STRING(
70
+ "http://www.w3.org/TR/html4/strict.dtd");
71
+ static const GumboStringPiece kSystemIdXhtmlStrict1_1 = GUMBO_STRING(
72
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
73
+ static const GumboStringPiece kSystemIdXhtml1_1 = GUMBO_STRING(
74
+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
75
+ static const GumboStringPiece kSystemIdLegacyCompat = GUMBO_STRING(
76
+ "about:legacy-compat");
77
+
78
+ // The doctype arrays have an explicit terminator because we want to pass them
79
+ // to a helper function, and passing them as a pointer discards sizeof
80
+ // information. The SVG arrays are used only by one-off functions, and so loops
81
+ // over them use sizeof directly instead of a terminator.
82
+
83
+ static const GumboStringPiece kQuirksModePublicIdPrefixes[] = {
84
+ GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"),
85
+ GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
86
+ GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"),
87
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"),
88
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"),
89
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"),
90
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"),
91
+ GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"),
92
+ GUMBO_STRING("-//IETF//DTD HTML 2.0//"),
93
+ GUMBO_STRING("-//IETF//DTD HTML 2.1E//"),
94
+ GUMBO_STRING("-//IETF//DTD HTML 3.0//"),
95
+ GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"),
96
+ GUMBO_STRING("-//IETF//DTD HTML 3.2//"),
97
+ GUMBO_STRING("-//IETF//DTD HTML 3//"),
98
+ GUMBO_STRING("-//IETF//DTD HTML Level 0//"),
99
+ GUMBO_STRING("-//IETF//DTD HTML Level 1//"),
100
+ GUMBO_STRING("-//IETF//DTD HTML Level 2//"),
101
+ GUMBO_STRING("-//IETF//DTD HTML Level 3//"),
102
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"),
103
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"),
104
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"),
105
+ GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"),
106
+ GUMBO_STRING("-//IETF//DTD HTML Strict//"),
107
+ GUMBO_STRING("-//IETF//DTD HTML//"),
108
+ GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"),
109
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
110
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
111
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
112
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
113
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
114
+ GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
115
+ GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"),
116
+ GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"),
117
+ GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"),
118
+ GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
119
+ GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
120
+ GUMBO_STRING("-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)"
121
+ "extensions to HTML 4.0//"),
122
+ GUMBO_STRING("-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::"
123
+ "extensions to HTML 4.0//"),
124
+ GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"),
125
+ GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
126
+ GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
127
+ GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
128
+ GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"),
129
+ GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"),
130
+ GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"),
131
+ GUMBO_STRING("-//W3C//DTD HTML 3.2//"),
132
+ GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"),
133
+ GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"),
134
+ GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"),
135
+ GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"),
136
+ GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"),
137
+ GUMBO_STRING("-//W3C//DTD W3 HTML//"),
138
+ GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"),
139
+ GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"),
140
+ GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"),
141
+ TERMINATOR
142
+ };
143
+
144
+ static const GumboStringPiece kQuirksModePublicIdExactMatches[] = {
145
+ GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"),
146
+ GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"),
147
+ GUMBO_STRING("HTML"),
148
+ TERMINATOR
149
+ };
150
+
151
+ static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = {
152
+ GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"),
153
+ TERMINATOR
154
+ };
155
+
156
+ static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = {
157
+ GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"),
158
+ GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"),
159
+ TERMINATOR
160
+ };
161
+
162
+ static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] = {
163
+ GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"),
164
+ GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"),
165
+ TERMINATOR
166
+ };
167
+
168
+ // Indexed by GumboNamespaceEnum; keep in sync with that.
169
+ static const char* kLegalXmlns[] = {
170
+ "http://www.w3.org/1999/xhtml",
171
+ "http://www.w3.org/2000/svg",
172
+ "http://www.w3.org/1998/Math/MathML"
173
+ };
174
+
175
+ typedef struct _ReplacementEntry {
176
+ const GumboStringPiece from;
177
+ const GumboStringPiece to;
178
+ } ReplacementEntry;
179
+
180
+ #define REPLACEMENT_ENTRY(from, to) \
181
+ { GUMBO_STRING(from), GUMBO_STRING(to) }
182
+
183
+ // Static data for SVG attribute replacements.
184
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-svg-attributes
185
+ static const ReplacementEntry kSvgAttributeReplacements[] = {
186
+ REPLACEMENT_ENTRY("attributename", "attributeName"),
187
+ REPLACEMENT_ENTRY("attributetype", "attributeType"),
188
+ REPLACEMENT_ENTRY("basefrequency", "baseFrequency"),
189
+ REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
190
+ REPLACEMENT_ENTRY("calcmode", "calcMode"),
191
+ REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
192
+ REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
193
+ REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
194
+ REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
195
+ REPLACEMENT_ENTRY("edgemode", "edgeMode"),
196
+ REPLACEMENT_ENTRY("externalresourcesrequired", "externalResourcesRequired"),
197
+ REPLACEMENT_ENTRY("filterres", "filterRes"),
198
+ REPLACEMENT_ENTRY("filterunits", "filterUnits"),
199
+ REPLACEMENT_ENTRY("glyphref", "glyphRef"),
200
+ REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
201
+ REPLACEMENT_ENTRY("gradientunits", "gradientUnits"),
202
+ REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"),
203
+ REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"),
204
+ REPLACEMENT_ENTRY("keypoints", "keyPoints"),
205
+ REPLACEMENT_ENTRY("keysplines", "keySplines"),
206
+ REPLACEMENT_ENTRY("keytimes", "keyTimes"),
207
+ REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"),
208
+ REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"),
209
+ REPLACEMENT_ENTRY("markerheight", "markerHeight"),
210
+ REPLACEMENT_ENTRY("markerunits", "markerUnits"),
211
+ REPLACEMENT_ENTRY("markerwidth", "markerWidth"),
212
+ REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"),
213
+ REPLACEMENT_ENTRY("maskunits", "maskUnits"),
214
+ REPLACEMENT_ENTRY("numoctaves", "numOctaves"),
215
+ REPLACEMENT_ENTRY("pathlength", "pathLength"),
216
+ REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"),
217
+ REPLACEMENT_ENTRY("patterntransform", "patternTransform"),
218
+ REPLACEMENT_ENTRY("patternunits", "patternUnits"),
219
+ REPLACEMENT_ENTRY("pointsatx", "pointsAtX"),
220
+ REPLACEMENT_ENTRY("pointsaty", "pointsAtY"),
221
+ REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"),
222
+ REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"),
223
+ REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"),
224
+ REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"),
225
+ REPLACEMENT_ENTRY("refx", "refX"),
226
+ REPLACEMENT_ENTRY("refy", "refY"),
227
+ REPLACEMENT_ENTRY("repeatcount", "repeatCount"),
228
+ REPLACEMENT_ENTRY("repeatdur", "repeatDur"),
229
+ REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"),
230
+ REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"),
231
+ REPLACEMENT_ENTRY("specularconstant", "specularConstant"),
232
+ REPLACEMENT_ENTRY("specularexponent", "specularExponent"),
233
+ REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"),
234
+ REPLACEMENT_ENTRY("startoffset", "startOffset"),
235
+ REPLACEMENT_ENTRY("stddeviation", "stdDeviation"),
236
+ REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"),
237
+ REPLACEMENT_ENTRY("surfacescale", "surfaceScale"),
238
+ REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"),
239
+ REPLACEMENT_ENTRY("tablevalues", "tableValues"),
240
+ REPLACEMENT_ENTRY("targetx", "targetX"),
241
+ REPLACEMENT_ENTRY("targety", "targetY"),
242
+ REPLACEMENT_ENTRY("textlength", "textLength"),
243
+ REPLACEMENT_ENTRY("viewbox", "viewBox"),
244
+ REPLACEMENT_ENTRY("viewtarget", "viewTarget"),
245
+ REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"),
246
+ REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"),
247
+ REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"),
248
+ };
249
+
250
+ static const ReplacementEntry kSvgTagReplacements[] = {
251
+ REPLACEMENT_ENTRY("altglyph", "altGlyph"),
252
+ REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"),
253
+ REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"),
254
+ REPLACEMENT_ENTRY("animatecolor", "animateColor"),
255
+ REPLACEMENT_ENTRY("animatemotion", "animateMotion"),
256
+ REPLACEMENT_ENTRY("animatetransform", "animateTransform"),
257
+ REPLACEMENT_ENTRY("clippath", "clipPath"),
258
+ REPLACEMENT_ENTRY("feblend", "feBlend"),
259
+ REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"),
260
+ REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"),
261
+ REPLACEMENT_ENTRY("fecomposite", "feComposite"),
262
+ REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"),
263
+ REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"),
264
+ REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"),
265
+ REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"),
266
+ REPLACEMENT_ENTRY("feflood", "feFlood"),
267
+ REPLACEMENT_ENTRY("fefunca", "feFuncA"),
268
+ REPLACEMENT_ENTRY("fefuncb", "feFuncB"),
269
+ REPLACEMENT_ENTRY("fefuncg", "feFuncG"),
270
+ REPLACEMENT_ENTRY("fefuncr", "feFuncR"),
271
+ REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"),
272
+ REPLACEMENT_ENTRY("feimage", "feImage"),
273
+ REPLACEMENT_ENTRY("femerge", "feMerge"),
274
+ REPLACEMENT_ENTRY("femergenode", "feMergeNode"),
275
+ REPLACEMENT_ENTRY("femorphology", "feMorphology"),
276
+ REPLACEMENT_ENTRY("feoffset", "feOffset"),
277
+ REPLACEMENT_ENTRY("fepointlight", "fePointLight"),
278
+ REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"),
279
+ REPLACEMENT_ENTRY("fespotlight", "feSpotLight"),
280
+ REPLACEMENT_ENTRY("fetile", "feTile"),
281
+ REPLACEMENT_ENTRY("feturbulence", "feTurbulence"),
282
+ REPLACEMENT_ENTRY("foreignobject", "foreignObject"),
283
+ REPLACEMENT_ENTRY("glyphref", "glyphRef"),
284
+ REPLACEMENT_ENTRY("lineargradient", "linearGradient"),
285
+ REPLACEMENT_ENTRY("radialgradient", "radialGradient"),
286
+ REPLACEMENT_ENTRY("textpath", "textPath"),
287
+ };
288
+
289
+ typedef struct _NamespacedAttributeReplacement {
290
+ const char* from;
291
+ const char* local_name;
292
+ const GumboAttributeNamespaceEnum attr_namespace;
293
+ } NamespacedAttributeReplacement;
294
+
295
+ static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = {
296
+ { "xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK },
297
+ { "xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK },
298
+ { "xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK },
299
+ { "xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK },
300
+ { "xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK },
301
+ { "xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK },
302
+ { "xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK },
303
+ { "xml:base", "base", GUMBO_ATTR_NAMESPACE_XML },
304
+ { "xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML },
305
+ { "xml:space", "space", GUMBO_ATTR_NAMESPACE_XML },
306
+ { "xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS },
307
+ { "xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS },
308
+ };
309
+
310
+ // The "scope marker" for the list of active formatting elements. We use a
311
+ // pointer to this as a generic marker element, since the particular element
312
+ // scope doesn't matter.
313
+ static const GumboNode kActiveFormattingScopeMarker;
314
+
315
+ // The tag_is and tag_in function use true & false to denote start & end tags,
316
+ // but for readability, we define constants for them here.
317
+ static const bool kStartTag = true;
318
+ static const bool kEndTag = false;
319
+
320
+ // Because GumboStringPieces are immutable, we can't insert a character directly
321
+ // into a text node. Instead, we accumulate all pending characters here and
322
+ // flush them out to a text node whenever a new element is inserted.
323
+ //
324
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character
325
+ typedef struct _TextNodeBufferState {
326
+ // The accumulated text to be inserted into the current text node.
327
+ GumboStringBuffer _buffer;
328
+
329
+ // A pointer to the original text represented by this text node. Note that
330
+ // because of foster parenting and other strange DOM manipulations, this may
331
+ // include other non-text HTML tags in it; it is defined as the span of
332
+ // original text from the first character in this text node to the last
333
+ // character in this text node.
334
+ const char* _start_original_text;
335
+
336
+ // The source position of the start of this text node.
337
+ GumboSourcePosition _start_position;
338
+
339
+ // The type of node that will be inserted (TEXT or WHITESPACE).
340
+ GumboNodeType _type;
341
+ } TextNodeBufferState;
342
+
343
+ typedef struct GumboInternalParserState {
344
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
345
+ GumboInsertionMode _insertion_mode;
346
+
347
+ // Used for run_generic_parsing_algorithm, which needs to switch back to the
348
+ // original insertion mode at its conclusion.
349
+ GumboInsertionMode _original_insertion_mode;
350
+
351
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements
352
+ GumboVector /*GumboNode*/ _open_elements;
353
+
354
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements
355
+ GumboVector /*GumboNode*/ _active_formatting_elements;
356
+
357
+ // The stack of template insertion modes.
358
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode
359
+ GumboVector /*InsertionMode*/ _template_insertion_modes;
360
+
361
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers
362
+ GumboNode* _head_element;
363
+ GumboNode* _form_element;
364
+
365
+ // The flag for when the spec says "Reprocess the current token in..."
366
+ bool _reprocess_current_token;
367
+
368
+ // The flag for "acknowledge the token's self-closing flag".
369
+ bool _self_closing_flag_acknowledged;
370
+
371
+ // The "frameset-ok" flag from the spec.
372
+ bool _frameset_ok;
373
+
374
+ // The flag for "If the next token is a LINE FEED, ignore that token...".
375
+ bool _ignore_next_linefeed;
376
+
377
+ // The flag for "whenever a node would be inserted into the current node, it
378
+ // must instead be foster parented". This is used for misnested table
379
+ // content, which needs to be handled according to "in body" rules yet foster
380
+ // parented outside of the table.
381
+ // It would perhaps be more explicit to have this as a parameter to
382
+ // handle_in_body and insert_element, but given how special-purpose this is
383
+ // and the number of call-sites that would need to take the extra parameter,
384
+ // it's easier just to have a state flag.
385
+ bool _foster_parent_insertions;
386
+
387
+ // The accumulated text node buffer state.
388
+ TextNodeBufferState _text_node;
389
+
390
+ // The current token.
391
+ GumboToken* _current_token;
392
+
393
+ // The way that the spec is written, the </body> and </html> tags are *always*
394
+ // implicit, because encountering one of those tokens merely switches the
395
+ // insertion mode out of "in body". So we have individual state flags for
396
+ // those end tags that are then inspected by pop_current_node when the <body>
397
+ // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG
398
+ // flag appropriately.
399
+ bool _closed_body_tag;
400
+ bool _closed_html_tag;
401
+ } GumboParserState;
402
+
403
+ static bool token_has_attribute(const GumboToken* token, const char* name) {
404
+ assert(token->type == GUMBO_TOKEN_START_TAG);
405
+ return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL;
406
+ }
407
+
408
+ // Checks if the value of the specified attribute is a case-insensitive match
409
+ // for the specified string.
410
+ static bool attribute_matches(
411
+ const GumboVector* attributes, const char* name, const char* value) {
412
+ const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
413
+ return attr ? strcasecmp(value, attr->value) == 0 : false;
414
+ }
415
+
416
+ // Checks if the value of the specified attribute is a case-sensitive match
417
+ // for the specified string.
418
+ static bool attribute_matches_case_sensitive(
419
+ const GumboVector* attributes, const char* name, const char* value) {
420
+ const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
421
+ return attr ? strcmp(value, attr->value) == 0 : false;
422
+ }
423
+
424
+ // Checks if the specified attribute vectors are identical.
425
+ static bool all_attributes_match(
426
+ const GumboVector* attr1, const GumboVector* attr2) {
427
+ int num_unmatched_attr2_elements = attr2->length;
428
+ for (int i = 0; i < attr1->length; ++i) {
429
+ const GumboAttribute* attr = attr1->data[i];
430
+ if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) {
431
+ --num_unmatched_attr2_elements;
432
+ } else {
433
+ return false;
434
+ }
435
+ }
436
+ return num_unmatched_attr2_elements == 0;
437
+ }
438
+
439
+ static void set_frameset_not_ok(GumboParser* parser) {
440
+ gumbo_debug("Setting frameset_ok to false.\n");
441
+ parser->_parser_state->_frameset_ok = false;
442
+ }
443
+
444
+ static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
445
+ GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
446
+ node->parent = NULL;
447
+ node->index_within_parent = -1;
448
+ node->type = type;
449
+ node->parse_flags = GUMBO_INSERTION_NORMAL;
450
+ return node;
451
+ }
452
+
453
+ static GumboNode* new_document_node(GumboParser* parser) {
454
+ GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT);
455
+ document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
456
+ gumbo_vector_init(
457
+ parser, 1, &document_node->v.document.children);
458
+
459
+ // Must be initialized explicitly, as there's no guarantee that we'll see a
460
+ // doc type token.
461
+ GumboDocument* document = &document_node->v.document;
462
+ document->has_doctype = false;
463
+ document->name = NULL;
464
+ document->public_identifier = NULL;
465
+ document->system_identifier = NULL;
466
+ return document_node;
467
+ }
468
+
469
+ static void output_init(GumboParser* parser) {
470
+ GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput));
471
+ output->root = NULL;
472
+ output->document = new_document_node(parser);
473
+ parser->_output = output;
474
+ gumbo_init_errors(parser);
475
+ }
476
+
477
+ static void parser_state_init(GumboParser* parser) {
478
+ GumboParserState* parser_state =
479
+ gumbo_parser_allocate(parser, sizeof(GumboParserState));
480
+ parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL;
481
+ parser_state->_reprocess_current_token = false;
482
+ parser_state->_frameset_ok = true;
483
+ parser_state->_ignore_next_linefeed = false;
484
+ parser_state->_foster_parent_insertions = false;
485
+ parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
486
+ gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer);
487
+ gumbo_vector_init(parser, 10, &parser_state->_open_elements);
488
+ gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements);
489
+ gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
490
+ parser_state->_head_element = NULL;
491
+ parser_state->_form_element = NULL;
492
+ parser_state->_current_token = NULL;
493
+ parser_state->_closed_body_tag = false;
494
+ parser_state->_closed_html_tag = false;
495
+ parser->_parser_state = parser_state;
496
+ }
497
+
498
+ static void parser_state_destroy(GumboParser* parser) {
499
+ GumboParserState* state = parser->_parser_state;
500
+ gumbo_vector_destroy(parser, &state->_active_formatting_elements);
501
+ gumbo_vector_destroy(parser, &state->_open_elements);
502
+ gumbo_vector_destroy(parser, &state->_template_insertion_modes);
503
+ gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
504
+ gumbo_parser_deallocate(parser, state);
505
+ }
506
+
507
+ static GumboNode* get_document_node(GumboParser* parser) {
508
+ return parser->_output->document;
509
+ }
510
+
511
+ // Returns the node at the bottom of the stack of open elements, or NULL if no
512
+ // elements have been added yet.
513
+ static GumboNode* get_current_node(GumboParser* parser) {
514
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
515
+ if (open_elements->length == 0) {
516
+ assert(!parser->_output->root);
517
+ return NULL;
518
+ }
519
+ assert(open_elements->length > 0);
520
+ assert(open_elements->data != NULL);
521
+ return open_elements->data[open_elements->length - 1];
522
+ }
523
+
524
+ // Returns true if the given needle is in the given array of literal
525
+ // GumboStringPieces. If exact_match is true, this requires that they match
526
+ // exactly; otherwise, this performs a prefix match to check if any of the
527
+ // elements in haystack start with needle. This always performs a
528
+ // case-insensitive match.
529
+ static bool is_in_static_list(
530
+ const char* needle, const GumboStringPiece* haystack, bool exact_match) {
531
+ for (int i = 0; haystack[i].length > 0; ++i) {
532
+ if ((exact_match && !strcmp(needle, haystack[i].data)) ||
533
+ (!exact_match && !strcasecmp(needle, haystack[i].data))) {
534
+ return true;
535
+ }
536
+ }
537
+ return false;
538
+ }
539
+
540
+ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
541
+ parser->_parser_state->_insertion_mode = mode;
542
+ }
543
+
544
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
545
+ // This is a helper function that returns the appropriate insertion mode instead
546
+ // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
547
+ // indicate that there is no appropriate insertion mode, and the loop should
548
+ // continue.
549
+ static GumboInsertionMode get_appropriate_insertion_mode(
550
+ const GumboNode* node, bool is_last) {
551
+ assert(node->type == GUMBO_NODE_ELEMENT);
552
+ switch (node->v.element.tag) {
553
+ case GUMBO_TAG_SELECT:
554
+ return GUMBO_INSERTION_MODE_IN_SELECT;
555
+ case GUMBO_TAG_TD:
556
+ case GUMBO_TAG_TH:
557
+ return is_last ?
558
+ GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_IN_CELL;
559
+ case GUMBO_TAG_TR:
560
+ return GUMBO_INSERTION_MODE_IN_ROW;
561
+ case GUMBO_TAG_TBODY:
562
+ case GUMBO_TAG_THEAD:
563
+ case GUMBO_TAG_TFOOT:
564
+ return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
565
+ case GUMBO_TAG_CAPTION:
566
+ return GUMBO_INSERTION_MODE_IN_CAPTION;
567
+ case GUMBO_TAG_COLGROUP:
568
+ return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
569
+ case GUMBO_TAG_TABLE:
570
+ return GUMBO_INSERTION_MODE_IN_TABLE;
571
+ case GUMBO_TAG_HEAD:
572
+ case GUMBO_TAG_BODY:
573
+ return GUMBO_INSERTION_MODE_IN_BODY;
574
+ case GUMBO_TAG_FRAMESET:
575
+ return GUMBO_INSERTION_MODE_IN_FRAMESET;
576
+ case GUMBO_TAG_HTML:
577
+ return GUMBO_INSERTION_MODE_BEFORE_HEAD;
578
+ default:
579
+ return is_last ?
580
+ GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
581
+ }
582
+ }
583
+
584
+ // This performs the actual "reset the insertion mode" loop.
585
+ static void reset_insertion_mode_appropriately(GumboParser* parser) {
586
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
587
+ for (int i = open_elements->length; --i >= 0; ) {
588
+ GumboInsertionMode mode =
589
+ get_appropriate_insertion_mode(open_elements->data[i], i == 0);
590
+ if (mode != GUMBO_INSERTION_MODE_INITIAL) {
591
+ set_insertion_mode(parser, mode);
592
+ return;
593
+ }
594
+ }
595
+ // Should never get here, because is_last will be set on the last iteration
596
+ // and will force GUMBO_INSERTION_MODE_IN_BODY.
597
+ assert(0);
598
+ }
599
+
600
+ static GumboError* add_parse_error(GumboParser* parser, const GumboToken* token) {
601
+ gumbo_debug("Adding parse error.\n");
602
+ GumboError* error = gumbo_add_error(parser);
603
+ if (!error) {
604
+ return NULL;
605
+ }
606
+ error->type = GUMBO_ERR_PARSER;
607
+ error->position = token->position;
608
+ error->original_text = token->original_text.data;
609
+ GumboParserError* extra_data = &error->v.parser;
610
+ extra_data->input_type = token->type;
611
+ extra_data->input_tag = GUMBO_TAG_UNKNOWN;
612
+ if (token->type == GUMBO_TOKEN_START_TAG) {
613
+ extra_data->input_tag = token->v.start_tag.tag;
614
+ } else if (token->type == GUMBO_TOKEN_END_TAG) {
615
+ extra_data->input_tag = token->v.end_tag;
616
+ }
617
+ GumboParserState* state = parser->_parser_state;
618
+ extra_data->parser_state = state->_insertion_mode;
619
+ gumbo_vector_init(parser, state->_open_elements.length,
620
+ &extra_data->tag_stack);
621
+ for (int i = 0; i < state->_open_elements.length; ++i) {
622
+ const GumboNode* node = state->_open_elements.data[i];
623
+ assert(node->type == GUMBO_NODE_ELEMENT);
624
+ gumbo_vector_add(parser, (void*) node->v.element.tag,
625
+ &extra_data->tag_stack);
626
+ }
627
+ return error;
628
+ }
629
+
630
+ // Returns true if the specified token is either a start or end tag (specified
631
+ // by is_start) with one of the tag types in the varargs list. Terminate the
632
+ // list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
633
+ // the spec references tags that are not in the spec.
634
+ // TODO(jdtang): A lot of the tag lists for this function are repeated in many
635
+ // places in the code. This is how it's written in the spec (and it's done this
636
+ // way so it's easy to verify the code against the spec), but it may be worth
637
+ // coming up with a notion of a "tag set" that includes a list of tags, and
638
+ // using that in many places. It'd probably also help performance, but I want
639
+ // to profile before optimizing.
640
+ static bool tag_in(const GumboToken* token, bool is_start, ...) {
641
+ GumboTag token_tag;
642
+ if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
643
+ token_tag = token->v.start_tag.tag;
644
+ } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
645
+ token_tag = token->v.end_tag;
646
+ } else {
647
+ return false;
648
+ }
649
+
650
+ va_list tags;
651
+ va_start(tags, is_start);
652
+ bool result = false;
653
+ for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
654
+ tag = va_arg(tags, GumboTag)) {
655
+ if (tag == token_tag) {
656
+ result = true;
657
+ break;
658
+ }
659
+ }
660
+ va_end(tags);
661
+ return result;
662
+ }
663
+
664
+ // Like tag_in, but for the single-tag case.
665
+ static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
666
+ if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
667
+ return token->v.start_tag.tag == tag;
668
+ } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
669
+ return token->v.end_tag == tag;
670
+ } else {
671
+ return false;
672
+ }
673
+ }
674
+
675
+ // Like tag_in, but checks for the tag of a node, rather than a token.
676
+ static bool node_tag_in(const GumboNode* node, ...) {
677
+ assert(node != NULL);
678
+ if (node->type != GUMBO_NODE_ELEMENT) {
679
+ return false;
680
+ }
681
+ GumboTag node_tag = node->v.element.tag;
682
+
683
+ va_list tags;
684
+ va_start(tags, node);
685
+ bool result = false;
686
+ for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
687
+ tag = va_arg(tags, GumboTag)) {
688
+ assert(tag <= GUMBO_TAG_LAST);
689
+ if (tag == node_tag) {
690
+ result = true;
691
+ break;
692
+ }
693
+ }
694
+ va_end(tags);
695
+ return result;
696
+ }
697
+
698
+ // Like node_tag_in, but for the single-tag case.
699
+ static bool node_tag_is(const GumboNode* node, GumboTag tag) {
700
+ return node->type == GUMBO_NODE_ELEMENT && node->v.element.tag == tag;
701
+ }
702
+
703
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
704
+ static bool is_mathml_integration_point(const GumboNode* node) {
705
+ return node_tag_in(node, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
706
+ GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST) &&
707
+ node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML;
708
+ }
709
+
710
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
711
+ static bool is_html_integration_point(const GumboNode* node) {
712
+ return (node_tag_in(node, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC,
713
+ GUMBO_TAG_TITLE, GUMBO_TAG_LAST) &&
714
+ node->v.element.tag_namespace == GUMBO_NAMESPACE_SVG) ||
715
+ (node_tag_is(node, GUMBO_TAG_ANNOTATION_XML) && (
716
+ attribute_matches(&node->v.element.attributes,
717
+ "encoding", "text/html") ||
718
+ attribute_matches(&node->v.element.attributes,
719
+ "encoding", "application/xhtml+xml")));
720
+ }
721
+
722
+ // Appends a node to the end of its parent, setting the "parent" and
723
+ // "index_within_parent" fields appropriately.
724
+ static void append_node(
725
+ GumboParser* parser, GumboNode* parent, GumboNode* node) {
726
+ assert(node->parent == NULL);
727
+ assert(node->index_within_parent == -1);
728
+ GumboVector* children;
729
+ if (parent->type == GUMBO_NODE_ELEMENT) {
730
+ children = &parent->v.element.children;
731
+ } else {
732
+ assert(parent->type == GUMBO_NODE_DOCUMENT);
733
+ children = &parent->v.document.children;
734
+ }
735
+ node->parent = parent;
736
+ node->index_within_parent = children->length;
737
+ gumbo_vector_add(parser, (void*) node, children);
738
+ assert(node->index_within_parent < children->length);
739
+ }
740
+
741
+ // Inserts a node at the specified index within its parent, updating the
742
+ // "parent" and "index_within_parent" fields of it and all its siblings.
743
+ static void insert_node(
744
+ GumboParser* parser, GumboNode* parent, int index, GumboNode* node) {
745
+ assert(node->parent == NULL);
746
+ assert(node->index_within_parent == -1);
747
+ assert(parent->type == GUMBO_NODE_ELEMENT);
748
+ GumboVector* children = &parent->v.element.children;
749
+ assert(index >= 0);
750
+ assert(index < children->length);
751
+ node->parent = parent;
752
+ node->index_within_parent = index;
753
+ gumbo_vector_insert_at(parser, (void*) node, index, children);
754
+ assert(node->index_within_parent < children->length);
755
+ for (int i = index + 1; i < children->length; ++i) {
756
+ GumboNode* sibling = children->data[i];
757
+ sibling->index_within_parent = i;
758
+ assert(sibling->index_within_parent < children->length);
759
+ }
760
+ }
761
+
762
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#foster-parenting
763
+ static void foster_parent_element(GumboParser* parser, GumboNode* node) {
764
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
765
+ assert(open_elements->length > 2);
766
+
767
+ node->parse_flags |= GUMBO_INSERTION_FOSTER_PARENTED;
768
+ GumboNode* foster_parent_element = open_elements->data[0];
769
+ assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
770
+ assert(node_tag_is(foster_parent_element, GUMBO_TAG_HTML));
771
+ for (int i = open_elements->length; --i > 1; ) {
772
+ GumboNode* table_element = open_elements->data[i];
773
+ if (node_tag_is(table_element, GUMBO_TAG_TABLE)) {
774
+ foster_parent_element = table_element->parent;
775
+ if (!foster_parent_element ||
776
+ foster_parent_element->type != GUMBO_NODE_ELEMENT) {
777
+ // Table has no parent; spec says it's possible if a script manipulated
778
+ // the DOM, although I don't think we have to worry about this case.
779
+ gumbo_debug("Table has no parent.\n");
780
+ foster_parent_element = open_elements->data[i - 1];
781
+ break;
782
+ }
783
+ assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
784
+ gumbo_debug("Found enclosing table (%x) at %d; parent=%s, index=%d.\n",
785
+ table_element, i, gumbo_normalized_tagname(
786
+ foster_parent_element->v.element.tag),
787
+ table_element->index_within_parent);
788
+ assert(foster_parent_element->v.element.children.data[
789
+ table_element->index_within_parent] == table_element);
790
+ insert_node(parser, foster_parent_element,
791
+ table_element->index_within_parent, node);
792
+ return;
793
+ }
794
+ }
795
+ if (node->type == GUMBO_NODE_ELEMENT) {
796
+ gumbo_vector_add(parser, (void*) node, open_elements);
797
+ }
798
+ append_node(parser, foster_parent_element, node);
799
+ }
800
+
801
+ static void maybe_flush_text_node_buffer(GumboParser* parser) {
802
+ GumboParserState* state = parser->_parser_state;
803
+ TextNodeBufferState* buffer_state = &state->_text_node;
804
+ if (buffer_state->_buffer.length == 0) {
805
+ return;
806
+ }
807
+
808
+ assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
809
+ buffer_state->_type == GUMBO_NODE_TEXT);
810
+ GumboNode* text_node = create_node(parser, buffer_state->_type);
811
+ GumboText* text_node_data = &text_node->v.text;
812
+ text_node_data->text = gumbo_string_buffer_to_string(
813
+ parser, &buffer_state->_buffer);
814
+ text_node_data->original_text.data = buffer_state->_start_original_text;
815
+ text_node_data->original_text.length =
816
+ state->_current_token->original_text.data -
817
+ buffer_state->_start_original_text;
818
+ text_node_data->start_pos = buffer_state->_start_position;
819
+ if (state->_foster_parent_insertions && node_tag_in(
820
+ get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
821
+ GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
822
+ foster_parent_element(parser, text_node);
823
+ } else {
824
+ append_node(
825
+ parser, parser->_output->root ?
826
+ get_current_node(parser) : parser->_output->document, text_node);
827
+ }
828
+ gumbo_debug("Flushing text node buffer of %.*s.\n",
829
+ (int) buffer_state->_buffer.length, buffer_state->_buffer.data);
830
+
831
+ gumbo_string_buffer_destroy(parser, &buffer_state->_buffer);
832
+ gumbo_string_buffer_init(parser, &buffer_state->_buffer);
833
+ buffer_state->_type = GUMBO_NODE_WHITESPACE;
834
+ assert(buffer_state->_buffer.length == 0);
835
+ }
836
+
837
+ static void record_end_of_element(
838
+ GumboToken* current_token, GumboElement* element) {
839
+ element->end_pos = current_token->position;
840
+ element->original_end_tag =
841
+ current_token->type == GUMBO_TOKEN_END_TAG ?
842
+ current_token->original_text : kGumboEmptyString;
843
+ }
844
+
845
+ static GumboNode* pop_current_node(GumboParser* parser) {
846
+ GumboParserState* state = parser->_parser_state;
847
+ maybe_flush_text_node_buffer(parser);
848
+ if (state->_open_elements.length > 0) {
849
+ assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
850
+ gumbo_debug(
851
+ "Popping %s node.\n",
852
+ gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
853
+ }
854
+ GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements);
855
+ if (!current_node) {
856
+ assert(state->_open_elements.length == 0);
857
+ return NULL;
858
+ }
859
+ assert(current_node->type == GUMBO_NODE_ELEMENT);
860
+ bool is_closed_body_or_html_tag =
861
+ (node_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
862
+ (node_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag);
863
+ if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
864
+ !node_tag_is(current_node, state->_current_token->v.end_tag)) &&
865
+ !is_closed_body_or_html_tag) {
866
+ current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
867
+ }
868
+ if (!is_closed_body_or_html_tag) {
869
+ record_end_of_element(state->_current_token, &current_node->v.element);
870
+ }
871
+ return current_node;
872
+ }
873
+
874
+ static void append_comment_node(
875
+ GumboParser* parser, GumboNode* node, const GumboToken* token) {
876
+ maybe_flush_text_node_buffer(parser);
877
+ GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT);
878
+ comment->type = GUMBO_NODE_COMMENT;
879
+ comment->parse_flags = GUMBO_INSERTION_NORMAL;
880
+ comment->v.text.text = token->v.text;
881
+ comment->v.text.original_text = token->original_text;
882
+ comment->v.text.start_pos = token->position;
883
+ append_node(parser, node, comment);
884
+ }
885
+
886
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
887
+ static void clear_stack_to_table_row_context(GumboParser* parser) {
888
+ while (!node_tag_in(get_current_node(parser),
889
+ GUMBO_TAG_HTML, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
890
+ pop_current_node(parser);
891
+ }
892
+ }
893
+
894
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
895
+ static void clear_stack_to_table_context(GumboParser* parser) {
896
+ while (!node_tag_in(get_current_node(parser),
897
+ GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST)) {
898
+ pop_current_node(parser);
899
+ }
900
+ }
901
+
902
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
903
+ void clear_stack_to_table_body_context(GumboParser* parser) {
904
+ while (!node_tag_in(get_current_node(parser), GUMBO_TAG_HTML,
905
+ GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
906
+ GUMBO_TAG_LAST)) {
907
+ pop_current_node(parser);
908
+ }
909
+ }
910
+
911
+ // Creates a parser-inserted element in the HTML namespace and returns it.
912
+ static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
913
+ GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
914
+ GumboElement* element = &node->v.element;
915
+ gumbo_vector_init(parser, 1, &element->children);
916
+ gumbo_vector_init(parser, 0, &element->attributes);
917
+ element->tag = tag;
918
+ element->tag_namespace = GUMBO_NAMESPACE_HTML;
919
+ element->original_tag = kGumboEmptyString;
920
+ element->original_end_tag = kGumboEmptyString;
921
+ element->start_pos = parser->_parser_state->_current_token->position;
922
+ element->end_pos = kGumboEmptySourcePosition;
923
+ return node;
924
+ }
925
+
926
+ // Constructs an element from the given start tag token.
927
+ static GumboNode* create_element_from_token(
928
+ GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
929
+ assert(token->type == GUMBO_TOKEN_START_TAG);
930
+ GumboTokenStartTag* start_tag = &token->v.start_tag;
931
+
932
+ GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
933
+ GumboElement* element = &node->v.element;
934
+ gumbo_vector_init(parser, 1, &element->children);
935
+ element->attributes = start_tag->attributes;
936
+ element->tag = start_tag->tag;
937
+ element->tag_namespace = tag_namespace;
938
+
939
+ assert(token->original_text.length >= 2);
940
+ assert(token->original_text.data[0] == '<');
941
+ assert(token->original_text.data[token->original_text.length - 1] == '>');
942
+ element->original_tag = token->original_text;
943
+ element->start_pos = token->position;
944
+ element->original_end_tag = kGumboEmptyString;
945
+ element->end_pos = kGumboEmptySourcePosition;
946
+
947
+ // The element takes ownership of the attributes from the token, so any
948
+ // allocated-memory fields should be nulled out.
949
+ start_tag->attributes = kGumboEmptyVector;
950
+ return node;
951
+ }
952
+
953
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element
954
+ static void insert_element(GumboParser* parser, GumboNode* node,
955
+ bool is_reconstructing_formatting_elements) {
956
+ GumboParserState* state = parser->_parser_state;
957
+ // NOTE(jdtang): The text node buffer must always be flushed before inserting
958
+ // a node, otherwise we're handling nodes in a different order than the spec
959
+ // mandated. However, one clause of the spec (character tokens in the body)
960
+ // requires that we reconstruct the active formatting elements *before* adding
961
+ // the character, and reconstructing the active formatting elements may itself
962
+ // result in the insertion of new elements (which should be pushed onto the
963
+ // stack of open elements before the buffer is flushed). We solve this (for
964
+ // the time being, the spec has been rewritten for <template> and the new
965
+ // version may be simpler here) with a boolean flag to this method.
966
+ if (!is_reconstructing_formatting_elements) {
967
+ maybe_flush_text_node_buffer(parser);
968
+ }
969
+ if (state->_foster_parent_insertions && node_tag_in(
970
+ get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
971
+ GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
972
+ foster_parent_element(parser, node);
973
+ gumbo_vector_add(parser, (void*) node, &state->_open_elements);
974
+ return;
975
+ }
976
+
977
+ // This is called to insert the root HTML element, but get_current_node
978
+ // assumes the stack of open elements is non-empty, so we need special
979
+ // handling for this case.
980
+ append_node(
981
+ parser, parser->_output->root ?
982
+ get_current_node(parser) : parser->_output->document, node);
983
+ gumbo_vector_add(parser, (void*) node, &state->_open_elements);
984
+ }
985
+
986
+ // Convenience method that combines create_element_from_token and
987
+ // insert_element, inserting the generated element directly into the current
988
+ // node. Returns the node inserted.
989
+ static GumboNode* insert_element_from_token(
990
+ GumboParser* parser, GumboToken* token) {
991
+ GumboNode* element =
992
+ create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML);
993
+ insert_element(parser, element, false);
994
+ gumbo_debug("Inserting <%s> element (@%x) from token.\n",
995
+ gumbo_normalized_tagname(element->v.element.tag), element);
996
+ return element;
997
+ }
998
+
999
+ // Convenience method that combines create_element and insert_element, inserting
1000
+ // a parser-generated element of a specific tag type. Returns the node
1001
+ // inserted.
1002
+ static GumboNode* insert_element_of_tag_type(
1003
+ GumboParser* parser, GumboTag tag, GumboParseFlags reason) {
1004
+ GumboNode* element = create_element(parser, tag);
1005
+ element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1006
+ insert_element(parser, element, false);
1007
+ gumbo_debug("Inserting %s element (@%x) from tag type.\n",
1008
+ gumbo_normalized_tagname(tag), element);
1009
+ return element;
1010
+ }
1011
+
1012
+ // Convenience method for creating foreign namespaced element. Returns the node
1013
+ // inserted.
1014
+ static GumboNode* insert_foreign_element(
1015
+ GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
1016
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1017
+ GumboNode* element = create_element_from_token(parser, token, tag_namespace);
1018
+ insert_element(parser, element, false);
1019
+ if (token_has_attribute(token, "xmlns") &&
1020
+ !attribute_matches_case_sensitive(
1021
+ &token->v.start_tag.attributes, "xmlns",
1022
+ kLegalXmlns[tag_namespace])) {
1023
+ // TODO(jdtang): Since there're multiple possible error codes here, we
1024
+ // eventually need reason codes to differentiate them.
1025
+ add_parse_error(parser, token);
1026
+ }
1027
+ if (token_has_attribute(token, "xmlns:xlink") &&
1028
+ !attribute_matches_case_sensitive(
1029
+ &token->v.start_tag.attributes,
1030
+ "xmlns:xlink", "http://www.w3.org/1999/xlink")) {
1031
+ add_parse_error(parser, token);
1032
+ }
1033
+ return element;
1034
+ }
1035
+
1036
+ static void insert_text_token(GumboParser* parser, GumboToken* token) {
1037
+ assert(token->type == GUMBO_TOKEN_WHITESPACE ||
1038
+ token->type == GUMBO_TOKEN_CHARACTER);
1039
+ TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
1040
+ if (buffer_state->_buffer.length == 0) {
1041
+ // Initialize position fields.
1042
+ buffer_state->_start_original_text = token->original_text.data;
1043
+ buffer_state->_start_position = token->position;
1044
+ }
1045
+ gumbo_string_buffer_append_codepoint(
1046
+ parser, token->v.character, &buffer_state->_buffer);
1047
+ if (token->type == GUMBO_TOKEN_CHARACTER) {
1048
+ buffer_state->_type = GUMBO_NODE_TEXT;
1049
+ }
1050
+ gumbo_debug("Inserting text token '%c'.\n", token->v.character);
1051
+ }
1052
+
1053
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm
1054
+ static void run_generic_parsing_algorithm(
1055
+ GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) {
1056
+ insert_element_from_token(parser, token);
1057
+ gumbo_tokenizer_set_state(parser, lexer_state);
1058
+ parser->_parser_state->_original_insertion_mode =
1059
+ parser->_parser_state->_insertion_mode;
1060
+ parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT;
1061
+ }
1062
+
1063
+ static void acknowledge_self_closing_tag(GumboParser* parser) {
1064
+ parser->_parser_state->_self_closing_flag_acknowledged = true;
1065
+ }
1066
+
1067
+ // Returns true if there's an anchor tag in the list of active formatting
1068
+ // elements, and fills in its index if so.
1069
+ static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
1070
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1071
+ for (int i = elements->length; --i >= 0; ) {
1072
+ GumboNode* node = elements->data[i];
1073
+ if (node == &kActiveFormattingScopeMarker) {
1074
+ return false;
1075
+ }
1076
+ if (node_tag_is(node, GUMBO_TAG_A)) {
1077
+ *anchor_index = i;
1078
+ return true;
1079
+ }
1080
+ }
1081
+ return false;
1082
+ }
1083
+
1084
+ // Counts the number of open formatting elements in the list of active
1085
+ // formatting elements (after the last active scope marker) that have a specific
1086
+ // tag. If this is > 0, then earliest_matching_index will be filled in with the
1087
+ // index of the first such element.
1088
+ static int count_formatting_elements_of_tag(
1089
+ GumboParser* parser, const GumboNode* desired_node,
1090
+ int* earliest_matching_index) {
1091
+ const GumboElement* desired_element = &desired_node->v.element;
1092
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1093
+ int num_identical_elements = 0;
1094
+ for (int i = elements->length; --i >= 0; ) {
1095
+ GumboNode* node = elements->data[i];
1096
+ if (node == &kActiveFormattingScopeMarker) {
1097
+ break;
1098
+ }
1099
+ assert(node->type == GUMBO_NODE_ELEMENT);
1100
+ GumboElement* element = &node->v.element;
1101
+ if (node_tag_is(node, desired_element->tag) &&
1102
+ element->tag_namespace == desired_element->tag_namespace &&
1103
+ all_attributes_match(&element->attributes,
1104
+ &desired_element->attributes)) {
1105
+ num_identical_elements++;
1106
+ *earliest_matching_index = i;
1107
+ }
1108
+ }
1109
+ return num_identical_elements;
1110
+ }
1111
+
1112
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements
1113
+ static void add_formatting_element(GumboParser* parser, const GumboNode* node) {
1114
+ assert(node == &kActiveFormattingScopeMarker ||
1115
+ node->type == GUMBO_NODE_ELEMENT);
1116
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1117
+ if (node == &kActiveFormattingScopeMarker) {
1118
+ gumbo_debug("Adding a scope marker.\n");
1119
+ } else {
1120
+ gumbo_debug("Adding a formatting element.\n");
1121
+ }
1122
+
1123
+ // Hunt for identical elements.
1124
+ int earliest_identical_element = elements->length;
1125
+ int num_identical_elements = count_formatting_elements_of_tag(
1126
+ parser, node, &earliest_identical_element);
1127
+
1128
+ // Noah's Ark clause: if there're at least 3, remove the earliest.
1129
+ if (num_identical_elements >= 3) {
1130
+ gumbo_debug("Noah's ark clause: removing element at %d.\n",
1131
+ earliest_identical_element);
1132
+ gumbo_vector_remove_at(parser, earliest_identical_element, elements);
1133
+ }
1134
+
1135
+ gumbo_vector_add(parser, (void*) node, elements);
1136
+ }
1137
+
1138
+ static bool is_open_element(GumboParser* parser, const GumboNode* node) {
1139
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1140
+ for (int i = 0; i < open_elements->length; ++i) {
1141
+ if (open_elements->data[i] == node) {
1142
+ return true;
1143
+ }
1144
+ }
1145
+ return false;
1146
+ }
1147
+
1148
+ // Clones attributes, tags, etc. of a node, but does not copy the content. The
1149
+ // clone shares no structure with the original node: all owned strings and
1150
+ // values are fresh copies.
1151
+ GumboNode* clone_node(
1152
+ GumboParser* parser, const GumboNode* node, GumboParseFlags reason) {
1153
+ assert(node->type == GUMBO_NODE_ELEMENT);
1154
+ GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
1155
+ *new_node = *node;
1156
+ new_node->parent = NULL;
1157
+ new_node->index_within_parent = -1;
1158
+ // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may
1159
+ // have a separate end tag.
1160
+ new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG;
1161
+ new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER;
1162
+ GumboElement* element = &new_node->v.element;
1163
+ gumbo_vector_init(parser, 1, &element->children);
1164
+
1165
+ const GumboVector* old_attributes = &node->v.element.attributes;
1166
+ gumbo_vector_init(parser, old_attributes->length, &element->attributes);
1167
+ for (int i = 0; i < old_attributes->length; ++i) {
1168
+ const GumboAttribute* old_attr = old_attributes->data[i];
1169
+ GumboAttribute* attr =
1170
+ gumbo_parser_allocate(parser, sizeof(GumboAttribute));
1171
+ *attr = *old_attr;
1172
+ attr->name = gumbo_copy_stringz(parser, old_attr->name);
1173
+ attr->value = gumbo_copy_stringz(parser, old_attr->value);
1174
+ gumbo_vector_add(parser, attr, &element->attributes);
1175
+ }
1176
+ return new_node;
1177
+ }
1178
+
1179
+ // "Reconstruct active formatting elements" part of the spec.
1180
+ // This implementation is based on the html5lib translation from the mess of
1181
+ // GOTOs in the spec to reasonably structured programming.
1182
+ // http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py
1183
+ static void reconstruct_active_formatting_elements(GumboParser* parser) {
1184
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1185
+ // Step 1
1186
+ if (elements->length == 0) {
1187
+ return;
1188
+ }
1189
+
1190
+ // Step 2 & 3
1191
+ int i = elements->length - 1;
1192
+ const GumboNode* element = elements->data[i];
1193
+ if (element == &kActiveFormattingScopeMarker ||
1194
+ is_open_element(parser, element)) {
1195
+ return;
1196
+ }
1197
+
1198
+ // Step 6
1199
+ do {
1200
+ if (i == 0) {
1201
+ // Step 4
1202
+ i = -1; // Incremented to 0 below.
1203
+ break;
1204
+ }
1205
+ // Step 5
1206
+ element = elements->data[--i];
1207
+ } while (element != &kActiveFormattingScopeMarker &&
1208
+ !is_open_element(parser, element));
1209
+
1210
+ ++i;
1211
+ gumbo_debug("Reconstructing elements from %d on %s parent.\n", i,
1212
+ gumbo_normalized_tagname(
1213
+ get_current_node(parser)->v.element.tag));
1214
+ for(; i < elements->length; ++i) {
1215
+ // Step 7 & 8.
1216
+ assert(elements->length > 0);
1217
+ assert(i < elements->length);
1218
+ element = elements->data[i];
1219
+ assert(element != &kActiveFormattingScopeMarker);
1220
+ GumboNode* clone = clone_node(
1221
+ parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
1222
+ // Step 9.
1223
+ insert_element(parser, clone, true);
1224
+ // Step 10.
1225
+ elements->data[i] = clone;
1226
+ gumbo_debug("Reconstructed %s element at %d.\n",
1227
+ gumbo_normalized_tagname(clone->v.element.tag), i);
1228
+ }
1229
+ }
1230
+
1231
+ static void clear_active_formatting_elements(GumboParser* parser) {
1232
+ GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1233
+ int num_elements_cleared = 0;
1234
+ const GumboNode* node;
1235
+ do {
1236
+ node = gumbo_vector_pop(parser, elements);
1237
+ ++num_elements_cleared;
1238
+ } while(node && node != &kActiveFormattingScopeMarker);
1239
+ gumbo_debug("Cleared %d elements from active formatting list.\n",
1240
+ num_elements_cleared);
1241
+ }
1242
+
1243
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode
1244
+ static GumboQuirksModeEnum compute_quirks_mode(
1245
+ const GumboTokenDocType* doctype) {
1246
+ if (doctype->force_quirks ||
1247
+ strcmp(doctype->name, kDoctypeHtml.data) ||
1248
+ is_in_static_list(doctype->public_identifier,
1249
+ kQuirksModePublicIdPrefixes, false) ||
1250
+ is_in_static_list(doctype->public_identifier,
1251
+ kQuirksModePublicIdExactMatches, true) ||
1252
+ is_in_static_list(doctype->system_identifier,
1253
+ kQuirksModeSystemIdExactMatches, true) ||
1254
+ (is_in_static_list(doctype->public_identifier,
1255
+ kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false)
1256
+ && !doctype->has_system_identifier)) {
1257
+ return GUMBO_DOCTYPE_QUIRKS;
1258
+ } else if (
1259
+ is_in_static_list(doctype->public_identifier,
1260
+ kLimitedQuirksPublicIdPrefixes, false) ||
1261
+ (is_in_static_list(doctype->public_identifier,
1262
+ kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false)
1263
+ && doctype->has_system_identifier)) {
1264
+ return GUMBO_DOCTYPE_LIMITED_QUIRKS;
1265
+ }
1266
+ return GUMBO_DOCTYPE_NO_QUIRKS;
1267
+ }
1268
+
1269
+ // The following functions are all defined by the "has an element in __ scope"
1270
+ // sections of the HTML5 spec:
1271
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
1272
+ // The basic idea behind them is that they check for an element of the given tag
1273
+ // name, contained within a scope formed by a set of other tag names. For
1274
+ // example, "has an element in list scope" looks for an element of the given tag
1275
+ // within the nearest enclosing <ol> or <ul>, along with a bunch of generic
1276
+ // element types that serve to "firewall" their content from the rest of the
1277
+ // document.
1278
+ static bool has_an_element_in_specific_scope(
1279
+ GumboParser* parser, GumboVector* /* GumboTag */ expected, bool negate, ...) {
1280
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1281
+ va_list args;
1282
+ va_start(args, negate);
1283
+ // va_arg can only run through the list once, so we copy it to an GumboVector
1284
+ // here. I wonder if it'd make more sense to make tags the GumboVector*
1285
+ // parameter and 'expected' a vararg list, but that'd require changing a lot
1286
+ // of code for unknown benefit. We may want to change the representation of
1287
+ // these tag sets anyway, to something more efficient.
1288
+ GumboVector tags;
1289
+ gumbo_vector_init(parser, 10, &tags);
1290
+ for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
1291
+ tag = va_arg(args, GumboTag)) {
1292
+ // We store the tags inline instead of storing pointers to them.
1293
+ gumbo_vector_add(parser, (void*) tag, &tags);
1294
+ }
1295
+ va_end(args);
1296
+
1297
+ bool result = false;
1298
+ for (int i = open_elements->length; --i >= 0; ) {
1299
+ const GumboNode* node = open_elements->data[i];
1300
+ if (node->type != GUMBO_NODE_ELEMENT) {
1301
+ continue;
1302
+ }
1303
+ GumboTag node_tag = node->v.element.tag;
1304
+ for (int j = 0; j < expected->length; ++j) {
1305
+ GumboTag expected_tag = (GumboTag) expected->data[j];
1306
+ if (node_tag == expected_tag) {
1307
+ result = true;
1308
+ goto cleanup;
1309
+ }
1310
+ }
1311
+
1312
+ bool found_tag = false;
1313
+ for (int j = 0; j < tags.length; ++j) {
1314
+ GumboTag tag = (GumboTag) tags.data[j];
1315
+ if (tag == node_tag) {
1316
+ found_tag = true;
1317
+ break;
1318
+ }
1319
+ }
1320
+ if (negate != found_tag) {
1321
+ result = false;
1322
+ goto cleanup;
1323
+ }
1324
+ }
1325
+ cleanup:
1326
+ gumbo_vector_destroy(parser, &tags);
1327
+ return result;
1328
+ }
1329
+
1330
+ // This is a bit of a hack to stack-allocate a one-element GumboVector name
1331
+ // 'varname' containing the 'from_var' variable, since it's used in nearly all
1332
+ // the subsequent helper functions. Note the use of void* and casts instead of
1333
+ // GumboTag; this is so the alignment requirements are the same as GumboVector
1334
+ // and the data inside it can be freely accessed as if it were a normal
1335
+ // GumboVector.
1336
+ #define DECLARE_ONE_ELEMENT_GUMBO_VECTOR(varname, from_var) \
1337
+ void* varname ## _tmp_array[1] = { (void*) from_var }; \
1338
+ GumboVector varname = { varname ## _tmp_array, 1, 1 }
1339
+
1340
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
1341
+ static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
1342
+ DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1343
+ return has_an_element_in_specific_scope(
1344
+ parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1345
+ GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1346
+ GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1347
+ GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1348
+ GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
1349
+ }
1350
+
1351
+ // Like "has an element in scope", but for the specific case of looking for a
1352
+ // unique target node, not for any node with a given tag name. This duplicates
1353
+ // much of the algorithm from has_an_element_in_specific_scope because the
1354
+ // predicate is different when checking for an exact node, and it's easier &
1355
+ // faster just to duplicate the code for this one case than to try and
1356
+ // parameterize it.
1357
+ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
1358
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
1359
+ for (int i = open_elements->length; --i >= 0; ) {
1360
+ const GumboNode* current = open_elements->data[i];
1361
+ if (current == node) {
1362
+ return true;
1363
+ }
1364
+ if (current->type != GUMBO_NODE_ELEMENT) {
1365
+ continue;
1366
+ }
1367
+ if (node_tag_in(
1368
+ current, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1369
+ GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1370
+ GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
1371
+ GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML,
1372
+ GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_TITLE,
1373
+ GUMBO_TAG_LAST)) {
1374
+ return false;
1375
+ }
1376
+ }
1377
+ assert(false);
1378
+ return false;
1379
+ }
1380
+
1381
+ // Like has_an_element_in_scope, but restricts the expected tag to a range of
1382
+ // possible tag names instead of just a single one.
1383
+ static bool has_an_element_in_scope_with_tagname(GumboParser* parser, ...) {
1384
+ GumboVector tags;
1385
+ // 6 = arbitrary initial size for vector, chosen because the major use-case
1386
+ // for this method is heading tags, of which there are 6.
1387
+ gumbo_vector_init(parser, 6, &tags);
1388
+ va_list args;
1389
+ va_start(args, parser);
1390
+ for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
1391
+ tag = va_arg(args, GumboTag)) {
1392
+ gumbo_vector_add(parser, (void*) tag, &tags);
1393
+ }
1394
+ bool found = has_an_element_in_specific_scope(
1395
+ parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1396
+ GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1397
+ GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1398
+ GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1399
+ GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
1400
+ gumbo_vector_destroy(parser, &tags);
1401
+ va_end(args);
1402
+ return found;
1403
+ }
1404
+
1405
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
1406
+ static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
1407
+ DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1408
+ return has_an_element_in_specific_scope(
1409
+ parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1410
+ GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1411
+ GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1412
+ GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1413
+ GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_OL, GUMBO_TAG_UL,
1414
+ GUMBO_TAG_LAST);
1415
+ }
1416
+
1417
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
1418
+ static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
1419
+ DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1420
+ return has_an_element_in_specific_scope(
1421
+ parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1422
+ GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1423
+ GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1424
+ GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1425
+ GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_BUTTON, GUMBO_TAG_LAST);
1426
+ }
1427
+
1428
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
1429
+ static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
1430
+ DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1431
+ return has_an_element_in_specific_scope(
1432
+ parser, &tags, false, GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST);
1433
+ }
1434
+
1435
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
1436
+ static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
1437
+ DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1438
+ return has_an_element_in_specific_scope(
1439
+ parser, &tags, true, GUMBO_TAG_OPTGROUP, GUMBO_TAG_OPTION,
1440
+ GUMBO_TAG_LAST);
1441
+ }
1442
+
1443
+
1444
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
1445
+ // "exception" is the "element to exclude from the process" listed in the spec.
1446
+ // Pass GUMBO_TAG_LAST to not exclude any of them.
1447
+ static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1448
+ for (;
1449
+ node_tag_in(get_current_node(parser), GUMBO_TAG_DD, GUMBO_TAG_DT,
1450
+ GUMBO_TAG_LI, GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP,
1451
+ GUMBO_TAG_P, GUMBO_TAG_RP, GUMBO_TAG_RT, GUMBO_TAG_LAST) &&
1452
+ !node_tag_is(get_current_node(parser), exception);
1453
+ pop_current_node(parser));
1454
+ }
1455
+
1456
+ // This factors out the clauses relating to "act as if an end tag token with tag
1457
+ // name "table" had been seen. Returns true if there's a table element in table
1458
+ // scope which was successfully closed, false if not and the token should be
1459
+ // ignored. Does not add parse errors; callers should handle that.
1460
+ static bool close_table(GumboParser* parser) {
1461
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) {
1462
+ return false;
1463
+ }
1464
+
1465
+ GumboNode* node = pop_current_node(parser);
1466
+ while (!node_tag_is(node, GUMBO_TAG_TABLE)) {
1467
+ node = pop_current_node(parser);
1468
+ }
1469
+ reset_insertion_mode_appropriately(parser);
1470
+ return true;
1471
+ }
1472
+
1473
+ // This factors out the clauses relating to "act as if an end tag token with tag
1474
+ // name `cell_tag` had been seen".
1475
+ static bool close_table_cell(GumboParser* parser, const GumboToken* token,
1476
+ GumboTag cell_tag) {
1477
+ bool result = true;
1478
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
1479
+ const GumboNode* node = get_current_node(parser);
1480
+ if (!node_tag_is(node, cell_tag)) {
1481
+ add_parse_error(parser, token);
1482
+ result = false;
1483
+ }
1484
+ do {
1485
+ node = pop_current_node(parser);
1486
+ } while (!node_tag_is(node, cell_tag));
1487
+
1488
+ clear_active_formatting_elements(parser);
1489
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
1490
+ return result;
1491
+ }
1492
+
1493
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell
1494
+ // This holds the logic to determine whether we should close a <td> or a <th>.
1495
+ static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1496
+ if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1497
+ assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1498
+ return close_table_cell(parser, token, GUMBO_TAG_TD);
1499
+ } else {
1500
+ assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1501
+ return close_table_cell(parser, token, GUMBO_TAG_TH);
1502
+ }
1503
+ }
1504
+
1505
+ // This factors out the "act as if an end tag of tag name 'select' had been
1506
+ // seen" clause of the spec, since it's referenced in several places. It pops
1507
+ // all nodes from the stack until the current <select> has been closed, then
1508
+ // resets the insertion mode appropriately.
1509
+ static void close_current_select(GumboParser* parser) {
1510
+ GumboNode* node = pop_current_node(parser);
1511
+ while (!node_tag_is(node, GUMBO_TAG_SELECT)) {
1512
+ node = pop_current_node(parser);
1513
+ }
1514
+ reset_insertion_mode_appropriately(parser);
1515
+ }
1516
+
1517
+ // The list of nodes in the "special" category:
1518
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
1519
+ static bool is_special_node(const GumboNode* node) {
1520
+ assert(node->type == GUMBO_NODE_ELEMENT);
1521
+ switch (node->v.element.tag_namespace) {
1522
+ case GUMBO_NAMESPACE_HTML:
1523
+ return node_tag_in(node,
1524
+ GUMBO_TAG_ADDRESS, GUMBO_TAG_APPLET, GUMBO_TAG_AREA,
1525
+ GUMBO_TAG_ARTICLE, GUMBO_TAG_ASIDE, GUMBO_TAG_BASE,
1526
+ GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND, GUMBO_TAG_BLOCKQUOTE,
1527
+ GUMBO_TAG_BODY, GUMBO_TAG_BR, GUMBO_TAG_BUTTON, GUMBO_TAG_CAPTION,
1528
+ GUMBO_TAG_CENTER, GUMBO_TAG_COL, GUMBO_TAG_COLGROUP,
1529
+ GUMBO_TAG_MENUITEM, GUMBO_TAG_DD, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
1530
+ GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EMBED,
1531
+ GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE,
1532
+ GUMBO_TAG_FOOTER, GUMBO_TAG_FORM, GUMBO_TAG_FRAME,
1533
+ GUMBO_TAG_FRAMESET, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
1534
+ GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HEAD,
1535
+ GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_HR, GUMBO_TAG_HTML,
1536
+ GUMBO_TAG_IFRAME, GUMBO_TAG_IMG, GUMBO_TAG_INPUT, GUMBO_TAG_ISINDEX,
1537
+ GUMBO_TAG_LI, GUMBO_TAG_LINK, GUMBO_TAG_LISTING, GUMBO_TAG_MARQUEE,
1538
+ GUMBO_TAG_MENU, GUMBO_TAG_META, GUMBO_TAG_NAV, GUMBO_TAG_NOEMBED,
1539
+ GUMBO_TAG_NOFRAMES, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_OBJECT,
1540
+ GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_PARAM, GUMBO_TAG_PLAINTEXT,
1541
+ GUMBO_TAG_PRE, GUMBO_TAG_SCRIPT, GUMBO_TAG_SECTION, GUMBO_TAG_SELECT,
1542
+ GUMBO_TAG_STYLE, GUMBO_TAG_SUMMARY, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
1543
+ GUMBO_TAG_TD, GUMBO_TAG_TEXTAREA, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
1544
+ GUMBO_TAG_THEAD, GUMBO_TAG_TITLE, GUMBO_TAG_TR, GUMBO_TAG_UL,
1545
+ GUMBO_TAG_WBR, GUMBO_TAG_XMP, GUMBO_TAG_LAST);
1546
+ case GUMBO_NAMESPACE_MATHML:
1547
+ return node_tag_in(node,
1548
+ GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1549
+ GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_LAST);
1550
+ case GUMBO_NAMESPACE_SVG:
1551
+ return node_tag_in(node,
1552
+ GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_LAST);
1553
+ }
1554
+ abort();
1555
+ return false; // Pacify compiler.
1556
+ }
1557
+
1558
+ // Implicitly closes currently open tags until it reaches an element with the
1559
+ // specified tag name. If the elements closed are in the set handled by
1560
+ // generate_implied_end_tags, this is normal operation and this function returns
1561
+ // true. Otherwise, a parse error is recorded and this function returns false.
1562
+ static bool implicitly_close_tags(
1563
+ GumboParser* parser, GumboToken* token, GumboTag target) {
1564
+ bool result = true;
1565
+ generate_implied_end_tags(parser, target);
1566
+ if (!node_tag_is(get_current_node(parser), target)) {
1567
+ add_parse_error(parser, token);
1568
+ while (!node_tag_is(get_current_node(parser), target)) {
1569
+ pop_current_node(parser);
1570
+ }
1571
+ result = false;
1572
+ }
1573
+ assert(node_tag_is(get_current_node(parser), target));
1574
+ pop_current_node(parser);
1575
+ return result;
1576
+ }
1577
+
1578
+ // If the stack of open elements has a <p> tag in button scope, this acts as if
1579
+ // a </p> tag was encountered, implicitly closing tags. Returns false if a
1580
+ // parse error occurs. This is a convenience function because this particular
1581
+ // clause appears several times in the spec.
1582
+ static bool maybe_implicitly_close_p_tag(GumboParser* parser, GumboToken* token) {
1583
+ if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1584
+ return implicitly_close_tags(parser, token, GUMBO_TAG_P);
1585
+ }
1586
+ return true;
1587
+ }
1588
+
1589
+ // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
1590
+ // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
1591
+ static void maybe_implicitly_close_list_tag(
1592
+ GumboParser* parser, GumboToken* token, bool is_li) {
1593
+ GumboParserState* state = parser->_parser_state;
1594
+ state->_frameset_ok = false;
1595
+ for (int i = state->_open_elements.length; --i >= 0; ) {
1596
+ const GumboNode* node = state->_open_elements.data[i];
1597
+ bool is_list_tag = is_li ?
1598
+ node_tag_is(node, GUMBO_TAG_LI) :
1599
+ node_tag_in(node, GUMBO_TAG_DD, GUMBO_TAG_DT, GUMBO_TAG_LAST);
1600
+ if (is_list_tag) {
1601
+ implicitly_close_tags(parser, token, node->v.element.tag);
1602
+ return;
1603
+ }
1604
+ if (is_special_node(node) &&
1605
+ !node_tag_in(node, GUMBO_TAG_ADDRESS, GUMBO_TAG_DIV, GUMBO_TAG_P,
1606
+ GUMBO_TAG_LAST)) {
1607
+ return;
1608
+ }
1609
+ }
1610
+ }
1611
+
1612
+ static void merge_attributes(
1613
+ GumboParser* parser, GumboToken* token, GumboNode* node) {
1614
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1615
+ assert(node->type == GUMBO_NODE_ELEMENT);
1616
+ const GumboVector* token_attr = &token->v.start_tag.attributes;
1617
+ GumboVector* node_attr = &node->v.element.attributes;
1618
+
1619
+ for (int i = 0; i < token_attr->length; ++i) {
1620
+ GumboAttribute* attr = token_attr->data[i];
1621
+ if (!gumbo_get_attribute(node_attr, attr->name)) {
1622
+ // Ownership of the attribute is transferred by this gumbo_vector_add,
1623
+ // so it has to be nulled out of the original token so it doesn't get
1624
+ // double-deleted.
1625
+ gumbo_vector_add(parser, attr, node_attr);
1626
+ token_attr->data[i] = NULL;
1627
+ }
1628
+ }
1629
+ // When attributes are merged, it means the token has been ignored and merged
1630
+ // with another token, so we need to free its memory. The attributes that are
1631
+ // transferred need to be nulled-out in the vector above so that they aren't
1632
+ // double-deleted.
1633
+ gumbo_token_destroy(parser, token);
1634
+
1635
+ #ifndef NDEBUG
1636
+ // Mark this sentinel so the assertion in the main loop knows it's been
1637
+ // destroyed.
1638
+ token->v.start_tag.attributes = kGumboEmptyVector;
1639
+ #endif
1640
+ }
1641
+
1642
+ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) {
1643
+ for (int i = 0;
1644
+ i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry); ++i) {
1645
+ const ReplacementEntry* entry = &kSvgTagReplacements[i];
1646
+ if (gumbo_string_equals_ignore_case(tag, &entry->from)) {
1647
+ return entry->to.data;
1648
+ }
1649
+ }
1650
+ return NULL;
1651
+ }
1652
+
1653
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
1654
+ // This destructively modifies any matching attributes on the token and sets the
1655
+ // namespace appropriately.
1656
+ static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) {
1657
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1658
+ const GumboVector* attributes = &token->v.start_tag.attributes;
1659
+ for (int i = 0;
1660
+ i < sizeof(kForeignAttributeReplacements) /
1661
+ sizeof(NamespacedAttributeReplacement); ++i) {
1662
+ const NamespacedAttributeReplacement* entry =
1663
+ &kForeignAttributeReplacements[i];
1664
+ GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from);
1665
+ if (!attr) {
1666
+ continue;
1667
+ }
1668
+ gumbo_parser_deallocate(parser, (void*) attr->name);
1669
+ attr->attr_namespace = entry->attr_namespace;
1670
+ attr->name = gumbo_copy_stringz(parser, entry->local_name);
1671
+ }
1672
+ }
1673
+
1674
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes
1675
+ // This destructively modifies any matching attributes on the token.
1676
+ static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) {
1677
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1678
+ const GumboVector* attributes = &token->v.start_tag.attributes;
1679
+ for (int i = 0;
1680
+ i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) {
1681
+ const ReplacementEntry* entry = &kSvgAttributeReplacements[i];
1682
+ GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data);
1683
+ if (!attr) {
1684
+ continue;
1685
+ }
1686
+ gumbo_parser_deallocate(parser, (void*) attr->name);
1687
+ attr->name = gumbo_copy_stringz(parser, entry->to.data);
1688
+ }
1689
+ }
1690
+
1691
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes
1692
+ // Note that this may destructively modify the token with the new attribute
1693
+ // value.
1694
+ static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) {
1695
+ assert(token->type == GUMBO_TOKEN_START_TAG);
1696
+ GumboAttribute* attr = gumbo_get_attribute(
1697
+ &token->v.start_tag.attributes, "definitionurl");
1698
+ if (!attr) {
1699
+ return;
1700
+ }
1701
+ gumbo_parser_deallocate(parser, (void*) attr->name);
1702
+ attr->name = gumbo_copy_stringz(parser, "definitionURL");
1703
+ }
1704
+
1705
+ static bool doctype_matches(
1706
+ const GumboTokenDocType* doctype,
1707
+ const GumboStringPiece* public_id,
1708
+ const GumboStringPiece* system_id,
1709
+ bool allow_missing_system_id) {
1710
+ return !strcmp(doctype->public_identifier, public_id->data) &&
1711
+ (allow_missing_system_id || doctype->has_system_identifier) &&
1712
+ !strcmp(doctype->system_identifier, system_id->data);
1713
+ }
1714
+
1715
+ static bool maybe_add_doctype_error(
1716
+ GumboParser* parser, const GumboToken* token) {
1717
+ const GumboTokenDocType* doctype = &token->v.doc_type;
1718
+ bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data);
1719
+ if ((!html_doctype ||
1720
+ doctype->has_public_identifier ||
1721
+ (doctype->has_system_identifier && !strcmp(
1722
+ doctype->system_identifier, kSystemIdLegacyCompat.data))) &&
1723
+ !(html_doctype && (
1724
+ doctype_matches(doctype, &kPublicIdHtml4_0,
1725
+ &kSystemIdRecHtml4_0, true) ||
1726
+ doctype_matches(doctype, &kPublicIdHtml4_01, &kSystemIdHtml4, true) ||
1727
+ doctype_matches(doctype, &kPublicIdXhtml1_0,
1728
+ &kSystemIdXhtmlStrict1_1, false) ||
1729
+ doctype_matches(doctype, &kPublicIdXhtml1_1,
1730
+ &kSystemIdXhtml1_1, false)))) {
1731
+ add_parse_error(parser, token);
1732
+ return false;
1733
+ }
1734
+ return true;
1735
+ }
1736
+
1737
+ static void remove_from_parent(GumboParser* parser, GumboNode* node) {
1738
+ if (!node->parent) {
1739
+ // The node may not have a parent if, for example, it is a newly-cloned copy
1740
+ // of an active formatting element. DOM manipulations continue with the
1741
+ // orphaned fragment of the DOM tree until it's appended/foster-parented to
1742
+ // the common ancestor at the end of the adoption agency algorithm.
1743
+ return;
1744
+ }
1745
+ assert(node->parent->type == GUMBO_NODE_ELEMENT);
1746
+ GumboVector* children = &node->parent->v.element.children;
1747
+ int index = gumbo_vector_index_of(children, node);
1748
+ assert(index != -1);
1749
+
1750
+ gumbo_vector_remove_at(parser, index, children);
1751
+ node->parent = NULL;
1752
+ node->index_within_parent = -1;
1753
+ for (int i = index; i < children->length; ++i) {
1754
+ GumboNode* child = children->data[i];
1755
+ child->index_within_parent = i;
1756
+ }
1757
+ }
1758
+
1759
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
1760
+ // Also described in the "in body" handling for end formatting tags.
1761
+ static bool adoption_agency_algorithm(
1762
+ GumboParser* parser, GumboToken* token, GumboTag closing_tag) {
1763
+ GumboParserState* state = parser->_parser_state;
1764
+ gumbo_debug("Entering adoption agency algorithm.\n");
1765
+ // Steps 1-3 & 16:
1766
+ for (int i = 0; i < 8; ++i) {
1767
+ // Step 4.
1768
+ GumboNode* formatting_node = NULL;
1769
+ int formatting_node_in_open_elements = -1;
1770
+ for (int j = state->_active_formatting_elements.length; --j >= 0; ) {
1771
+ GumboNode* current_node = state->_active_formatting_elements.data[j];
1772
+ if (current_node == &kActiveFormattingScopeMarker) {
1773
+ gumbo_debug("Broke on scope marker; aborting.\n");
1774
+ // Last scope marker; abort the algorithm.
1775
+ return false;
1776
+ }
1777
+ if (node_tag_is(current_node, closing_tag)) {
1778
+ // Found it.
1779
+ formatting_node = current_node;
1780
+ formatting_node_in_open_elements = gumbo_vector_index_of(
1781
+ &state->_open_elements, formatting_node);
1782
+ gumbo_debug("Formatting element of tag %s at %d.\n",
1783
+ gumbo_normalized_tagname(closing_tag),
1784
+ formatting_node_in_open_elements);
1785
+ break;
1786
+ }
1787
+ }
1788
+ if (!formatting_node) {
1789
+ // No matching tag; not a parse error outright, but fall through to the
1790
+ // "any other end tag" clause (which may potentially add a parse error,
1791
+ // but not always).
1792
+ gumbo_debug("No active formatting elements; aborting.\n");
1793
+ return false;
1794
+ }
1795
+
1796
+ if (formatting_node_in_open_elements == -1) {
1797
+ gumbo_debug("Formatting node not on stack of open elements.\n");
1798
+ gumbo_vector_remove(parser, formatting_node,
1799
+ &state->_active_formatting_elements);
1800
+ return false;
1801
+ }
1802
+
1803
+ if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
1804
+ add_parse_error(parser, token);
1805
+ gumbo_debug("Element not in scope.\n");
1806
+ return false;
1807
+ }
1808
+ if (formatting_node != get_current_node(parser)) {
1809
+ add_parse_error(parser, token); // But continue onwards.
1810
+ }
1811
+ assert(formatting_node);
1812
+ assert(!node_tag_is(formatting_node, GUMBO_TAG_HTML));
1813
+ assert(!node_tag_is(formatting_node, GUMBO_TAG_BODY));
1814
+
1815
+ // Step 5 & 6.
1816
+ GumboNode* furthest_block = NULL;
1817
+ for (int j = formatting_node_in_open_elements;
1818
+ j < state->_open_elements.length; ++j) {
1819
+ assert(j > 0);
1820
+ GumboNode* current = state->_open_elements.data[j];
1821
+ if (is_special_node(current)) {
1822
+ // Step 5.
1823
+ furthest_block = current;
1824
+ break;
1825
+ }
1826
+ }
1827
+ if (!furthest_block) {
1828
+ // Step 6.
1829
+ while (get_current_node(parser) != formatting_node) {
1830
+ pop_current_node(parser);
1831
+ }
1832
+ // And the formatting element itself.
1833
+ pop_current_node(parser);
1834
+ gumbo_vector_remove(parser, formatting_node,
1835
+ &state->_active_formatting_elements);
1836
+ return false;
1837
+ }
1838
+ assert(!node_tag_is(furthest_block, GUMBO_TAG_HTML));
1839
+ assert(furthest_block);
1840
+
1841
+ // Step 7.
1842
+ // Elements may be moved and reparented by this algorithm, so
1843
+ // common_ancestor is not necessarily the same as formatting_node->parent.
1844
+ GumboNode* common_ancestor =
1845
+ state->_open_elements.data[gumbo_vector_index_of(
1846
+ &state->_open_elements, formatting_node) - 1];
1847
+ gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
1848
+ gumbo_normalized_tagname(common_ancestor->v.element.tag),
1849
+ gumbo_normalized_tagname(furthest_block->v.element.tag));
1850
+
1851
+ // Step 8.
1852
+ int bookmark = gumbo_vector_index_of(
1853
+ &state->_active_formatting_elements, formatting_node);;
1854
+ // Step 9.
1855
+ GumboNode* node = furthest_block;
1856
+ GumboNode* last_node = furthest_block;
1857
+ // Must be stored explicitly, in case node is removed from the stack of open
1858
+ // elements, to handle step 9.4.
1859
+ int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
1860
+ assert(saved_node_index > 0);
1861
+ // Step 9.1-9.3 & 9.11.
1862
+ for (int j = 0; j < 3; ++j) {
1863
+ // Step 9.4.
1864
+ int node_index = gumbo_vector_index_of(&state->_open_elements, node);
1865
+ gumbo_debug(
1866
+ "Current index: %d, last index: %d.\n", node_index, saved_node_index);
1867
+ if (node_index == -1) {
1868
+ node_index = saved_node_index;
1869
+ }
1870
+ saved_node_index = --node_index;
1871
+ assert(node_index > 0);
1872
+ assert(node_index < state->_open_elements.capacity);
1873
+ node = state->_open_elements.data[node_index];
1874
+ assert(node->parent);
1875
+ // Step 9.5.
1876
+ if (gumbo_vector_index_of(
1877
+ &state->_active_formatting_elements, node) == -1) {
1878
+ gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
1879
+ continue;
1880
+ } else if (node == formatting_node) {
1881
+ // Step 9.6.
1882
+ break;
1883
+ }
1884
+ // Step 9.7.
1885
+ int formatting_index = gumbo_vector_index_of(
1886
+ &state->_active_formatting_elements, node);
1887
+ node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1888
+ state->_active_formatting_elements.data[formatting_index] = node;
1889
+ state->_open_elements.data[node_index] = node;
1890
+ // Step 9.8.
1891
+ if (last_node == furthest_block) {
1892
+ bookmark = formatting_index + 1;
1893
+ assert(bookmark <= state->_active_formatting_elements.length);
1894
+ }
1895
+ // Step 9.9.
1896
+ last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1897
+ remove_from_parent(parser, last_node);
1898
+ append_node(parser, node, last_node);
1899
+ // Step 9.10.
1900
+ last_node = node;
1901
+ }
1902
+
1903
+ // Step 10.
1904
+ gumbo_debug("Removing %s node from parent ",
1905
+ gumbo_normalized_tagname(last_node->v.element.tag));
1906
+ remove_from_parent(parser, last_node);
1907
+ last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1908
+ if (node_tag_in(common_ancestor, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
1909
+ GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
1910
+ GUMBO_TAG_LAST)) {
1911
+ gumbo_debug("and foster-parenting it.\n");
1912
+ foster_parent_element(parser, last_node);
1913
+ } else {
1914
+ gumbo_debug("and inserting it into %s.\n",
1915
+ gumbo_normalized_tagname(common_ancestor->v.element.tag));
1916
+ append_node(parser, common_ancestor, last_node);
1917
+ }
1918
+
1919
+ // Step 11.
1920
+ GumboNode* new_formatting_node = clone_node(
1921
+ parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1922
+ formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1923
+
1924
+ // Step 12. Instead of appending nodes one-by-one, we swap the children
1925
+ // vector of furthest_block with the empty children of new_formatting_node,
1926
+ // reducing memory traffic and allocations. We still have to reset their
1927
+ // parent pointers, though.
1928
+ GumboVector temp = new_formatting_node->v.element.children;
1929
+ new_formatting_node->v.element.children =
1930
+ furthest_block->v.element.children;
1931
+ furthest_block->v.element.children = temp;
1932
+
1933
+ temp = new_formatting_node->v.element.children;
1934
+ for (int i = 0; i < temp.length; ++i) {
1935
+ GumboNode* child = temp.data[i];
1936
+ child->parent = new_formatting_node;
1937
+ }
1938
+
1939
+ // Step 13.
1940
+ append_node(parser, furthest_block, new_formatting_node);
1941
+
1942
+ // Step 14.
1943
+ // If the formatting node was before the bookmark, it may shift over all
1944
+ // indices after it, so we need to explicitly find the index and possibly
1945
+ // adjust the bookmark.
1946
+ int formatting_node_index = gumbo_vector_index_of(
1947
+ &state->_active_formatting_elements, formatting_node);
1948
+ assert(formatting_node_index != -1);
1949
+ if (formatting_node_index < bookmark) {
1950
+ --bookmark;
1951
+ }
1952
+ gumbo_vector_remove_at(
1953
+ parser, formatting_node_index, &state->_active_formatting_elements);
1954
+ assert(bookmark >= 0);
1955
+ assert(bookmark <= state->_active_formatting_elements.length);
1956
+ gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
1957
+ &state->_active_formatting_elements);
1958
+
1959
+ // Step 15.
1960
+ gumbo_vector_remove(
1961
+ parser, formatting_node, &state->_open_elements);
1962
+ int insert_at = gumbo_vector_index_of(
1963
+ &state->_open_elements, furthest_block) + 1;
1964
+ assert(insert_at >= 0);
1965
+ assert(insert_at <= state->_open_elements.length);
1966
+ gumbo_vector_insert_at(
1967
+ parser, new_formatting_node, insert_at, &state->_open_elements);
1968
+ }
1969
+ return true;
1970
+ }
1971
+
1972
+ // This is here to clean up memory when the spec says "Ignore current token."
1973
+ static void ignore_token(GumboParser* parser) {
1974
+ GumboToken* token = parser->_parser_state->_current_token;
1975
+ // Ownership of the token's internal buffers are normally transferred to the
1976
+ // element, but if no element is emitted (as happens in non-verbatim-mode
1977
+ // when a token is ignored), we need to free it here to prevent a memory
1978
+ // leak.
1979
+ gumbo_token_destroy(parser, token);
1980
+ #ifndef NDEBUG
1981
+ if (token->type == GUMBO_TOKEN_START_TAG) {
1982
+ // Mark this sentinel so the assertion in the main loop knows it's been
1983
+ // destroyed.
1984
+ token->v.start_tag.attributes = kGumboEmptyVector;
1985
+ }
1986
+ #endif
1987
+ }
1988
+
1989
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html
1990
+ static void finish_parsing(GumboParser* parser) {
1991
+ maybe_flush_text_node_buffer(parser);
1992
+ GumboParserState* state = parser->_parser_state;
1993
+ for (GumboNode* node = pop_current_node(parser); node;
1994
+ node = pop_current_node(parser)) {
1995
+ if ((node_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
1996
+ (node_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
1997
+ continue;
1998
+ }
1999
+ node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
2000
+ }
2001
+ while (pop_current_node(parser)); // Pop them all.
2002
+ }
2003
+
2004
+ static bool handle_initial(GumboParser* parser, GumboToken* token) {
2005
+ GumboDocument* document = &get_document_node(parser)->v.document;
2006
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2007
+ ignore_token(parser);
2008
+ return true;
2009
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2010
+ append_comment_node(parser, get_document_node(parser), token);
2011
+ return true;
2012
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2013
+ document->has_doctype = true;
2014
+ document->name = token->v.doc_type.name;
2015
+ document->public_identifier = token->v.doc_type.public_identifier;
2016
+ document->system_identifier = token->v.doc_type.system_identifier;
2017
+ document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
2018
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2019
+ return maybe_add_doctype_error(parser, token);
2020
+ }
2021
+ add_parse_error(parser, token);
2022
+ document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
2023
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2024
+ parser->_parser_state->_reprocess_current_token = true;
2025
+ return true;
2026
+ }
2027
+
2028
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode
2029
+ static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2030
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2031
+ add_parse_error(parser, token);
2032
+ ignore_token(parser);
2033
+ return false;
2034
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2035
+ append_comment_node(parser, get_document_node(parser), token);
2036
+ return true;
2037
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2038
+ ignore_token(parser);
2039
+ return true;
2040
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2041
+ GumboNode* html_node = insert_element_from_token(parser, token);
2042
+ parser->_output->root = html_node;
2043
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2044
+ return true;
2045
+ } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
2046
+ token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2047
+ GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
2048
+ add_parse_error(parser, token);
2049
+ ignore_token(parser);
2050
+ return false;
2051
+ } else {
2052
+ GumboNode* html_node = insert_element_of_tag_type(
2053
+ parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
2054
+ assert(html_node);
2055
+ parser->_output->root = html_node;
2056
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2057
+ parser->_parser_state->_reprocess_current_token = true;
2058
+ return true;
2059
+ }
2060
+ }
2061
+
2062
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode
2063
+ static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2064
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2065
+ add_parse_error(parser, token);
2066
+ ignore_token(parser);
2067
+ return false;
2068
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2069
+ append_comment_node(parser, get_current_node(parser), token);
2070
+ return true;
2071
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2072
+ ignore_token(parser);
2073
+ return true;
2074
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2075
+ GumboNode* node = insert_element_from_token(parser, token);
2076
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2077
+ parser->_parser_state->_head_element = node;
2078
+ return true;
2079
+ } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
2080
+ token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2081
+ GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
2082
+ add_parse_error(parser, token);
2083
+ ignore_token(parser);
2084
+ return false;
2085
+ } else {
2086
+ GumboNode* node = insert_element_of_tag_type(
2087
+ parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED);
2088
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2089
+ parser->_parser_state->_head_element = node;
2090
+ parser->_parser_state->_reprocess_current_token = true;
2091
+ return true;
2092
+ }
2093
+ }
2094
+
2095
+ // Forward declarations because of mutual dependencies.
2096
+ static bool handle_token(GumboParser* parser, GumboToken* token);
2097
+ static bool handle_in_body(GumboParser* parser, GumboToken* token);
2098
+
2099
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead
2100
+ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2101
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2102
+ insert_text_token(parser, token);
2103
+ return true;
2104
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2105
+ add_parse_error(parser, token);
2106
+ ignore_token(parser);
2107
+ return false;
2108
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2109
+ append_comment_node(parser, get_current_node(parser), token);
2110
+ return true;
2111
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2112
+ return handle_in_body(parser, token);
2113
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2114
+ GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
2115
+ GUMBO_TAG_LAST)) {
2116
+ insert_element_from_token(parser, token);
2117
+ pop_current_node(parser);
2118
+ acknowledge_self_closing_tag(parser);
2119
+ return true;
2120
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2121
+ insert_element_from_token(parser, token);
2122
+ pop_current_node(parser);
2123
+ acknowledge_self_closing_tag(parser);
2124
+ // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the
2125
+ // spec doesn't apply. If clients want to handle meta-tag re-encoding, they
2126
+ // should specifically look for that string in the document and re-encode it
2127
+ // before passing to Gumbo.
2128
+ return true;
2129
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2130
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2131
+ return true;
2132
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_NOFRAMES, GUMBO_TAG_STYLE,
2133
+ GUMBO_TAG_LAST)) {
2134
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2135
+ return true;
2136
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2137
+ insert_element_from_token(parser, token);
2138
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2139
+ return true;
2140
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2141
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
2142
+ return true;
2143
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2144
+ GumboNode* head = pop_current_node(parser);
2145
+ AVOID_UNUSED_VARIABLE_WARNING(head);
2146
+ assert(node_tag_is(head, GUMBO_TAG_HEAD));
2147
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2148
+ return true;
2149
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2150
+ add_parse_error(parser, token);
2151
+ ignore_token(parser);
2152
+ return false;
2153
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2154
+ (token->type == GUMBO_TOKEN_END_TAG &&
2155
+ !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2156
+ GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
2157
+ add_parse_error(parser, token);
2158
+ return false;
2159
+ } else {
2160
+ const GumboNode* node = pop_current_node(parser);
2161
+ assert(node_tag_is(node, GUMBO_TAG_HEAD));
2162
+ AVOID_UNUSED_VARIABLE_WARNING(node);
2163
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2164
+ parser->_parser_state->_reprocess_current_token = true;
2165
+ return true;
2166
+ }
2167
+
2168
+ return true;
2169
+ }
2170
+
2171
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript
2172
+ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2173
+ if (token->type == GUMBO_TOKEN_DOCTYPE) {
2174
+ add_parse_error(parser, token);
2175
+ return false;
2176
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2177
+ return handle_in_body(parser, token);
2178
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2179
+ const GumboNode* node = pop_current_node(parser);
2180
+ assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
2181
+ AVOID_UNUSED_VARIABLE_WARNING(node);
2182
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2183
+ return true;
2184
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE ||
2185
+ token->type == GUMBO_TOKEN_COMMENT ||
2186
+ tag_in(token, kStartTag, GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND,
2187
+ GUMBO_TAG_LINK, GUMBO_TAG_META, GUMBO_TAG_NOFRAMES,
2188
+ GUMBO_TAG_STYLE, GUMBO_TAG_LAST)) {
2189
+ return handle_in_head(parser, token);
2190
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_HEAD, GUMBO_TAG_NOSCRIPT,
2191
+ GUMBO_TAG_LAST) ||
2192
+ (token->type == GUMBO_TOKEN_END_TAG &&
2193
+ !tag_is(token, kEndTag, GUMBO_TAG_BR))) {
2194
+ add_parse_error(parser, token);
2195
+ ignore_token(parser);
2196
+ return false;
2197
+ } else {
2198
+ add_parse_error(parser, token);
2199
+ const GumboNode* node = pop_current_node(parser);
2200
+ assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
2201
+ AVOID_UNUSED_VARIABLE_WARNING(node);
2202
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2203
+ parser->_parser_state->_reprocess_current_token = true;
2204
+ return false;
2205
+ }
2206
+ }
2207
+
2208
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode
2209
+ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2210
+ GumboParserState* state = parser->_parser_state;
2211
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
2212
+ insert_text_token(parser, token);
2213
+ return true;
2214
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2215
+ add_parse_error(parser, token);
2216
+ ignore_token(parser);
2217
+ return false;
2218
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2219
+ append_comment_node(parser, get_current_node(parser), token);
2220
+ return true;
2221
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2222
+ return handle_in_body(parser, token);
2223
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2224
+ insert_element_from_token(parser, token);
2225
+ state->_frameset_ok = false;
2226
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2227
+ return true;
2228
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2229
+ insert_element_from_token(parser, token);
2230
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2231
+ return true;
2232
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2233
+ GUMBO_TAG_BGSOUND, GUMBO_TAG_LINK, GUMBO_TAG_META,
2234
+ GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT, GUMBO_TAG_STYLE,
2235
+ GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
2236
+ add_parse_error(parser, token);
2237
+ assert(state->_head_element != NULL);
2238
+ // This must be flushed before we push the head element on, as there may be
2239
+ // pending character tokens that should be attached to the root.
2240
+ maybe_flush_text_node_buffer(parser);
2241
+ gumbo_vector_add(parser, state->_head_element, &state->_open_elements);
2242
+ bool result = handle_in_head(parser, token);
2243
+ gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
2244
+ return result;
2245
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2246
+ (token->type == GUMBO_TOKEN_END_TAG &&
2247
+ !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2248
+ GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
2249
+ add_parse_error(parser, token);
2250
+ ignore_token(parser);
2251
+ return false;
2252
+ } else {
2253
+ insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2254
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2255
+ state->_reprocess_current_token = true;
2256
+ return true;
2257
+ }
2258
+ }
2259
+
2260
+ static void destroy_node(GumboParser* parser, GumboNode* node) {
2261
+ switch (node->type) {
2262
+ case GUMBO_NODE_DOCUMENT:
2263
+ {
2264
+ GumboDocument* doc = &node->v.document;
2265
+ for (int i = 0; i < doc->children.length; ++i) {
2266
+ destroy_node(parser, doc->children.data[i]);
2267
+ }
2268
+ gumbo_parser_deallocate(parser, (void*) doc->children.data);
2269
+ gumbo_parser_deallocate(parser, (void*) doc->name);
2270
+ gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
2271
+ gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
2272
+ }
2273
+ break;
2274
+ case GUMBO_NODE_ELEMENT:
2275
+ for (int i = 0; i < node->v.element.attributes.length; ++i) {
2276
+ gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
2277
+ }
2278
+ gumbo_parser_deallocate(parser, node->v.element.attributes.data);
2279
+ for (int i = 0; i < node->v.element.children.length; ++i) {
2280
+ destroy_node(parser, node->v.element.children.data[i]);
2281
+ }
2282
+ gumbo_parser_deallocate(parser, node->v.element.children.data);
2283
+ break;
2284
+ case GUMBO_NODE_TEXT:
2285
+ case GUMBO_NODE_CDATA:
2286
+ case GUMBO_NODE_COMMENT:
2287
+ case GUMBO_NODE_WHITESPACE:
2288
+ gumbo_parser_deallocate(parser, (void*) node->v.text.text);
2289
+ break;
2290
+ }
2291
+ gumbo_parser_deallocate(parser, node);
2292
+ }
2293
+
2294
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
2295
+ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2296
+ GumboParserState* state = parser->_parser_state;
2297
+ assert(state->_open_elements.length > 0);
2298
+ if (token->type == GUMBO_TOKEN_NULL) {
2299
+ add_parse_error(parser, token);
2300
+ ignore_token(parser);
2301
+ return false;
2302
+ } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2303
+ reconstruct_active_formatting_elements(parser);
2304
+ insert_text_token(parser, token);
2305
+ return true;
2306
+ } else if (token->type == GUMBO_TOKEN_CHARACTER) {
2307
+ reconstruct_active_formatting_elements(parser);
2308
+ insert_text_token(parser, token);
2309
+ set_frameset_not_ok(parser);
2310
+ return true;
2311
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2312
+ append_comment_node(parser, get_current_node(parser), token);
2313
+ return true;
2314
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2315
+ add_parse_error(parser, token);
2316
+ ignore_token(parser);
2317
+ return false;
2318
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2319
+ assert(parser->_output->root != NULL);
2320
+ assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2321
+ add_parse_error(parser, token);
2322
+ merge_attributes(parser, token, parser->_output->root);
2323
+ return false;
2324
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2325
+ GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
2326
+ GUMBO_TAG_META, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT,
2327
+ GUMBO_TAG_STYLE, GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
2328
+ return handle_in_head(parser, token);
2329
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2330
+ add_parse_error(parser, token);
2331
+ if (state->_open_elements.length < 2 ||
2332
+ !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY)) {
2333
+ ignore_token(parser);
2334
+ return false;
2335
+ }
2336
+ state->_frameset_ok = false;
2337
+ merge_attributes(parser, token, state->_open_elements.data[1]);
2338
+ return false;
2339
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2340
+ add_parse_error(parser, token);
2341
+ if (state->_open_elements.length < 2 ||
2342
+ !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2343
+ !state->_frameset_ok) {
2344
+ ignore_token(parser);
2345
+ return false;
2346
+ }
2347
+ // Save the body node for later removal.
2348
+ GumboNode* body_node = state->_open_elements.data[1];
2349
+
2350
+ // Pop all nodes except root HTML element.
2351
+ GumboNode* node;
2352
+ do {
2353
+ node = pop_current_node(parser);
2354
+ } while (node != state->_open_elements.data[1]);
2355
+
2356
+ // Removing & destroying the body node is going to kill any nodes that have
2357
+ // been added to the list of active formatting elements, and so we should
2358
+ // clear it to prevent a use-after-free if the list of active formatting
2359
+ // elements is reconstructed afterwards. This may happen if whitespace
2360
+ // follows the </frameset>.
2361
+ clear_active_formatting_elements(parser);
2362
+
2363
+ // Remove the body node. We may want to factor this out into a generic
2364
+ // helper, but right now this is the only code that needs to do this.
2365
+ GumboVector* children = &parser->_output->root->v.element.children;
2366
+ for (int i = 0; i < children->length; ++i) {
2367
+ if (children->data[i] == body_node) {
2368
+ gumbo_vector_remove_at(parser, i, children);
2369
+ break;
2370
+ }
2371
+ }
2372
+ destroy_node(parser, body_node);
2373
+
2374
+ // Insert the <frameset>, and switch the insertion mode.
2375
+ insert_element_from_token(parser, token);
2376
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2377
+ return true;
2378
+ } else if (token->type == GUMBO_TOKEN_EOF) {
2379
+ for (int i = 0; i < state->_open_elements.length; ++i) {
2380
+ if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
2381
+ GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_P, GUMBO_TAG_TBODY,
2382
+ GUMBO_TAG_TD, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
2383
+ GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_BODY,
2384
+ GUMBO_TAG_HTML, GUMBO_TAG_LAST)) {
2385
+ add_parse_error(parser, token);
2386
+ return false;
2387
+ }
2388
+ }
2389
+ return true;
2390
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2391
+ GUMBO_TAG_LAST)) {
2392
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2393
+ add_parse_error(parser, token);
2394
+ ignore_token(parser);
2395
+ return false;
2396
+ }
2397
+ bool success = true;
2398
+ for (int i = 0; i < state->_open_elements.length; ++i) {
2399
+ if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
2400
+ GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_OPTGROUP,
2401
+ GUMBO_TAG_OPTION, GUMBO_TAG_P, GUMBO_TAG_RP,
2402
+ GUMBO_TAG_RT, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
2403
+ GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
2404
+ GUMBO_TAG_TR, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2405
+ GUMBO_TAG_LAST)) {
2406
+ add_parse_error(parser, token);
2407
+ success = false;
2408
+ break;
2409
+ }
2410
+ }
2411
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2412
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2413
+ parser->_parser_state->_reprocess_current_token = true;
2414
+ } else {
2415
+ GumboNode* body = state->_open_elements.data[1];
2416
+ assert(node_tag_is(body, GUMBO_TAG_BODY));
2417
+ record_end_of_element(state->_current_token, &body->v.element);
2418
+ }
2419
+ return success;
2420
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
2421
+ GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_CENTER,
2422
+ GUMBO_TAG_DETAILS, GUMBO_TAG_DIR, GUMBO_TAG_DIV,
2423
+ GUMBO_TAG_DL, GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION,
2424
+ GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER, GUMBO_TAG_HEADER,
2425
+ GUMBO_TAG_HGROUP, GUMBO_TAG_MENU, GUMBO_TAG_NAV,
2426
+ GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_SECTION,
2427
+ GUMBO_TAG_SUMMARY, GUMBO_TAG_UL, GUMBO_TAG_LAST)) {
2428
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2429
+ insert_element_from_token(parser, token);
2430
+ return result;
2431
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2432
+ GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2433
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2434
+ if (node_tag_in(get_current_node(parser), GUMBO_TAG_H1, GUMBO_TAG_H2,
2435
+ GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6,
2436
+ GUMBO_TAG_LAST)) {
2437
+ add_parse_error(parser, token);
2438
+ pop_current_node(parser);
2439
+ result = false;
2440
+ }
2441
+ insert_element_from_token(parser, token);
2442
+ return result;
2443
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_PRE, GUMBO_TAG_LISTING,
2444
+ GUMBO_TAG_LAST)) {
2445
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2446
+ insert_element_from_token(parser, token);
2447
+ state->_ignore_next_linefeed = true;
2448
+ state->_frameset_ok = false;
2449
+ return result;
2450
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2451
+ if (state->_form_element != NULL) {
2452
+ gumbo_debug("Ignoring nested form.\n");
2453
+ add_parse_error(parser, token);
2454
+ ignore_token(parser);
2455
+ return false;
2456
+ }
2457
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2458
+ state->_form_element =
2459
+ insert_element_from_token(parser, token);
2460
+ return result;
2461
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2462
+ maybe_implicitly_close_list_tag(parser, token, true);
2463
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2464
+ insert_element_from_token(parser, token);
2465
+ return result;
2466
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
2467
+ GUMBO_TAG_LAST)) {
2468
+ maybe_implicitly_close_list_tag(parser, token, false);
2469
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2470
+ insert_element_from_token(parser, token);
2471
+ return result;
2472
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2473
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2474
+ insert_element_from_token(parser, token);
2475
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
2476
+ return result;
2477
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2478
+ if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2479
+ add_parse_error(parser, token);
2480
+ implicitly_close_tags(parser, token, GUMBO_TAG_BUTTON);
2481
+ state->_reprocess_current_token = true;
2482
+ return false;
2483
+ }
2484
+ reconstruct_active_formatting_elements(parser);
2485
+ insert_element_from_token(parser, token);
2486
+ state->_frameset_ok = false;
2487
+ return true;
2488
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
2489
+ GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_BUTTON,
2490
+ GUMBO_TAG_CENTER, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
2491
+ GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_FIELDSET,
2492
+ GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER,
2493
+ GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_LISTING,
2494
+ GUMBO_TAG_MENU, GUMBO_TAG_NAV, GUMBO_TAG_OL, GUMBO_TAG_PRE,
2495
+ GUMBO_TAG_SECTION, GUMBO_TAG_SUMMARY, GUMBO_TAG_UL,
2496
+ GUMBO_TAG_LAST)) {
2497
+ GumboTag tag = token->v.end_tag;
2498
+ if (!has_an_element_in_scope(parser, tag)) {
2499
+ add_parse_error(parser, token);
2500
+ ignore_token(parser);
2501
+ return false;
2502
+ }
2503
+ implicitly_close_tags(parser, token, token->v.end_tag);
2504
+ return true;
2505
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2506
+ bool result = true;
2507
+ const GumboNode* node = state->_form_element;
2508
+ assert(!node || node->type == GUMBO_NODE_ELEMENT);
2509
+ state->_form_element = NULL;
2510
+ if (!node || !has_node_in_scope(parser, node)) {
2511
+ gumbo_debug("Closing an unopened form.\n");
2512
+ add_parse_error(parser, token);
2513
+ ignore_token(parser);
2514
+ return false;
2515
+ }
2516
+ // This differs from implicitly_close_tags because we remove *only* the
2517
+ // <form> element; other nodes are left in scope.
2518
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2519
+ if (get_current_node(parser) != node) {
2520
+ add_parse_error(parser, token);
2521
+ result = false;
2522
+ }
2523
+
2524
+ GumboVector* open_elements = &state->_open_elements;
2525
+ int index = open_elements->length - 1;
2526
+ for (; index >= 0 && open_elements->data[index] != node; --index);
2527
+ assert(index >= 0);
2528
+ gumbo_vector_remove_at(parser, index, open_elements);
2529
+ return result;
2530
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
2531
+ if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2532
+ add_parse_error(parser, token);
2533
+ reconstruct_active_formatting_elements(parser);
2534
+ insert_element_of_tag_type(
2535
+ parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2536
+ state->_reprocess_current_token = true;
2537
+ return false;
2538
+ }
2539
+ return implicitly_close_tags(parser, token, GUMBO_TAG_P);
2540
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
2541
+ if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
2542
+ add_parse_error(parser, token);
2543
+ ignore_token(parser);
2544
+ return false;
2545
+ }
2546
+ return implicitly_close_tags(parser, token, GUMBO_TAG_LI);
2547
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
2548
+ GUMBO_TAG_LAST)) {
2549
+ assert(token->type == GUMBO_TOKEN_END_TAG);
2550
+ GumboTag token_tag = token->v.end_tag;
2551
+ if (!has_an_element_in_scope(parser, token_tag)) {
2552
+ add_parse_error(parser, token);
2553
+ ignore_token(parser);
2554
+ return false;
2555
+ }
2556
+ return implicitly_close_tags(parser, token, token_tag);
2557
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2558
+ GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2559
+ if (!has_an_element_in_scope_with_tagname(
2560
+ parser, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
2561
+ GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2562
+ // No heading open; ignore the token entirely.
2563
+ add_parse_error(parser, token);
2564
+ ignore_token(parser);
2565
+ return false;
2566
+ } else {
2567
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2568
+ const GumboNode* current_node = get_current_node(parser);
2569
+ bool success = node_tag_is(current_node, token->v.end_tag);
2570
+ if (!success) {
2571
+ // There're children of the heading currently open; close them below and
2572
+ // record a parse error.
2573
+ // TODO(jdtang): Add a way to distinguish this error case from the one
2574
+ // above.
2575
+ add_parse_error(parser, token);
2576
+ }
2577
+ do {
2578
+ current_node = pop_current_node(parser);
2579
+ } while (!node_tag_in(current_node, GUMBO_TAG_H1, GUMBO_TAG_H2,
2580
+ GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5,
2581
+ GUMBO_TAG_H6, GUMBO_TAG_LAST));
2582
+ return success;
2583
+ }
2584
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
2585
+ bool success = true;
2586
+ int last_a;
2587
+ int has_matching_a = find_last_anchor_index(parser, &last_a);
2588
+ if (has_matching_a) {
2589
+ assert(has_matching_a == 1);
2590
+ add_parse_error(parser, token);
2591
+ adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
2592
+ // The adoption agency algorithm usually removes all instances of <a>
2593
+ // from the list of active formatting elements, but in case it doesn't,
2594
+ // we're supposed to do this. (The conditions where it might not are
2595
+ // listed in the spec.)
2596
+ if (find_last_anchor_index(parser, &last_a)) {
2597
+ void* last_element = gumbo_vector_remove_at(
2598
+ parser, last_a, &state->_active_formatting_elements);
2599
+ gumbo_vector_remove(
2600
+ parser, last_element, &state->_open_elements);
2601
+ }
2602
+ success = false;
2603
+ }
2604
+ reconstruct_active_formatting_elements(parser);
2605
+ add_formatting_element(parser, insert_element_from_token(parser, token));
2606
+ return success;
2607
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_B, GUMBO_TAG_BIG,
2608
+ GUMBO_TAG_CODE, GUMBO_TAG_EM, GUMBO_TAG_FONT, GUMBO_TAG_I,
2609
+ GUMBO_TAG_S, GUMBO_TAG_SMALL, GUMBO_TAG_STRIKE,
2610
+ GUMBO_TAG_STRONG, GUMBO_TAG_TT, GUMBO_TAG_U,
2611
+ GUMBO_TAG_LAST)) {
2612
+ reconstruct_active_formatting_elements(parser);
2613
+ add_formatting_element(parser, insert_element_from_token(parser, token));
2614
+ return true;
2615
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
2616
+ bool result = true;
2617
+ reconstruct_active_formatting_elements(parser);
2618
+ if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
2619
+ result = false;
2620
+ add_parse_error(parser, token);
2621
+ adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
2622
+ reconstruct_active_formatting_elements(parser);
2623
+ }
2624
+ insert_element_from_token(parser, token);
2625
+ add_formatting_element(parser, get_current_node(parser));
2626
+ return result;
2627
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_A, GUMBO_TAG_B, GUMBO_TAG_BIG,
2628
+ GUMBO_TAG_CODE, GUMBO_TAG_EM, GUMBO_TAG_FONT, GUMBO_TAG_I,
2629
+ GUMBO_TAG_NOBR, GUMBO_TAG_S, GUMBO_TAG_SMALL,
2630
+ GUMBO_TAG_STRIKE, GUMBO_TAG_STRONG, GUMBO_TAG_TT,
2631
+ GUMBO_TAG_U, GUMBO_TAG_LAST)) {
2632
+ return adoption_agency_algorithm(parser, token, token->v.end_tag);
2633
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_APPLET, GUMBO_TAG_MARQUEE,
2634
+ GUMBO_TAG_OBJECT, GUMBO_TAG_LAST)) {
2635
+ reconstruct_active_formatting_elements(parser);
2636
+ insert_element_from_token(parser, token);
2637
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
2638
+ set_frameset_not_ok(parser);
2639
+ return true;
2640
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_APPLET, GUMBO_TAG_MARQUEE,
2641
+ GUMBO_TAG_OBJECT, GUMBO_TAG_LAST)) {
2642
+ GumboTag token_tag = token->v.end_tag;
2643
+ if (!has_an_element_in_table_scope(parser, token_tag)) {
2644
+ add_parse_error(parser, token);
2645
+ ignore_token(parser);
2646
+ return false;
2647
+ }
2648
+ implicitly_close_tags(parser, token, token_tag);
2649
+ clear_active_formatting_elements(parser);
2650
+ return true;
2651
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
2652
+ if (get_document_node(parser)->v.document.doc_type_quirks_mode !=
2653
+ GUMBO_DOCTYPE_QUIRKS) {
2654
+ maybe_implicitly_close_p_tag(parser, token);
2655
+ }
2656
+ insert_element_from_token(parser, token);
2657
+ set_frameset_not_ok(parser);
2658
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
2659
+ return true;
2660
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_AREA, GUMBO_TAG_BR,
2661
+ GUMBO_TAG_EMBED, GUMBO_TAG_IMG, GUMBO_TAG_IMAGE,
2662
+ GUMBO_TAG_KEYGEN, GUMBO_TAG_WBR, GUMBO_TAG_LAST)) {
2663
+ bool success = true;
2664
+ if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2665
+ success = false;
2666
+ add_parse_error(parser, token);
2667
+ token->v.start_tag.tag = GUMBO_TAG_IMG;
2668
+ }
2669
+ reconstruct_active_formatting_elements(parser);
2670
+ GumboNode* node = insert_element_from_token(parser, token);
2671
+ if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2672
+ success = false;
2673
+ add_parse_error(parser, token);
2674
+ node->v.element.tag = GUMBO_TAG_IMG;
2675
+ node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
2676
+ }
2677
+ pop_current_node(parser);
2678
+ acknowledge_self_closing_tag(parser);
2679
+ set_frameset_not_ok(parser);
2680
+ return success;
2681
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
2682
+ if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
2683
+ // Must be before the element is inserted, as that takes ownership of the
2684
+ // token's attribute vector.
2685
+ set_frameset_not_ok(parser);
2686
+ }
2687
+ reconstruct_active_formatting_elements(parser);
2688
+ insert_element_from_token(parser, token);
2689
+ pop_current_node(parser);
2690
+ acknowledge_self_closing_tag(parser);
2691
+ return true;
2692
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_PARAM, GUMBO_TAG_SOURCE,
2693
+ GUMBO_TAG_TRACK, GUMBO_TAG_LAST)) {
2694
+ insert_element_from_token(parser, token);
2695
+ pop_current_node(parser);
2696
+ acknowledge_self_closing_tag(parser);
2697
+ return true;
2698
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
2699
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2700
+ insert_element_from_token(parser, token);
2701
+ pop_current_node(parser);
2702
+ acknowledge_self_closing_tag(parser);
2703
+ set_frameset_not_ok(parser);
2704
+ return result;
2705
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
2706
+ add_parse_error(parser, token);
2707
+ if (parser->_parser_state->_form_element != NULL) {
2708
+ ignore_token(parser);
2709
+ return false;
2710
+ }
2711
+ acknowledge_self_closing_tag(parser);
2712
+ maybe_implicitly_close_p_tag(parser, token);
2713
+ set_frameset_not_ok(parser);
2714
+
2715
+ GumboVector* token_attrs = &token->v.start_tag.attributes;
2716
+ GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt");
2717
+ GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action");
2718
+ GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "isindex");
2719
+
2720
+ GumboNode* form = insert_element_of_tag_type(
2721
+ parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
2722
+ if (action_attr) {
2723
+ gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
2724
+ }
2725
+ insert_element_of_tag_type(parser, GUMBO_TAG_HR,
2726
+ GUMBO_INSERTION_FROM_ISINDEX);
2727
+ pop_current_node(parser); // <hr>
2728
+
2729
+ insert_element_of_tag_type(parser, GUMBO_TAG_LABEL,
2730
+ GUMBO_INSERTION_FROM_ISINDEX);
2731
+ TextNodeBufferState* text_state = &parser->_parser_state->_text_node;
2732
+ text_state->_start_original_text = token->original_text.data;
2733
+ text_state->_start_position = token->position;
2734
+ text_state->_type = GUMBO_NODE_TEXT;
2735
+ if (prompt_attr) {
2736
+ int prompt_attr_length = strlen(prompt_attr->value);
2737
+ gumbo_string_buffer_destroy(parser, &text_state->_buffer);
2738
+ text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value);
2739
+ text_state->_buffer.length = prompt_attr_length;
2740
+ text_state->_buffer.capacity = prompt_attr_length + 1;
2741
+ gumbo_destroy_attribute(parser, prompt_attr);
2742
+ } else {
2743
+ GumboStringPiece prompt_text = GUMBO_STRING(
2744
+ "This is a searchable index. Enter search keywords: ");
2745
+ gumbo_string_buffer_append_string(
2746
+ parser, &prompt_text, &text_state->_buffer);
2747
+ }
2748
+
2749
+ GumboNode* input = insert_element_of_tag_type(
2750
+ parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX);
2751
+ for (int i = 0; i < token_attrs->length; ++i) {
2752
+ GumboAttribute* attr = token_attrs->data[i];
2753
+ if (attr != prompt_attr && attr != action_attr && attr != name_attr) {
2754
+ gumbo_vector_add(parser, attr, &input->v.element.attributes);
2755
+ }
2756
+ token_attrs->data[i] = NULL;
2757
+ }
2758
+
2759
+ // All attributes have been successfully transferred and nulled out at this
2760
+ // point, so the call to ignore_token will free the memory for it without
2761
+ // touching the attributes.
2762
+ ignore_token(parser);
2763
+
2764
+ GumboAttribute* name =
2765
+ gumbo_parser_allocate(parser, sizeof(GumboAttribute));
2766
+ GumboStringPiece name_str = GUMBO_STRING("name");
2767
+ GumboStringPiece isindex_str = GUMBO_STRING("isindex");
2768
+ name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
2769
+ name->name = gumbo_copy_stringz(parser, "name");
2770
+ name->value = gumbo_copy_stringz(parser, "isindex");
2771
+ name->original_name = name_str;
2772
+ name->original_value = isindex_str;
2773
+ name->name_start = kGumboEmptySourcePosition;
2774
+ name->name_end = kGumboEmptySourcePosition;
2775
+ name->value_start = kGumboEmptySourcePosition;
2776
+ name->value_end = kGumboEmptySourcePosition;
2777
+ gumbo_vector_add(parser, name, &input->v.element.attributes);
2778
+
2779
+ pop_current_node(parser); // <input>
2780
+ pop_current_node(parser); // <label>
2781
+ insert_element_of_tag_type(
2782
+ parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2783
+ pop_current_node(parser); // <hr>
2784
+ pop_current_node(parser); // <form>
2785
+ return false;
2786
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
2787
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2788
+ parser->_parser_state->_ignore_next_linefeed = true;
2789
+ set_frameset_not_ok(parser);
2790
+ return true;
2791
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
2792
+ bool result = maybe_implicitly_close_p_tag(parser, token);
2793
+ reconstruct_active_formatting_elements(parser);
2794
+ set_frameset_not_ok(parser);
2795
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2796
+ return result;
2797
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
2798
+ set_frameset_not_ok(parser);
2799
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2800
+ return true;
2801
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
2802
+ run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2803
+ return true;
2804
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
2805
+ reconstruct_active_formatting_elements(parser);
2806
+ insert_element_from_token(parser, token);
2807
+ set_frameset_not_ok(parser);
2808
+ GumboInsertionMode state = parser->_parser_state->_insertion_mode;
2809
+ if (state == GUMBO_INSERTION_MODE_IN_TABLE ||
2810
+ state == GUMBO_INSERTION_MODE_IN_CAPTION ||
2811
+ state == GUMBO_INSERTION_MODE_IN_TABLE_BODY ||
2812
+ state == GUMBO_INSERTION_MODE_IN_ROW ||
2813
+ state == GUMBO_INSERTION_MODE_IN_CELL) {
2814
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
2815
+ } else {
2816
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
2817
+ }
2818
+ return true;
2819
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP,
2820
+ GUMBO_TAG_LAST)) {
2821
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2822
+ pop_current_node(parser);
2823
+ }
2824
+ reconstruct_active_formatting_elements(parser);
2825
+ insert_element_from_token(parser, token);
2826
+ return true;
2827
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_RP, GUMBO_TAG_RT,
2828
+ GUMBO_TAG_LAST)) {
2829
+ bool success = true;
2830
+ if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
2831
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2832
+ }
2833
+ if (!node_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)) {
2834
+ add_parse_error(parser, token);
2835
+ success = false;
2836
+ }
2837
+ insert_element_from_token(parser, token);
2838
+ return success;
2839
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
2840
+ add_parse_error(parser, token);
2841
+ reconstruct_active_formatting_elements(parser);
2842
+ insert_element_of_tag_type(
2843
+ parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2844
+ pop_current_node(parser);
2845
+ return false;
2846
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
2847
+ reconstruct_active_formatting_elements(parser);
2848
+ adjust_mathml_attributes(parser, token);
2849
+ adjust_foreign_attributes(parser, token);
2850
+ insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML);
2851
+ if (token->v.start_tag.is_self_closing) {
2852
+ pop_current_node(parser);
2853
+ acknowledge_self_closing_tag(parser);
2854
+ }
2855
+ return true;
2856
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
2857
+ reconstruct_active_formatting_elements(parser);
2858
+ adjust_svg_attributes(parser, token);
2859
+ adjust_foreign_attributes(parser, token);
2860
+ insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG);
2861
+ if (token->v.start_tag.is_self_closing) {
2862
+ pop_current_node(parser);
2863
+ acknowledge_self_closing_tag(parser);
2864
+ }
2865
+ return true;
2866
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
2867
+ GUMBO_TAG_COLGROUP, GUMBO_TAG_FRAME, GUMBO_TAG_HEAD,
2868
+ GUMBO_TAG_TBODY, GUMBO_TAG_TD, GUMBO_TAG_TFOOT,
2869
+ GUMBO_TAG_TH, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
2870
+ GUMBO_TAG_LAST)) {
2871
+ add_parse_error(parser, token);
2872
+ ignore_token(parser);
2873
+ return false;
2874
+ } else if (token->type == GUMBO_TOKEN_START_TAG) {
2875
+ reconstruct_active_formatting_elements(parser);
2876
+ insert_element_from_token(parser, token);
2877
+ return true;
2878
+ } else {
2879
+ assert(token->type == GUMBO_TOKEN_END_TAG);
2880
+ GumboTag end_tag = token->v.end_tag;
2881
+ assert(state->_open_elements.length > 0);
2882
+ assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
2883
+ // Walk up the stack of open elements until we find one that either:
2884
+ // a) Matches the tag name we saw
2885
+ // b) Is in the "special" category.
2886
+ // If we see a), implicitly close everything up to and including it. If we
2887
+ // see b), then record a parse error, don't close anything (except the
2888
+ // implied end tags) and ignore the end tag token.
2889
+ for (int i = state->_open_elements.length; --i >= 0; ) {
2890
+ const GumboNode* node = state->_open_elements.data[i];
2891
+ if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
2892
+ node_tag_is(node, end_tag)) {
2893
+ generate_implied_end_tags(parser, end_tag);
2894
+ // TODO(jdtang): Do I need to add a parse error here? The condition in
2895
+ // the spec seems like it's the inverse of the loop condition above, and
2896
+ // so would never fire.
2897
+ while (node != pop_current_node(parser)); // Pop everything.
2898
+ return true;
2899
+ } else if (is_special_node(node)) {
2900
+ add_parse_error(parser, token);
2901
+ ignore_token(parser);
2902
+ return false;
2903
+ }
2904
+ }
2905
+ // <html> is in the special category, so we should never get here.
2906
+ assert(0);
2907
+ return false;
2908
+ }
2909
+ }
2910
+
2911
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata
2912
+ static bool handle_text(GumboParser* parser, GumboToken* token) {
2913
+ if (token->type == GUMBO_TOKEN_CHARACTER || token->type == GUMBO_TOKEN_WHITESPACE) {
2914
+ insert_text_token(parser, token);
2915
+ } else {
2916
+ // We provide only bare-bones script handling that doesn't involve any of
2917
+ // the parser-pause/already-started/script-nesting flags or re-entrant
2918
+ // invocations of the tokenizer. Because the intended usage of this library
2919
+ // is mostly for templating, refactoring, and static-analysis libraries, we
2920
+ // provide the script body as a text-node child of the <script> element.
2921
+ // This behavior doesn't support document.write of partial HTML elements,
2922
+ // but should be adequate for almost all other scripting support.
2923
+ if (token->type == GUMBO_TOKEN_EOF) {
2924
+ add_parse_error(parser, token);
2925
+ parser->_parser_state->_reprocess_current_token = true;
2926
+ }
2927
+ pop_current_node(parser);
2928
+ set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
2929
+ }
2930
+ return true;
2931
+ }
2932
+
2933
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable
2934
+ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
2935
+ GumboParserState* state = parser->_parser_state;
2936
+ if (token->type == GUMBO_TOKEN_CHARACTER ||
2937
+ token->type == GUMBO_TOKEN_WHITESPACE) {
2938
+ // The "pending table character tokens" list described in the spec is
2939
+ // nothing more than the TextNodeBufferState. We accumulate text tokens as
2940
+ // normal, except that when we go to flush them in the handle_in_table_text,
2941
+ // we set _foster_parent_insertions if there're non-whitespace characters in
2942
+ // the buffer.
2943
+ assert(state->_text_node._buffer.length == 0);
2944
+ state->_original_insertion_mode = state->_insertion_mode;
2945
+ state->_reprocess_current_token = true;
2946
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
2947
+ return true;
2948
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2949
+ add_parse_error(parser, token);
2950
+ ignore_token(parser);
2951
+ return false;
2952
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
2953
+ append_comment_node(parser, get_current_node(parser), token);
2954
+ return true;
2955
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
2956
+ clear_stack_to_table_context(parser);
2957
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
2958
+ insert_element_from_token(parser, token);
2959
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
2960
+ return true;
2961
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
2962
+ clear_stack_to_table_context(parser);
2963
+ insert_element_from_token(parser, token);
2964
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
2965
+ return true;
2966
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
2967
+ clear_stack_to_table_context(parser);
2968
+ insert_element_of_tag_type(
2969
+ parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED);
2970
+ parser->_parser_state->_reprocess_current_token = true;
2971
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
2972
+ return true;
2973
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
2974
+ GUMBO_TAG_THEAD, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_TR,
2975
+ GUMBO_TAG_LAST)) {
2976
+ clear_stack_to_table_context(parser);
2977
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
2978
+ if (tag_in(token, kStartTag, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_TR,
2979
+ GUMBO_TAG_LAST)) {
2980
+ insert_element_of_tag_type(
2981
+ parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED);
2982
+ state->_reprocess_current_token = true;
2983
+ } else {
2984
+ insert_element_from_token(parser, token);
2985
+ }
2986
+ return true;
2987
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
2988
+ add_parse_error(parser, token);
2989
+ if (close_table(parser)) {
2990
+ parser->_parser_state->_reprocess_current_token = true;
2991
+ } else {
2992
+ ignore_token(parser);
2993
+ }
2994
+ return false;
2995
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
2996
+ if (!close_table(parser)) {
2997
+ add_parse_error(parser, token);
2998
+ return false;
2999
+ }
3000
+ return true;
3001
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3002
+ GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
3003
+ GUMBO_TAG_TBODY, GUMBO_TAG_TD, GUMBO_TAG_TFOOT,
3004
+ GUMBO_TAG_TH, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
3005
+ GUMBO_TAG_LAST)) {
3006
+ add_parse_error(parser, token);
3007
+ ignore_token(parser);
3008
+ return false;
3009
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_STYLE, GUMBO_TAG_SCRIPT,
3010
+ GUMBO_TAG_LAST)) {
3011
+ return handle_in_head(parser, token);
3012
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) &&
3013
+ attribute_matches(&token->v.start_tag.attributes,
3014
+ "type", "hidden")) {
3015
+ add_parse_error(parser, token);
3016
+ insert_element_from_token(parser, token);
3017
+ pop_current_node(parser);
3018
+ return false;
3019
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3020
+ add_parse_error(parser, token);
3021
+ if (state->_form_element) {
3022
+ ignore_token(parser);
3023
+ return false;
3024
+ }
3025
+ state->_form_element = insert_element_from_token(parser, token);
3026
+ pop_current_node(parser);
3027
+ return false;
3028
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3029
+ if (!node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3030
+ add_parse_error(parser, token);
3031
+ return false;
3032
+ }
3033
+ return true;
3034
+ } else {
3035
+ add_parse_error(parser, token);
3036
+ state->_foster_parent_insertions = true;
3037
+ bool result = handle_in_body(parser, token);
3038
+ state->_foster_parent_insertions = false;
3039
+ return result;
3040
+ }
3041
+ }
3042
+
3043
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext
3044
+ static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3045
+ if (token->type == GUMBO_TOKEN_NULL) {
3046
+ add_parse_error(parser, token);
3047
+ ignore_token(parser);
3048
+ return false;
3049
+ } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3050
+ token->type == GUMBO_TOKEN_WHITESPACE) {
3051
+ insert_text_token(parser, token);
3052
+ return true;
3053
+ } else {
3054
+ GumboParserState* state = parser->_parser_state;
3055
+ GumboStringBuffer* buffer = &state->_text_node._buffer;
3056
+ // Can't use strspn for this because GumboStringBuffers are not
3057
+ // null-terminated.
3058
+ // Note that TextNodeBuffer may contain UTF-8 characters, but the presence
3059
+ // of any one byte that is not whitespace means we flip the flag, so this
3060
+ // loop is still valid.
3061
+ for (int i = 0; i < buffer->length; ++i) {
3062
+ if (!isspace(buffer->data[i]) || buffer->data[i] == '\v') {
3063
+ state->_foster_parent_insertions = true;
3064
+ reconstruct_active_formatting_elements(parser);
3065
+ break;
3066
+ }
3067
+ }
3068
+ maybe_flush_text_node_buffer(parser);
3069
+ state->_foster_parent_insertions = false;
3070
+ state->_reprocess_current_token = true;
3071
+ state->_insertion_mode = state->_original_insertion_mode;
3072
+ return true;
3073
+ }
3074
+ }
3075
+
3076
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption
3077
+ static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3078
+ if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3079
+ GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
3080
+ GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
3081
+ GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
3082
+ tag_in(token, kEndTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3083
+ GUMBO_TAG_LAST)) {
3084
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3085
+ add_parse_error(parser, token);
3086
+ ignore_token(parser);
3087
+ return false;
3088
+ }
3089
+ if (!tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3090
+ add_parse_error(parser, token);
3091
+ parser->_parser_state->_reprocess_current_token = true;
3092
+ }
3093
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3094
+ bool result = true;
3095
+ if (!node_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3096
+ add_parse_error(parser, token);
3097
+ while (!node_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3098
+ pop_current_node(parser);
3099
+ }
3100
+ result = false;
3101
+ }
3102
+ pop_current_node(parser); // The <caption> itself.
3103
+ clear_active_formatting_elements(parser);
3104
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3105
+ return result;
3106
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_COL,
3107
+ GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML, GUMBO_TAG_TBODY,
3108
+ GUMBO_TAG_TD, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
3109
+ GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
3110
+ add_parse_error(parser, token);
3111
+ ignore_token(parser);
3112
+ return false;
3113
+ } else {
3114
+ return handle_in_body(parser, token);
3115
+ }
3116
+ }
3117
+
3118
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup
3119
+ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3120
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
3121
+ insert_text_token(parser, token);
3122
+ return true;
3123
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3124
+ add_parse_error(parser, token);
3125
+ ignore_token(parser);
3126
+ return false;
3127
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3128
+ append_comment_node(parser, get_current_node(parser), token);
3129
+ return true;
3130
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3131
+ return handle_in_body(parser, token);
3132
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3133
+ insert_element_from_token(parser, token);
3134
+ pop_current_node(parser);
3135
+ acknowledge_self_closing_tag(parser);
3136
+ return true;
3137
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3138
+ add_parse_error(parser, token);
3139
+ ignore_token(parser);
3140
+ return false;
3141
+ } else if (token->type == GUMBO_TOKEN_EOF &&
3142
+ get_current_node(parser) == parser->_output->root) {
3143
+ return true;
3144
+ } else {
3145
+ if (get_current_node(parser) == parser->_output->root) {
3146
+ add_parse_error(parser, token);
3147
+ return false;
3148
+ }
3149
+ assert(node_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP));
3150
+ pop_current_node(parser);
3151
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3152
+ if (!tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3153
+ parser->_parser_state->_reprocess_current_token = true;
3154
+ }
3155
+ return true;
3156
+ }
3157
+ }
3158
+
3159
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody
3160
+ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3161
+ if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3162
+ clear_stack_to_table_body_context(parser);
3163
+ insert_element_from_token(parser, token);
3164
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3165
+ return true;
3166
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_TD, GUMBO_TAG_TH,
3167
+ GUMBO_TAG_LAST)) {
3168
+ add_parse_error(parser, token);
3169
+ clear_stack_to_table_body_context(parser);
3170
+ insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3171
+ parser->_parser_state->_reprocess_current_token = true;
3172
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3173
+ return false;
3174
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3175
+ GUMBO_TAG_THEAD, GUMBO_TAG_LAST)) {
3176
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3177
+ add_parse_error(parser, token);
3178
+ ignore_token(parser);
3179
+ return false;
3180
+ }
3181
+ clear_stack_to_table_body_context(parser);
3182
+ pop_current_node(parser);
3183
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3184
+ return true;
3185
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3186
+ GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3187
+ GUMBO_TAG_THEAD, GUMBO_TAG_LAST) ||
3188
+ tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3189
+ if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) ||
3190
+ has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) ||
3191
+ has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) {
3192
+ add_parse_error(parser, token);
3193
+ ignore_token(parser);
3194
+ return false;
3195
+ }
3196
+ clear_stack_to_table_body_context(parser);
3197
+ pop_current_node(parser);
3198
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3199
+ parser->_parser_state->_reprocess_current_token = true;
3200
+ return true;
3201
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3202
+ GUMBO_TAG_COL, GUMBO_TAG_TR, GUMBO_TAG_COLGROUP,
3203
+ GUMBO_TAG_HTML, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST))
3204
+ {
3205
+ add_parse_error(parser, token);
3206
+ ignore_token(parser);
3207
+ return false;
3208
+ } else {
3209
+ return handle_in_table(parser, token);
3210
+ }
3211
+ }
3212
+
3213
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr
3214
+ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3215
+ if (tag_in(token, kStartTag, GUMBO_TAG_TH, GUMBO_TAG_TD, GUMBO_TAG_LAST)) {
3216
+ clear_stack_to_table_row_context(parser);
3217
+ insert_element_from_token(parser, token);
3218
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3219
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
3220
+ return true;
3221
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COLGROUP,
3222
+ GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3223
+ GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
3224
+ tag_in(token, kEndTag, GUMBO_TAG_TR, GUMBO_TAG_TABLE,
3225
+ GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3226
+ GUMBO_TAG_LAST)) {
3227
+ // This case covers 4 clauses of the spec, each of which say "Otherwise, act
3228
+ // as if an end tag with the tag name "tr" had been seen." The differences
3229
+ // are in error handling and whether the current token is reprocessed.
3230
+ GumboTag desired_tag =
3231
+ tag_in(token, kEndTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3232
+ GUMBO_TAG_THEAD, GUMBO_TAG_LAST)
3233
+ ? token->v.end_tag : GUMBO_TAG_TR;
3234
+ if (!has_an_element_in_table_scope(parser, desired_tag)) {
3235
+ gumbo_debug("Bailing because there is no tag %s in table scope.\nOpen elements:",
3236
+ gumbo_normalized_tagname(desired_tag));
3237
+ for (int i = 0; i < parser->_parser_state->_open_elements.length; ++i) {
3238
+ const GumboNode* node = parser->_parser_state->_open_elements.data[i];
3239
+ gumbo_debug("%s\n", gumbo_normalized_tagname(node->v.element.tag));
3240
+ }
3241
+ add_parse_error(parser, token);
3242
+ ignore_token(parser);
3243
+ return false;
3244
+ }
3245
+ clear_stack_to_table_row_context(parser);
3246
+ GumboNode* last_element = pop_current_node(parser);
3247
+ assert(node_tag_is(last_element, GUMBO_TAG_TR));
3248
+ AVOID_UNUSED_VARIABLE_WARNING(last_element);
3249
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3250
+ if (!tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3251
+ parser->_parser_state->_reprocess_current_token = true;
3252
+ }
3253
+ return true;
3254
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3255
+ GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
3256
+ GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3257
+ add_parse_error(parser, token);
3258
+ ignore_token(parser);
3259
+ return false;
3260
+ } else {
3261
+ return handle_in_table(parser, token);
3262
+ }
3263
+ }
3264
+
3265
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd
3266
+ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3267
+ if (tag_in(token, kEndTag, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3268
+ GumboTag token_tag = token->v.end_tag;
3269
+ if (!has_an_element_in_table_scope(parser, token_tag)) {
3270
+ add_parse_error(parser, token);
3271
+ return false;
3272
+ }
3273
+ return close_table_cell(parser, token, token_tag);
3274
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3275
+ GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
3276
+ GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
3277
+ GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
3278
+ gumbo_debug("Handling <td> in cell.\n");
3279
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) &&
3280
+ !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
3281
+ gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
3282
+ add_parse_error(parser, token);
3283
+ ignore_token(parser);
3284
+ return false;
3285
+ }
3286
+ parser->_parser_state->_reprocess_current_token = true;
3287
+ return close_current_cell(parser, token);
3288
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3289
+ GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
3290
+ GUMBO_TAG_LAST)) {
3291
+ add_parse_error(parser, token);
3292
+ ignore_token(parser);
3293
+ return false;
3294
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
3295
+ GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
3296
+ GUMBO_TAG_LAST)) {
3297
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3298
+ add_parse_error(parser, token);
3299
+ ignore_token(parser);
3300
+ return false;
3301
+ }
3302
+ parser->_parser_state->_reprocess_current_token = true;
3303
+ return close_current_cell(parser, token);
3304
+ } else {
3305
+ return handle_in_body(parser, token);
3306
+ }
3307
+ }
3308
+
3309
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect
3310
+ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3311
+ if (token->type == GUMBO_TOKEN_NULL) {
3312
+ add_parse_error(parser, token);
3313
+ ignore_token(parser);
3314
+ return false;
3315
+ } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3316
+ token->type == GUMBO_TOKEN_WHITESPACE) {
3317
+ insert_text_token(parser, token);
3318
+ return true;
3319
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3320
+ add_parse_error(parser, token);
3321
+ ignore_token(parser);
3322
+ return false;
3323
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3324
+ append_comment_node(parser, get_current_node(parser), token);
3325
+ return true;
3326
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3327
+ return handle_in_body(parser, token);
3328
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3329
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3330
+ pop_current_node(parser);
3331
+ }
3332
+ insert_element_from_token(parser, token);
3333
+ return true;
3334
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3335
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3336
+ pop_current_node(parser);
3337
+ }
3338
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3339
+ pop_current_node(parser);
3340
+ }
3341
+ insert_element_from_token(parser, token);
3342
+ return true;
3343
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3344
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
3345
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
3346
+ node_tag_is(open_elements->data[open_elements->length - 2],
3347
+ GUMBO_TAG_OPTGROUP)) {
3348
+ pop_current_node(parser);
3349
+ }
3350
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3351
+ pop_current_node(parser);
3352
+ return true;
3353
+ } else {
3354
+ add_parse_error(parser, token);
3355
+ ignore_token(parser);
3356
+ return false;
3357
+ }
3358
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3359
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3360
+ pop_current_node(parser);
3361
+ return true;
3362
+ } else {
3363
+ add_parse_error(parser, token);
3364
+ ignore_token(parser);
3365
+ return false;
3366
+ }
3367
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3368
+ if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3369
+ add_parse_error(parser, token);
3370
+ ignore_token(parser);
3371
+ return false;
3372
+ }
3373
+ close_current_select(parser);
3374
+ return true;
3375
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3376
+ add_parse_error(parser, token);
3377
+ ignore_token(parser);
3378
+ close_current_select(parser);
3379
+ return false;
3380
+ } else if (tag_in(token, kStartTag, GUMBO_TAG_INPUT, GUMBO_TAG_KEYGEN,
3381
+ GUMBO_TAG_TEXTAREA, GUMBO_TAG_LAST)) {
3382
+ add_parse_error(parser, token);
3383
+ if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3384
+ ignore_token(parser);
3385
+ } else {
3386
+ close_current_select(parser);
3387
+ parser->_parser_state->_reprocess_current_token = true;
3388
+ }
3389
+ return false;
3390
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
3391
+ return handle_in_head(parser, token);
3392
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3393
+ if (get_current_node(parser) != parser->_output->root) {
3394
+ add_parse_error(parser, token);
3395
+ return false;
3396
+ }
3397
+ return true;
3398
+ } else {
3399
+ add_parse_error(parser, token);
3400
+ ignore_token(parser);
3401
+ return false;
3402
+ }
3403
+ }
3404
+
3405
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
3406
+ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3407
+ if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3408
+ GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
3409
+ GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3410
+ add_parse_error(parser, token);
3411
+ close_current_select(parser);
3412
+ parser->_parser_state->_reprocess_current_token = true;
3413
+ return false;
3414
+ } else if (tag_in(token, kEndTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3415
+ GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3416
+ GUMBO_TAG_TR, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3417
+ add_parse_error(parser, token);
3418
+ if (has_an_element_in_table_scope(parser, token->v.end_tag)) {
3419
+ close_current_select(parser);
3420
+ reset_insertion_mode_appropriately(parser);
3421
+ parser->_parser_state->_reprocess_current_token = true;
3422
+ } else {
3423
+ ignore_token(parser);
3424
+ }
3425
+ return false;
3426
+ } else {
3427
+ return handle_in_select(parser, token);
3428
+ }
3429
+ }
3430
+
3431
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
3432
+ static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3433
+ // TODO(jdtang): Implement this.
3434
+ return true;
3435
+ }
3436
+
3437
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
3438
+ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
3439
+ if (token->type == GUMBO_TOKEN_WHITESPACE ||
3440
+ tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3441
+ return handle_in_body(parser, token);
3442
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3443
+ GumboNode* html_node = parser->_output->root;
3444
+ assert(html_node != NULL);
3445
+ append_comment_node(parser, html_node, token);
3446
+ return true;
3447
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3448
+ add_parse_error(parser, token);
3449
+ ignore_token(parser);
3450
+ return false;
3451
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3452
+ // TODO(jdtang): Handle fragment parsing algorithm case.
3453
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
3454
+ GumboNode* html = parser->_parser_state->_open_elements.data[0];
3455
+ assert(node_tag_is(html, GUMBO_TAG_HTML));
3456
+ record_end_of_element(
3457
+ parser->_parser_state->_current_token, &html->v.element);
3458
+ return true;
3459
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3460
+ return true;
3461
+ } else {
3462
+ add_parse_error(parser, token);
3463
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3464
+ parser->_parser_state->_reprocess_current_token = true;
3465
+ return false;
3466
+ }
3467
+ }
3468
+
3469
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset
3470
+ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
3471
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
3472
+ insert_text_token(parser, token);
3473
+ return true;
3474
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3475
+ append_comment_node(parser, get_current_node(parser), token);
3476
+ return true;
3477
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3478
+ add_parse_error(parser, token);
3479
+ ignore_token(parser);
3480
+ return false;
3481
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3482
+ return handle_in_body(parser, token);
3483
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
3484
+ insert_element_from_token(parser, token);
3485
+ return true;
3486
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
3487
+ if (node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3488
+ add_parse_error(parser, token);
3489
+ ignore_token(parser);
3490
+ return false;
3491
+ }
3492
+ pop_current_node(parser);
3493
+ // TODO(jdtang): Add a condition to ignore this for the fragment parsing
3494
+ // algorithm.
3495
+ if (!node_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
3496
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
3497
+ }
3498
+ return true;
3499
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
3500
+ insert_element_from_token(parser, token);
3501
+ pop_current_node(parser);
3502
+ acknowledge_self_closing_tag(parser);
3503
+ return true;
3504
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3505
+ return handle_in_head(parser, token);
3506
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3507
+ if (!node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3508
+ add_parse_error(parser, token);
3509
+ return false;
3510
+ }
3511
+ return true;
3512
+ } else {
3513
+ add_parse_error(parser, token);
3514
+ ignore_token(parser);
3515
+ return false;
3516
+ }
3517
+ }
3518
+
3519
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset
3520
+ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
3521
+ if (token->type == GUMBO_TOKEN_WHITESPACE) {
3522
+ insert_text_token(parser, token);
3523
+ return true;
3524
+ } else if (token->type == GUMBO_TOKEN_COMMENT) {
3525
+ append_comment_node(parser, get_current_node(parser), token);
3526
+ return true;
3527
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3528
+ add_parse_error(parser, token);
3529
+ ignore_token(parser);
3530
+ return false;
3531
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3532
+ return handle_in_body(parser, token);
3533
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3534
+ GumboNode* html = parser->_parser_state->_open_elements.data[0];
3535
+ assert(node_tag_is(html, GUMBO_TAG_HTML));
3536
+ record_end_of_element(
3537
+ parser->_parser_state->_current_token, &html->v.element);
3538
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
3539
+ return true;
3540
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3541
+ return handle_in_head(parser, token);
3542
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3543
+ return true;
3544
+ } else {
3545
+ add_parse_error(parser, token);
3546
+ ignore_token(parser);
3547
+ return false;
3548
+ }
3549
+ }
3550
+
3551
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode
3552
+ static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
3553
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3554
+ append_comment_node(parser, get_document_node(parser), token);
3555
+ return true;
3556
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3557
+ token->type == GUMBO_TOKEN_WHITESPACE ||
3558
+ tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3559
+ return handle_in_body(parser, token);
3560
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3561
+ return true;
3562
+ } else {
3563
+ add_parse_error(parser, token);
3564
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3565
+ parser->_parser_state->_reprocess_current_token = true;
3566
+ return false;
3567
+ }
3568
+ }
3569
+
3570
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode
3571
+ static bool handle_after_after_frameset(
3572
+ GumboParser* parser, GumboToken* token) {
3573
+ if (token->type == GUMBO_TOKEN_COMMENT) {
3574
+ append_comment_node(parser, get_document_node(parser), token);
3575
+ return true;
3576
+ } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3577
+ token->type == GUMBO_TOKEN_WHITESPACE ||
3578
+ tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3579
+ return handle_in_body(parser, token);
3580
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3581
+ return true;
3582
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3583
+ return handle_in_head(parser, token);
3584
+ } else {
3585
+ add_parse_error(parser, token);
3586
+ ignore_token(parser);
3587
+ return false;
3588
+ }
3589
+ }
3590
+
3591
+ // Function pointers for each insertion mode. Keep in sync with
3592
+ // insertion_mode.h.
3593
+ typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token);
3594
+ static const TokenHandler kTokenHandlers[] = {
3595
+ handle_initial,
3596
+ handle_before_html,
3597
+ handle_before_head,
3598
+ handle_in_head,
3599
+ handle_in_head_noscript,
3600
+ handle_after_head,
3601
+ handle_in_body,
3602
+ handle_text,
3603
+ handle_in_table,
3604
+ handle_in_table_text,
3605
+ handle_in_caption,
3606
+ handle_in_column_group,
3607
+ handle_in_table_body,
3608
+ handle_in_row,
3609
+ handle_in_cell,
3610
+ handle_in_select,
3611
+ handle_in_select_in_table,
3612
+ handle_in_template,
3613
+ handle_after_body,
3614
+ handle_in_frameset,
3615
+ handle_after_frameset,
3616
+ handle_after_after_body,
3617
+ handle_after_after_frameset
3618
+ };
3619
+
3620
+ static bool handle_html_content(GumboParser* parser, GumboToken* token) {
3621
+ return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode](
3622
+ parser, token);
3623
+ }
3624
+
3625
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign
3626
+ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
3627
+ switch (token->type) {
3628
+ case GUMBO_TOKEN_NULL:
3629
+ add_parse_error(parser, token);
3630
+ token->type = GUMBO_TOKEN_CHARACTER;
3631
+ token->v.character = kUtf8ReplacementChar;
3632
+ insert_text_token(parser, token);
3633
+ return false;
3634
+ case GUMBO_TOKEN_WHITESPACE:
3635
+ insert_text_token(parser, token);
3636
+ return true;
3637
+ case GUMBO_TOKEN_CHARACTER:
3638
+ insert_text_token(parser, token);
3639
+ set_frameset_not_ok(parser);
3640
+ return true;
3641
+ case GUMBO_TOKEN_COMMENT:
3642
+ append_comment_node(parser, get_current_node(parser), token);
3643
+ return true;
3644
+ case GUMBO_TOKEN_DOCTYPE:
3645
+ add_parse_error(parser, token);
3646
+ ignore_token(parser);
3647
+ return false;
3648
+ default:
3649
+ // Fall through to the if-statements below.
3650
+ break;
3651
+ }
3652
+ // Order matters for these clauses.
3653
+ if (tag_in(token, kStartTag, GUMBO_TAG_B, GUMBO_TAG_BIG,
3654
+ GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_BODY, GUMBO_TAG_BR,
3655
+ GUMBO_TAG_CENTER, GUMBO_TAG_CODE, GUMBO_TAG_DD, GUMBO_TAG_DIV,
3656
+ GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EM, GUMBO_TAG_EMBED,
3657
+ GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
3658
+ GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HEAD, GUMBO_TAG_HR,
3659
+ GUMBO_TAG_I, GUMBO_TAG_IMG, GUMBO_TAG_LI, GUMBO_TAG_LISTING,
3660
+ GUMBO_TAG_MENU, GUMBO_TAG_META, GUMBO_TAG_NOBR, GUMBO_TAG_OL,
3661
+ GUMBO_TAG_P, GUMBO_TAG_PRE, GUMBO_TAG_RUBY, GUMBO_TAG_S,
3662
+ GUMBO_TAG_SMALL, GUMBO_TAG_SPAN, GUMBO_TAG_STRONG,
3663
+ GUMBO_TAG_STRIKE, GUMBO_TAG_SUB, GUMBO_TAG_SUP,
3664
+ GUMBO_TAG_TABLE, GUMBO_TAG_TT, GUMBO_TAG_U, GUMBO_TAG_UL,
3665
+ GUMBO_TAG_VAR, GUMBO_TAG_LAST) ||
3666
+ (tag_is(token, kStartTag, GUMBO_TAG_FONT) && (
3667
+ token_has_attribute(token, "color") ||
3668
+ token_has_attribute(token, "face") ||
3669
+ token_has_attribute(token, "size")))) {
3670
+ add_parse_error(parser, token);
3671
+ do {
3672
+ pop_current_node(parser);
3673
+ } while(!(is_mathml_integration_point(get_current_node(parser)) ||
3674
+ is_html_integration_point(get_current_node(parser)) ||
3675
+ get_current_node(parser)->v.element.tag_namespace ==
3676
+ GUMBO_NAMESPACE_HTML));
3677
+ parser->_parser_state->_reprocess_current_token = true;
3678
+ return false;
3679
+ } else if (token->type == GUMBO_TOKEN_START_TAG) {
3680
+ const GumboNamespaceEnum current_namespace =
3681
+ get_current_node(parser)->v.element.tag_namespace;
3682
+ if (current_namespace == GUMBO_NAMESPACE_MATHML) {
3683
+ adjust_mathml_attributes(parser, token);
3684
+ }
3685
+ if (current_namespace == GUMBO_NAMESPACE_SVG) {
3686
+ // Tag adjustment is left to the gumbo_normalize_svg_tagname helper
3687
+ // function.
3688
+ adjust_svg_attributes(parser, token);
3689
+ }
3690
+ adjust_foreign_attributes(parser, token);
3691
+ insert_foreign_element(parser, token, current_namespace);
3692
+ if (token->v.start_tag.is_self_closing) {
3693
+ pop_current_node(parser);
3694
+ acknowledge_self_closing_tag(parser);
3695
+ }
3696
+ return true;
3697
+ // </script> tags are handled like any other end tag, putting the script's
3698
+ // text into a text node child and closing the current node.
3699
+ } else {
3700
+ assert(token->type == GUMBO_TOKEN_END_TAG);
3701
+ GumboNode* node = get_current_node(parser);
3702
+ assert(node != NULL);
3703
+ GumboStringPiece token_tagname = token->original_text;
3704
+ GumboStringPiece node_tagname = node->v.element.original_tag;
3705
+ gumbo_tag_from_original_text(&token_tagname);
3706
+ gumbo_tag_from_original_text(&node_tagname);
3707
+
3708
+ bool is_success = true;
3709
+ if (!gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3710
+ add_parse_error(parser, token);
3711
+ is_success = false;
3712
+ }
3713
+ int i = parser->_parser_state->_open_elements.length;
3714
+ for( --i; i > 0; ) {
3715
+ // Here we move up the stack until we find an HTML element (in which
3716
+ // case we do nothing) or we find the element that we're about to
3717
+ // close (in which case we pop everything we've seen until that
3718
+ // point.)
3719
+ gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length,
3720
+ node_tagname.data, i);
3721
+ if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3722
+ gumbo_debug("Matches.\n");
3723
+ while (pop_current_node(parser) != node) {
3724
+ // Pop all the nodes below the current one. Node is guaranteed to
3725
+ // be an element on the stack of open elements (set below), so
3726
+ // this loop is guaranteed to terminate.
3727
+ }
3728
+ return is_success;
3729
+ }
3730
+ --i;
3731
+ node = parser->_parser_state->_open_elements.data[i];
3732
+ if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
3733
+ // Must break before gumbo_tag_from_original_text to avoid passing
3734
+ // parser-inserted nodes through.
3735
+ break;
3736
+ }
3737
+ node_tagname = node->v.element.original_tag;
3738
+ gumbo_tag_from_original_text(&node_tagname);
3739
+ }
3740
+ assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
3741
+ // We can't call handle_token directly because the current node is still in
3742
+ // the SVG namespace, so it would re-enter this and result in infinite
3743
+ // recursion.
3744
+ return handle_html_content(parser, token) && is_success;
3745
+ }
3746
+ }
3747
+
3748
+
3749
+ // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction
3750
+ static bool handle_token(GumboParser* parser, GumboToken* token) {
3751
+ if (parser->_parser_state->_ignore_next_linefeed &&
3752
+ token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') {
3753
+ parser->_parser_state->_ignore_next_linefeed = false;
3754
+ ignore_token(parser);
3755
+ return true;
3756
+ }
3757
+ // This needs to be reset both here and in the conditional above to catch both
3758
+ // the case where the next token is not whitespace (so we don't ignore
3759
+ // whitespace in the middle of <pre> tags) and where there are multiple
3760
+ // whitespace tokens (so we don't ignore the second one).
3761
+ parser->_parser_state->_ignore_next_linefeed = false;
3762
+
3763
+ if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
3764
+ parser->_parser_state->_closed_body_tag = true;
3765
+ }
3766
+ if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3767
+ parser->_parser_state->_closed_html_tag = true;
3768
+ }
3769
+
3770
+ const GumboNode* current_node = get_current_node(parser);
3771
+ assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT);
3772
+ if (current_node) {
3773
+ gumbo_debug("Current node: <%s>.\n",
3774
+ gumbo_normalized_tagname(current_node->v.element.tag));
3775
+ }
3776
+ if (!current_node ||
3777
+ current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML ||
3778
+ (is_mathml_integration_point(current_node) &&
3779
+ (token->type == GUMBO_TOKEN_CHARACTER ||
3780
+ token->type == GUMBO_TOKEN_WHITESPACE ||
3781
+ token->type == GUMBO_TOKEN_NULL ||
3782
+ (token->type == GUMBO_TOKEN_START_TAG &&
3783
+ !tag_in(token, kStartTag, GUMBO_TAG_MGLYPH, GUMBO_TAG_MALIGNMARK,
3784
+ GUMBO_TAG_LAST)))) ||
3785
+ (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
3786
+ node_tag_is(current_node, GUMBO_TAG_ANNOTATION_XML) &&
3787
+ tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
3788
+ (is_html_integration_point(current_node) && (
3789
+ token->type == GUMBO_TOKEN_START_TAG ||
3790
+ token->type == GUMBO_TOKEN_CHARACTER ||
3791
+ token->type == GUMBO_TOKEN_NULL ||
3792
+ token->type == GUMBO_TOKEN_WHITESPACE)) ||
3793
+ token->type == GUMBO_TOKEN_EOF) {
3794
+ return handle_html_content(parser, token);
3795
+ } else {
3796
+ return handle_in_foreign_content(parser, token);
3797
+ }
3798
+ }
3799
+
3800
+ GumboOutput* gumbo_parse(const char* buffer) {
3801
+ return gumbo_parse_with_options(
3802
+ &kGumboDefaultOptions, buffer, strlen(buffer));
3803
+ }
3804
+
3805
+ GumboOutput* gumbo_parse_with_options(
3806
+ const GumboOptions* options, const char* buffer, size_t length) {
3807
+ GumboParser parser;
3808
+ parser._options = options;
3809
+ output_init(&parser);
3810
+ gumbo_tokenizer_state_init(&parser, buffer, length);
3811
+ parser_state_init(&parser);
3812
+
3813
+ GumboParserState* state = parser._parser_state;
3814
+ gumbo_debug("Parsing %.*s.\n", length, buffer);
3815
+
3816
+ // Sanity check so that infinite loops die with an assertion failure instead
3817
+ // of hanging the process before we ever get an error.
3818
+ int loop_count = 0;
3819
+
3820
+ GumboToken token;
3821
+ bool has_error = false;
3822
+ do {
3823
+ if (state->_reprocess_current_token) {
3824
+ state->_reprocess_current_token = false;
3825
+ } else {
3826
+ GumboNode* current_node = get_current_node(&parser);
3827
+ gumbo_tokenizer_set_is_current_node_foreign(
3828
+ &parser, current_node &&
3829
+ current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML);
3830
+ has_error = !gumbo_lex(&parser, &token) || has_error;
3831
+ }
3832
+ const char* token_type = "text";
3833
+ switch (token.type) {
3834
+ case GUMBO_TOKEN_DOCTYPE:
3835
+ token_type = "doctype";
3836
+ break;
3837
+ case GUMBO_TOKEN_START_TAG:
3838
+ token_type = gumbo_normalized_tagname(token.v.start_tag.tag);
3839
+ break;
3840
+ case GUMBO_TOKEN_END_TAG:
3841
+ token_type = gumbo_normalized_tagname(token.v.end_tag);
3842
+ break;
3843
+ case GUMBO_TOKEN_COMMENT:
3844
+ token_type = "comment";
3845
+ break;
3846
+ default:
3847
+ break;
3848
+ }
3849
+ gumbo_debug("Handling %s token @%d:%d in state %d.\n",
3850
+ (char*) token_type, token.position.line, token.position.column,
3851
+ state->_insertion_mode);
3852
+
3853
+ state->_current_token = &token;
3854
+ state->_self_closing_flag_acknowledged =
3855
+ !(token.type == GUMBO_TOKEN_START_TAG &&
3856
+ token.v.start_tag.is_self_closing);
3857
+
3858
+ has_error = !handle_token(&parser, &token) || has_error;
3859
+
3860
+ // Check for memory leaks when ownership is transferred from start tag
3861
+ // tokens to nodes.
3862
+ assert(state->_reprocess_current_token ||
3863
+ token.type != GUMBO_TOKEN_START_TAG ||
3864
+ token.v.start_tag.attributes.data == NULL);
3865
+
3866
+ if (!state->_self_closing_flag_acknowledged) {
3867
+ GumboError* error = add_parse_error(&parser, &token);
3868
+ if (error) {
3869
+ error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
3870
+ }
3871
+ }
3872
+
3873
+ ++loop_count;
3874
+ assert(loop_count < 1000000000);
3875
+
3876
+ } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) &&
3877
+ !(options->stop_on_first_error && has_error));
3878
+
3879
+ finish_parsing(&parser);
3880
+ // For API uniformity reasons, if the doctype still has nulls, convert them to
3881
+ // empty strings.
3882
+ GumboDocument* doc_type = &parser._output->document->v.document;
3883
+ if (doc_type->name == NULL) {
3884
+ doc_type->name = gumbo_copy_stringz(&parser, "");
3885
+ }
3886
+ if (doc_type->public_identifier == NULL) {
3887
+ doc_type->public_identifier = gumbo_copy_stringz(&parser, "");
3888
+ }
3889
+ if (doc_type->system_identifier == NULL) {
3890
+ doc_type->system_identifier = gumbo_copy_stringz(&parser, "");
3891
+ }
3892
+
3893
+ parser_state_destroy(&parser);
3894
+ gumbo_tokenizer_state_destroy(&parser);
3895
+ return parser._output;
3896
+ }
3897
+
3898
+ void gumbo_destroy_node(GumboOptions* options, GumboNode* node) {
3899
+ // Need a dummy GumboParser because the allocator comes along with the
3900
+ // options object.
3901
+ GumboParser parser;
3902
+ parser._options = options;
3903
+ destroy_node(&parser, node);
3904
+ }
3905
+
3906
+ void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
3907
+ // Need a dummy GumboParser because the allocator comes along with the
3908
+ // options object.
3909
+ GumboParser parser;
3910
+ parser._options = options;
3911
+ destroy_node(&parser, output->document);
3912
+ for (int i = 0; i < output->errors.length; ++i) {
3913
+ gumbo_error_destroy(&parser, output->errors.data[i]);
3914
+ }
3915
+ gumbo_vector_destroy(&parser, &output->errors);
3916
+ gumbo_parser_deallocate(&parser, output);
3917
+ }