nokogumbo 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,54 +0,0 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #ifndef GUMBO_INSERTION_MODE_H_
18
- #define GUMBO_INSERTION_MODE_H_
19
-
20
- #ifdef __cplusplus
21
- extern "C" {
22
- #endif
23
-
24
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
25
- typedef enum _GumboInsertionMode {
26
- GUMBO_INSERTION_MODE_INITIAL,
27
- GUMBO_INSERTION_MODE_BEFORE_HTML,
28
- GUMBO_INSERTION_MODE_BEFORE_HEAD,
29
- GUMBO_INSERTION_MODE_IN_HEAD,
30
- GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT,
31
- GUMBO_INSERTION_MODE_AFTER_HEAD,
32
- GUMBO_INSERTION_MODE_IN_BODY,
33
- GUMBO_INSERTION_MODE_TEXT,
34
- GUMBO_INSERTION_MODE_IN_TABLE,
35
- GUMBO_INSERTION_MODE_IN_TABLE_TEXT,
36
- GUMBO_INSERTION_MODE_IN_CAPTION,
37
- GUMBO_INSERTION_MODE_IN_COLUMN_GROUP,
38
- GUMBO_INSERTION_MODE_IN_TABLE_BODY,
39
- GUMBO_INSERTION_MODE_IN_ROW,
40
- GUMBO_INSERTION_MODE_IN_CELL,
41
- GUMBO_INSERTION_MODE_IN_SELECT,
42
- GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
43
- GUMBO_INSERTION_MODE_AFTER_BODY,
44
- GUMBO_INSERTION_MODE_IN_FRAMESET,
45
- GUMBO_INSERTION_MODE_AFTER_FRAMESET,
46
- GUMBO_INSERTION_MODE_AFTER_AFTER_BODY,
47
- GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET
48
- } GumboInsertionMode;
49
-
50
- #ifdef __cplusplus
51
- } // extern C
52
- #endif
53
-
54
- #endif // GUMBO_INSERTION_MODE_H_
data/work/nokogumbo.c DELETED
@@ -1,254 +0,0 @@
1
- #include "ruby.h"
2
- #include "gumbo.h"
3
- #include <nokogiri.h>
4
- #include <libxml/parser.h>
5
- #include <libxml/tree.h>
6
-
7
- // class constants
8
- static VALUE Document;
9
-
10
- static const char* TAGS[] = {
11
- "html",
12
- "head",
13
- "title",
14
- "base",
15
- "link",
16
- "meta",
17
- "style",
18
- "script",
19
- "noscript",
20
- "body",
21
- "section",
22
- "nav",
23
- "article",
24
- "aside",
25
- "h1",
26
- "h2",
27
- "h3",
28
- "h4",
29
- "h5",
30
- "h6",
31
- "hgroup",
32
- "header",
33
- "footer",
34
- "address",
35
- "p",
36
- "hr",
37
- "pre",
38
- "blockquote",
39
- "ol",
40
- "ul",
41
- "li",
42
- "dl",
43
- "dt",
44
- "dd",
45
- "figure",
46
- "figcaption",
47
- "div",
48
- "a",
49
- "em",
50
- "strong",
51
- "small",
52
- "s",
53
- "cite",
54
- "q",
55
- "dfn",
56
- "abbr",
57
- "time",
58
- "code",
59
- "var",
60
- "samp",
61
- "kbd",
62
- "sub",
63
- "sup",
64
- "i",
65
- "b",
66
- "mark",
67
- "ruby",
68
- "rt",
69
- "rp",
70
- "bdi",
71
- "bdo",
72
- "span",
73
- "br",
74
- "wbr",
75
- "ins",
76
- "del",
77
- "image",
78
- "img",
79
- "iframe",
80
- "embed",
81
- "object",
82
- "param",
83
- "video",
84
- "audio",
85
- "source",
86
- "track",
87
- "canvas",
88
- "map",
89
- "area",
90
- "math",
91
- "mi",
92
- "mo",
93
- "mn",
94
- "ms",
95
- "mtext",
96
- "mglyph",
97
- "malignmark",
98
- "annotation_xml",
99
- "svg",
100
- "foreignobject",
101
- "desc",
102
- "table",
103
- "caption",
104
- "colgroup",
105
- "col",
106
- "tbody",
107
- "thead",
108
- "tfoot",
109
- "tr",
110
- "td",
111
- "th",
112
- "form",
113
- "fieldset",
114
- "legend",
115
- "label",
116
- "input",
117
- "button",
118
- "select",
119
- "datalist",
120
- "optgroup",
121
- "option",
122
- "textarea",
123
- "keygen",
124
- "output",
125
- "progress",
126
- "meter",
127
- "details",
128
- "summary",
129
- "command",
130
- "menu",
131
- "applet",
132
- "acronym",
133
- "bgsound",
134
- "dir",
135
- "frame",
136
- "frameset",
137
- "noframes",
138
- "isindex",
139
- "listing",
140
- "xmp",
141
- "nextid",
142
- "noembed",
143
- "plaintext",
144
- "rb",
145
- "strike",
146
- "basefont",
147
- "big",
148
- "blink",
149
- "center",
150
- "font",
151
- "marquee",
152
- "multicol",
153
- "nobr",
154
- "spacer",
155
- "tt",
156
- "u",
157
- "unknown"
158
- };
159
-
160
- const static int Unknown=sizeof(TAGS)/sizeof(char*)-1;
161
-
162
- // determine tag name for a given node
163
- static xmlNodePtr new_element(GumboElement *node) {
164
- xmlNodePtr element;
165
- if (node->tag != Unknown) {
166
- element = xmlNewNode(NULL, BAD_CAST TAGS[(int)node->tag]);
167
- } else {
168
- // Gumbo doesn't provide unknown tags, so we need to parse it ourselves:
169
- // http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state
170
- GumboStringPiece *tag = &node->original_tag;
171
- int length;
172
- for (length = 1; length < tag->length-1; length++) {
173
- if (strchr(" \t\r\n<", *((char*)tag->data+length))) break;
174
- }
175
- char name[length];
176
- strncpy(name, 1+(char *)tag->data, length-1);
177
- name[length-1] = '\0';
178
- element = xmlNewNode(NULL, BAD_CAST name);
179
- }
180
- return element;
181
- }
182
-
183
- // Build a Nokogiri Element for a given GumboElement (recursively)
184
- static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
185
- xmlNodePtr element = new_element(node);
186
-
187
- // add in the attributes
188
- GumboVector* attrs = &node->attributes;
189
- for (int i=0; i < attrs->length; i++) {
190
- GumboAttribute *attr = attrs->data[i];
191
- xmlNewProp(element, BAD_CAST attr->name, BAD_CAST attr->value);
192
- }
193
-
194
- // add in the children
195
- GumboVector* children = &node->children;
196
- for (int i=0; i < children->length; i++) {
197
- GumboNode* child = children->data[i];
198
-
199
- xmlNodePtr node = NULL;
200
-
201
- switch (child->type) {
202
- case GUMBO_NODE_ELEMENT:
203
- node = walk_tree(document, &child->v.element);
204
- break;
205
- case GUMBO_NODE_WHITESPACE:
206
- case GUMBO_NODE_TEXT:
207
- node = xmlNewText(BAD_CAST child->v.text.text);
208
- break;
209
- case GUMBO_NODE_CDATA:
210
- node = xmlNewCDataBlock(document,
211
- BAD_CAST child->v.text.original_text.data,
212
- child->v.text.original_text.length);
213
- break;
214
- case GUMBO_NODE_COMMENT:
215
- node = xmlNewComment(BAD_CAST child->v.text.text);
216
- break;
217
- case GUMBO_NODE_DOCUMENT:
218
- break; // should never happen -- ignore
219
- }
220
-
221
- if (node) xmlAddChild(element, node);
222
- }
223
-
224
- return element;
225
- }
226
-
227
- // Parse a string using gumbo_parse into a Nokogiri document
228
- static VALUE t_parse(VALUE self, VALUE string) {
229
- GumboOutput *output = gumbo_parse_with_options(
230
- &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
231
- );
232
- xmlDocPtr doc = xmlNewDoc(BAD_CAST "1.0");
233
- xmlNodePtr root = walk_tree(doc, (GumboElement*)&output->root->v.element);
234
- xmlDocSetRootElement(doc, root);
235
- gumbo_destroy_output(&kGumboDefaultOptions, output);
236
-
237
- return Nokogiri_wrap_xml_document(Document, doc);
238
- }
239
-
240
- // Initialize the Nokogumbo class and fetch constants we will use later
241
- void Init_nokogumboc() {
242
- rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
243
- rb_require("nokogiri");
244
-
245
- // class constants
246
- VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
247
- VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
248
- Document = rb_const_get(HTML, rb_intern("Document"));
249
-
250
- // define Nokogumbo class with a singleton parse method
251
- VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
252
- rb_define_singleton_method(Gumbo, "parse", t_parse, 1);
253
- }
254
-
data/work/parser.c DELETED
@@ -1,3893 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include <assert.h>
18
- #include <ctype.h>
19
- #include <stdarg.h>
20
- #include <stdlib.h>
21
- #include <string.h>
22
- #include <strings.h>
23
-
24
- #include "attribute.h"
25
- #include "error.h"
26
- #include "gumbo.h"
27
- #include "insertion_mode.h"
28
- #include "parser.h"
29
- #include "tokenizer.h"
30
- #include "tokenizer_states.h"
31
- #include "utf8.h"
32
- #include "util.h"
33
- #include "vector.h"
34
-
35
-
36
- #define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
37
-
38
- #define GUMBO_STRING(literal) { literal, sizeof(literal) - 1 }
39
- #define TERMINATOR { "", 0 }
40
-
41
- static void* malloc_wrapper(void* unused, size_t size) {
42
- return malloc(size);
43
- }
44
-
45
- static void free_wrapper(void* unused, void* ptr) {
46
- return free(ptr);
47
- }
48
-
49
- const GumboOptions kGumboDefaultOptions = {
50
- &malloc_wrapper,
51
- &free_wrapper,
52
- NULL,
53
- 8,
54
- false,
55
- -1,
56
- };
57
-
58
- static const GumboStringPiece kDoctypeHtml = GUMBO_STRING("html");
59
- static const GumboStringPiece kPublicIdHtml4_0 = GUMBO_STRING(
60
- "-//W3C//DTD HTML 4.0//EN");
61
- static const GumboStringPiece kPublicIdHtml4_01 = GUMBO_STRING(
62
- "-//W3C//DTD HTML 4.01//EN");
63
- static const GumboStringPiece kPublicIdXhtml1_0 = GUMBO_STRING(
64
- "-//W3C//DTD XHTML 1.0 Strict//EN");
65
- static const GumboStringPiece kPublicIdXhtml1_1 = GUMBO_STRING(
66
- "-//W3C//DTD XHTML 1.1//EN");
67
- static const GumboStringPiece kSystemIdRecHtml4_0 = GUMBO_STRING(
68
- "http://www.w3.org/TR/REC-html40/strict.dtd");
69
- static const GumboStringPiece kSystemIdHtml4 = GUMBO_STRING(
70
- "http://www.w3.org/TR/html4/strict.dtd");
71
- static const GumboStringPiece kSystemIdXhtmlStrict1_1 = GUMBO_STRING(
72
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd");
73
- static const GumboStringPiece kSystemIdXhtml1_1 = GUMBO_STRING(
74
- "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd");
75
- static const GumboStringPiece kSystemIdLegacyCompat = GUMBO_STRING(
76
- "about:legacy-compat");
77
-
78
- // The doctype arrays have an explicit terminator because we want to pass them
79
- // to a helper function, and passing them as a pointer discards sizeof
80
- // information. The SVG arrays are used only by one-off functions, and so loops
81
- // over them use sizeof directly instead of a terminator.
82
-
83
- static const GumboStringPiece kQuirksModePublicIdPrefixes[] = {
84
- GUMBO_STRING("+//Silmaril//dtd html Pro v0r11 19970101//"),
85
- GUMBO_STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"),
86
- GUMBO_STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"),
87
- GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 1//"),
88
- GUMBO_STRING("-//IETF//DTD HTML 2.0 Level 2//"),
89
- GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"),
90
- GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"),
91
- GUMBO_STRING("-//IETF//DTD HTML 2.0 Strict//"),
92
- GUMBO_STRING("-//IETF//DTD HTML 2.0//"),
93
- GUMBO_STRING("-//IETF//DTD HTML 2.1E//"),
94
- GUMBO_STRING("-//IETF//DTD HTML 3.0//"),
95
- GUMBO_STRING("-//IETF//DTD HTML 3.2 Final//"),
96
- GUMBO_STRING("-//IETF//DTD HTML 3.2//"),
97
- GUMBO_STRING("-//IETF//DTD HTML 3//"),
98
- GUMBO_STRING("-//IETF//DTD HTML Level 0//"),
99
- GUMBO_STRING("-//IETF//DTD HTML Level 1//"),
100
- GUMBO_STRING("-//IETF//DTD HTML Level 2//"),
101
- GUMBO_STRING("-//IETF//DTD HTML Level 3//"),
102
- GUMBO_STRING("-//IETF//DTD HTML Strict Level 0//"),
103
- GUMBO_STRING("-//IETF//DTD HTML Strict Level 1//"),
104
- GUMBO_STRING("-//IETF//DTD HTML Strict Level 2//"),
105
- GUMBO_STRING("-//IETF//DTD HTML Strict Level 3//"),
106
- GUMBO_STRING("-//IETF//DTD HTML Strict//"),
107
- GUMBO_STRING("-//IETF//DTD HTML//"),
108
- GUMBO_STRING("-//Metrius//DTD Metrius Presentational//"),
109
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"),
110
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"),
111
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"),
112
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"),
113
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"),
114
- GUMBO_STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"),
115
- GUMBO_STRING("-//Netscape Comm. Corp.//DTD HTML//"),
116
- GUMBO_STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"),
117
- GUMBO_STRING("-//O'Reilly and Associates//DTD HTML 2.0//"),
118
- GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"),
119
- GUMBO_STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"),
120
- GUMBO_STRING("-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)"
121
- "extensions to HTML 4.0//"),
122
- GUMBO_STRING("-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::"
123
- "extensions to HTML 4.0//"),
124
- GUMBO_STRING("-//Spyglass//DTD HTML 2.0 Extended//"),
125
- GUMBO_STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"),
126
- GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"),
127
- GUMBO_STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"),
128
- GUMBO_STRING("-//W3C//DTD HTML 3 1995-03-24//"),
129
- GUMBO_STRING("-//W3C//DTD HTML 3.2 Draft//"),
130
- GUMBO_STRING("-//W3C//DTD HTML 3.2 Final//"),
131
- GUMBO_STRING("-//W3C//DTD HTML 3.2//"),
132
- GUMBO_STRING("-//W3C//DTD HTML 3.2S Draft//"),
133
- GUMBO_STRING("-//W3C//DTD HTML 4.0 Frameset//"),
134
- GUMBO_STRING("-//W3C//DTD HTML 4.0 Transitional//"),
135
- GUMBO_STRING("-//W3C//DTD HTML Experimental 19960712//"),
136
- GUMBO_STRING("-//W3C//DTD HTML Experimental 970421//"),
137
- GUMBO_STRING("-//W3C//DTD W3 HTML//"),
138
- GUMBO_STRING("-//W3O//DTD W3 HTML 3.0//"),
139
- GUMBO_STRING("-//WebTechs//DTD Mozilla HTML 2.0//"),
140
- GUMBO_STRING("-//WebTechs//DTD Mozilla HTML//"),
141
- TERMINATOR
142
- };
143
-
144
- static const GumboStringPiece kQuirksModePublicIdExactMatches[] = {
145
- GUMBO_STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"),
146
- GUMBO_STRING("-/W3C/DTD HTML 4.0 Transitional/EN"),
147
- GUMBO_STRING("HTML"),
148
- TERMINATOR
149
- };
150
-
151
- static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = {
152
- GUMBO_STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"),
153
- TERMINATOR
154
- };
155
-
156
- static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = {
157
- GUMBO_STRING("-//W3C//DTD XHTML 1.0 Frameset//"),
158
- GUMBO_STRING("-//W3C//DTD XHTML 1.0 Transitional//"),
159
- TERMINATOR
160
- };
161
-
162
- static const GumboStringPiece kLimitedQuirksRequiresSystemIdPublicIdPrefixes[] = {
163
- GUMBO_STRING("-//W3C//DTD HTML 4.01 Frameset//"),
164
- GUMBO_STRING("-//W3C//DTD HTML 4.01 Transitional//"),
165
- TERMINATOR
166
- };
167
-
168
- // Indexed by GumboNamespaceEnum; keep in sync with that.
169
- static const char* kLegalXmlns[] = {
170
- "http://www.w3.org/1999/xhtml",
171
- "http://www.w3.org/2000/svg",
172
- "http://www.w3.org/1998/Math/MathML"
173
- };
174
-
175
- typedef struct _ReplacementEntry {
176
- const GumboStringPiece from;
177
- const GumboStringPiece to;
178
- } ReplacementEntry;
179
-
180
- #define REPLACEMENT_ENTRY(from, to) \
181
- { GUMBO_STRING(from), GUMBO_STRING(to) }
182
-
183
- // Static data for SVG attribute replacements.
184
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-svg-attributes
185
- static const ReplacementEntry kSvgAttributeReplacements[] = {
186
- REPLACEMENT_ENTRY("attributename", "attributeName"),
187
- REPLACEMENT_ENTRY("attributetype", "attributeType"),
188
- REPLACEMENT_ENTRY("basefrequency", "baseFrequency"),
189
- REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
190
- REPLACEMENT_ENTRY("calcmode", "calcMode"),
191
- REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
192
- REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
193
- REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
194
- REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
195
- REPLACEMENT_ENTRY("edgemode", "edgeMode"),
196
- REPLACEMENT_ENTRY("externalresourcesrequired", "externalResourcesRequired"),
197
- REPLACEMENT_ENTRY("filterres", "filterRes"),
198
- REPLACEMENT_ENTRY("filterunits", "filterUnits"),
199
- REPLACEMENT_ENTRY("glyphref", "glyphRef"),
200
- REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
201
- REPLACEMENT_ENTRY("gradientunits", "gradientUnits"),
202
- REPLACEMENT_ENTRY("kernelmatrix", "kernelMatrix"),
203
- REPLACEMENT_ENTRY("kernelunitlength", "kernelUnitLength"),
204
- REPLACEMENT_ENTRY("keypoints", "keyPoints"),
205
- REPLACEMENT_ENTRY("keysplines", "keySplines"),
206
- REPLACEMENT_ENTRY("keytimes", "keyTimes"),
207
- REPLACEMENT_ENTRY("lengthadjust", "lengthAdjust"),
208
- REPLACEMENT_ENTRY("limitingconeangle", "limitingConeAngle"),
209
- REPLACEMENT_ENTRY("markerheight", "markerHeight"),
210
- REPLACEMENT_ENTRY("markerunits", "markerUnits"),
211
- REPLACEMENT_ENTRY("markerwidth", "markerWidth"),
212
- REPLACEMENT_ENTRY("maskcontentunits", "maskContentUnits"),
213
- REPLACEMENT_ENTRY("maskunits", "maskUnits"),
214
- REPLACEMENT_ENTRY("numoctaves", "numOctaves"),
215
- REPLACEMENT_ENTRY("pathlength", "pathLength"),
216
- REPLACEMENT_ENTRY("patterncontentunits", "patternContentUnits"),
217
- REPLACEMENT_ENTRY("patterntransform", "patternTransform"),
218
- REPLACEMENT_ENTRY("patternunits", "patternUnits"),
219
- REPLACEMENT_ENTRY("pointsatx", "pointsAtX"),
220
- REPLACEMENT_ENTRY("pointsaty", "pointsAtY"),
221
- REPLACEMENT_ENTRY("pointsatz", "pointsAtZ"),
222
- REPLACEMENT_ENTRY("preservealpha", "preserveAlpha"),
223
- REPLACEMENT_ENTRY("preserveaspectratio", "preserveAspectRatio"),
224
- REPLACEMENT_ENTRY("primitiveunits", "primitiveUnits"),
225
- REPLACEMENT_ENTRY("refx", "refX"),
226
- REPLACEMENT_ENTRY("refy", "refY"),
227
- REPLACEMENT_ENTRY("repeatcount", "repeatCount"),
228
- REPLACEMENT_ENTRY("repeatdur", "repeatDur"),
229
- REPLACEMENT_ENTRY("requiredextensions", "requiredExtensions"),
230
- REPLACEMENT_ENTRY("requiredfeatures", "requiredFeatures"),
231
- REPLACEMENT_ENTRY("specularconstant", "specularConstant"),
232
- REPLACEMENT_ENTRY("specularexponent", "specularExponent"),
233
- REPLACEMENT_ENTRY("spreadmethod", "spreadMethod"),
234
- REPLACEMENT_ENTRY("startoffset", "startOffset"),
235
- REPLACEMENT_ENTRY("stddeviation", "stdDeviation"),
236
- REPLACEMENT_ENTRY("stitchtiles", "stitchTiles"),
237
- REPLACEMENT_ENTRY("surfacescale", "surfaceScale"),
238
- REPLACEMENT_ENTRY("systemlanguage", "systemLanguage"),
239
- REPLACEMENT_ENTRY("tablevalues", "tableValues"),
240
- REPLACEMENT_ENTRY("targetx", "targetX"),
241
- REPLACEMENT_ENTRY("targety", "targetY"),
242
- REPLACEMENT_ENTRY("textlength", "textLength"),
243
- REPLACEMENT_ENTRY("viewbox", "viewBox"),
244
- REPLACEMENT_ENTRY("viewtarget", "viewTarget"),
245
- REPLACEMENT_ENTRY("xchannelselector", "xChannelSelector"),
246
- REPLACEMENT_ENTRY("ychannelselector", "yChannelSelector"),
247
- REPLACEMENT_ENTRY("zoomandpan", "zoomAndPan"),
248
- };
249
-
250
- static const ReplacementEntry kSvgTagReplacements[] = {
251
- REPLACEMENT_ENTRY("altglyph", "altGlyph"),
252
- REPLACEMENT_ENTRY("altglyphdef", "altGlyphDef"),
253
- REPLACEMENT_ENTRY("altglyphitem", "altGlyphItem"),
254
- REPLACEMENT_ENTRY("animatecolor", "animateColor"),
255
- REPLACEMENT_ENTRY("animatemotion", "animateMotion"),
256
- REPLACEMENT_ENTRY("animatetransform", "animateTransform"),
257
- REPLACEMENT_ENTRY("clippath", "clipPath"),
258
- REPLACEMENT_ENTRY("feblend", "feBlend"),
259
- REPLACEMENT_ENTRY("fecolormatrix", "feColorMatrix"),
260
- REPLACEMENT_ENTRY("fecomponenttransfer", "feComponentTransfer"),
261
- REPLACEMENT_ENTRY("fecomposite", "feComposite"),
262
- REPLACEMENT_ENTRY("feconvolvematrix", "feConvolveMatrix"),
263
- REPLACEMENT_ENTRY("fediffuselighting", "feDiffuseLighting"),
264
- REPLACEMENT_ENTRY("fedisplacementmap", "feDisplacementMap"),
265
- REPLACEMENT_ENTRY("fedistantlight", "feDistantLight"),
266
- REPLACEMENT_ENTRY("feflood", "feFlood"),
267
- REPLACEMENT_ENTRY("fefunca", "feFuncA"),
268
- REPLACEMENT_ENTRY("fefuncb", "feFuncB"),
269
- REPLACEMENT_ENTRY("fefuncg", "feFuncG"),
270
- REPLACEMENT_ENTRY("fefuncr", "feFuncR"),
271
- REPLACEMENT_ENTRY("fegaussianblur", "feGaussianBlur"),
272
- REPLACEMENT_ENTRY("feimage", "feImage"),
273
- REPLACEMENT_ENTRY("femerge", "feMerge"),
274
- REPLACEMENT_ENTRY("femergenode", "feMergeNode"),
275
- REPLACEMENT_ENTRY("femorphology", "feMorphology"),
276
- REPLACEMENT_ENTRY("feoffset", "feOffset"),
277
- REPLACEMENT_ENTRY("fepointlight", "fePointLight"),
278
- REPLACEMENT_ENTRY("fespecularlighting", "feSpecularLighting"),
279
- REPLACEMENT_ENTRY("fespotlight", "feSpotLight"),
280
- REPLACEMENT_ENTRY("fetile", "feTile"),
281
- REPLACEMENT_ENTRY("feturbulence", "feTurbulence"),
282
- REPLACEMENT_ENTRY("foreignobject", "foreignObject"),
283
- REPLACEMENT_ENTRY("glyphref", "glyphRef"),
284
- REPLACEMENT_ENTRY("lineargradient", "linearGradient"),
285
- REPLACEMENT_ENTRY("radialgradient", "radialGradient"),
286
- REPLACEMENT_ENTRY("textpath", "textPath"),
287
- };
288
-
289
- typedef struct _NamespacedAttributeReplacement {
290
- const char* from;
291
- const char* local_name;
292
- const GumboAttributeNamespaceEnum attr_namespace;
293
- } NamespacedAttributeReplacement;
294
-
295
- static const NamespacedAttributeReplacement kForeignAttributeReplacements[] = {
296
- { "xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK },
297
- { "xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK },
298
- { "xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK },
299
- { "xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK },
300
- { "xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK },
301
- { "xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK },
302
- { "xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK },
303
- { "xml:base", "base", GUMBO_ATTR_NAMESPACE_XML },
304
- { "xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML },
305
- { "xml:space", "space", GUMBO_ATTR_NAMESPACE_XML },
306
- { "xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS },
307
- { "xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS },
308
- };
309
-
310
- // The "scope marker" for the list of active formatting elements. We use a
311
- // pointer to this as a generic marker element, since the particular element
312
- // scope doesn't matter.
313
- static const GumboNode kActiveFormattingScopeMarker;
314
-
315
- // The tag_is and tag_in function use true & false to denote start & end tags,
316
- // but for readability, we define constants for them here.
317
- static const bool kStartTag = true;
318
- static const bool kEndTag = false;
319
-
320
- // Because GumboStringPieces are immutable, we can't insert a character directly
321
- // into a text node. Instead, we accumulate all pending characters here and
322
- // flush them out to a text node whenever a new element is inserted.
323
- //
324
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-a-character
325
- typedef struct _TextNodeBufferState {
326
- // The accumulated text to be inserted into the current text node.
327
- GumboStringBuffer _buffer;
328
-
329
- // A pointer to the original text represented by this text node. Note that
330
- // because of foster parenting and other strange DOM manipulations, this may
331
- // include other non-text HTML tags in it; it is defined as the span of
332
- // original text from the first character in this text node to the last
333
- // character in this text node.
334
- const char* _start_original_text;
335
-
336
- // The source position of the start of this text node.
337
- GumboSourcePosition _start_position;
338
-
339
- // The type of node that will be inserted (TEXT or WHITESPACE).
340
- GumboNodeType _type;
341
- } TextNodeBufferState;
342
-
343
- typedef struct _GumboParserState {
344
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
345
- GumboInsertionMode _insertion_mode;
346
-
347
- // Used for run_generic_parsing_algorithm, which needs to switch back to the
348
- // original insertion mode at its conclusion.
349
- GumboInsertionMode _original_insertion_mode;
350
-
351
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-stack-of-open-elements
352
- GumboVector /*GumboNode*/ _open_elements;
353
-
354
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements
355
- GumboVector /*GumboNode*/ _active_formatting_elements;
356
-
357
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers
358
- GumboNode* _head_element;
359
- GumboNode* _form_element;
360
-
361
- // The flag for when the spec says "Reprocess the current token in..."
362
- bool _reprocess_current_token;
363
-
364
- // The flag for "acknowledge the token's self-closing flag".
365
- bool _self_closing_flag_acknowledged;
366
-
367
- // The "frameset-ok" flag from the spec.
368
- bool _frameset_ok;
369
-
370
- // The flag for "If the next token is a LINE FEED, ignore that token...".
371
- bool _ignore_next_linefeed;
372
-
373
- // The flag for "whenever a node would be inserted into the current node, it
374
- // must instead be foster parented". This is used for misnested table
375
- // content, which needs to be handled according to "in body" rules yet foster
376
- // parented outside of the table.
377
- // It would perhaps be more explicit to have this as a parameter to
378
- // handle_in_body and insert_element, but given how special-purpose this is
379
- // and the number of call-sites that would need to take the extra parameter,
380
- // it's easier just to have a state flag.
381
- bool _foster_parent_insertions;
382
-
383
- // The accumulated text node buffer state.
384
- TextNodeBufferState _text_node;
385
-
386
- // The current token.
387
- GumboToken* _current_token;
388
-
389
- // The way that the spec is written, the </body> and </html> tags are *always*
390
- // implicit, because encountering one of those tokens merely switches the
391
- // insertion mode out of "in body". So we have individual state flags for
392
- // those end tags that are then inspected by pop_current_node when the <body>
393
- // and <html> nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG
394
- // flag appropriately.
395
- bool _closed_body_tag;
396
- bool _closed_html_tag;
397
- } GumboParserState;
398
-
399
- static bool token_has_attribute(const GumboToken* token, const char* name) {
400
- assert(token->type == GUMBO_TOKEN_START_TAG);
401
- return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL;
402
- }
403
-
404
- // Checks if the value of the specified attribute is a case-insensitive match
405
- // for the specified string.
406
- static bool attribute_matches(
407
- const GumboVector* attributes, const char* name, const char* value) {
408
- const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
409
- return attr ? strcasecmp(value, attr->value) == 0 : false;
410
- }
411
-
412
- // Checks if the value of the specified attribute is a case-sensitive match
413
- // for the specified string.
414
- static bool attribute_matches_case_sensitive(
415
- const GumboVector* attributes, const char* name, const char* value) {
416
- const GumboAttribute* attr = gumbo_get_attribute(attributes, name);
417
- return attr ? strcmp(value, attr->value) == 0 : false;
418
- }
419
-
420
- // Checks if the specified attribute vectors are identical.
421
- static bool all_attributes_match(
422
- const GumboVector* attr1, const GumboVector* attr2) {
423
- int num_unmatched_attr2_elements = attr2->length;
424
- for (int i = 0; i < attr1->length; ++i) {
425
- const GumboAttribute* attr = attr1->data[i];
426
- if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) {
427
- --num_unmatched_attr2_elements;
428
- } else {
429
- return false;
430
- }
431
- }
432
- return num_unmatched_attr2_elements == 0;
433
- }
434
-
435
- static void set_frameset_not_ok(GumboParser* parser) {
436
- gumbo_debug("Setting frameset_ok to false.\n");
437
- parser->_parser_state->_frameset_ok = false;
438
- }
439
-
440
- static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
441
- GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
442
- node->parent = NULL;
443
- node->index_within_parent = -1;
444
- node->type = type;
445
- node->parse_flags = GUMBO_INSERTION_NORMAL;
446
- return node;
447
- }
448
-
449
- static GumboNode* new_document_node(GumboParser* parser) {
450
- GumboNode* document_node = create_node(parser, GUMBO_NODE_DOCUMENT);
451
- document_node->parse_flags = GUMBO_INSERTION_BY_PARSER;
452
- gumbo_vector_init(
453
- parser, 1, &document_node->v.document.children);
454
-
455
- // Must be initialized explicitly, as there's no guarantee that we'll see a
456
- // doc type token.
457
- GumboDocument* document = &document_node->v.document;
458
- document->has_doctype = false;
459
- document->name = NULL;
460
- document->public_identifier = NULL;
461
- document->system_identifier = NULL;
462
- return document_node;
463
- }
464
-
465
- static void output_init(GumboParser* parser) {
466
- GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput));
467
- output->root = NULL;
468
- output->document = new_document_node(parser);
469
- parser->_output = output;
470
- gumbo_init_errors(parser);
471
- }
472
-
473
- static void parser_state_init(GumboParser* parser) {
474
- GumboParserState* parser_state =
475
- gumbo_parser_allocate(parser, sizeof(GumboParserState));
476
- parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL;
477
- parser_state->_reprocess_current_token = false;
478
- parser_state->_frameset_ok = true;
479
- parser_state->_ignore_next_linefeed = false;
480
- parser_state->_foster_parent_insertions = false;
481
- parser_state->_text_node._type = GUMBO_NODE_WHITESPACE;
482
- gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer);
483
- gumbo_vector_init(parser, 10, &parser_state->_open_elements);
484
- gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements);
485
- parser_state->_head_element = NULL;
486
- parser_state->_form_element = NULL;
487
- parser_state->_current_token = NULL;
488
- parser_state->_closed_body_tag = false;
489
- parser_state->_closed_html_tag = false;
490
- parser->_parser_state = parser_state;
491
- }
492
-
493
- static void parser_state_destroy(GumboParser* parser) {
494
- GumboParserState* state = parser->_parser_state;
495
- gumbo_vector_destroy(parser, &state->_active_formatting_elements);
496
- gumbo_vector_destroy(parser, &state->_open_elements);
497
- gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
498
- gumbo_parser_deallocate(parser, state);
499
- }
500
-
501
- static GumboNode* get_document_node(GumboParser* parser) {
502
- return parser->_output->document;
503
- }
504
-
505
- // Returns the node at the bottom of the stack of open elements, or NULL if no
506
- // elements have been added yet.
507
- static GumboNode* get_current_node(GumboParser* parser) {
508
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
509
- if (open_elements->length == 0) {
510
- assert(!parser->_output->root);
511
- return NULL;
512
- }
513
- assert(open_elements->length > 0);
514
- assert(open_elements->data != NULL);
515
- return open_elements->data[open_elements->length - 1];
516
- }
517
-
518
- // Returns true if the given needle is in the given array of literal
519
- // GumboStringPieces. If exact_match is true, this requires that they match
520
- // exactly; otherwise, this performs a prefix match to check if any of the
521
- // elements in haystack start with needle. This always performs a
522
- // case-insensitive match.
523
- static bool is_in_static_list(
524
- const char* needle, const GumboStringPiece* haystack, bool exact_match) {
525
- for (int i = 0; haystack[i].length > 0; ++i) {
526
- if ((exact_match && !strcmp(needle, haystack[i].data)) ||
527
- (!exact_match && !strcasecmp(needle, haystack[i].data))) {
528
- return true;
529
- }
530
- }
531
- return false;
532
- }
533
-
534
- static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
535
- parser->_parser_state->_insertion_mode = mode;
536
- }
537
-
538
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
539
- // This is a helper function that returns the appropriate insertion mode instead
540
- // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
541
- // indicate that there is no appropriate insertion mode, and the loop should
542
- // continue.
543
- static GumboInsertionMode get_appropriate_insertion_mode(
544
- const GumboNode* node, bool is_last) {
545
- assert(node->type == GUMBO_NODE_ELEMENT);
546
- switch (node->v.element.tag) {
547
- case GUMBO_TAG_SELECT:
548
- return GUMBO_INSERTION_MODE_IN_SELECT;
549
- case GUMBO_TAG_TD:
550
- case GUMBO_TAG_TH:
551
- return is_last ?
552
- GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_IN_CELL;
553
- case GUMBO_TAG_TR:
554
- return GUMBO_INSERTION_MODE_IN_ROW;
555
- case GUMBO_TAG_TBODY:
556
- case GUMBO_TAG_THEAD:
557
- case GUMBO_TAG_TFOOT:
558
- return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
559
- case GUMBO_TAG_CAPTION:
560
- return GUMBO_INSERTION_MODE_IN_CAPTION;
561
- case GUMBO_TAG_COLGROUP:
562
- return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
563
- case GUMBO_TAG_TABLE:
564
- return GUMBO_INSERTION_MODE_IN_TABLE;
565
- case GUMBO_TAG_HEAD:
566
- case GUMBO_TAG_BODY:
567
- return GUMBO_INSERTION_MODE_IN_BODY;
568
- case GUMBO_TAG_FRAMESET:
569
- return GUMBO_INSERTION_MODE_IN_FRAMESET;
570
- case GUMBO_TAG_HTML:
571
- return GUMBO_INSERTION_MODE_BEFORE_HEAD;
572
- default:
573
- return is_last ?
574
- GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
575
- }
576
- }
577
-
578
- // This performs the actual "reset the insertion mode" loop.
579
- static void reset_insertion_mode_appropriately(GumboParser* parser) {
580
- const GumboVector* open_elements = &parser->_parser_state->_open_elements;
581
- for (int i = open_elements->length - 1; i >= 0; --i) {
582
- GumboInsertionMode mode =
583
- get_appropriate_insertion_mode(open_elements->data[i], i == 0);
584
- if (mode != GUMBO_INSERTION_MODE_INITIAL) {
585
- set_insertion_mode(parser, mode);
586
- return;
587
- }
588
- }
589
- // Should never get here, because is_last will be set on the last iteration
590
- // and will force GUMBO_INSERTION_MODE_IN_BODY.
591
- assert(0);
592
- }
593
-
594
- static GumboError* add_parse_error(GumboParser* parser, const GumboToken* token) {
595
- gumbo_debug("Adding parse error.\n");
596
- GumboError* error = gumbo_add_error(parser);
597
- if (!error) {
598
- return NULL;
599
- }
600
- error->type = GUMBO_ERR_PARSER;
601
- error->position = token->position;
602
- error->original_text = token->original_text.data;
603
- GumboParserError* extra_data = &error->v.parser;
604
- extra_data->input_type = token->type;
605
- extra_data->input_tag = GUMBO_TAG_UNKNOWN;
606
- if (token->type == GUMBO_TOKEN_START_TAG) {
607
- extra_data->input_tag = token->v.start_tag.tag;
608
- } else if (token->type == GUMBO_TOKEN_END_TAG) {
609
- extra_data->input_tag = token->v.end_tag;
610
- }
611
- GumboParserState* state = parser->_parser_state;
612
- extra_data->parser_state = state->_insertion_mode;
613
- gumbo_vector_init(parser, state->_open_elements.length,
614
- &extra_data->tag_stack);
615
- for (int i = 0; i < state->_open_elements.length; ++i) {
616
- const GumboNode* node = state->_open_elements.data[i];
617
- assert(node->type == GUMBO_NODE_ELEMENT);
618
- gumbo_vector_add(parser, (void*) node->v.element.tag,
619
- &extra_data->tag_stack);
620
- }
621
- return error;
622
- }
623
-
624
- // Returns true if the specified token is either a start or end tag (specified
625
- // by is_start) with one of the tag types in the varargs list. Terminate the
626
- // list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
627
- // the spec references tags that are not in the spec.
628
- // TODO(jdtang): A lot of the tag lists for this function are repeated in many
629
- // places in the code. This is how it's written in the spec (and it's done this
630
- // way so it's easy to verify the code against the spec), but it may be worth
631
- // coming up with a notion of a "tag set" that includes a list of tags, and
632
- // using that in many places. It'd probably also help performance, but I want
633
- // to profile before optimizing.
634
- static bool tag_in(const GumboToken* token, bool is_start, ...) {
635
- GumboTag token_tag;
636
- if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
637
- token_tag = token->v.start_tag.tag;
638
- } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
639
- token_tag = token->v.end_tag;
640
- } else {
641
- return false;
642
- }
643
-
644
- va_list tags;
645
- va_start(tags, is_start);
646
- bool result = false;
647
- for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
648
- tag = va_arg(tags, GumboTag)) {
649
- if (tag == token_tag) {
650
- result = true;
651
- break;
652
- }
653
- }
654
- va_end(tags);
655
- return result;
656
- }
657
-
658
- // Like tag_in, but for the single-tag case.
659
- static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
660
- if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
661
- return token->v.start_tag.tag == tag;
662
- } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) {
663
- return token->v.end_tag == tag;
664
- } else {
665
- return false;
666
- }
667
- }
668
-
669
- // Like tag_in, but checks for the tag of a node, rather than a token.
670
- static bool node_tag_in(const GumboNode* node, ...) {
671
- assert(node != NULL);
672
- if (node->type != GUMBO_NODE_ELEMENT) {
673
- return false;
674
- }
675
- GumboTag node_tag = node->v.element.tag;
676
-
677
- va_list tags;
678
- va_start(tags, node);
679
- bool result = false;
680
- for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
681
- tag = va_arg(tags, GumboTag)) {
682
- assert(tag <= GUMBO_TAG_LAST);
683
- if (tag == node_tag) {
684
- result = true;
685
- break;
686
- }
687
- }
688
- va_end(tags);
689
- return result;
690
- }
691
-
692
- // Like node_tag_in, but for the single-tag case.
693
- static bool node_tag_is(const GumboNode* node, GumboTag tag) {
694
- return node->type == GUMBO_NODE_ELEMENT && node->v.element.tag == tag;
695
- }
696
-
697
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
698
- static bool is_mathml_integration_point(const GumboNode* node) {
699
- return node_tag_in(node, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
700
- GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST) &&
701
- node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML;
702
- }
703
-
704
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
705
- static bool is_html_integration_point(const GumboNode* node) {
706
- return (node_tag_in(node, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC,
707
- GUMBO_TAG_TITLE, GUMBO_TAG_LAST) &&
708
- node->v.element.tag_namespace == GUMBO_NAMESPACE_SVG) ||
709
- (node_tag_is(node, GUMBO_TAG_ANNOTATION_XML) && (
710
- attribute_matches(&node->v.element.attributes,
711
- "encoding", "text/html") ||
712
- attribute_matches(&node->v.element.attributes,
713
- "encoding", "application/xhtml+xml")));
714
- }
715
-
716
- // Appends a node to the end of its parent, setting the "parent" and
717
- // "index_within_parent" fields appropriately.
718
- static void append_node(
719
- GumboParser* parser, GumboNode* parent, GumboNode* node) {
720
- assert(node->parent == NULL);
721
- assert(node->index_within_parent = -1);
722
- GumboVector* children;
723
- if (parent->type == GUMBO_NODE_ELEMENT) {
724
- children = &parent->v.element.children;
725
- } else {
726
- assert(parent->type == GUMBO_NODE_DOCUMENT);
727
- children = &parent->v.document.children;
728
- }
729
- node->parent = parent;
730
- node->index_within_parent = children->length;
731
- gumbo_vector_add(parser, (void*) node, children);
732
- assert(node->index_within_parent < children->length);
733
- }
734
-
735
- // Inserts a node at the specified index within its parent, updating the
736
- // "parent" and "index_within_parent" fields of it and all its siblings.
737
- static void insert_node(
738
- GumboParser* parser, GumboNode* parent, int index, GumboNode* node) {
739
- assert(node->parent == NULL);
740
- assert(node->index_within_parent = -1);
741
- assert(parent->type == GUMBO_NODE_ELEMENT);
742
- GumboVector* children = &parent->v.element.children;
743
- assert(index >= 0);
744
- assert(index < children->length);
745
- node->parent = parent;
746
- node->index_within_parent = index;
747
- gumbo_vector_insert_at(parser, (void*) node, index, children);
748
- assert(node->index_within_parent < children->length);
749
- for (int i = index + 1; i < children->length; ++i) {
750
- GumboNode* sibling = children->data[i];
751
- sibling->index_within_parent = i;
752
- assert(sibling->index_within_parent < children->length);
753
- }
754
- }
755
-
756
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#foster-parenting
757
- static void foster_parent_element(GumboParser* parser, GumboNode* node) {
758
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
759
- assert(open_elements->length > 2);
760
-
761
- node->parse_flags |= GUMBO_INSERTION_FOSTER_PARENTED;
762
- GumboNode* foster_parent_element = open_elements->data[0];
763
- assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
764
- assert(node_tag_is(foster_parent_element, GUMBO_TAG_HTML));
765
- for (int i = open_elements->length - 1; i > 1; --i) {
766
- GumboNode* table_element = open_elements->data[i];
767
- if (node_tag_is(table_element, GUMBO_TAG_TABLE)) {
768
- foster_parent_element = table_element->parent;
769
- if (!foster_parent_element ||
770
- foster_parent_element->type != GUMBO_NODE_ELEMENT) {
771
- // Table has no parent; spec says it's possible if a script manipulated
772
- // the DOM, although I don't think we have to worry about this case.
773
- gumbo_debug("Table has no parent.\n");
774
- foster_parent_element = open_elements->data[i - 1];
775
- break;
776
- }
777
- assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
778
- gumbo_debug("Found enclosing table (%x) at %d; parent=%s, index=%d.\n",
779
- table_element, i, gumbo_normalized_tagname(
780
- foster_parent_element->v.element.tag),
781
- table_element->index_within_parent);
782
- assert(foster_parent_element->v.element.children.data[
783
- table_element->index_within_parent] == table_element);
784
- insert_node(parser, foster_parent_element,
785
- table_element->index_within_parent, node);
786
- return;
787
- }
788
- }
789
- if (node->type == GUMBO_NODE_ELEMENT) {
790
- gumbo_vector_add(parser, (void*) node, open_elements);
791
- }
792
- append_node(parser, foster_parent_element, node);
793
- }
794
-
795
- static void maybe_flush_text_node_buffer(GumboParser* parser) {
796
- GumboParserState* state = parser->_parser_state;
797
- TextNodeBufferState* buffer_state = &state->_text_node;
798
- if (buffer_state->_buffer.length == 0) {
799
- return;
800
- }
801
-
802
- assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
803
- buffer_state->_type == GUMBO_NODE_TEXT);
804
- GumboNode* text_node = create_node(parser, buffer_state->_type);
805
- GumboText* text_node_data = &text_node->v.text;
806
- text_node_data->text = gumbo_string_buffer_to_string(
807
- parser, &buffer_state->_buffer);
808
- text_node_data->original_text.data = buffer_state->_start_original_text;
809
- text_node_data->original_text.length =
810
- state->_current_token->original_text.data -
811
- buffer_state->_start_original_text;
812
- text_node_data->start_pos = buffer_state->_start_position;
813
- if (state->_foster_parent_insertions && node_tag_in(
814
- get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
815
- GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
816
- foster_parent_element(parser, text_node);
817
- } else {
818
- append_node(
819
- parser, parser->_output->root ?
820
- get_current_node(parser) : parser->_output->document, text_node);
821
- }
822
- gumbo_debug("Flushing text node buffer of %.*s.\n",
823
- (int) buffer_state->_buffer.length, buffer_state->_buffer.data);
824
-
825
- gumbo_string_buffer_destroy(parser, &buffer_state->_buffer);
826
- gumbo_string_buffer_init(parser, &buffer_state->_buffer);
827
- buffer_state->_type = GUMBO_NODE_WHITESPACE;
828
- assert(buffer_state->_buffer.length == 0);
829
- }
830
-
831
- static void record_end_of_element(
832
- GumboToken* current_token, GumboElement* element) {
833
- element->end_pos = current_token->position;
834
- element->original_end_tag =
835
- current_token->type == GUMBO_TOKEN_END_TAG ?
836
- current_token->original_text : kGumboEmptyString;
837
- }
838
-
839
- static GumboNode* pop_current_node(GumboParser* parser) {
840
- GumboParserState* state = parser->_parser_state;
841
- maybe_flush_text_node_buffer(parser);
842
- if (state->_open_elements.length > 0) {
843
- assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
844
- gumbo_debug(
845
- "Popping %s node.\n",
846
- gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
847
- }
848
- GumboNode* current_node = gumbo_vector_pop(parser, &state->_open_elements);
849
- if (!current_node) {
850
- assert(state->_open_elements.length == 0);
851
- return NULL;
852
- }
853
- assert(current_node->type == GUMBO_NODE_ELEMENT);
854
- bool is_closed_body_or_html_tag =
855
- (node_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
856
- (node_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag);
857
- if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
858
- !node_tag_is(current_node, state->_current_token->v.end_tag)) &&
859
- !is_closed_body_or_html_tag) {
860
- current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
861
- }
862
- if (!is_closed_body_or_html_tag) {
863
- record_end_of_element(state->_current_token, &current_node->v.element);
864
- }
865
- return current_node;
866
- }
867
-
868
- static void append_comment_node(
869
- GumboParser* parser, GumboNode* node, const GumboToken* token) {
870
- maybe_flush_text_node_buffer(parser);
871
- GumboNode* comment = create_node(parser, GUMBO_NODE_COMMENT);
872
- comment->type = GUMBO_NODE_COMMENT;
873
- comment->parse_flags = GUMBO_INSERTION_NORMAL;
874
- comment->v.text.text = token->v.text;
875
- comment->v.text.original_text = token->original_text;
876
- comment->v.text.start_pos = token->position;
877
- append_node(parser, node, comment);
878
- }
879
-
880
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
881
- static void clear_stack_to_table_row_context(GumboParser* parser) {
882
- while (!node_tag_in(get_current_node(parser),
883
- GUMBO_TAG_HTML, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
884
- pop_current_node(parser);
885
- }
886
- }
887
-
888
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
889
- static void clear_stack_to_table_context(GumboParser* parser) {
890
- while (!node_tag_in(get_current_node(parser),
891
- GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST)) {
892
- pop_current_node(parser);
893
- }
894
- }
895
-
896
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
897
- void clear_stack_to_table_body_context(GumboParser* parser) {
898
- while (!node_tag_in(get_current_node(parser), GUMBO_TAG_HTML,
899
- GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
900
- GUMBO_TAG_LAST)) {
901
- pop_current_node(parser);
902
- }
903
- }
904
-
905
- // Creates a parser-inserted element in the HTML namespace and returns it.
906
- static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
907
- GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
908
- GumboElement* element = &node->v.element;
909
- gumbo_vector_init(parser, 1, &element->children);
910
- gumbo_vector_init(parser, 0, &element->attributes);
911
- element->tag = tag;
912
- element->tag_namespace = GUMBO_NAMESPACE_HTML;
913
- element->original_tag = kGumboEmptyString;
914
- element->original_end_tag = kGumboEmptyString;
915
- element->start_pos = parser->_parser_state->_current_token->position;
916
- element->end_pos = kGumboEmptySourcePosition;
917
- return node;
918
- }
919
-
920
- // Constructs an element from the given start tag token.
921
- static GumboNode* create_element_from_token(
922
- GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
923
- assert(token->type == GUMBO_TOKEN_START_TAG);
924
- GumboTokenStartTag* start_tag = &token->v.start_tag;
925
-
926
- GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
927
- GumboElement* element = &node->v.element;
928
- gumbo_vector_init(parser, 1, &element->children);
929
- element->attributes = start_tag->attributes;
930
- element->tag = start_tag->tag;
931
- element->tag_namespace = tag_namespace;
932
-
933
- assert(token->original_text.length >= 2);
934
- assert(token->original_text.data[0] == '<');
935
- assert(token->original_text.data[token->original_text.length - 1] == '>');
936
- element->original_tag = token->original_text;
937
- element->start_pos = token->position;
938
- element->original_end_tag = kGumboEmptyString;
939
- element->end_pos = kGumboEmptySourcePosition;
940
-
941
- // The element takes ownership of the attributes from the token, so any
942
- // allocated-memory fields should be nulled out.
943
- start_tag->attributes = kGumboEmptyVector;
944
- return node;
945
- }
946
-
947
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#insert-an-html-element
948
- static void insert_element(GumboParser* parser, GumboNode* node,
949
- bool is_reconstructing_formatting_elements) {
950
- GumboParserState* state = parser->_parser_state;
951
- // NOTE(jdtang): The text node buffer must always be flushed before inserting
952
- // a node, otherwise we're handling nodes in a different order than the spec
953
- // mandated. However, one clause of the spec (character tokens in the body)
954
- // requires that we reconstruct the active formatting elements *before* adding
955
- // the character, and reconstructing the active formatting elements may itself
956
- // result in the insertion of new elements (which should be pushed onto the
957
- // stack of open elements before the buffer is flushed). We solve this (for
958
- // the time being, the spec has been rewritten for <template> and the new
959
- // version may be simpler here) with a boolean flag to this method.
960
- if (!is_reconstructing_formatting_elements) {
961
- maybe_flush_text_node_buffer(parser);
962
- }
963
- if (state->_foster_parent_insertions && node_tag_in(
964
- get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
965
- GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
966
- foster_parent_element(parser, node);
967
- gumbo_vector_add(parser, (void*) node, &state->_open_elements);
968
- return;
969
- }
970
-
971
- // This is called to insert the root HTML element, but get_current_node
972
- // assumes the stack of open elements is non-empty, so we need special
973
- // handling for this case.
974
- append_node(
975
- parser, parser->_output->root ?
976
- get_current_node(parser) : parser->_output->document, node);
977
- gumbo_vector_add(parser, (void*) node, &state->_open_elements);
978
- }
979
-
980
- // Convenience method that combines create_element_from_token and
981
- // insert_element, inserting the generated element directly into the current
982
- // node. Returns the node inserted.
983
- static GumboNode* insert_element_from_token(
984
- GumboParser* parser, GumboToken* token) {
985
- GumboNode* element =
986
- create_element_from_token(parser, token, GUMBO_NAMESPACE_HTML);
987
- insert_element(parser, element, false);
988
- gumbo_debug("Inserting <%s> element (@%x) from token.\n",
989
- gumbo_normalized_tagname(element->v.element.tag), element);
990
- return element;
991
- }
992
-
993
- // Convenience method that combines create_element and insert_element, inserting
994
- // a parser-generated element of a specific tag type. Returns the node
995
- // inserted.
996
- static GumboNode* insert_element_of_tag_type(
997
- GumboParser* parser, GumboTag tag, GumboParseFlags reason) {
998
- GumboNode* element = create_element(parser, tag);
999
- element->parse_flags |= GUMBO_INSERTION_BY_PARSER | reason;
1000
- insert_element(parser, element, false);
1001
- gumbo_debug("Inserting %s element (@%x) from tag type.\n",
1002
- gumbo_normalized_tagname(tag), element);
1003
- return element;
1004
- }
1005
-
1006
- // Convenience method for creating foreign namespaced element. Returns the node
1007
- // inserted.
1008
- static GumboNode* insert_foreign_element(
1009
- GumboParser* parser, GumboToken* token, GumboNamespaceEnum tag_namespace) {
1010
- assert(token->type == GUMBO_TOKEN_START_TAG);
1011
- GumboNode* element = create_element_from_token(parser, token, tag_namespace);
1012
- insert_element(parser, element, false);
1013
- if (token_has_attribute(token, "xmlns") &&
1014
- !attribute_matches_case_sensitive(
1015
- &token->v.start_tag.attributes, "xmlns",
1016
- kLegalXmlns[tag_namespace])) {
1017
- // TODO(jdtang): Since there're multiple possible error codes here, we
1018
- // eventually need reason codes to differentiate them.
1019
- add_parse_error(parser, token);
1020
- }
1021
- if (token_has_attribute(token, "xmlns:xlink") &&
1022
- !attribute_matches_case_sensitive(
1023
- &token->v.start_tag.attributes,
1024
- "xmlns:xlink", "http://www.w3.org/1999/xlink")) {
1025
- add_parse_error(parser, token);
1026
- }
1027
- return element;
1028
- }
1029
-
1030
- static void insert_text_token(GumboParser* parser, GumboToken* token) {
1031
- assert(token->type == GUMBO_TOKEN_WHITESPACE ||
1032
- token->type == GUMBO_TOKEN_CHARACTER);
1033
- TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
1034
- if (buffer_state->_buffer.length == 0) {
1035
- // Initialize position fields.
1036
- buffer_state->_start_original_text = token->original_text.data;
1037
- buffer_state->_start_position = token->position;
1038
- }
1039
- gumbo_string_buffer_append_codepoint(
1040
- parser, token->v.character, &buffer_state->_buffer);
1041
- if (token->type == GUMBO_TOKEN_CHARACTER) {
1042
- buffer_state->_type = GUMBO_NODE_TEXT;
1043
- }
1044
- gumbo_debug("Inserting text token '%c'.\n", token->v.character);
1045
- }
1046
-
1047
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generic-rcdata-element-parsing-algorithm
1048
- static void run_generic_parsing_algorithm(
1049
- GumboParser* parser, GumboToken* token, GumboTokenizerEnum lexer_state) {
1050
- insert_element_from_token(parser, token);
1051
- gumbo_tokenizer_set_state(parser, lexer_state);
1052
- parser->_parser_state->_original_insertion_mode =
1053
- parser->_parser_state->_insertion_mode;
1054
- parser->_parser_state->_insertion_mode = GUMBO_INSERTION_MODE_TEXT;
1055
- }
1056
-
1057
- static void acknowledge_self_closing_tag(GumboParser* parser) {
1058
- parser->_parser_state->_self_closing_flag_acknowledged = true;
1059
- }
1060
-
1061
- // Returns true if there's an anchor tag in the list of active formatting
1062
- // elements, and fills in its index if so.
1063
- static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
1064
- GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1065
- for (int i = elements->length - 1; i >= 0; --i) {
1066
- GumboNode* node = elements->data[i];
1067
- if (node == &kActiveFormattingScopeMarker) {
1068
- return false;
1069
- }
1070
- if (node_tag_is(node, GUMBO_TAG_A)) {
1071
- *anchor_index = i;
1072
- return true;
1073
- }
1074
- }
1075
- return false;
1076
- }
1077
-
1078
- // Counts the number of open formatting elements in the list of active
1079
- // formatting elements (after the last active scope marker) that have a specific
1080
- // tag. If this is > 0, then earliest_matching_index will be filled in with the
1081
- // index of the first such element.
1082
- static int count_formatting_elements_of_tag(
1083
- GumboParser* parser, const GumboNode* desired_node,
1084
- int* earliest_matching_index) {
1085
- const GumboElement* desired_element = &desired_node->v.element;
1086
- GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1087
- int num_identical_elements = 0;
1088
- for (int i = elements->length - 1; i >= 0; --i) {
1089
- GumboNode* node = elements->data[i];
1090
- if (node == &kActiveFormattingScopeMarker) {
1091
- break;
1092
- }
1093
- assert(node->type == GUMBO_NODE_ELEMENT);
1094
- GumboElement* element = &node->v.element;
1095
- if (node_tag_is(node, desired_element->tag) &&
1096
- element->tag_namespace == desired_element->tag_namespace &&
1097
- all_attributes_match(&element->attributes,
1098
- &desired_element->attributes)) {
1099
- num_identical_elements++;
1100
- *earliest_matching_index = i;
1101
- }
1102
- }
1103
- return num_identical_elements;
1104
- }
1105
-
1106
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reconstruct-the-active-formatting-elements
1107
- static void add_formatting_element(GumboParser* parser, const GumboNode* node) {
1108
- assert(node == &kActiveFormattingScopeMarker ||
1109
- node->type == GUMBO_NODE_ELEMENT);
1110
- GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1111
- if (node == &kActiveFormattingScopeMarker) {
1112
- gumbo_debug("Adding a scope marker.\n");
1113
- } else {
1114
- gumbo_debug("Adding a formatting element.\n");
1115
- }
1116
-
1117
- // Hunt for identical elements.
1118
- int earliest_identical_element = elements->length;
1119
- int num_identical_elements = count_formatting_elements_of_tag(
1120
- parser, node, &earliest_identical_element);
1121
-
1122
- // Noah's Ark clause: if there're at least 3, remove the earliest.
1123
- if (num_identical_elements >= 3) {
1124
- gumbo_debug("Noah's ark clause: removing element at %d.\n",
1125
- earliest_identical_element);
1126
- gumbo_vector_remove_at(parser, earliest_identical_element, elements);
1127
- }
1128
-
1129
- gumbo_vector_add(parser, (void*) node, elements);
1130
- }
1131
-
1132
- static bool is_open_element(GumboParser* parser, const GumboNode* node) {
1133
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
1134
- for (int i = 0; i < open_elements->length; ++i) {
1135
- if (open_elements->data[i] == node) {
1136
- return true;
1137
- }
1138
- }
1139
- return false;
1140
- }
1141
-
1142
- // Clones attributes, tags, etc. of a node, but does not copy the content. The
1143
- // clone shares no structure with the original node: all owned strings and
1144
- // values are fresh copies.
1145
- GumboNode* clone_node(
1146
- GumboParser* parser, const GumboNode* node, GumboParseFlags reason) {
1147
- assert(node->type == GUMBO_NODE_ELEMENT);
1148
- GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
1149
- *new_node = *node;
1150
- new_node->parent = NULL;
1151
- new_node->index_within_parent = -1;
1152
- // Clear the GUMBO_INSERTION_IMPLICIT_END_TAG flag, as the cloned node may
1153
- // have a separate end tag.
1154
- new_node->parse_flags &= ~GUMBO_INSERTION_IMPLICIT_END_TAG;
1155
- new_node->parse_flags |= reason | GUMBO_INSERTION_BY_PARSER;
1156
- GumboElement* element = &new_node->v.element;
1157
- gumbo_vector_init(parser, 1, &element->children);
1158
-
1159
- const GumboVector* old_attributes = &node->v.element.attributes;
1160
- gumbo_vector_init(parser, old_attributes->length, &element->attributes);
1161
- for (int i = 0; i < old_attributes->length; ++i) {
1162
- const GumboAttribute* old_attr = old_attributes->data[i];
1163
- GumboAttribute* attr =
1164
- gumbo_parser_allocate(parser, sizeof(GumboAttribute));
1165
- *attr = *old_attr;
1166
- attr->name = gumbo_copy_stringz(parser, old_attr->name);
1167
- attr->value = gumbo_copy_stringz(parser, old_attr->value);
1168
- gumbo_vector_add(parser, attr, &element->attributes);
1169
- }
1170
- return new_node;
1171
- }
1172
-
1173
- // "Reconstruct active formatting elements" part of the spec.
1174
- // This implementation is based on the html5lib translation from the mess of
1175
- // GOTOs in the spec to reasonably structured programming.
1176
- // http://code.google.com/p/html5lib/source/browse/python/html5lib/treebuilders/_base.py
1177
- static void reconstruct_active_formatting_elements(GumboParser* parser) {
1178
- GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1179
- // Step 1
1180
- if (elements->length == 0) {
1181
- return;
1182
- }
1183
-
1184
- // Step 2 & 3
1185
- int i = elements->length - 1;
1186
- const GumboNode* element = elements->data[i];
1187
- if (element == &kActiveFormattingScopeMarker ||
1188
- is_open_element(parser, element)) {
1189
- return;
1190
- }
1191
-
1192
- // Step 6
1193
- do {
1194
- if (i == 0) {
1195
- // Step 4
1196
- i = -1; // Incremented to 0 below.
1197
- break;
1198
- }
1199
- // Step 5
1200
- element = elements->data[--i];
1201
- } while (element != &kActiveFormattingScopeMarker &&
1202
- !is_open_element(parser, element));
1203
-
1204
- ++i;
1205
- gumbo_debug("Reconstructing elements from %d on %s parent.\n", i,
1206
- gumbo_normalized_tagname(
1207
- get_current_node(parser)->v.element.tag));
1208
- for(; i < elements->length; ++i) {
1209
- // Step 7 & 8.
1210
- assert(elements->length > 0);
1211
- assert(i < elements->length);
1212
- element = elements->data[i];
1213
- assert(element != &kActiveFormattingScopeMarker);
1214
- GumboNode* clone = clone_node(
1215
- parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
1216
- // Step 9.
1217
- insert_element(parser, clone, true);
1218
- // Step 10.
1219
- elements->data[i] = clone;
1220
- gumbo_debug("Reconstructed %s element at %d.\n",
1221
- gumbo_normalized_tagname(clone->v.element.tag), i);
1222
- }
1223
- }
1224
-
1225
- static void clear_active_formatting_elements(GumboParser* parser) {
1226
- GumboVector* elements = &parser->_parser_state->_active_formatting_elements;
1227
- int num_elements_cleared = 0;
1228
- const GumboNode* node;
1229
- do {
1230
- node = gumbo_vector_pop(parser, elements);
1231
- ++num_elements_cleared;
1232
- } while(node && node != &kActiveFormattingScopeMarker);
1233
- gumbo_debug("Cleared %d elements from active formatting list.\n",
1234
- num_elements_cleared);
1235
- }
1236
-
1237
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-initial-insertion-mode
1238
- static GumboQuirksModeEnum compute_quirks_mode(
1239
- const GumboTokenDocType* doctype) {
1240
- if (doctype->force_quirks ||
1241
- strcmp(doctype->name, kDoctypeHtml.data) ||
1242
- is_in_static_list(doctype->public_identifier,
1243
- kQuirksModePublicIdPrefixes, false) ||
1244
- is_in_static_list(doctype->public_identifier,
1245
- kQuirksModePublicIdExactMatches, true) ||
1246
- is_in_static_list(doctype->system_identifier,
1247
- kQuirksModeSystemIdExactMatches, true) ||
1248
- (is_in_static_list(doctype->public_identifier,
1249
- kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false)
1250
- && !doctype->has_system_identifier)) {
1251
- return GUMBO_DOCTYPE_QUIRKS;
1252
- } else if (
1253
- is_in_static_list(doctype->public_identifier,
1254
- kLimitedQuirksPublicIdPrefixes, false) ||
1255
- (is_in_static_list(doctype->public_identifier,
1256
- kLimitedQuirksRequiresSystemIdPublicIdPrefixes, false)
1257
- && doctype->has_system_identifier)) {
1258
- return GUMBO_DOCTYPE_LIMITED_QUIRKS;
1259
- }
1260
- return GUMBO_DOCTYPE_NO_QUIRKS;
1261
- }
1262
-
1263
- // The following functions are all defined by the "has an element in __ scope"
1264
- // sections of the HTML5 spec:
1265
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
1266
- // The basic idea behind them is that they check for an element of the given tag
1267
- // name, contained within a scope formed by a set of other tag names. For
1268
- // example, "has an element in list scope" looks for an element of the given tag
1269
- // within the nearest enclosing <ol> or <ul>, along with a bunch of generic
1270
- // element types that serve to "firewall" their content from the rest of the
1271
- // document.
1272
- static bool has_an_element_in_specific_scope(
1273
- GumboParser* parser, GumboVector* /* GumboTag */ expected, bool negate, ...) {
1274
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
1275
- va_list args;
1276
- va_start(args, negate);
1277
- // va_arg can only run through the list once, so we copy it to an GumboVector
1278
- // here. I wonder if it'd make more sense to make tags the GumboVector*
1279
- // parameter and 'expected' a vararg list, but that'd require changing a lot
1280
- // of code for unknown benefit. We may want to change the representation of
1281
- // these tag sets anyway, to something more efficient.
1282
- GumboVector tags;
1283
- gumbo_vector_init(parser, 10, &tags);
1284
- for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
1285
- tag = va_arg(args, GumboTag)) {
1286
- // We store the tags inline instead of storing pointers to them.
1287
- gumbo_vector_add(parser, (void*) tag, &tags);
1288
- }
1289
- va_end(args);
1290
-
1291
- bool result = false;
1292
- for (int i = open_elements->length - 1; i >= 0; --i) {
1293
- const GumboNode* node = open_elements->data[i];
1294
- if (node->type != GUMBO_NODE_ELEMENT) {
1295
- continue;
1296
- }
1297
- GumboTag node_tag = node->v.element.tag;
1298
- for (int j = 0; j < expected->length; ++j) {
1299
- GumboTag expected_tag = (GumboTag) expected->data[j];
1300
- if (node_tag == expected_tag) {
1301
- result = true;
1302
- goto cleanup;
1303
- }
1304
- }
1305
-
1306
- bool found_tag = false;
1307
- for (int j = 0; j < tags.length; ++j) {
1308
- GumboTag tag = (GumboTag) tags.data[j];
1309
- if (tag == node_tag) {
1310
- found_tag = true;
1311
- break;
1312
- }
1313
- }
1314
- if (negate != found_tag) {
1315
- result = false;
1316
- goto cleanup;
1317
- }
1318
- }
1319
- cleanup:
1320
- gumbo_vector_destroy(parser, &tags);
1321
- return result;
1322
- }
1323
-
1324
- // This is a bit of a hack to stack-allocate a one-element GumboVector name
1325
- // 'varname' containing the 'from_var' variable, since it's used in nearly all
1326
- // the subsequent helper functions. Note the use of void* and casts instead of
1327
- // GumboTag; this is so the alignment requirements are the same as GumboVector
1328
- // and the data inside it can be freely accessed as if it were a normal
1329
- // GumboVector.
1330
- #define DECLARE_ONE_ELEMENT_GUMBO_VECTOR(varname, from_var) \
1331
- void* varname ## _tmp_array[1] = { (void*) from_var }; \
1332
- GumboVector varname = { varname ## _tmp_array, 1, 1 }
1333
-
1334
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
1335
- static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
1336
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1337
- return has_an_element_in_specific_scope(
1338
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1339
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1340
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1341
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1342
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
1343
- }
1344
-
1345
- // Like "has an element in scope", but for the specific case of looking for a
1346
- // unique target node, not for any node with a given tag name. This duplicates
1347
- // much of the algorithm from has_an_element_in_specific_scope because the
1348
- // predicate is different when checking for an exact node, and it's easier &
1349
- // faster just to duplicate the code for this one case than to try and
1350
- // parameterize it.
1351
- static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
1352
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
1353
- for (int i = open_elements->length - 1; i >= 0; --i) {
1354
- const GumboNode* current = open_elements->data[i];
1355
- if (current == node) {
1356
- return true;
1357
- }
1358
- if (current->type != GUMBO_NODE_ELEMENT) {
1359
- continue;
1360
- }
1361
- if (node_tag_in(
1362
- current, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1363
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1364
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
1365
- GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML,
1366
- GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_TITLE,
1367
- GUMBO_TAG_LAST)) {
1368
- return false;
1369
- }
1370
- }
1371
- assert(false);
1372
- return false;
1373
- }
1374
-
1375
- // Like has_an_element_in_scope, but restricts the expected tag to a range of
1376
- // possible tag names instead of just a single one.
1377
- static bool has_an_element_in_scope_with_tagname(GumboParser* parser, ...) {
1378
- GumboVector tags;
1379
- // 6 = arbitrary initial size for vector, chosen because the major use-case
1380
- // for this method is heading tags, of which there are 6.
1381
- gumbo_vector_init(parser, 6, &tags);
1382
- va_list args;
1383
- va_start(args, parser);
1384
- for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
1385
- tag = va_arg(args, GumboTag)) {
1386
- gumbo_vector_add(parser, (void*) tag, &tags);
1387
- }
1388
- bool found = has_an_element_in_specific_scope(
1389
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1390
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1391
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1392
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1393
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
1394
- gumbo_vector_destroy(parser, &tags);
1395
- va_end(args);
1396
- return found;
1397
- }
1398
-
1399
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
1400
- static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
1401
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1402
- return has_an_element_in_specific_scope(
1403
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1404
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1405
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1406
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1407
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_OL, GUMBO_TAG_UL,
1408
- GUMBO_TAG_LAST);
1409
- }
1410
-
1411
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
1412
- static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
1413
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1414
- return has_an_element_in_specific_scope(
1415
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1416
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1417
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1418
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1419
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_BUTTON, GUMBO_TAG_LAST);
1420
- }
1421
-
1422
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
1423
- static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
1424
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1425
- return has_an_element_in_specific_scope(
1426
- parser, &tags, false, GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST);
1427
- }
1428
-
1429
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
1430
- static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
1431
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1432
- return has_an_element_in_specific_scope(
1433
- parser, &tags, true, GUMBO_TAG_OPTGROUP, GUMBO_TAG_OPTION,
1434
- GUMBO_TAG_LAST);
1435
- }
1436
-
1437
-
1438
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
1439
- // "exception" is the "element to exclude from the process" listed in the spec.
1440
- // Pass GUMBO_TAG_LAST to not exclude any of them.
1441
- static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1442
- for (;
1443
- node_tag_in(get_current_node(parser), GUMBO_TAG_DD, GUMBO_TAG_DT,
1444
- GUMBO_TAG_LI, GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP,
1445
- GUMBO_TAG_P, GUMBO_TAG_RP, GUMBO_TAG_RT, GUMBO_TAG_LAST) &&
1446
- !node_tag_is(get_current_node(parser), exception);
1447
- pop_current_node(parser));
1448
- }
1449
-
1450
- // This factors out the clauses relating to "act as if an end tag token with tag
1451
- // name "table" had been seen. Returns true if there's a table element in table
1452
- // scope which was successfully closed, false if not and the token should be
1453
- // ignored. Does not add parse errors; callers should handle that.
1454
- static bool close_table(GumboParser* parser) {
1455
- if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TABLE)) {
1456
- return false;
1457
- }
1458
-
1459
- GumboNode* node = pop_current_node(parser);
1460
- while (!node_tag_is(node, GUMBO_TAG_TABLE)) {
1461
- node = pop_current_node(parser);
1462
- }
1463
- reset_insertion_mode_appropriately(parser);
1464
- return true;
1465
- }
1466
-
1467
- // This factors out the clauses relating to "act as if an end tag token with tag
1468
- // name `cell_tag` had been seen".
1469
- static bool close_table_cell(GumboParser* parser, const GumboToken* token,
1470
- GumboTag cell_tag) {
1471
- bool result = true;
1472
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
1473
- const GumboNode* node = get_current_node(parser);
1474
- if (!node_tag_is(node, cell_tag)) {
1475
- add_parse_error(parser, token);
1476
- result = false;
1477
- }
1478
- do {
1479
- node = pop_current_node(parser);
1480
- } while (!node_tag_is(node, cell_tag));
1481
-
1482
- clear_active_formatting_elements(parser);
1483
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
1484
- return result;
1485
- }
1486
-
1487
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#close-the-cell
1488
- // This holds the logic to determine whether we should close a <td> or a <th>.
1489
- static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1490
- if (has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
1491
- assert(!has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1492
- return close_table_cell(parser, token, GUMBO_TAG_TD);
1493
- } else {
1494
- assert(has_an_element_in_table_scope(parser, GUMBO_TAG_TH));
1495
- return close_table_cell(parser, token, GUMBO_TAG_TH);
1496
- }
1497
- }
1498
-
1499
- // This factors out the "act as if an end tag of tag name 'select' had been
1500
- // seen" clause of the spec, since it's referenced in several places. It pops
1501
- // all nodes from the stack until the current <select> has been closed, then
1502
- // resets the insertion mode appropriately.
1503
- static void close_current_select(GumboParser* parser) {
1504
- GumboNode* node = pop_current_node(parser);
1505
- while (!node_tag_is(node, GUMBO_TAG_SELECT)) {
1506
- node = pop_current_node(parser);
1507
- }
1508
- reset_insertion_mode_appropriately(parser);
1509
- }
1510
-
1511
- // The list of nodes in the "special" category:
1512
- // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
1513
- static bool is_special_node(const GumboNode* node) {
1514
- assert(node->type == GUMBO_NODE_ELEMENT);
1515
- switch (node->v.element.tag_namespace) {
1516
- case GUMBO_NAMESPACE_HTML:
1517
- return node_tag_in(node,
1518
- GUMBO_TAG_ADDRESS, GUMBO_TAG_APPLET, GUMBO_TAG_AREA,
1519
- GUMBO_TAG_ARTICLE, GUMBO_TAG_ASIDE, GUMBO_TAG_BASE,
1520
- GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND, GUMBO_TAG_BLOCKQUOTE,
1521
- GUMBO_TAG_BODY, GUMBO_TAG_BR, GUMBO_TAG_BUTTON, GUMBO_TAG_CAPTION,
1522
- GUMBO_TAG_CENTER, GUMBO_TAG_COL, GUMBO_TAG_COLGROUP,
1523
- GUMBO_TAG_COMMAND, GUMBO_TAG_DD, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
1524
- GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EMBED,
1525
- GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE,
1526
- GUMBO_TAG_FOOTER, GUMBO_TAG_FORM, GUMBO_TAG_FRAME,
1527
- GUMBO_TAG_FRAMESET, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
1528
- GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HEAD,
1529
- GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_HR, GUMBO_TAG_HTML,
1530
- GUMBO_TAG_IFRAME, GUMBO_TAG_IMG, GUMBO_TAG_INPUT, GUMBO_TAG_ISINDEX,
1531
- GUMBO_TAG_LI, GUMBO_TAG_LINK, GUMBO_TAG_LISTING, GUMBO_TAG_MARQUEE,
1532
- GUMBO_TAG_MENU, GUMBO_TAG_META, GUMBO_TAG_NAV, GUMBO_TAG_NOEMBED,
1533
- GUMBO_TAG_NOFRAMES, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_OBJECT,
1534
- GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_PARAM, GUMBO_TAG_PLAINTEXT,
1535
- GUMBO_TAG_PRE, GUMBO_TAG_SCRIPT, GUMBO_TAG_SECTION, GUMBO_TAG_SELECT,
1536
- GUMBO_TAG_STYLE, GUMBO_TAG_SUMMARY, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
1537
- GUMBO_TAG_TD, GUMBO_TAG_TEXTAREA, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
1538
- GUMBO_TAG_THEAD, GUMBO_TAG_TITLE, GUMBO_TAG_TR, GUMBO_TAG_UL,
1539
- GUMBO_TAG_WBR, GUMBO_TAG_XMP, GUMBO_TAG_LAST);
1540
- case GUMBO_NAMESPACE_MATHML:
1541
- return node_tag_in(node,
1542
- GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1543
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_LAST);
1544
- case GUMBO_NAMESPACE_SVG:
1545
- return node_tag_in(node,
1546
- GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_LAST);
1547
- }
1548
- abort();
1549
- return false; // Pacify compiler.
1550
- }
1551
-
1552
- // Implicitly closes currently open tags until it reaches an element with the
1553
- // specified tag name. If the elements closed are in the set handled by
1554
- // generate_implied_end_tags, this is normal operation and this function returns
1555
- // true. Otherwise, a parse error is recorded and this function returns false.
1556
- static bool implicitly_close_tags(
1557
- GumboParser* parser, GumboToken* token, GumboTag target) {
1558
- bool result = true;
1559
- generate_implied_end_tags(parser, target);
1560
- if (!node_tag_is(get_current_node(parser), target)) {
1561
- add_parse_error(parser, token);
1562
- while (!node_tag_is(get_current_node(parser), target)) {
1563
- pop_current_node(parser);
1564
- }
1565
- result = false;
1566
- }
1567
- assert(node_tag_is(get_current_node(parser), target));
1568
- pop_current_node(parser);
1569
- return result;
1570
- }
1571
-
1572
- // If the stack of open elements has a <p> tag in button scope, this acts as if
1573
- // a </p> tag was encountered, implicitly closing tags. Returns false if a
1574
- // parse error occurs. This is a convenience function because this particular
1575
- // clause appears several times in the spec.
1576
- static bool maybe_implicitly_close_p_tag(GumboParser* parser, GumboToken* token) {
1577
- if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1578
- return implicitly_close_tags(parser, token, GUMBO_TAG_P);
1579
- }
1580
- return true;
1581
- }
1582
-
1583
- // Convenience function to encapsulate the logic for closing <li> or <dd>/<dt>
1584
- // tags. Pass true to is_li for handling <li> tags, false for <dd> and <dt>.
1585
- static void maybe_implicitly_close_list_tag(
1586
- GumboParser* parser, GumboToken* token, bool is_li) {
1587
- GumboParserState* state = parser->_parser_state;
1588
- state->_frameset_ok = false;
1589
- for (int i = state->_open_elements.length - 1; i >= 0; --i) {
1590
- const GumboNode* node = state->_open_elements.data[i];
1591
- bool is_list_tag = is_li ?
1592
- node_tag_is(node, GUMBO_TAG_LI) :
1593
- node_tag_in(node, GUMBO_TAG_DD, GUMBO_TAG_DT, GUMBO_TAG_LAST);
1594
- if (is_list_tag) {
1595
- implicitly_close_tags(parser, token, node->v.element.tag);
1596
- return;
1597
- }
1598
- if (is_special_node(node) &&
1599
- !node_tag_in(node, GUMBO_TAG_ADDRESS, GUMBO_TAG_DIV, GUMBO_TAG_P,
1600
- GUMBO_TAG_LAST)) {
1601
- return;
1602
- }
1603
- }
1604
- }
1605
-
1606
- static void merge_attributes(
1607
- GumboParser* parser, GumboToken* token, GumboNode* node) {
1608
- assert(token->type == GUMBO_TOKEN_START_TAG);
1609
- assert(node->type == GUMBO_NODE_ELEMENT);
1610
- const GumboVector* token_attr = &token->v.start_tag.attributes;
1611
- GumboVector* node_attr = &node->v.element.attributes;
1612
-
1613
- for (int i = 0; i < token_attr->length; ++i) {
1614
- GumboAttribute* attr = token_attr->data[i];
1615
- if (!gumbo_get_attribute(node_attr, attr->name)) {
1616
- // Ownership of the attribute is transferred by this gumbo_vector_add,
1617
- // so it has to be nulled out of the original token so it doesn't get
1618
- // double-deleted.
1619
- gumbo_vector_add(parser, attr, node_attr);
1620
- token_attr->data[i] = NULL;
1621
- }
1622
- }
1623
- // When attributes are merged, it means the token has been ignored and merged
1624
- // with another token, so we need to free its memory. The attributes that are
1625
- // transferred need to be nulled-out in the vector above so that they aren't
1626
- // double-deleted.
1627
- gumbo_token_destroy(parser, token);
1628
-
1629
- #ifndef NDEBUG
1630
- // Mark this sentinel so the assertion in the main loop knows it's been
1631
- // destroyed.
1632
- token->v.start_tag.attributes = kGumboEmptyVector;
1633
- #endif
1634
- }
1635
-
1636
- const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tag) {
1637
- for (int i = 0;
1638
- i < sizeof(kSvgTagReplacements) / sizeof(ReplacementEntry); ++i) {
1639
- const ReplacementEntry* entry = &kSvgTagReplacements[i];
1640
- if (gumbo_string_equals_ignore_case(tag, &entry->from)) {
1641
- return entry->to.data;
1642
- }
1643
- }
1644
- return NULL;
1645
- }
1646
-
1647
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
1648
- // This destructively modifies any matching attributes on the token and sets the
1649
- // namespace appropriately.
1650
- static void adjust_foreign_attributes(GumboParser* parser, GumboToken* token) {
1651
- assert(token->type == GUMBO_TOKEN_START_TAG);
1652
- const GumboVector* attributes = &token->v.start_tag.attributes;
1653
- for (int i = 0;
1654
- i < sizeof(kForeignAttributeReplacements) /
1655
- sizeof(NamespacedAttributeReplacement); ++i) {
1656
- const NamespacedAttributeReplacement* entry =
1657
- &kForeignAttributeReplacements[i];
1658
- GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from);
1659
- if (!attr) {
1660
- continue;
1661
- }
1662
- gumbo_parser_deallocate(parser, (void*) attr->name);
1663
- attr->attr_namespace = entry->attr_namespace;
1664
- attr->name = gumbo_copy_stringz(parser, entry->local_name);
1665
- }
1666
- }
1667
-
1668
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-svg-attributes
1669
- // This destructively modifies any matching attributes on the token.
1670
- static void adjust_svg_attributes(GumboParser* parser, GumboToken* token) {
1671
- assert(token->type == GUMBO_TOKEN_START_TAG);
1672
- const GumboVector* attributes = &token->v.start_tag.attributes;
1673
- for (int i = 0;
1674
- i < sizeof(kSvgAttributeReplacements) / sizeof(ReplacementEntry); ++i) {
1675
- const ReplacementEntry* entry = &kSvgAttributeReplacements[i];
1676
- GumboAttribute* attr = gumbo_get_attribute(attributes, entry->from.data);
1677
- if (!attr) {
1678
- continue;
1679
- }
1680
- gumbo_parser_deallocate(parser, (void*) attr->name);
1681
- attr->name = gumbo_copy_stringz(parser, entry->to.data);
1682
- }
1683
- }
1684
-
1685
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#adjust-mathml-attributes
1686
- // Note that this may destructively modify the token with the new attribute
1687
- // value.
1688
- static void adjust_mathml_attributes(GumboParser* parser, GumboToken* token) {
1689
- assert(token->type == GUMBO_TOKEN_START_TAG);
1690
- GumboAttribute* attr = gumbo_get_attribute(
1691
- &token->v.start_tag.attributes, "definitionurl");
1692
- if (!attr) {
1693
- return;
1694
- }
1695
- gumbo_parser_deallocate(parser, (void*) attr->name);
1696
- attr->name = gumbo_copy_stringz(parser, "definitionURL");
1697
- }
1698
-
1699
- static bool doctype_matches(
1700
- const GumboTokenDocType* doctype,
1701
- const GumboStringPiece* public_id,
1702
- const GumboStringPiece* system_id,
1703
- bool allow_missing_system_id) {
1704
- return !strcmp(doctype->public_identifier, public_id->data) &&
1705
- (allow_missing_system_id || doctype->has_system_identifier) &&
1706
- !strcmp(doctype->system_identifier, system_id->data);
1707
- }
1708
-
1709
- static bool maybe_add_doctype_error(
1710
- GumboParser* parser, const GumboToken* token) {
1711
- const GumboTokenDocType* doctype = &token->v.doc_type;
1712
- bool html_doctype = !strcmp(doctype->name, kDoctypeHtml.data);
1713
- if (!html_doctype ||
1714
- doctype->has_public_identifier ||
1715
- (doctype->has_system_identifier && !strcmp(
1716
- doctype->system_identifier, kSystemIdLegacyCompat.data)) ||
1717
- !(html_doctype && (
1718
- doctype_matches(doctype, &kPublicIdHtml4_0,
1719
- &kSystemIdRecHtml4_0, true) ||
1720
- doctype_matches(doctype, &kPublicIdHtml4_01, &kSystemIdHtml4, true) ||
1721
- doctype_matches(doctype, &kPublicIdXhtml1_0,
1722
- &kSystemIdXhtmlStrict1_1, false) ||
1723
- doctype_matches(doctype, &kPublicIdXhtml1_1,
1724
- &kSystemIdXhtml1_1, false)))) {
1725
- add_parse_error(parser, token);
1726
- return false;
1727
- }
1728
- return true;
1729
- }
1730
-
1731
- static void remove_from_parent(GumboParser* parser, GumboNode* node) {
1732
- if (!node->parent) {
1733
- // The node may not have a parent if, for example, it is a newly-cloned copy
1734
- // of an active formatting element. DOM manipulations continue with the
1735
- // orphaned fragment of the DOM tree until it's appended/foster-parented to
1736
- // the common ancestor at the end of the adoption agency algorithm.
1737
- return;
1738
- }
1739
- assert(node->parent->type == GUMBO_NODE_ELEMENT);
1740
- GumboVector* children = &node->parent->v.element.children;
1741
- int index = gumbo_vector_index_of(children, node);
1742
- assert(index != -1);
1743
-
1744
- gumbo_vector_remove_at(parser, index, children);
1745
- node->parent = NULL;
1746
- node->index_within_parent = -1;
1747
- for (int i = index; i < children->length; ++i) {
1748
- GumboNode* child = children->data[i];
1749
- child->index_within_parent = i;
1750
- }
1751
- }
1752
-
1753
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
1754
- // Also described in the "in body" handling for end formatting tags.
1755
- static bool adoption_agency_algorithm(
1756
- GumboParser* parser, GumboToken* token, GumboTag closing_tag) {
1757
- GumboParserState* state = parser->_parser_state;
1758
- gumbo_debug("Entering adoption agency algorithm.\n");
1759
- // Steps 1-3 & 16:
1760
- for (int i = 0; i < 8; ++i) {
1761
- // Step 4.
1762
- GumboNode* formatting_node = NULL;
1763
- int formatting_node_in_open_elements = -1;
1764
- for (int j = state->_active_formatting_elements.length - 1; j >= 0; --j) {
1765
- GumboNode* current_node = state->_active_formatting_elements.data[j];
1766
- if (current_node == &kActiveFormattingScopeMarker) {
1767
- gumbo_debug("Broke on scope marker; aborting.\n");
1768
- // Last scope marker; abort the algorithm.
1769
- return false;
1770
- }
1771
- if (node_tag_is(current_node, closing_tag)) {
1772
- // Found it.
1773
- formatting_node = current_node;
1774
- formatting_node_in_open_elements = gumbo_vector_index_of(
1775
- &state->_open_elements, formatting_node);
1776
- gumbo_debug("Formatting element of tag %s at %d.\n",
1777
- gumbo_normalized_tagname(closing_tag),
1778
- formatting_node_in_open_elements);
1779
- break;
1780
- }
1781
- }
1782
- if (!formatting_node) {
1783
- // No matching tag; not a parse error outright, but fall through to the
1784
- // "any other end tag" clause (which may potentially add a parse error,
1785
- // but not always).
1786
- gumbo_debug("No active formatting elements; aborting.\n");
1787
- return false;
1788
- }
1789
-
1790
- if (formatting_node_in_open_elements == -1) {
1791
- gumbo_debug("Formatting node not on stack of open elements.\n");
1792
- gumbo_vector_remove(parser, formatting_node,
1793
- &state->_active_formatting_elements);
1794
- return false;
1795
- }
1796
-
1797
- if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
1798
- add_parse_error(parser, token);
1799
- gumbo_debug("Element not in scope.\n");
1800
- return false;
1801
- }
1802
- if (formatting_node != get_current_node(parser)) {
1803
- add_parse_error(parser, token); // But continue onwards.
1804
- }
1805
- assert(formatting_node);
1806
- assert(!node_tag_is(formatting_node, GUMBO_TAG_HTML));
1807
- assert(!node_tag_is(formatting_node, GUMBO_TAG_BODY));
1808
-
1809
- // Step 5 & 6.
1810
- GumboNode* furthest_block = NULL;
1811
- for (int j = formatting_node_in_open_elements;
1812
- j < state->_open_elements.length; ++j) {
1813
- assert(j > 0);
1814
- GumboNode* current = state->_open_elements.data[j];
1815
- if (is_special_node(current)) {
1816
- // Step 5.
1817
- furthest_block = current;
1818
- break;
1819
- }
1820
- }
1821
- if (!furthest_block) {
1822
- // Step 6.
1823
- while (get_current_node(parser) != formatting_node) {
1824
- pop_current_node(parser);
1825
- }
1826
- // And the formatting element itself.
1827
- pop_current_node(parser);
1828
- gumbo_vector_remove(parser, formatting_node,
1829
- &state->_active_formatting_elements);
1830
- return false;
1831
- }
1832
- assert(!node_tag_is(furthest_block, GUMBO_TAG_HTML));
1833
- assert(furthest_block);
1834
-
1835
- // Step 7.
1836
- // Elements may be moved and reparented by this algorithm, so
1837
- // common_ancestor is not necessarily the same as formatting_node->parent.
1838
- GumboNode* common_ancestor =
1839
- state->_open_elements.data[gumbo_vector_index_of(
1840
- &state->_open_elements, formatting_node) - 1];
1841
- gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
1842
- gumbo_normalized_tagname(common_ancestor->v.element.tag),
1843
- gumbo_normalized_tagname(furthest_block->v.element.tag));
1844
-
1845
- // Step 8.
1846
- int bookmark = gumbo_vector_index_of(
1847
- &state->_active_formatting_elements, formatting_node);;
1848
- // Step 9.
1849
- GumboNode* node = furthest_block;
1850
- GumboNode* last_node = furthest_block;
1851
- // Must be stored explicitly, in case node is removed from the stack of open
1852
- // elements, to handle step 9.4.
1853
- int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
1854
- assert(saved_node_index > 0);
1855
- // Step 9.1-9.3 & 9.11.
1856
- for (int j = 0; j < 3; ++j) {
1857
- // Step 9.4.
1858
- int node_index = gumbo_vector_index_of(&state->_open_elements, node);
1859
- gumbo_debug(
1860
- "Current index: %d, last index: %d.\n", node_index, saved_node_index);
1861
- if (node_index == -1) {
1862
- node_index = saved_node_index;
1863
- }
1864
- saved_node_index = --node_index;
1865
- assert(node_index > 0);
1866
- assert(node_index < state->_open_elements.capacity);
1867
- node = state->_open_elements.data[node_index];
1868
- assert(node->parent);
1869
- // Step 9.5.
1870
- if (gumbo_vector_index_of(
1871
- &state->_active_formatting_elements, node) == -1) {
1872
- gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
1873
- continue;
1874
- } else if (node == formatting_node) {
1875
- // Step 9.6.
1876
- break;
1877
- }
1878
- // Step 9.7.
1879
- int formatting_index = gumbo_vector_index_of(
1880
- &state->_active_formatting_elements, node);
1881
- node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1882
- state->_active_formatting_elements.data[formatting_index] = node;
1883
- state->_open_elements.data[node_index] = node;
1884
- // Step 9.8.
1885
- if (last_node == furthest_block) {
1886
- bookmark = formatting_index + 1;
1887
- assert(bookmark <= state->_active_formatting_elements.length);
1888
- }
1889
- // Step 9.9.
1890
- last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1891
- remove_from_parent(parser, last_node);
1892
- append_node(parser, node, last_node);
1893
- // Step 9.10.
1894
- last_node = node;
1895
- }
1896
-
1897
- // Step 10.
1898
- gumbo_debug("Removing %s node from parent ",
1899
- gumbo_normalized_tagname(last_node->v.element.tag));
1900
- remove_from_parent(parser, last_node);
1901
- last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1902
- if (node_tag_in(common_ancestor, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
1903
- GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
1904
- GUMBO_TAG_LAST)) {
1905
- gumbo_debug("and foster-parenting it.\n");
1906
- foster_parent_element(parser, last_node);
1907
- } else {
1908
- gumbo_debug("and inserting it into %s.\n",
1909
- gumbo_normalized_tagname(common_ancestor->v.element.tag));
1910
- append_node(parser, common_ancestor, last_node);
1911
- }
1912
-
1913
- // Step 11.
1914
- GumboNode* new_formatting_node = clone_node(
1915
- parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1916
- formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1917
-
1918
- // Step 12. Instead of appending nodes one-by-one, we swap the children
1919
- // vector of furthest_block with the empty children of new_formatting_node,
1920
- // reducing memory traffic and allocations. We still have to reset their
1921
- // parent pointers, though.
1922
- GumboVector temp = new_formatting_node->v.element.children;
1923
- new_formatting_node->v.element.children =
1924
- furthest_block->v.element.children;
1925
- furthest_block->v.element.children = temp;
1926
-
1927
- temp = new_formatting_node->v.element.children;
1928
- for (int i = 0; i < temp.length; ++i) {
1929
- GumboNode* child = temp.data[i];
1930
- child->parent = new_formatting_node;
1931
- }
1932
-
1933
- // Step 13.
1934
- append_node(parser, furthest_block, new_formatting_node);
1935
-
1936
- // Step 14.
1937
- // If the formatting node was before the bookmark, it may shift over all
1938
- // indices after it, so we need to explicitly find the index and possibly
1939
- // adjust the bookmark.
1940
- int formatting_node_index = gumbo_vector_index_of(
1941
- &state->_active_formatting_elements, formatting_node);
1942
- assert(formatting_node_index != -1);
1943
- if (formatting_node_index < bookmark) {
1944
- --bookmark;
1945
- }
1946
- gumbo_vector_remove_at(
1947
- parser, formatting_node_index, &state->_active_formatting_elements);
1948
- assert(bookmark >= 0);
1949
- assert(bookmark <= state->_active_formatting_elements.length);
1950
- gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
1951
- &state->_active_formatting_elements);
1952
-
1953
- // Step 15.
1954
- gumbo_vector_remove(
1955
- parser, formatting_node, &state->_open_elements);
1956
- int insert_at = gumbo_vector_index_of(
1957
- &state->_open_elements, furthest_block) + 1;
1958
- assert(insert_at >= 0);
1959
- assert(insert_at <= state->_open_elements.length);
1960
- gumbo_vector_insert_at(
1961
- parser, new_formatting_node, insert_at, &state->_open_elements);
1962
- }
1963
- return true;
1964
- }
1965
-
1966
- // This is here to clean up memory when the spec says "Ignore current token."
1967
- static void ignore_token(GumboParser* parser) {
1968
- GumboToken* token = parser->_parser_state->_current_token;
1969
- // Ownership of the token's internal buffers are normally transferred to the
1970
- // element, but if no element is emitted (as happens in non-verbatim-mode
1971
- // when a token is ignored), we need to free it here to prevent a memory
1972
- // leak.
1973
- gumbo_token_destroy(parser, token);
1974
- #ifndef NDEBUG
1975
- if (token->type == GUMBO_TOKEN_START_TAG) {
1976
- // Mark this sentinel so the assertion in the main loop knows it's been
1977
- // destroyed.
1978
- token->v.start_tag.attributes = kGumboEmptyVector;
1979
- }
1980
- #endif
1981
- }
1982
-
1983
- // http://www.whatwg.org/specs/web-apps/current-work/complete/the-end.html
1984
- static void finish_parsing(GumboParser* parser) {
1985
- maybe_flush_text_node_buffer(parser);
1986
- GumboParserState* state = parser->_parser_state;
1987
- for (GumboNode* node = pop_current_node(parser); node;
1988
- node = pop_current_node(parser)) {
1989
- if ((node_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
1990
- (node_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
1991
- continue;
1992
- }
1993
- node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1994
- }
1995
- while (pop_current_node(parser)); // Pop them all.
1996
- }
1997
-
1998
- static bool handle_initial(GumboParser* parser, GumboToken* token) {
1999
- GumboDocument* document = &get_document_node(parser)->v.document;
2000
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
2001
- ignore_token(parser);
2002
- return true;
2003
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2004
- append_comment_node(parser, get_document_node(parser), token);
2005
- return true;
2006
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2007
- document->has_doctype = true;
2008
- document->name = token->v.doc_type.name;
2009
- document->public_identifier = token->v.doc_type.public_identifier;
2010
- document->system_identifier = token->v.doc_type.system_identifier;
2011
- document->doc_type_quirks_mode = compute_quirks_mode(&token->v.doc_type);
2012
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2013
- return maybe_add_doctype_error(parser, token);
2014
- }
2015
- add_parse_error(parser, token);
2016
- document->doc_type_quirks_mode = GUMBO_DOCTYPE_QUIRKS;
2017
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HTML);
2018
- parser->_parser_state->_reprocess_current_token = true;
2019
- return true;
2020
- }
2021
-
2022
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-html-insertion-mode
2023
- static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2024
- if (token->type == GUMBO_TOKEN_DOCTYPE) {
2025
- add_parse_error(parser, token);
2026
- ignore_token(parser);
2027
- return false;
2028
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2029
- append_comment_node(parser, get_document_node(parser), token);
2030
- return true;
2031
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2032
- ignore_token(parser);
2033
- return true;
2034
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2035
- GumboNode* html_node = insert_element_from_token(parser, token);
2036
- parser->_output->root = html_node;
2037
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2038
- return true;
2039
- } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
2040
- token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2041
- GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
2042
- add_parse_error(parser, token);
2043
- ignore_token(parser);
2044
- return false;
2045
- } else {
2046
- GumboNode* html_node = insert_element_of_tag_type(
2047
- parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
2048
- assert(html_node);
2049
- parser->_output->root = html_node;
2050
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2051
- parser->_parser_state->_reprocess_current_token = true;
2052
- return true;
2053
- }
2054
- }
2055
-
2056
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-before-head-insertion-mode
2057
- static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2058
- if (token->type == GUMBO_TOKEN_DOCTYPE) {
2059
- add_parse_error(parser, token);
2060
- ignore_token(parser);
2061
- return false;
2062
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2063
- append_comment_node(parser, get_current_node(parser), token);
2064
- return true;
2065
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2066
- ignore_token(parser);
2067
- return true;
2068
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2069
- GumboNode* node = insert_element_from_token(parser, token);
2070
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2071
- parser->_parser_state->_head_element = node;
2072
- return true;
2073
- } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
2074
- token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2075
- GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
2076
- add_parse_error(parser, token);
2077
- ignore_token(parser);
2078
- return false;
2079
- } else {
2080
- GumboNode* node = insert_element_of_tag_type(
2081
- parser, GUMBO_TAG_HEAD, GUMBO_INSERTION_IMPLIED);
2082
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2083
- parser->_parser_state->_head_element = node;
2084
- parser->_parser_state->_reprocess_current_token = true;
2085
- return true;
2086
- }
2087
- }
2088
-
2089
- // Forward declarations because of mutual dependencies.
2090
- static bool handle_token(GumboParser* parser, GumboToken* token);
2091
- static bool handle_in_body(GumboParser* parser, GumboToken* token);
2092
-
2093
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inhead
2094
- static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2095
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
2096
- insert_text_token(parser, token);
2097
- return true;
2098
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2099
- add_parse_error(parser, token);
2100
- ignore_token(parser);
2101
- return false;
2102
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2103
- append_comment_node(parser, get_current_node(parser), token);
2104
- return true;
2105
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2106
- return handle_in_body(parser, token);
2107
- } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2108
- GUMBO_TAG_BGSOUND, GUMBO_TAG_COMMAND, GUMBO_TAG_LINK,
2109
- GUMBO_TAG_LAST)) {
2110
- insert_element_from_token(parser, token);
2111
- pop_current_node(parser);
2112
- acknowledge_self_closing_tag(parser);
2113
- return true;
2114
- } else if (tag_is(token, kStartTag, GUMBO_TAG_META)) {
2115
- insert_element_from_token(parser, token);
2116
- pop_current_node(parser);
2117
- acknowledge_self_closing_tag(parser);
2118
- // NOTE(jdtang): Gumbo handles only UTF-8, so the encoding clause of the
2119
- // spec doesn't apply. If clients want to handle meta-tag re-encoding, they
2120
- // should specifically look for that string in the document and re-encode it
2121
- // before passing to Gumbo.
2122
- return true;
2123
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2124
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2125
- return true;
2126
- } else if (tag_in(token, kStartTag, GUMBO_TAG_NOFRAMES, GUMBO_TAG_STYLE,
2127
- GUMBO_TAG_LAST)) {
2128
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2129
- return true;
2130
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
2131
- insert_element_from_token(parser, token);
2132
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT);
2133
- return true;
2134
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
2135
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_SCRIPT);
2136
- return true;
2137
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2138
- GumboNode* head = pop_current_node(parser);
2139
- AVOID_UNUSED_VARIABLE_WARNING(head);
2140
- assert(node_tag_is(head, GUMBO_TAG_HEAD));
2141
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2142
- return true;
2143
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2144
- add_parse_error(parser, token);
2145
- ignore_token(parser);
2146
- return false;
2147
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2148
- (token->type == GUMBO_TOKEN_END_TAG &&
2149
- !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2150
- GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
2151
- add_parse_error(parser, token);
2152
- return false;
2153
- } else {
2154
- const GumboNode* node = pop_current_node(parser);
2155
- assert(node_tag_is(node, GUMBO_TAG_HEAD));
2156
- AVOID_UNUSED_VARIABLE_WARNING(node);
2157
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2158
- parser->_parser_state->_reprocess_current_token = true;
2159
- return true;
2160
- }
2161
-
2162
- return true;
2163
- }
2164
-
2165
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inheadnoscript
2166
- static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2167
- if (token->type == GUMBO_TOKEN_DOCTYPE) {
2168
- add_parse_error(parser, token);
2169
- return false;
2170
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2171
- return handle_in_body(parser, token);
2172
- } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2173
- const GumboNode* node = pop_current_node(parser);
2174
- assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
2175
- AVOID_UNUSED_VARIABLE_WARNING(node);
2176
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2177
- return true;
2178
- } else if (token->type == GUMBO_TOKEN_WHITESPACE ||
2179
- token->type == GUMBO_TOKEN_COMMENT ||
2180
- tag_in(token, kStartTag, GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND,
2181
- GUMBO_TAG_LINK, GUMBO_TAG_META, GUMBO_TAG_NOFRAMES,
2182
- GUMBO_TAG_STYLE, GUMBO_TAG_LAST)) {
2183
- return handle_in_head(parser, token);
2184
- } else if (tag_in(token, kStartTag, GUMBO_TAG_HEAD, GUMBO_TAG_NOSCRIPT,
2185
- GUMBO_TAG_LAST) ||
2186
- (token->type == GUMBO_TOKEN_END_TAG &&
2187
- !tag_is(token, kEndTag, GUMBO_TAG_BR))) {
2188
- add_parse_error(parser, token);
2189
- ignore_token(parser);
2190
- return false;
2191
- } else {
2192
- add_parse_error(parser, token);
2193
- const GumboNode* node = pop_current_node(parser);
2194
- assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
2195
- AVOID_UNUSED_VARIABLE_WARNING(node);
2196
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2197
- parser->_parser_state->_reprocess_current_token = true;
2198
- return false;
2199
- }
2200
- }
2201
-
2202
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-head-insertion-mode
2203
- static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2204
- GumboParserState* state = parser->_parser_state;
2205
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
2206
- insert_text_token(parser, token);
2207
- return true;
2208
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2209
- add_parse_error(parser, token);
2210
- ignore_token(parser);
2211
- return false;
2212
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2213
- append_comment_node(parser, get_current_node(parser), token);
2214
- return true;
2215
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2216
- return handle_in_body(parser, token);
2217
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2218
- insert_element_from_token(parser, token);
2219
- state->_frameset_ok = false;
2220
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2221
- return true;
2222
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2223
- insert_element_from_token(parser, token);
2224
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2225
- return true;
2226
- } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2227
- GUMBO_TAG_BGSOUND, GUMBO_TAG_LINK, GUMBO_TAG_META,
2228
- GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT, GUMBO_TAG_STYLE,
2229
- GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
2230
- add_parse_error(parser, token);
2231
- assert(state->_head_element != NULL);
2232
- // This must be flushed before we push the head element on, as there may be
2233
- // pending character tokens that should be attached to the root.
2234
- maybe_flush_text_node_buffer(parser);
2235
- gumbo_vector_add(parser, state->_head_element, &state->_open_elements);
2236
- bool result = handle_in_head(parser, token);
2237
- gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
2238
- return result;
2239
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2240
- (token->type == GUMBO_TOKEN_END_TAG &&
2241
- !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2242
- GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
2243
- add_parse_error(parser, token);
2244
- ignore_token(parser);
2245
- return false;
2246
- } else {
2247
- insert_element_of_tag_type(parser, GUMBO_TAG_BODY, GUMBO_INSERTION_IMPLIED);
2248
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
2249
- state->_reprocess_current_token = true;
2250
- return true;
2251
- }
2252
- }
2253
-
2254
- static void destroy_node(GumboParser* parser, GumboNode* node) {
2255
- switch (node->type) {
2256
- case GUMBO_NODE_DOCUMENT:
2257
- {
2258
- GumboDocument* doc = &node->v.document;
2259
- for (int i = 0; i < doc->children.length; ++i) {
2260
- destroy_node(parser, doc->children.data[i]);
2261
- }
2262
- gumbo_parser_deallocate(parser, (void*) doc->children.data);
2263
- gumbo_parser_deallocate(parser, (void*) doc->name);
2264
- gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
2265
- gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
2266
- }
2267
- break;
2268
- case GUMBO_NODE_ELEMENT:
2269
- for (int i = 0; i < node->v.element.attributes.length; ++i) {
2270
- gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
2271
- }
2272
- gumbo_parser_deallocate(parser, node->v.element.attributes.data);
2273
- for (int i = 0; i < node->v.element.children.length; ++i) {
2274
- destroy_node(parser, node->v.element.children.data[i]);
2275
- }
2276
- gumbo_parser_deallocate(parser, node->v.element.children.data);
2277
- break;
2278
- case GUMBO_NODE_TEXT:
2279
- case GUMBO_NODE_CDATA:
2280
- case GUMBO_NODE_COMMENT:
2281
- case GUMBO_NODE_WHITESPACE:
2282
- gumbo_parser_deallocate(parser, (void*) node->v.text.text);
2283
- break;
2284
- }
2285
- gumbo_parser_deallocate(parser, node);
2286
- }
2287
-
2288
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
2289
- static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2290
- GumboParserState* state = parser->_parser_state;
2291
- assert(state->_open_elements.length > 0);
2292
- if (token->type == GUMBO_TOKEN_NULL) {
2293
- add_parse_error(parser, token);
2294
- ignore_token(parser);
2295
- return false;
2296
- } else if (token->type == GUMBO_TOKEN_WHITESPACE) {
2297
- reconstruct_active_formatting_elements(parser);
2298
- insert_text_token(parser, token);
2299
- return true;
2300
- } else if (token->type == GUMBO_TOKEN_CHARACTER) {
2301
- reconstruct_active_formatting_elements(parser);
2302
- insert_text_token(parser, token);
2303
- set_frameset_not_ok(parser);
2304
- return true;
2305
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2306
- append_comment_node(parser, get_current_node(parser), token);
2307
- return true;
2308
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2309
- add_parse_error(parser, token);
2310
- ignore_token(parser);
2311
- return false;
2312
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2313
- assert(parser->_output->root != NULL);
2314
- assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2315
- add_parse_error(parser, token);
2316
- merge_attributes(parser, token, parser->_output->root);
2317
- return false;
2318
- } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2319
- GUMBO_TAG_BGSOUND, GUMBO_TAG_COMMAND, GUMBO_TAG_LINK,
2320
- GUMBO_TAG_META, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT,
2321
- GUMBO_TAG_STYLE, GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
2322
- return handle_in_head(parser, token);
2323
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2324
- add_parse_error(parser, token);
2325
- if (state->_open_elements.length < 2 ||
2326
- !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY)) {
2327
- ignore_token(parser);
2328
- return false;
2329
- }
2330
- state->_frameset_ok = false;
2331
- merge_attributes(parser, token, state->_open_elements.data[1]);
2332
- return false;
2333
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2334
- add_parse_error(parser, token);
2335
- if (state->_open_elements.length < 2 ||
2336
- !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2337
- !state->_frameset_ok) {
2338
- ignore_token(parser);
2339
- return false;
2340
- }
2341
- // Save the body node for later removal.
2342
- GumboNode* body_node = state->_open_elements.data[1];
2343
-
2344
- // Pop all nodes except root HTML element.
2345
- GumboNode* node;
2346
- do {
2347
- node = pop_current_node(parser);
2348
- } while (node != state->_open_elements.data[1]);
2349
-
2350
- // Remove the body node. We may want to factor this out into a generic
2351
- // helper, but right now this is the only code that needs to do this.
2352
- GumboVector* children = &parser->_output->root->v.element.children;
2353
- for (int i = 0; i < children->length; ++i) {
2354
- if (children->data[i] == body_node) {
2355
- gumbo_vector_remove_at(parser, i, children);
2356
- break;
2357
- }
2358
- }
2359
- destroy_node(parser, body_node);
2360
-
2361
- // Insert the <frameset>, and switch the insertion mode.
2362
- insert_element_from_token(parser, token);
2363
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2364
- return true;
2365
- } else if (token->type == GUMBO_TOKEN_EOF) {
2366
- for (int i = 0; i < state->_open_elements.length; ++i) {
2367
- if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
2368
- GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_P, GUMBO_TAG_TBODY,
2369
- GUMBO_TAG_TD, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
2370
- GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_BODY,
2371
- GUMBO_TAG_HTML, GUMBO_TAG_LAST)) {
2372
- add_parse_error(parser, token);
2373
- return false;
2374
- }
2375
- }
2376
- return true;
2377
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2378
- GUMBO_TAG_LAST)) {
2379
- if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2380
- add_parse_error(parser, token);
2381
- ignore_token(parser);
2382
- return false;
2383
- }
2384
- bool success = true;
2385
- for (int i = 0; i < state->_open_elements.length; ++i) {
2386
- if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
2387
- GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_OPTGROUP,
2388
- GUMBO_TAG_OPTION, GUMBO_TAG_P, GUMBO_TAG_RP,
2389
- GUMBO_TAG_RT, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
2390
- GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
2391
- GUMBO_TAG_TR, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2392
- GUMBO_TAG_LAST)) {
2393
- add_parse_error(parser, token);
2394
- success = false;
2395
- break;
2396
- }
2397
- }
2398
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_BODY);
2399
- if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
2400
- parser->_parser_state->_reprocess_current_token = true;
2401
- } else {
2402
- GumboNode* body = state->_open_elements.data[1];
2403
- assert(node_tag_is(body, GUMBO_TAG_BODY));
2404
- record_end_of_element(state->_current_token, &body->v.element);
2405
- }
2406
- return success;
2407
- } else if (tag_in(token, kStartTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
2408
- GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_CENTER,
2409
- GUMBO_TAG_DETAILS, GUMBO_TAG_DIR, GUMBO_TAG_DIV,
2410
- GUMBO_TAG_DL, GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION,
2411
- GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER, GUMBO_TAG_HEADER,
2412
- GUMBO_TAG_HGROUP, GUMBO_TAG_MENU, GUMBO_TAG_NAV,
2413
- GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_SECTION,
2414
- GUMBO_TAG_SUMMARY, GUMBO_TAG_UL, GUMBO_TAG_LAST)) {
2415
- bool result = maybe_implicitly_close_p_tag(parser, token);
2416
- insert_element_from_token(parser, token);
2417
- return result;
2418
- } else if (tag_in(token, kStartTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2419
- GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2420
- bool result = maybe_implicitly_close_p_tag(parser, token);
2421
- if (node_tag_in(get_current_node(parser), GUMBO_TAG_H1, GUMBO_TAG_H2,
2422
- GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6,
2423
- GUMBO_TAG_LAST)) {
2424
- add_parse_error(parser, token);
2425
- pop_current_node(parser);
2426
- result = false;
2427
- }
2428
- insert_element_from_token(parser, token);
2429
- return result;
2430
- } else if (tag_in(token, kStartTag, GUMBO_TAG_PRE, GUMBO_TAG_LISTING,
2431
- GUMBO_TAG_LAST)) {
2432
- bool result = maybe_implicitly_close_p_tag(parser, token);
2433
- insert_element_from_token(parser, token);
2434
- state->_ignore_next_linefeed = true;
2435
- state->_frameset_ok = false;
2436
- return result;
2437
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2438
- if (state->_form_element != NULL) {
2439
- gumbo_debug("Ignoring nested form.\n");
2440
- add_parse_error(parser, token);
2441
- ignore_token(parser);
2442
- return false;
2443
- }
2444
- bool result = maybe_implicitly_close_p_tag(parser, token);
2445
- state->_form_element =
2446
- insert_element_from_token(parser, token);
2447
- return result;
2448
- } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2449
- maybe_implicitly_close_list_tag(parser, token, true);
2450
- bool result = maybe_implicitly_close_p_tag(parser, token);
2451
- insert_element_from_token(parser, token);
2452
- return result;
2453
- } else if (tag_in(token, kStartTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
2454
- GUMBO_TAG_LAST)) {
2455
- maybe_implicitly_close_list_tag(parser, token, false);
2456
- bool result = maybe_implicitly_close_p_tag(parser, token);
2457
- insert_element_from_token(parser, token);
2458
- return result;
2459
- } else if (tag_is(token, kStartTag, GUMBO_TAG_PLAINTEXT)) {
2460
- bool result = maybe_implicitly_close_p_tag(parser, token);
2461
- insert_element_from_token(parser, token);
2462
- gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
2463
- return result;
2464
- } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2465
- if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2466
- add_parse_error(parser, token);
2467
- implicitly_close_tags(parser, token, GUMBO_TAG_BUTTON);
2468
- state->_reprocess_current_token = true;
2469
- return false;
2470
- }
2471
- reconstruct_active_formatting_elements(parser);
2472
- insert_element_from_token(parser, token);
2473
- state->_frameset_ok = false;
2474
- return true;
2475
- } else if (tag_in(token, kEndTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
2476
- GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_BUTTON,
2477
- GUMBO_TAG_CENTER, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
2478
- GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_FIELDSET,
2479
- GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER,
2480
- GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_LISTING,
2481
- GUMBO_TAG_MENU, GUMBO_TAG_NAV, GUMBO_TAG_OL, GUMBO_TAG_PRE,
2482
- GUMBO_TAG_SECTION, GUMBO_TAG_SUMMARY, GUMBO_TAG_UL,
2483
- GUMBO_TAG_LAST)) {
2484
- GumboTag tag = token->v.end_tag;
2485
- if (!has_an_element_in_scope(parser, tag)) {
2486
- add_parse_error(parser, token);
2487
- ignore_token(parser);
2488
- return false;
2489
- }
2490
- implicitly_close_tags(parser, token, token->v.end_tag);
2491
- return true;
2492
- } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2493
- bool result = true;
2494
- const GumboNode* node = state->_form_element;
2495
- assert(!node || node->type == GUMBO_NODE_ELEMENT);
2496
- state->_form_element = NULL;
2497
- if (!node || !has_node_in_scope(parser, node)) {
2498
- gumbo_debug("Closing an unopened form.\n");
2499
- add_parse_error(parser, token);
2500
- ignore_token(parser);
2501
- return false;
2502
- }
2503
- // This differs from implicitly_close_tags because we remove *only* the
2504
- // <form> element; other nodes are left in scope.
2505
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2506
- if (get_current_node(parser) != node) {
2507
- add_parse_error(parser, token);
2508
- result = false;
2509
- }
2510
-
2511
- GumboVector* open_elements = &state->_open_elements;
2512
- int index = open_elements->length - 1;
2513
- for (; index >= 0 && open_elements->data[index] != node; --index);
2514
- assert(index >= 0);
2515
- gumbo_vector_remove_at(parser, index, open_elements);
2516
- return result;
2517
- } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
2518
- if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2519
- add_parse_error(parser, token);
2520
- reconstruct_active_formatting_elements(parser);
2521
- insert_element_of_tag_type(
2522
- parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2523
- state->_reprocess_current_token = true;
2524
- return false;
2525
- }
2526
- return implicitly_close_tags(parser, token, GUMBO_TAG_P);
2527
- } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
2528
- if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
2529
- add_parse_error(parser, token);
2530
- ignore_token(parser);
2531
- return false;
2532
- }
2533
- return implicitly_close_tags(parser, token, GUMBO_TAG_LI);
2534
- } else if (tag_in(token, kEndTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
2535
- GUMBO_TAG_LAST)) {
2536
- assert(token->type == GUMBO_TOKEN_END_TAG);
2537
- GumboTag token_tag = token->v.end_tag;
2538
- if (!has_an_element_in_scope(parser, token_tag)) {
2539
- add_parse_error(parser, token);
2540
- ignore_token(parser);
2541
- return false;
2542
- }
2543
- return implicitly_close_tags(parser, token, token_tag);
2544
- } else if (tag_in(token, kEndTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2545
- GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2546
- if (!has_an_element_in_scope_with_tagname(
2547
- parser, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
2548
- GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2549
- // No heading open; ignore the token entirely.
2550
- add_parse_error(parser, token);
2551
- ignore_token(parser);
2552
- return false;
2553
- } else {
2554
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2555
- const GumboNode* current_node = get_current_node(parser);
2556
- bool success = node_tag_is(current_node, token->v.end_tag);
2557
- if (!success) {
2558
- // There're children of the heading currently open; close them below and
2559
- // record a parse error.
2560
- // TODO(jdtang): Add a way to distinguish this error case from the one
2561
- // above.
2562
- add_parse_error(parser, token);
2563
- }
2564
- do {
2565
- current_node = pop_current_node(parser);
2566
- } while (!node_tag_in(current_node, GUMBO_TAG_H1, GUMBO_TAG_H2,
2567
- GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5,
2568
- GUMBO_TAG_H6, GUMBO_TAG_LAST));
2569
- return success;
2570
- }
2571
- } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
2572
- bool success = true;
2573
- int last_a;
2574
- int has_matching_a = find_last_anchor_index(parser, &last_a);
2575
- if (has_matching_a) {
2576
- assert(has_matching_a == 1);
2577
- add_parse_error(parser, token);
2578
- adoption_agency_algorithm(parser, token, GUMBO_TAG_A);
2579
- // The adoption agency algorithm usually removes all instances of <a>
2580
- // from the list of active formatting elements, but in case it doesn't,
2581
- // we're supposed to do this. (The conditions where it might not are
2582
- // listed in the spec.)
2583
- if (find_last_anchor_index(parser, &last_a)) {
2584
- void* last_element = gumbo_vector_remove_at(
2585
- parser, last_a, &state->_active_formatting_elements);
2586
- gumbo_vector_remove(
2587
- parser, last_element, &state->_open_elements);
2588
- }
2589
- success = false;
2590
- }
2591
- reconstruct_active_formatting_elements(parser);
2592
- add_formatting_element(parser, insert_element_from_token(parser, token));
2593
- return success;
2594
- } else if (tag_in(token, kStartTag, GUMBO_TAG_B, GUMBO_TAG_BIG,
2595
- GUMBO_TAG_CODE, GUMBO_TAG_EM, GUMBO_TAG_FONT, GUMBO_TAG_I,
2596
- GUMBO_TAG_S, GUMBO_TAG_SMALL, GUMBO_TAG_STRIKE,
2597
- GUMBO_TAG_STRONG, GUMBO_TAG_TT, GUMBO_TAG_U,
2598
- GUMBO_TAG_LAST)) {
2599
- reconstruct_active_formatting_elements(parser);
2600
- add_formatting_element(parser, insert_element_from_token(parser, token));
2601
- return true;
2602
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOBR)) {
2603
- bool result = true;
2604
- reconstruct_active_formatting_elements(parser);
2605
- if (has_an_element_in_scope(parser, GUMBO_TAG_NOBR)) {
2606
- result = false;
2607
- add_parse_error(parser, token);
2608
- adoption_agency_algorithm(parser, token, GUMBO_TAG_NOBR);
2609
- reconstruct_active_formatting_elements(parser);
2610
- }
2611
- insert_element_from_token(parser, token);
2612
- add_formatting_element(parser, get_current_node(parser));
2613
- return result;
2614
- } else if (tag_in(token, kEndTag, GUMBO_TAG_A, GUMBO_TAG_B, GUMBO_TAG_BIG,
2615
- GUMBO_TAG_CODE, GUMBO_TAG_EM, GUMBO_TAG_FONT, GUMBO_TAG_I,
2616
- GUMBO_TAG_NOBR, GUMBO_TAG_S, GUMBO_TAG_SMALL,
2617
- GUMBO_TAG_STRIKE, GUMBO_TAG_STRONG, GUMBO_TAG_TT,
2618
- GUMBO_TAG_U, GUMBO_TAG_LAST)) {
2619
- return adoption_agency_algorithm(parser, token, token->v.end_tag);
2620
- } else if (tag_in(token, kStartTag, GUMBO_TAG_APPLET, GUMBO_TAG_MARQUEE,
2621
- GUMBO_TAG_OBJECT, GUMBO_TAG_LAST)) {
2622
- reconstruct_active_formatting_elements(parser);
2623
- insert_element_from_token(parser, token);
2624
- add_formatting_element(parser, &kActiveFormattingScopeMarker);
2625
- set_frameset_not_ok(parser);
2626
- return true;
2627
- } else if (tag_in(token, kEndTag, GUMBO_TAG_APPLET, GUMBO_TAG_MARQUEE,
2628
- GUMBO_TAG_OBJECT, GUMBO_TAG_LAST)) {
2629
- GumboTag token_tag = token->v.end_tag;
2630
- if (!has_an_element_in_table_scope(parser, token_tag)) {
2631
- add_parse_error(parser, token);
2632
- ignore_token(parser);
2633
- return false;
2634
- }
2635
- implicitly_close_tags(parser, token, token_tag);
2636
- clear_active_formatting_elements(parser);
2637
- return true;
2638
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
2639
- if (get_document_node(parser)->v.document.doc_type_quirks_mode !=
2640
- GUMBO_DOCTYPE_QUIRKS) {
2641
- maybe_implicitly_close_p_tag(parser, token);
2642
- }
2643
- insert_element_from_token(parser, token);
2644
- set_frameset_not_ok(parser);
2645
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
2646
- return true;
2647
- } else if (tag_in(token, kStartTag, GUMBO_TAG_AREA, GUMBO_TAG_BR,
2648
- GUMBO_TAG_EMBED, GUMBO_TAG_IMG, GUMBO_TAG_IMAGE,
2649
- GUMBO_TAG_KEYGEN, GUMBO_TAG_WBR, GUMBO_TAG_LAST)) {
2650
- bool success = true;
2651
- if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2652
- success = false;
2653
- add_parse_error(parser, token);
2654
- token->v.start_tag.tag = GUMBO_TAG_IMG;
2655
- }
2656
- reconstruct_active_formatting_elements(parser);
2657
- GumboNode* node = insert_element_from_token(parser, token);
2658
- if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2659
- success = false;
2660
- add_parse_error(parser, token);
2661
- node->v.element.tag = GUMBO_TAG_IMG;
2662
- node->parse_flags |= GUMBO_INSERTION_FROM_IMAGE;
2663
- }
2664
- pop_current_node(parser);
2665
- acknowledge_self_closing_tag(parser);
2666
- set_frameset_not_ok(parser);
2667
- return success;
2668
- } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT)) {
2669
- if (!attribute_matches(&token->v.start_tag.attributes, "type", "hidden")) {
2670
- // Must be before the element is inserted, as that takes ownership of the
2671
- // token's attribute vector.
2672
- set_frameset_not_ok(parser);
2673
- }
2674
- reconstruct_active_formatting_elements(parser);
2675
- insert_element_from_token(parser, token);
2676
- pop_current_node(parser);
2677
- acknowledge_self_closing_tag(parser);
2678
- return true;
2679
- } else if (tag_in(token, kStartTag, GUMBO_TAG_PARAM, GUMBO_TAG_SOURCE,
2680
- GUMBO_TAG_TRACK, GUMBO_TAG_LAST)) {
2681
- insert_element_from_token(parser, token);
2682
- pop_current_node(parser);
2683
- acknowledge_self_closing_tag(parser);
2684
- return true;
2685
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HR)) {
2686
- bool result = maybe_implicitly_close_p_tag(parser, token);
2687
- insert_element_from_token(parser, token);
2688
- pop_current_node(parser);
2689
- acknowledge_self_closing_tag(parser);
2690
- set_frameset_not_ok(parser);
2691
- return result;
2692
- } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
2693
- add_parse_error(parser, token);
2694
- if (parser->_parser_state->_form_element != NULL) {
2695
- ignore_token(parser);
2696
- return false;
2697
- }
2698
- acknowledge_self_closing_tag(parser);
2699
- maybe_implicitly_close_p_tag(parser, token);
2700
- set_frameset_not_ok(parser);
2701
-
2702
- GumboVector* token_attrs = &token->v.start_tag.attributes;
2703
- GumboAttribute* prompt_attr = gumbo_get_attribute(token_attrs, "prompt");
2704
- GumboAttribute* action_attr = gumbo_get_attribute(token_attrs, "action");
2705
- GumboAttribute* name_attr = gumbo_get_attribute(token_attrs, "isindex");
2706
-
2707
- GumboNode* form = insert_element_of_tag_type(
2708
- parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
2709
- if (action_attr) {
2710
- gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
2711
- }
2712
- insert_element_of_tag_type(parser, GUMBO_TAG_HR,
2713
- GUMBO_INSERTION_FROM_ISINDEX);
2714
- pop_current_node(parser); // <hr>
2715
-
2716
- insert_element_of_tag_type(parser, GUMBO_TAG_LABEL,
2717
- GUMBO_INSERTION_FROM_ISINDEX);
2718
- TextNodeBufferState* text_state = &parser->_parser_state->_text_node;
2719
- text_state->_start_original_text = token->original_text.data;
2720
- text_state->_start_position = token->position;
2721
- text_state->_type = GUMBO_NODE_TEXT;
2722
- if (prompt_attr) {
2723
- int prompt_attr_length = strlen(prompt_attr->value);
2724
- gumbo_string_buffer_destroy(parser, &text_state->_buffer);
2725
- text_state->_buffer.data = gumbo_copy_stringz(parser, prompt_attr->value);
2726
- text_state->_buffer.length = prompt_attr_length;
2727
- text_state->_buffer.capacity = prompt_attr_length + 1;
2728
- gumbo_destroy_attribute(parser, prompt_attr);
2729
- } else {
2730
- GumboStringPiece prompt_text = GUMBO_STRING(
2731
- "This is a searchable index. Enter search keywords: ");
2732
- gumbo_string_buffer_append_string(
2733
- parser, &prompt_text, &text_state->_buffer);
2734
- }
2735
-
2736
- GumboNode* input = insert_element_of_tag_type(
2737
- parser, GUMBO_TAG_INPUT, GUMBO_INSERTION_FROM_ISINDEX);
2738
- for (int i = 0; i < token_attrs->length; ++i) {
2739
- GumboAttribute* attr = token_attrs->data[i];
2740
- if (attr != prompt_attr && attr != action_attr && attr != name_attr) {
2741
- gumbo_vector_add(parser, attr, &input->v.element.attributes);
2742
- }
2743
- token_attrs->data[i] = NULL;
2744
- }
2745
-
2746
- // All attributes have been successfully transferred and nulled out at this
2747
- // point, so the call to ignore_token will free the memory for it without
2748
- // touching the attributes.
2749
- ignore_token(parser);
2750
-
2751
- GumboAttribute* name =
2752
- gumbo_parser_allocate(parser, sizeof(GumboAttribute));
2753
- GumboStringPiece name_str = GUMBO_STRING("name");
2754
- GumboStringPiece isindex_str = GUMBO_STRING("isindex");
2755
- name->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
2756
- name->name = gumbo_copy_stringz(parser, "name");
2757
- name->value = gumbo_copy_stringz(parser, "isindex");
2758
- name->original_name = name_str;
2759
- name->original_value = isindex_str;
2760
- name->name_start = kGumboEmptySourcePosition;
2761
- name->name_end = kGumboEmptySourcePosition;
2762
- name->value_start = kGumboEmptySourcePosition;
2763
- name->value_end = kGumboEmptySourcePosition;
2764
- gumbo_vector_add(parser, name, &input->v.element.attributes);
2765
-
2766
- pop_current_node(parser); // <input>
2767
- pop_current_node(parser); // <label>
2768
- insert_element_of_tag_type(
2769
- parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2770
- pop_current_node(parser); // <hr>
2771
- pop_current_node(parser); // <form>
2772
- return false;
2773
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
2774
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2775
- parser->_parser_state->_ignore_next_linefeed = true;
2776
- set_frameset_not_ok(parser);
2777
- return true;
2778
- } else if (tag_is(token, kStartTag, GUMBO_TAG_XMP)) {
2779
- bool result = maybe_implicitly_close_p_tag(parser, token);
2780
- reconstruct_active_formatting_elements(parser);
2781
- set_frameset_not_ok(parser);
2782
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2783
- return result;
2784
- } else if (tag_is(token, kStartTag, GUMBO_TAG_IFRAME)) {
2785
- set_frameset_not_ok(parser);
2786
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2787
- return true;
2788
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOEMBED)) {
2789
- run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2790
- return true;
2791
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
2792
- reconstruct_active_formatting_elements(parser);
2793
- insert_element_from_token(parser, token);
2794
- set_frameset_not_ok(parser);
2795
- GumboInsertionMode state = parser->_parser_state->_insertion_mode;
2796
- if (state == GUMBO_INSERTION_MODE_IN_TABLE ||
2797
- state == GUMBO_INSERTION_MODE_IN_CAPTION ||
2798
- state == GUMBO_INSERTION_MODE_IN_TABLE_BODY ||
2799
- state == GUMBO_INSERTION_MODE_IN_ROW ||
2800
- state == GUMBO_INSERTION_MODE_IN_CELL) {
2801
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE);
2802
- } else {
2803
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
2804
- }
2805
- return true;
2806
- } else if (tag_in(token, kStartTag, GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP,
2807
- GUMBO_TAG_LAST)) {
2808
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2809
- pop_current_node(parser);
2810
- }
2811
- reconstruct_active_formatting_elements(parser);
2812
- insert_element_from_token(parser, token);
2813
- return true;
2814
- } else if (tag_in(token, kStartTag, GUMBO_TAG_RP, GUMBO_TAG_RT,
2815
- GUMBO_TAG_LAST)) {
2816
- bool success = true;
2817
- if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
2818
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2819
- }
2820
- if (!node_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)) {
2821
- add_parse_error(parser, token);
2822
- success = false;
2823
- }
2824
- insert_element_from_token(parser, token);
2825
- return success;
2826
- } else if (tag_is(token, kEndTag, GUMBO_TAG_BR)) {
2827
- add_parse_error(parser, token);
2828
- reconstruct_active_formatting_elements(parser);
2829
- insert_element_of_tag_type(
2830
- parser, GUMBO_TAG_BR, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2831
- pop_current_node(parser);
2832
- return false;
2833
- } else if (tag_is(token, kStartTag, GUMBO_TAG_MATH)) {
2834
- reconstruct_active_formatting_elements(parser);
2835
- adjust_mathml_attributes(parser, token);
2836
- adjust_foreign_attributes(parser, token);
2837
- insert_foreign_element(parser, token, GUMBO_NAMESPACE_MATHML);
2838
- if (token->v.start_tag.is_self_closing) {
2839
- pop_current_node(parser);
2840
- acknowledge_self_closing_tag(parser);
2841
- }
2842
- return true;
2843
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SVG)) {
2844
- reconstruct_active_formatting_elements(parser);
2845
- adjust_svg_attributes(parser, token);
2846
- adjust_foreign_attributes(parser, token);
2847
- insert_foreign_element(parser, token, GUMBO_NAMESPACE_SVG);
2848
- if (token->v.start_tag.is_self_closing) {
2849
- pop_current_node(parser);
2850
- acknowledge_self_closing_tag(parser);
2851
- }
2852
- return true;
2853
- } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
2854
- GUMBO_TAG_COLGROUP, GUMBO_TAG_FRAME, GUMBO_TAG_HEAD,
2855
- GUMBO_TAG_TBODY, GUMBO_TAG_TD, GUMBO_TAG_TFOOT,
2856
- GUMBO_TAG_TH, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
2857
- GUMBO_TAG_LAST)) {
2858
- add_parse_error(parser, token);
2859
- ignore_token(parser);
2860
- return false;
2861
- } else if (token->type == GUMBO_TOKEN_START_TAG) {
2862
- reconstruct_active_formatting_elements(parser);
2863
- insert_element_from_token(parser, token);
2864
- return true;
2865
- } else {
2866
- assert(token->type == GUMBO_TOKEN_END_TAG);
2867
- GumboTag end_tag = token->v.end_tag;
2868
- assert(state->_open_elements.length > 0);
2869
- assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
2870
- // Walk up the stack of open elements until we find one that either:
2871
- // a) Matches the tag name we saw
2872
- // b) Is in the "special" category.
2873
- // If we see a), implicitly close everything up to and including it. If we
2874
- // see b), then record a parse error, don't close anything (except the
2875
- // implied end tags) and ignore the end tag token.
2876
- for (int i = state->_open_elements.length - 1; ; --i) {
2877
- const GumboNode* node = state->_open_elements.data[i];
2878
- if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
2879
- node_tag_is(node, end_tag)) {
2880
- generate_implied_end_tags(parser, end_tag);
2881
- // TODO(jdtang): Do I need to add a parse error here? The condition in
2882
- // the spec seems like it's the inverse of the loop condition above, and
2883
- // so would never fire.
2884
- while (node != pop_current_node(parser)); // Pop everything.
2885
- return true;
2886
- } else if (is_special_node(node)) {
2887
- add_parse_error(parser, token);
2888
- ignore_token(parser);
2889
- return false;
2890
- }
2891
- }
2892
- // <html> is in the special category, so we should never get here.
2893
- assert(0);
2894
- return false;
2895
- }
2896
- }
2897
-
2898
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incdata
2899
- static bool handle_text(GumboParser* parser, GumboToken* token) {
2900
- if (token->type == GUMBO_TOKEN_CHARACTER || token->type == GUMBO_TOKEN_WHITESPACE) {
2901
- insert_text_token(parser, token);
2902
- } else {
2903
- // We provide only bare-bones script handling that doesn't involve any of
2904
- // the parser-pause/already-started/script-nesting flags or re-entrant
2905
- // invocations of the tokenizer. Because the intended usage of this library
2906
- // is mostly for templating, refactoring, and static-analysis libraries, we
2907
- // provide the script body as a text-node child of the <script> element.
2908
- // This behavior doesn't support document.write of partial HTML elements,
2909
- // but should be adequate for almost all other scripting support.
2910
- if (token->type == GUMBO_TOKEN_EOF) {
2911
- add_parse_error(parser, token);
2912
- parser->_parser_state->_reprocess_current_token = true;
2913
- }
2914
- pop_current_node(parser);
2915
- set_insertion_mode(parser, parser->_parser_state->_original_insertion_mode);
2916
- }
2917
- return true;
2918
- }
2919
-
2920
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intable
2921
- static bool handle_in_table(GumboParser* parser, GumboToken* token) {
2922
- GumboParserState* state = parser->_parser_state;
2923
- if (token->type == GUMBO_TOKEN_CHARACTER ||
2924
- token->type == GUMBO_TOKEN_WHITESPACE) {
2925
- // The "pending table character tokens" list described in the spec is
2926
- // nothing more than the TextNodeBufferState. We accumulate text tokens as
2927
- // normal, except that when we go to flush them in the handle_in_table_text,
2928
- // we set _foster_parent_insertions if there're non-whitespace characters in
2929
- // the buffer.
2930
- assert(state->_text_node._buffer.length == 0);
2931
- state->_original_insertion_mode = state->_insertion_mode;
2932
- state->_reprocess_current_token = true;
2933
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_TEXT);
2934
- return true;
2935
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
2936
- add_parse_error(parser, token);
2937
- ignore_token(parser);
2938
- return false;
2939
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
2940
- append_comment_node(parser, get_current_node(parser), token);
2941
- return true;
2942
- } else if (tag_is(token, kStartTag, GUMBO_TAG_CAPTION)) {
2943
- clear_stack_to_table_context(parser);
2944
- add_formatting_element(parser, &kActiveFormattingScopeMarker);
2945
- insert_element_from_token(parser, token);
2946
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CAPTION);
2947
- return true;
2948
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COLGROUP)) {
2949
- clear_stack_to_table_context(parser);
2950
- insert_element_from_token(parser, token);
2951
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
2952
- return true;
2953
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
2954
- clear_stack_to_table_context(parser);
2955
- insert_element_of_tag_type(
2956
- parser, GUMBO_TAG_COLGROUP, GUMBO_INSERTION_IMPLIED);
2957
- parser->_parser_state->_reprocess_current_token = true;
2958
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
2959
- return true;
2960
- } else if (tag_in(token, kStartTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
2961
- GUMBO_TAG_THEAD, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_TR,
2962
- GUMBO_TAG_LAST)) {
2963
- clear_stack_to_table_context(parser);
2964
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
2965
- if (tag_in(token, kStartTag, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_TR,
2966
- GUMBO_TAG_LAST)) {
2967
- insert_element_of_tag_type(
2968
- parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED);
2969
- state->_reprocess_current_token = true;
2970
- } else {
2971
- insert_element_from_token(parser, token);
2972
- }
2973
- return true;
2974
- } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
2975
- add_parse_error(parser, token);
2976
- if (close_table(parser)) {
2977
- parser->_parser_state->_reprocess_current_token = true;
2978
- } else {
2979
- ignore_token(parser);
2980
- }
2981
- return false;
2982
- } else if (tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
2983
- if (!close_table(parser)) {
2984
- add_parse_error(parser, token);
2985
- return false;
2986
- }
2987
- return true;
2988
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
2989
- GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
2990
- GUMBO_TAG_TBODY, GUMBO_TAG_TD, GUMBO_TAG_TFOOT,
2991
- GUMBO_TAG_TH, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
2992
- GUMBO_TAG_LAST)) {
2993
- add_parse_error(parser, token);
2994
- ignore_token(parser);
2995
- return false;
2996
- } else if (tag_in(token, kStartTag, GUMBO_TAG_STYLE, GUMBO_TAG_SCRIPT,
2997
- GUMBO_TAG_LAST)) {
2998
- return handle_in_head(parser, token);
2999
- } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) &&
3000
- attribute_matches(&token->v.start_tag.attributes,
3001
- "type", "hidden")) {
3002
- add_parse_error(parser, token);
3003
- insert_element_from_token(parser, token);
3004
- pop_current_node(parser);
3005
- return false;
3006
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3007
- add_parse_error(parser, token);
3008
- if (state->_form_element) {
3009
- ignore_token(parser);
3010
- return false;
3011
- }
3012
- state->_form_element = insert_element_from_token(parser, token);
3013
- pop_current_node(parser);
3014
- return false;
3015
- } else if (token->type == GUMBO_TOKEN_EOF) {
3016
- if (!node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3017
- add_parse_error(parser, token);
3018
- return false;
3019
- }
3020
- return true;
3021
- } else {
3022
- add_parse_error(parser, token);
3023
- state->_foster_parent_insertions = true;
3024
- bool result = handle_in_body(parser, token);
3025
- state->_foster_parent_insertions = false;
3026
- return result;
3027
- }
3028
- }
3029
-
3030
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intabletext
3031
- static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3032
- if (token->type == GUMBO_TOKEN_NULL) {
3033
- add_parse_error(parser, token);
3034
- ignore_token(parser);
3035
- return false;
3036
- } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3037
- token->type == GUMBO_TOKEN_WHITESPACE) {
3038
- insert_text_token(parser, token);
3039
- return true;
3040
- } else {
3041
- GumboParserState* state = parser->_parser_state;
3042
- GumboStringBuffer* buffer = &state->_text_node._buffer;
3043
- // Can't use strspn for this because GumboStringBuffers are not
3044
- // null-terminated.
3045
- // Note that TextNodeBuffer may contain UTF-8 characters, but the presence
3046
- // of any one byte that is not whitespace means we flip the flag, so this
3047
- // loop is still valid.
3048
- for (int i = 0; i < buffer->length; ++i) {
3049
- if (!isspace(buffer->data[i]) || buffer->data[i] == '\v') {
3050
- state->_foster_parent_insertions = true;
3051
- reconstruct_active_formatting_elements(parser);
3052
- break;
3053
- }
3054
- }
3055
- maybe_flush_text_node_buffer(parser);
3056
- state->_foster_parent_insertions = false;
3057
- state->_reprocess_current_token = true;
3058
- state->_insertion_mode = state->_original_insertion_mode;
3059
- return true;
3060
- }
3061
- }
3062
-
3063
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption
3064
- static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3065
- if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3066
- GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
3067
- GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
3068
- GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
3069
- tag_in(token, kEndTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3070
- GUMBO_TAG_LAST)) {
3071
- if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3072
- add_parse_error(parser, token);
3073
- ignore_token(parser);
3074
- return false;
3075
- }
3076
- if (!tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3077
- add_parse_error(parser, token);
3078
- parser->_parser_state->_reprocess_current_token = true;
3079
- }
3080
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3081
- bool result = true;
3082
- if (!node_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3083
- add_parse_error(parser, token);
3084
- while (!node_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3085
- pop_current_node(parser);
3086
- }
3087
- result = false;
3088
- }
3089
- pop_current_node(parser); // The <caption> itself.
3090
- clear_active_formatting_elements(parser);
3091
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3092
- return result;
3093
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_COL,
3094
- GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML, GUMBO_TAG_TBODY,
3095
- GUMBO_TAG_TD, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
3096
- GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
3097
- add_parse_error(parser, token);
3098
- ignore_token(parser);
3099
- return false;
3100
- } else {
3101
- return handle_in_body(parser, token);
3102
- }
3103
- }
3104
-
3105
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incolgroup
3106
- static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3107
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
3108
- insert_text_token(parser, token);
3109
- return true;
3110
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3111
- add_parse_error(parser, token);
3112
- ignore_token(parser);
3113
- return false;
3114
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3115
- append_comment_node(parser, get_current_node(parser), token);
3116
- return true;
3117
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3118
- return handle_in_body(parser, token);
3119
- } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3120
- insert_element_from_token(parser, token);
3121
- pop_current_node(parser);
3122
- acknowledge_self_closing_tag(parser);
3123
- return true;
3124
- } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3125
- add_parse_error(parser, token);
3126
- ignore_token(parser);
3127
- return false;
3128
- } else if (token->type == GUMBO_TOKEN_EOF &&
3129
- get_current_node(parser) == parser->_output->root) {
3130
- return true;
3131
- } else {
3132
- if (get_current_node(parser) == parser->_output->root) {
3133
- add_parse_error(parser, token);
3134
- return false;
3135
- }
3136
- assert(node_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP));
3137
- pop_current_node(parser);
3138
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3139
- if (!tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3140
- parser->_parser_state->_reprocess_current_token = true;
3141
- }
3142
- return true;
3143
- }
3144
- }
3145
-
3146
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intbody
3147
- static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3148
- if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3149
- clear_stack_to_table_body_context(parser);
3150
- insert_element_from_token(parser, token);
3151
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3152
- return true;
3153
- } else if (tag_in(token, kStartTag, GUMBO_TAG_TD, GUMBO_TAG_TH,
3154
- GUMBO_TAG_LAST)) {
3155
- add_parse_error(parser, token);
3156
- clear_stack_to_table_body_context(parser);
3157
- insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3158
- parser->_parser_state->_reprocess_current_token = true;
3159
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3160
- return false;
3161
- } else if (tag_in(token, kEndTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3162
- GUMBO_TAG_THEAD, GUMBO_TAG_LAST)) {
3163
- if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3164
- add_parse_error(parser, token);
3165
- ignore_token(parser);
3166
- return false;
3167
- }
3168
- clear_stack_to_table_body_context(parser);
3169
- pop_current_node(parser);
3170
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3171
- return true;
3172
- } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3173
- GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3174
- GUMBO_TAG_THEAD, GUMBO_TAG_LAST) ||
3175
- tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3176
- if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) ||
3177
- has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) ||
3178
- has_an_element_in_table_scope(parser, GUMBO_TAG_TFOOT))) {
3179
- add_parse_error(parser, token);
3180
- ignore_token(parser);
3181
- return false;
3182
- }
3183
- clear_stack_to_table_body_context(parser);
3184
- pop_current_node(parser);
3185
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3186
- parser->_parser_state->_reprocess_current_token = true;
3187
- return true;
3188
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3189
- GUMBO_TAG_COL, GUMBO_TAG_TR, GUMBO_TAG_COLGROUP,
3190
- GUMBO_TAG_HTML, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST))
3191
- {
3192
- add_parse_error(parser, token);
3193
- ignore_token(parser);
3194
- return false;
3195
- } else {
3196
- return handle_in_table(parser, token);
3197
- }
3198
- }
3199
-
3200
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr
3201
- static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3202
- if (tag_in(token, kStartTag, GUMBO_TAG_TH, GUMBO_TAG_TD, GUMBO_TAG_LAST)) {
3203
- clear_stack_to_table_row_context(parser);
3204
- insert_element_from_token(parser, token);
3205
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3206
- add_formatting_element(parser, &kActiveFormattingScopeMarker);
3207
- return true;
3208
- } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COLGROUP,
3209
- GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3210
- GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
3211
- tag_in(token, kEndTag, GUMBO_TAG_TR, GUMBO_TAG_TABLE,
3212
- GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3213
- GUMBO_TAG_LAST)) {
3214
- // This case covers 4 clauses of the spec, each of which say "Otherwise, act
3215
- // as if an end tag with the tag name "tr" had been seen." The differences
3216
- // are in error handling and whether the current token is reprocessed.
3217
- GumboTag desired_tag =
3218
- tag_in(token, kEndTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3219
- GUMBO_TAG_THEAD, GUMBO_TAG_LAST)
3220
- ? token->v.end_tag : GUMBO_TAG_TR;
3221
- if (!has_an_element_in_table_scope(parser, desired_tag)) {
3222
- gumbo_debug("Bailing because there is no tag %s in table scope.\nOpen elements:",
3223
- gumbo_normalized_tagname(desired_tag));
3224
- for (int i = 0; i < parser->_parser_state->_open_elements.length; ++i) {
3225
- const GumboNode* node = parser->_parser_state->_open_elements.data[i];
3226
- gumbo_debug("%s\n", gumbo_normalized_tagname(node->v.element.tag));
3227
- }
3228
- add_parse_error(parser, token);
3229
- ignore_token(parser);
3230
- return false;
3231
- }
3232
- clear_stack_to_table_row_context(parser);
3233
- GumboNode* last_element = pop_current_node(parser);
3234
- assert(node_tag_is(last_element, GUMBO_TAG_TR));
3235
- AVOID_UNUSED_VARIABLE_WARNING(last_element);
3236
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3237
- if (!tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3238
- parser->_parser_state->_reprocess_current_token = true;
3239
- }
3240
- return true;
3241
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3242
- GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
3243
- GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3244
- add_parse_error(parser, token);
3245
- ignore_token(parser);
3246
- return false;
3247
- } else {
3248
- return handle_in_table(parser, token);
3249
- }
3250
- }
3251
-
3252
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd
3253
- static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3254
- if (tag_in(token, kEndTag, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3255
- GumboTag token_tag = token->v.end_tag;
3256
- if (!has_an_element_in_table_scope(parser, token_tag)) {
3257
- add_parse_error(parser, token);
3258
- return false;
3259
- }
3260
- return close_table_cell(parser, token, token_tag);
3261
- } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3262
- GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
3263
- GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
3264
- GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
3265
- gumbo_debug("Handling <td> in cell.\n");
3266
- if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) &&
3267
- !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
3268
- gumbo_debug("Bailing out because there's no <td> or <th> in scope.\n");
3269
- add_parse_error(parser, token);
3270
- ignore_token(parser);
3271
- return false;
3272
- }
3273
- parser->_parser_state->_reprocess_current_token = true;
3274
- return close_current_cell(parser, token);
3275
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3276
- GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
3277
- GUMBO_TAG_LAST)) {
3278
- add_parse_error(parser, token);
3279
- ignore_token(parser);
3280
- return false;
3281
- } else if (tag_in(token, kEndTag, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
3282
- GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
3283
- GUMBO_TAG_LAST)) {
3284
- if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3285
- add_parse_error(parser, token);
3286
- ignore_token(parser);
3287
- return false;
3288
- }
3289
- parser->_parser_state->_reprocess_current_token = true;
3290
- return close_current_cell(parser, token);
3291
- } else {
3292
- return handle_in_body(parser, token);
3293
- }
3294
- }
3295
-
3296
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselect
3297
- static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3298
- if (token->type == GUMBO_TOKEN_NULL) {
3299
- add_parse_error(parser, token);
3300
- ignore_token(parser);
3301
- return false;
3302
- } else if (token->type == GUMBO_TOKEN_CHARACTER ||
3303
- token->type == GUMBO_TOKEN_WHITESPACE) {
3304
- insert_text_token(parser, token);
3305
- return true;
3306
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3307
- add_parse_error(parser, token);
3308
- ignore_token(parser);
3309
- return false;
3310
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3311
- append_comment_node(parser, get_current_node(parser), token);
3312
- return true;
3313
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3314
- return handle_in_body(parser, token);
3315
- } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3316
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3317
- pop_current_node(parser);
3318
- }
3319
- insert_element_from_token(parser, token);
3320
- return true;
3321
- } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3322
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3323
- pop_current_node(parser);
3324
- }
3325
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3326
- pop_current_node(parser);
3327
- }
3328
- insert_element_from_token(parser, token);
3329
- return true;
3330
- } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3331
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
3332
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
3333
- node_tag_is(open_elements->data[open_elements->length - 2],
3334
- GUMBO_TAG_OPTGROUP)) {
3335
- pop_current_node(parser);
3336
- }
3337
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3338
- pop_current_node(parser);
3339
- return true;
3340
- } else {
3341
- add_parse_error(parser, token);
3342
- ignore_token(parser);
3343
- return false;
3344
- }
3345
- } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3346
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3347
- pop_current_node(parser);
3348
- return true;
3349
- } else {
3350
- add_parse_error(parser, token);
3351
- ignore_token(parser);
3352
- return false;
3353
- }
3354
- } else if (tag_is(token, kEndTag, GUMBO_TAG_SELECT)) {
3355
- if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3356
- add_parse_error(parser, token);
3357
- ignore_token(parser);
3358
- return false;
3359
- }
3360
- close_current_select(parser);
3361
- return true;
3362
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SELECT)) {
3363
- add_parse_error(parser, token);
3364
- ignore_token(parser);
3365
- close_current_select(parser);
3366
- return false;
3367
- } else if (tag_in(token, kStartTag, GUMBO_TAG_INPUT, GUMBO_TAG_KEYGEN,
3368
- GUMBO_TAG_TEXTAREA, GUMBO_TAG_LAST)) {
3369
- add_parse_error(parser, token);
3370
- if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3371
- ignore_token(parser);
3372
- } else {
3373
- close_current_select(parser);
3374
- parser->_parser_state->_reprocess_current_token = true;
3375
- }
3376
- return false;
3377
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
3378
- return handle_in_head(parser, token);
3379
- } else if (token->type == GUMBO_TOKEN_EOF) {
3380
- if (get_current_node(parser) != parser->_output->root) {
3381
- add_parse_error(parser, token);
3382
- return false;
3383
- }
3384
- return true;
3385
- } else {
3386
- add_parse_error(parser, token);
3387
- ignore_token(parser);
3388
- return false;
3389
- }
3390
- }
3391
-
3392
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
3393
- static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3394
- if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3395
- GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
3396
- GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3397
- add_parse_error(parser, token);
3398
- close_current_select(parser);
3399
- parser->_parser_state->_reprocess_current_token = true;
3400
- return false;
3401
- } else if (tag_in(token, kEndTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3402
- GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3403
- GUMBO_TAG_TR, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3404
- add_parse_error(parser, token);
3405
- if (has_an_element_in_table_scope(parser, token->v.end_tag)) {
3406
- close_current_select(parser);
3407
- reset_insertion_mode_appropriately(parser);
3408
- parser->_parser_state->_reprocess_current_token = true;
3409
- } else {
3410
- ignore_token(parser);
3411
- }
3412
- return false;
3413
- } else {
3414
- return handle_in_select(parser, token);
3415
- }
3416
- }
3417
-
3418
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
3419
- static bool handle_after_body(GumboParser* parser, GumboToken* token) {
3420
- if (token->type == GUMBO_TOKEN_WHITESPACE ||
3421
- tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3422
- return handle_in_body(parser, token);
3423
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3424
- GumboNode* html_node = parser->_output->root;
3425
- assert(html_node != NULL);
3426
- append_comment_node(parser, html_node, token);
3427
- return true;
3428
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3429
- add_parse_error(parser, token);
3430
- ignore_token(parser);
3431
- return false;
3432
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3433
- // TODO(jdtang): Handle fragment parsing algorithm case.
3434
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
3435
- GumboNode* html = parser->_parser_state->_open_elements.data[0];
3436
- assert(node_tag_is(html, GUMBO_TAG_HTML));
3437
- record_end_of_element(
3438
- parser->_parser_state->_current_token, &html->v.element);
3439
- return true;
3440
- } else if (token->type == GUMBO_TOKEN_EOF) {
3441
- return true;
3442
- } else {
3443
- add_parse_error(parser, token);
3444
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3445
- parser->_parser_state->_reprocess_current_token = true;
3446
- return false;
3447
- }
3448
- }
3449
-
3450
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inframeset
3451
- static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
3452
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
3453
- insert_text_token(parser, token);
3454
- return true;
3455
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3456
- append_comment_node(parser, get_current_node(parser), token);
3457
- return true;
3458
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3459
- add_parse_error(parser, token);
3460
- ignore_token(parser);
3461
- return false;
3462
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3463
- return handle_in_body(parser, token);
3464
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
3465
- insert_element_from_token(parser, token);
3466
- return true;
3467
- } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
3468
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3469
- add_parse_error(parser, token);
3470
- ignore_token(parser);
3471
- return false;
3472
- }
3473
- pop_current_node(parser);
3474
- // TODO(jdtang): Add a condition to ignore this for the fragment parsing
3475
- // algorithm.
3476
- if (!node_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
3477
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
3478
- }
3479
- return true;
3480
- } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAME)) {
3481
- insert_element_from_token(parser, token);
3482
- pop_current_node(parser);
3483
- acknowledge_self_closing_tag(parser);
3484
- return true;
3485
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3486
- return handle_in_head(parser, token);
3487
- } else if (token->type == GUMBO_TOKEN_EOF) {
3488
- if (!node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3489
- add_parse_error(parser, token);
3490
- return false;
3491
- }
3492
- return true;
3493
- } else {
3494
- add_parse_error(parser, token);
3495
- ignore_token(parser);
3496
- return false;
3497
- }
3498
- }
3499
-
3500
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterframeset
3501
- static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
3502
- if (token->type == GUMBO_TOKEN_WHITESPACE) {
3503
- insert_text_token(parser, token);
3504
- return true;
3505
- } else if (token->type == GUMBO_TOKEN_COMMENT) {
3506
- append_comment_node(parser, get_current_node(parser), token);
3507
- return true;
3508
- } else if (token->type == GUMBO_TOKEN_DOCTYPE) {
3509
- add_parse_error(parser, token);
3510
- ignore_token(parser);
3511
- return false;
3512
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3513
- return handle_in_body(parser, token);
3514
- } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3515
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
3516
- return true;
3517
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3518
- return handle_in_head(parser, token);
3519
- } else if (token->type == GUMBO_TOKEN_EOF) {
3520
- return true;
3521
- } else {
3522
- add_parse_error(parser, token);
3523
- ignore_token(parser);
3524
- return false;
3525
- }
3526
- }
3527
-
3528
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-body-insertion-mode
3529
- static bool handle_after_after_body(GumboParser* parser, GumboToken* token) {
3530
- if (token->type == GUMBO_TOKEN_COMMENT) {
3531
- append_comment_node(parser, get_document_node(parser), token);
3532
- return true;
3533
- } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3534
- token->type == GUMBO_TOKEN_WHITESPACE ||
3535
- tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3536
- return handle_in_body(parser, token);
3537
- } else if (token->type == GUMBO_TOKEN_EOF) {
3538
- return true;
3539
- } else {
3540
- add_parse_error(parser, token);
3541
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3542
- parser->_parser_state->_reprocess_current_token = true;
3543
- return false;
3544
- }
3545
- }
3546
-
3547
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#the-after-after-frameset-insertion-mode
3548
- static bool handle_after_after_frameset(
3549
- GumboParser* parser, GumboToken* token) {
3550
- if (token->type == GUMBO_TOKEN_COMMENT) {
3551
- append_comment_node(parser, get_document_node(parser), token);
3552
- return true;
3553
- } else if (token->type == GUMBO_TOKEN_DOCTYPE ||
3554
- token->type == GUMBO_TOKEN_WHITESPACE ||
3555
- tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3556
- return handle_in_body(parser, token);
3557
- } else if (token->type == GUMBO_TOKEN_EOF) {
3558
- return true;
3559
- } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3560
- return handle_in_head(parser, token);
3561
- } else {
3562
- add_parse_error(parser, token);
3563
- ignore_token(parser);
3564
- return false;
3565
- }
3566
- }
3567
-
3568
- // Function pointers for each insertion mode. Keep in sync with
3569
- // insertion_mode.h.
3570
- typedef bool (*TokenHandler)(GumboParser* parser, GumboToken* token);
3571
- static const TokenHandler kTokenHandlers[] = {
3572
- handle_initial,
3573
- handle_before_html,
3574
- handle_before_head,
3575
- handle_in_head,
3576
- handle_in_head_noscript,
3577
- handle_after_head,
3578
- handle_in_body,
3579
- handle_text,
3580
- handle_in_table,
3581
- handle_in_table_text,
3582
- handle_in_caption,
3583
- handle_in_column_group,
3584
- handle_in_table_body,
3585
- handle_in_row,
3586
- handle_in_cell,
3587
- handle_in_select,
3588
- handle_in_select_in_table,
3589
- handle_after_body,
3590
- handle_in_frameset,
3591
- handle_after_frameset,
3592
- handle_after_after_body,
3593
- handle_after_after_frameset
3594
- };
3595
-
3596
- static bool handle_html_content(GumboParser* parser, GumboToken* token) {
3597
- return kTokenHandlers[parser->_parser_state->_insertion_mode](
3598
- parser, token);
3599
- }
3600
-
3601
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inforeign
3602
- static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
3603
- switch (token->type) {
3604
- case GUMBO_TOKEN_NULL:
3605
- add_parse_error(parser, token);
3606
- token->type = GUMBO_TOKEN_CHARACTER;
3607
- token->v.character = kUtf8ReplacementChar;
3608
- insert_text_token(parser, token);
3609
- return false;
3610
- case GUMBO_TOKEN_WHITESPACE:
3611
- insert_text_token(parser, token);
3612
- return true;
3613
- case GUMBO_TOKEN_CHARACTER:
3614
- insert_text_token(parser, token);
3615
- set_frameset_not_ok(parser);
3616
- return true;
3617
- case GUMBO_TOKEN_COMMENT:
3618
- append_comment_node(parser, get_current_node(parser), token);
3619
- return true;
3620
- case GUMBO_TOKEN_DOCTYPE:
3621
- add_parse_error(parser, token);
3622
- ignore_token(parser);
3623
- return false;
3624
- default:
3625
- // Fall through to the if-statements below.
3626
- break;
3627
- }
3628
- // Order matters for these clauses.
3629
- if (tag_in(token, kStartTag, GUMBO_TAG_B, GUMBO_TAG_BIG,
3630
- GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_BODY, GUMBO_TAG_BR,
3631
- GUMBO_TAG_CENTER, GUMBO_TAG_CODE, GUMBO_TAG_DD, GUMBO_TAG_DIV,
3632
- GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EM, GUMBO_TAG_EMBED,
3633
- GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
3634
- GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HEAD, GUMBO_TAG_HR,
3635
- GUMBO_TAG_I, GUMBO_TAG_IMG, GUMBO_TAG_LI, GUMBO_TAG_LISTING,
3636
- GUMBO_TAG_MENU, GUMBO_TAG_META, GUMBO_TAG_NOBR, GUMBO_TAG_OL,
3637
- GUMBO_TAG_P, GUMBO_TAG_PRE, GUMBO_TAG_RUBY, GUMBO_TAG_S,
3638
- GUMBO_TAG_SMALL, GUMBO_TAG_SPAN, GUMBO_TAG_STRONG,
3639
- GUMBO_TAG_STRIKE, GUMBO_TAG_SUB, GUMBO_TAG_SUP,
3640
- GUMBO_TAG_TABLE, GUMBO_TAG_TT, GUMBO_TAG_U, GUMBO_TAG_UL,
3641
- GUMBO_TAG_VAR, GUMBO_TAG_LAST) ||
3642
- (tag_is(token, kStartTag, GUMBO_TAG_FONT) && (
3643
- token_has_attribute(token, "color") ||
3644
- token_has_attribute(token, "face") ||
3645
- token_has_attribute(token, "size")))) {
3646
- add_parse_error(parser, token);
3647
- do {
3648
- pop_current_node(parser);
3649
- } while(!(is_mathml_integration_point(get_current_node(parser)) ||
3650
- is_html_integration_point(get_current_node(parser)) ||
3651
- get_current_node(parser)->v.element.tag_namespace ==
3652
- GUMBO_NAMESPACE_HTML));
3653
- parser->_parser_state->_reprocess_current_token = true;
3654
- return false;
3655
- } else if (token->type == GUMBO_TOKEN_START_TAG) {
3656
- const GumboNamespaceEnum current_namespace =
3657
- get_current_node(parser)->v.element.tag_namespace;
3658
- if (current_namespace == GUMBO_NAMESPACE_MATHML) {
3659
- adjust_mathml_attributes(parser, token);
3660
- }
3661
- if (current_namespace == GUMBO_NAMESPACE_SVG) {
3662
- // Tag adjustment is left to the gumbo_normalize_svg_tagname helper
3663
- // function.
3664
- adjust_svg_attributes(parser, token);
3665
- }
3666
- adjust_foreign_attributes(parser, token);
3667
- insert_foreign_element(parser, token, current_namespace);
3668
- if (token->v.start_tag.is_self_closing) {
3669
- pop_current_node(parser);
3670
- acknowledge_self_closing_tag(parser);
3671
- }
3672
- return true;
3673
- // </script> tags are handled like any other end tag, putting the script's
3674
- // text into a text node child and closing the current node.
3675
- } else {
3676
- assert(token->type == GUMBO_TOKEN_END_TAG);
3677
- GumboNode* node = get_current_node(parser);
3678
- assert(node != NULL);
3679
- GumboStringPiece token_tagname = token->original_text;
3680
- GumboStringPiece node_tagname = node->v.element.original_tag;
3681
- gumbo_tag_from_original_text(&token_tagname);
3682
- gumbo_tag_from_original_text(&node_tagname);
3683
-
3684
- bool is_success = true;
3685
- if (!gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3686
- add_parse_error(parser, token);
3687
- is_success = false;
3688
- }
3689
- int i = parser->_parser_state->_open_elements.length - 1;
3690
- while (i > 0) {
3691
- // Here we move up the stack until we find an HTML element (in which
3692
- // case we do nothing) or we find the element that we're about to
3693
- // close (in which case we pop everything we've seen until that
3694
- // point.)
3695
- gumbo_debug("Foreign %.*s node at %d.\n", node_tagname.length,
3696
- node_tagname.data, i);
3697
- if (gumbo_string_equals_ignore_case(&node_tagname, &token_tagname)) {
3698
- gumbo_debug("Matches.\n");
3699
- while (pop_current_node(parser) != node) {
3700
- // Pop all the nodes below the current one. Node is guaranteed to
3701
- // be an element on the stack of open elements (set below), so
3702
- // this loop is guaranteed to terminate.
3703
- }
3704
- return is_success;
3705
- }
3706
- --i;
3707
- node = parser->_parser_state->_open_elements.data[i];
3708
- if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML) {
3709
- // Must break before gumbo_tag_from_original_text to avoid passing
3710
- // parser-inserted nodes through.
3711
- break;
3712
- }
3713
- node_tagname = node->v.element.original_tag;
3714
- gumbo_tag_from_original_text(&node_tagname);
3715
- }
3716
- assert(node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML);
3717
- // We can't call handle_token directly because the current node is still in
3718
- // the SVG namespace, so it would re-enter this and result in infinite
3719
- // recursion.
3720
- return handle_html_content(parser, token) && is_success;
3721
- }
3722
- }
3723
-
3724
-
3725
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#tree-construction
3726
- static bool handle_token(GumboParser* parser, GumboToken* token) {
3727
- if (parser->_parser_state->_ignore_next_linefeed &&
3728
- token->type == GUMBO_TOKEN_WHITESPACE && token->v.character == '\n') {
3729
- parser->_parser_state->_ignore_next_linefeed = false;
3730
- ignore_token(parser);
3731
- return true;
3732
- }
3733
- // This needs to be reset both here and in the conditional above to catch both
3734
- // the case where the next token is not whitespace (so we don't ignore
3735
- // whitespace in the middle of <pre> tags) and where there are multiple
3736
- // whitespace tokens (so we don't ignore the second one).
3737
- parser->_parser_state->_ignore_next_linefeed = false;
3738
-
3739
- if (tag_is(token, kEndTag, GUMBO_TAG_BODY)) {
3740
- parser->_parser_state->_closed_body_tag = true;
3741
- }
3742
- if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3743
- parser->_parser_state->_closed_html_tag = true;
3744
- }
3745
-
3746
- const GumboNode* current_node = get_current_node(parser);
3747
- assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT);
3748
- if (current_node) {
3749
- gumbo_debug("Current node: <%s>.\n",
3750
- gumbo_normalized_tagname(current_node->v.element.tag));
3751
- }
3752
- if (!current_node ||
3753
- current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML ||
3754
- (is_mathml_integration_point(current_node) &&
3755
- (token->type == GUMBO_TOKEN_CHARACTER ||
3756
- token->type == GUMBO_TOKEN_WHITESPACE ||
3757
- token->type == GUMBO_TOKEN_NULL ||
3758
- (token->type == GUMBO_TOKEN_START_TAG &&
3759
- !tag_in(token, kStartTag, GUMBO_TAG_MGLYPH, GUMBO_TAG_MALIGNMARK,
3760
- GUMBO_TAG_LAST)))) ||
3761
- (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
3762
- node_tag_is(current_node, GUMBO_TAG_ANNOTATION_XML) &&
3763
- tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
3764
- (is_html_integration_point(current_node) && (
3765
- token->type == GUMBO_TOKEN_START_TAG ||
3766
- token->type == GUMBO_TOKEN_CHARACTER ||
3767
- token->type == GUMBO_TOKEN_NULL ||
3768
- token->type == GUMBO_TOKEN_WHITESPACE)) ||
3769
- token->type == GUMBO_TOKEN_EOF) {
3770
- return handle_html_content(parser, token);
3771
- } else {
3772
- return handle_in_foreign_content(parser, token);
3773
- }
3774
- }
3775
-
3776
- GumboOutput* gumbo_parse(const char* buffer) {
3777
- return gumbo_parse_with_options(
3778
- &kGumboDefaultOptions, buffer, strlen(buffer));
3779
- }
3780
-
3781
- GumboOutput* gumbo_parse_with_options(
3782
- const GumboOptions* options, const char* buffer, size_t length) {
3783
- GumboParser parser;
3784
- parser._options = options;
3785
- output_init(&parser);
3786
- gumbo_tokenizer_state_init(&parser, buffer, length);
3787
- parser_state_init(&parser);
3788
-
3789
- GumboParserState* state = parser._parser_state;
3790
- gumbo_debug("Parsing %.*s.\n", length, buffer);
3791
-
3792
- // Sanity check so that infinite loops die with an assertion failure instead
3793
- // of hanging the process before we ever get an error.
3794
- int loop_count = 0;
3795
-
3796
- GumboToken token;
3797
- bool has_error = false;
3798
- do {
3799
- if (state->_reprocess_current_token) {
3800
- state->_reprocess_current_token = false;
3801
- } else {
3802
- GumboNode* current_node = get_current_node(&parser);
3803
- gumbo_tokenizer_set_is_current_node_foreign(
3804
- &parser, current_node &&
3805
- current_node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML);
3806
- has_error = !gumbo_lex(&parser, &token) || has_error;
3807
- }
3808
- const char* token_type = "text";
3809
- switch (token.type) {
3810
- case GUMBO_TOKEN_DOCTYPE:
3811
- token_type = "doctype";
3812
- break;
3813
- case GUMBO_TOKEN_START_TAG:
3814
- token_type = gumbo_normalized_tagname(token.v.start_tag.tag);
3815
- break;
3816
- case GUMBO_TOKEN_END_TAG:
3817
- token_type = gumbo_normalized_tagname(token.v.end_tag);
3818
- break;
3819
- case GUMBO_TOKEN_COMMENT:
3820
- token_type = "comment";
3821
- break;
3822
- default:
3823
- break;
3824
- }
3825
- gumbo_debug("Handling %s token @%d:%d in state %d.\n",
3826
- (char*) token_type, token.position.line, token.position.column,
3827
- state->_insertion_mode);
3828
-
3829
- state->_current_token = &token;
3830
- state->_self_closing_flag_acknowledged =
3831
- !(token.type == GUMBO_TOKEN_START_TAG &&
3832
- token.v.start_tag.is_self_closing);
3833
-
3834
- has_error = !handle_token(&parser, &token) || has_error;
3835
-
3836
- // Check for memory leaks when ownership is transferred from start tag
3837
- // tokens to nodes.
3838
- assert(state->_reprocess_current_token ||
3839
- token.type != GUMBO_TOKEN_START_TAG ||
3840
- token.v.start_tag.attributes.data == NULL);
3841
-
3842
- if (!state->_self_closing_flag_acknowledged) {
3843
- GumboError* error = add_parse_error(&parser, &token);
3844
- if (error) {
3845
- error->type = GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG;
3846
- }
3847
- }
3848
-
3849
- ++loop_count;
3850
- assert(loop_count < 1000000000);
3851
-
3852
- } while ((token.type != GUMBO_TOKEN_EOF || state->_reprocess_current_token) &&
3853
- !(options->stop_on_first_error && has_error));
3854
-
3855
- finish_parsing(&parser);
3856
- // For API uniformity reasons, if the doctype still has nulls, convert them to
3857
- // empty strings.
3858
- GumboDocument* doc_type = &parser._output->document->v.document;
3859
- if (doc_type->name == NULL) {
3860
- doc_type->name = gumbo_copy_stringz(&parser, "");
3861
- }
3862
- if (doc_type->public_identifier == NULL) {
3863
- doc_type->public_identifier = gumbo_copy_stringz(&parser, "");
3864
- }
3865
- if (doc_type->system_identifier == NULL) {
3866
- doc_type->system_identifier = gumbo_copy_stringz(&parser, "");
3867
- }
3868
-
3869
- parser_state_destroy(&parser);
3870
- gumbo_tokenizer_state_destroy(&parser);
3871
- return parser._output;
3872
- }
3873
-
3874
- void gumbo_destroy_node(GumboOptions* options, GumboNode* node) {
3875
- // Need a dummy GumboParser because the allocator comes along with the
3876
- // options object.
3877
- GumboParser parser;
3878
- parser._options = options;
3879
- destroy_node(&parser, node);
3880
- }
3881
-
3882
- void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
3883
- // Need a dummy GumboParser because the allocator comes along with the
3884
- // options object.
3885
- GumboParser parser;
3886
- parser._options = options;
3887
- destroy_node(&parser, output->document);
3888
- for (int i = 0; i < output->errors.length; ++i) {
3889
- gumbo_error_destroy(&parser, output->errors.data[i]);
3890
- }
3891
- gumbo_vector_destroy(&parser, &output->errors);
3892
- gumbo_parser_deallocate(&parser, output);
3893
- }