nokogumbo 1.3.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +1 -1
- data/ext/nokogumboc/nokogumbo.c +1 -0
- data/gumbo-parser/src/error.c +6 -3
- data/gumbo-parser/src/gumbo.h +36 -170
- data/gumbo-parser/src/parser.c +1030 -779
- data/gumbo-parser/src/string_buffer.c +8 -1
- data/gumbo-parser/src/string_buffer.h +5 -0
- data/gumbo-parser/src/tag.c +35 -162
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +150 -0
- data/gumbo-parser/src/tag_gperf.h +343 -0
- data/gumbo-parser/src/tag_sizes.h +1 -0
- data/gumbo-parser/src/tag_strings.h +150 -0
- data/gumbo-parser/src/token_type.h +1 -0
- data/gumbo-parser/src/tokenizer.c +29 -21
- data/gumbo-parser/src/utf8.c +9 -8
- data/gumbo-parser/src/vector.c +1 -1
- data/gumbo-parser/visualc/include/strings.h +2 -1
- data/test-nokogumbo.rb +140 -0
- metadata +16 -10
data/README.md
CHANGED
data/ext/nokogumboc/nokogumbo.c
CHANGED
data/gumbo-parser/src/error.c
CHANGED
@@ -35,10 +35,11 @@ static const size_t kMessageBufferSize = 256;
|
|
35
35
|
static int print_message(GumboParser* parser, GumboStringBuffer* output,
|
36
36
|
const char* format, ...) {
|
37
37
|
va_list args;
|
38
|
-
va_start(args, format);
|
39
38
|
int remaining_capacity = output->capacity - output->length;
|
39
|
+
va_start(args, format);
|
40
40
|
int bytes_written = vsnprintf(output->data + output->length,
|
41
41
|
remaining_capacity, format, args);
|
42
|
+
va_end(args);
|
42
43
|
#ifdef _MSC_VER
|
43
44
|
if (bytes_written == -1) {
|
44
45
|
// vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
|
@@ -47,6 +48,7 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
|
|
47
48
|
// we retry (letting it fail and returning 0 if it doesn't), since there's
|
48
49
|
// no way to smartly resize the buffer.
|
49
50
|
gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
|
51
|
+
va_start(args, format);
|
50
52
|
int result = vsnprintf(output->data + output->length,
|
51
53
|
remaining_capacity, format, args);
|
52
54
|
va_end(args);
|
@@ -55,7 +57,6 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
|
|
55
57
|
#else
|
56
58
|
// -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
|
57
59
|
if (bytes_written == -1) {
|
58
|
-
va_end(args);
|
59
60
|
return 0;
|
60
61
|
}
|
61
62
|
#endif
|
@@ -64,11 +65,12 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
|
|
64
65
|
gumbo_string_buffer_reserve(
|
65
66
|
parser, output->capacity + bytes_written, output);
|
66
67
|
remaining_capacity = output->capacity - output->length;
|
68
|
+
va_start(args, format);
|
67
69
|
bytes_written = vsnprintf(output->data + output->length,
|
68
70
|
remaining_capacity, format, args);
|
71
|
+
va_end(args);
|
69
72
|
}
|
70
73
|
output->length += bytes_written;
|
71
|
-
va_end(args);
|
72
74
|
return bytes_written;
|
73
75
|
}
|
74
76
|
|
@@ -106,6 +108,7 @@ static void handle_parser_error(GumboParser* parser,
|
|
106
108
|
// But just in case...
|
107
109
|
print_message(parser, output, "Comments aren't legal here");
|
108
110
|
return;
|
111
|
+
case GUMBO_TOKEN_CDATA:
|
109
112
|
case GUMBO_TOKEN_WHITESPACE:
|
110
113
|
case GUMBO_TOKEN_CHARACTER:
|
111
114
|
print_message(parser, output, "Character tokens aren't legal here");
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -141,7 +141,7 @@ extern const GumboVector kGumboEmptyVector;
|
|
141
141
|
* Returns the first index at which an element appears in this vector (testing
|
142
142
|
* by pointer equality), or -1 if it never does.
|
143
143
|
*/
|
144
|
-
int gumbo_vector_index_of(GumboVector* vector, void* element);
|
144
|
+
int gumbo_vector_index_of(GumboVector* vector, const void* element);
|
145
145
|
|
146
146
|
|
147
147
|
/**
|
@@ -157,172 +157,10 @@ int gumbo_vector_index_of(GumboVector* vector, void* element);
|
|
157
157
|
* strings.
|
158
158
|
*/
|
159
159
|
typedef enum {
|
160
|
-
//
|
161
|
-
|
162
|
-
//
|
163
|
-
|
164
|
-
GUMBO_TAG_TITLE,
|
165
|
-
GUMBO_TAG_BASE,
|
166
|
-
GUMBO_TAG_LINK,
|
167
|
-
GUMBO_TAG_META,
|
168
|
-
GUMBO_TAG_STYLE,
|
169
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
|
170
|
-
GUMBO_TAG_SCRIPT,
|
171
|
-
GUMBO_TAG_NOSCRIPT,
|
172
|
-
GUMBO_TAG_TEMPLATE,
|
173
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
|
174
|
-
GUMBO_TAG_BODY,
|
175
|
-
GUMBO_TAG_ARTICLE,
|
176
|
-
GUMBO_TAG_SECTION,
|
177
|
-
GUMBO_TAG_NAV,
|
178
|
-
GUMBO_TAG_ASIDE,
|
179
|
-
GUMBO_TAG_H1,
|
180
|
-
GUMBO_TAG_H2,
|
181
|
-
GUMBO_TAG_H3,
|
182
|
-
GUMBO_TAG_H4,
|
183
|
-
GUMBO_TAG_H5,
|
184
|
-
GUMBO_TAG_H6,
|
185
|
-
GUMBO_TAG_HGROUP,
|
186
|
-
GUMBO_TAG_HEADER,
|
187
|
-
GUMBO_TAG_FOOTER,
|
188
|
-
GUMBO_TAG_ADDRESS,
|
189
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
|
190
|
-
GUMBO_TAG_P,
|
191
|
-
GUMBO_TAG_HR,
|
192
|
-
GUMBO_TAG_PRE,
|
193
|
-
GUMBO_TAG_BLOCKQUOTE,
|
194
|
-
GUMBO_TAG_OL,
|
195
|
-
GUMBO_TAG_UL,
|
196
|
-
GUMBO_TAG_LI,
|
197
|
-
GUMBO_TAG_DL,
|
198
|
-
GUMBO_TAG_DT,
|
199
|
-
GUMBO_TAG_DD,
|
200
|
-
GUMBO_TAG_FIGURE,
|
201
|
-
GUMBO_TAG_FIGCAPTION,
|
202
|
-
GUMBO_TAG_MAIN,
|
203
|
-
GUMBO_TAG_DIV,
|
204
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
|
205
|
-
GUMBO_TAG_A,
|
206
|
-
GUMBO_TAG_EM,
|
207
|
-
GUMBO_TAG_STRONG,
|
208
|
-
GUMBO_TAG_SMALL,
|
209
|
-
GUMBO_TAG_S,
|
210
|
-
GUMBO_TAG_CITE,
|
211
|
-
GUMBO_TAG_Q,
|
212
|
-
GUMBO_TAG_DFN,
|
213
|
-
GUMBO_TAG_ABBR,
|
214
|
-
GUMBO_TAG_DATA,
|
215
|
-
GUMBO_TAG_TIME,
|
216
|
-
GUMBO_TAG_CODE,
|
217
|
-
GUMBO_TAG_VAR,
|
218
|
-
GUMBO_TAG_SAMP,
|
219
|
-
GUMBO_TAG_KBD,
|
220
|
-
GUMBO_TAG_SUB,
|
221
|
-
GUMBO_TAG_SUP,
|
222
|
-
GUMBO_TAG_I,
|
223
|
-
GUMBO_TAG_B,
|
224
|
-
GUMBO_TAG_U,
|
225
|
-
GUMBO_TAG_MARK,
|
226
|
-
GUMBO_TAG_RUBY,
|
227
|
-
GUMBO_TAG_RT,
|
228
|
-
GUMBO_TAG_RP,
|
229
|
-
GUMBO_TAG_BDI,
|
230
|
-
GUMBO_TAG_BDO,
|
231
|
-
GUMBO_TAG_SPAN,
|
232
|
-
GUMBO_TAG_BR,
|
233
|
-
GUMBO_TAG_WBR,
|
234
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
|
235
|
-
GUMBO_TAG_INS,
|
236
|
-
GUMBO_TAG_DEL,
|
237
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
|
238
|
-
GUMBO_TAG_IMAGE,
|
239
|
-
GUMBO_TAG_IMG,
|
240
|
-
GUMBO_TAG_IFRAME,
|
241
|
-
GUMBO_TAG_EMBED,
|
242
|
-
GUMBO_TAG_OBJECT,
|
243
|
-
GUMBO_TAG_PARAM,
|
244
|
-
GUMBO_TAG_VIDEO,
|
245
|
-
GUMBO_TAG_AUDIO,
|
246
|
-
GUMBO_TAG_SOURCE,
|
247
|
-
GUMBO_TAG_TRACK,
|
248
|
-
GUMBO_TAG_CANVAS,
|
249
|
-
GUMBO_TAG_MAP,
|
250
|
-
GUMBO_TAG_AREA,
|
251
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
|
252
|
-
GUMBO_TAG_MATH,
|
253
|
-
GUMBO_TAG_MI,
|
254
|
-
GUMBO_TAG_MO,
|
255
|
-
GUMBO_TAG_MN,
|
256
|
-
GUMBO_TAG_MS,
|
257
|
-
GUMBO_TAG_MTEXT,
|
258
|
-
GUMBO_TAG_MGLYPH,
|
259
|
-
GUMBO_TAG_MALIGNMARK,
|
260
|
-
GUMBO_TAG_ANNOTATION_XML,
|
261
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
|
262
|
-
GUMBO_TAG_SVG,
|
263
|
-
GUMBO_TAG_FOREIGNOBJECT,
|
264
|
-
GUMBO_TAG_DESC,
|
265
|
-
// SVG title tags will have GUMBO_TAG_TITLE as with HTML.
|
266
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
|
267
|
-
GUMBO_TAG_TABLE,
|
268
|
-
GUMBO_TAG_CAPTION,
|
269
|
-
GUMBO_TAG_COLGROUP,
|
270
|
-
GUMBO_TAG_COL,
|
271
|
-
GUMBO_TAG_TBODY,
|
272
|
-
GUMBO_TAG_THEAD,
|
273
|
-
GUMBO_TAG_TFOOT,
|
274
|
-
GUMBO_TAG_TR,
|
275
|
-
GUMBO_TAG_TD,
|
276
|
-
GUMBO_TAG_TH,
|
277
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
|
278
|
-
GUMBO_TAG_FORM,
|
279
|
-
GUMBO_TAG_FIELDSET,
|
280
|
-
GUMBO_TAG_LEGEND,
|
281
|
-
GUMBO_TAG_LABEL,
|
282
|
-
GUMBO_TAG_INPUT,
|
283
|
-
GUMBO_TAG_BUTTON,
|
284
|
-
GUMBO_TAG_SELECT,
|
285
|
-
GUMBO_TAG_DATALIST,
|
286
|
-
GUMBO_TAG_OPTGROUP,
|
287
|
-
GUMBO_TAG_OPTION,
|
288
|
-
GUMBO_TAG_TEXTAREA,
|
289
|
-
GUMBO_TAG_KEYGEN,
|
290
|
-
GUMBO_TAG_OUTPUT,
|
291
|
-
GUMBO_TAG_PROGRESS,
|
292
|
-
GUMBO_TAG_METER,
|
293
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
|
294
|
-
GUMBO_TAG_DETAILS,
|
295
|
-
GUMBO_TAG_SUMMARY,
|
296
|
-
GUMBO_TAG_MENU,
|
297
|
-
GUMBO_TAG_MENUITEM,
|
298
|
-
// Non-conforming elements that nonetheless appear in the HTML5 spec.
|
299
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
|
300
|
-
GUMBO_TAG_APPLET,
|
301
|
-
GUMBO_TAG_ACRONYM,
|
302
|
-
GUMBO_TAG_BGSOUND,
|
303
|
-
GUMBO_TAG_DIR,
|
304
|
-
GUMBO_TAG_FRAME,
|
305
|
-
GUMBO_TAG_FRAMESET,
|
306
|
-
GUMBO_TAG_NOFRAMES,
|
307
|
-
GUMBO_TAG_ISINDEX,
|
308
|
-
GUMBO_TAG_LISTING,
|
309
|
-
GUMBO_TAG_XMP,
|
310
|
-
GUMBO_TAG_NEXTID,
|
311
|
-
GUMBO_TAG_NOEMBED,
|
312
|
-
GUMBO_TAG_PLAINTEXT,
|
313
|
-
GUMBO_TAG_RB,
|
314
|
-
GUMBO_TAG_STRIKE,
|
315
|
-
GUMBO_TAG_BASEFONT,
|
316
|
-
GUMBO_TAG_BIG,
|
317
|
-
GUMBO_TAG_BLINK,
|
318
|
-
GUMBO_TAG_CENTER,
|
319
|
-
GUMBO_TAG_FONT,
|
320
|
-
GUMBO_TAG_MARQUEE,
|
321
|
-
GUMBO_TAG_MULTICOL,
|
322
|
-
GUMBO_TAG_NOBR,
|
323
|
-
GUMBO_TAG_SPACER,
|
324
|
-
GUMBO_TAG_TT,
|
325
|
-
// Used for all tags that don't have special handling in HTML.
|
160
|
+
// Load all the tags from an external source, generated from tag.in.
|
161
|
+
# include "tag_enum.h"
|
162
|
+
// Used for all tags that don't have special handling in HTML. Add new tags
|
163
|
+
// to the end of tag.in so as to preserve backwards-compatibility.
|
326
164
|
GUMBO_TAG_UNKNOWN,
|
327
165
|
// A marker value to indicate the end of the enum, for iterating over it.
|
328
166
|
// Also used as the terminator for varargs functions that take tags.
|
@@ -364,9 +202,10 @@ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
|
|
364
202
|
|
365
203
|
/**
|
366
204
|
* Converts a tag name string (which may be in upper or mixed case) to a tag
|
367
|
-
* enum.
|
205
|
+
* enum. The `tag` version expects `tagname` to be NULL-terminated
|
368
206
|
*/
|
369
207
|
GumboTag gumbo_tag_enum(const char* tagname);
|
208
|
+
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
|
370
209
|
|
371
210
|
/**
|
372
211
|
* Attribute namespaces.
|
@@ -461,10 +300,16 @@ typedef enum {
|
|
461
300
|
GUMBO_NODE_TEXT,
|
462
301
|
/** CDATA node. v will be a GumboText. */
|
463
302
|
GUMBO_NODE_CDATA,
|
464
|
-
/** Comment node. v
|
303
|
+
/** Comment node. v will be a GumboText, excluding comment delimiters. */
|
465
304
|
GUMBO_NODE_COMMENT,
|
466
305
|
/** Text node, where all contents is whitespace. v will be a GumboText. */
|
467
|
-
GUMBO_NODE_WHITESPACE
|
306
|
+
GUMBO_NODE_WHITESPACE,
|
307
|
+
/** Template node. This is separate from GUMBO_NODE_ELEMENT because many
|
308
|
+
* client libraries will want to ignore the contents of template nodes, as
|
309
|
+
* the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
|
310
|
+
* here, while clients that want to include template contents should also
|
311
|
+
* check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
|
312
|
+
GUMBO_NODE_TEMPLATE
|
468
313
|
} GumboNodeType;
|
469
314
|
|
470
315
|
/**
|
@@ -678,6 +523,19 @@ struct GumboInternalNode {
|
|
678
523
|
/** Pointer back to parent node. Not owned. */
|
679
524
|
GumboNode* parent;
|
680
525
|
|
526
|
+
/**
|
527
|
+
* Pointer to next node in document order. This is the next node by start tag
|
528
|
+
* position in the document, or by position of the tag that forces the parser
|
529
|
+
* to insert it for parser-inserted nodes. It's necessary to maintain API
|
530
|
+
* compatibility with some other libraries, eg. BeautifulSoup. Not owned.
|
531
|
+
*/
|
532
|
+
GumboNode* next;
|
533
|
+
|
534
|
+
/**
|
535
|
+
* Pointer to previous node in document order.
|
536
|
+
*/
|
537
|
+
GumboNode* prev;
|
538
|
+
|
681
539
|
/** The index within the parent's children vector of this node. */
|
682
540
|
size_t index_within_parent;
|
683
541
|
|
@@ -795,6 +653,14 @@ GumboOutput* gumbo_parse(const char* buffer);
|
|
795
653
|
GumboOutput* gumbo_parse_with_options(
|
796
654
|
const GumboOptions* options, const char* buffer, size_t buffer_length);
|
797
655
|
|
656
|
+
/**
|
657
|
+
* Parse a chunk of HTML with the given fragment context. If `fragment_ctx`
|
658
|
+
* is `GUMBO_TAG_LAST`, the fragment will be parsed as a full document.
|
659
|
+
*/
|
660
|
+
GumboOutput* gumbo_parse_fragment(
|
661
|
+
const GumboOptions* options, const char* buffer, size_t length,
|
662
|
+
const GumboTag fragment_ctx, const GumboNamespaceEnum fragment_namespace);
|
663
|
+
|
798
664
|
/** Release the memory used for the parse tree & parse errors. */
|
799
665
|
void gumbo_destroy_output(
|
800
666
|
const GumboOptions* options, GumboOutput* output);
|
data/gumbo-parser/src/parser.c
CHANGED
@@ -32,12 +32,30 @@
|
|
32
32
|
#include "util.h"
|
33
33
|
#include "vector.h"
|
34
34
|
|
35
|
-
|
36
35
|
#define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
|
37
36
|
|
38
37
|
#define GUMBO_STRING(literal) { literal, sizeof(literal) - 1 }
|
39
38
|
#define TERMINATOR { "", 0 }
|
40
39
|
|
40
|
+
typedef char gumbo_tagset[GUMBO_TAG_LAST];
|
41
|
+
#define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML)
|
42
|
+
#define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG)
|
43
|
+
#define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML)
|
44
|
+
|
45
|
+
#define TAGSET_INCLUDES(tagset, namespace, tag) \
|
46
|
+
(tag < GUMBO_TAG_LAST && \
|
47
|
+
tagset[(int)tag] == (1 << (int)namespace))
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
// selected forward declarations as it is getting hard to find
|
52
|
+
// an appropriate order
|
53
|
+
static bool node_html_tag_is(const GumboNode*, GumboTag);
|
54
|
+
static GumboInsertionMode get_current_template_insertion_mode(const GumboParser*);
|
55
|
+
static bool handle_in_template(GumboParser*, GumboToken*);
|
56
|
+
static GumboNode* destroy_node(GumboParser*, GumboNode*);
|
57
|
+
|
58
|
+
|
41
59
|
static void* malloc_wrapper(void* unused, size_t size) {
|
42
60
|
return malloc(size);
|
43
61
|
}
|
@@ -181,7 +199,7 @@ typedef struct _ReplacementEntry {
|
|
181
199
|
{ GUMBO_STRING(from), GUMBO_STRING(to) }
|
182
200
|
|
183
201
|
// Static data for SVG attribute replacements.
|
184
|
-
//
|
202
|
+
// https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes
|
185
203
|
static const ReplacementEntry kSvgAttributeReplacements[] = {
|
186
204
|
REPLACEMENT_ENTRY("attributename", "attributeName"),
|
187
205
|
REPLACEMENT_ENTRY("attributetype", "attributeType"),
|
@@ -189,12 +207,12 @@ static const ReplacementEntry kSvgAttributeReplacements[] = {
|
|
189
207
|
REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
|
190
208
|
REPLACEMENT_ENTRY("calcmode", "calcMode"),
|
191
209
|
REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
|
192
|
-
REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
|
193
|
-
REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
|
210
|
+
// REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
|
211
|
+
// REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
|
194
212
|
REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
|
195
213
|
REPLACEMENT_ENTRY("edgemode", "edgeMode"),
|
196
|
-
REPLACEMENT_ENTRY("externalresourcesrequired", "externalResourcesRequired"),
|
197
|
-
REPLACEMENT_ENTRY("filterres", "filterRes"),
|
214
|
+
// REPLACEMENT_ENTRY("externalresourcesrequired", "externalResourcesRequired"),
|
215
|
+
// REPLACEMENT_ENTRY("filterres", "filterRes"),
|
198
216
|
REPLACEMENT_ENTRY("filterunits", "filterUnits"),
|
199
217
|
REPLACEMENT_ENTRY("glyphref", "glyphRef"),
|
200
218
|
REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
|
@@ -336,7 +354,7 @@ typedef struct _TextNodeBufferState {
|
|
336
354
|
// The source position of the start of this text node.
|
337
355
|
GumboSourcePosition _start_position;
|
338
356
|
|
339
|
-
// The type of node that will be inserted (TEXT or WHITESPACE).
|
357
|
+
// The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
|
340
358
|
GumboNodeType _type;
|
341
359
|
} TextNodeBufferState;
|
342
360
|
|
@@ -362,6 +380,9 @@ typedef struct GumboInternalParserState {
|
|
362
380
|
GumboNode* _head_element;
|
363
381
|
GumboNode* _form_element;
|
364
382
|
|
383
|
+
// The element used as fragment context when parsing in fragment mode
|
384
|
+
GumboNode* _fragment_ctx;
|
385
|
+
|
365
386
|
// The flag for when the spec says "Reprocess the current token in..."
|
366
387
|
bool _reprocess_current_token;
|
367
388
|
|
@@ -390,6 +411,10 @@ typedef struct GumboInternalParserState {
|
|
390
411
|
// The current token.
|
391
412
|
GumboToken* _current_token;
|
392
413
|
|
414
|
+
// The current (most recently inserted) node. This is used to link together
|
415
|
+
// nodes in document order.
|
416
|
+
GumboNode* _current_node;
|
417
|
+
|
393
418
|
// The way that the spec is written, the </body> and </html> tags are *always*
|
394
419
|
// implicit, because encountering one of those tokens merely switches the
|
395
420
|
// insertion mode out of "in body". So we have individual state flags for
|
@@ -442,7 +467,17 @@ static void set_frameset_not_ok(GumboParser* parser) {
|
|
442
467
|
}
|
443
468
|
|
444
469
|
static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
|
470
|
+
GumboParserState* state = parser->_parser_state;
|
445
471
|
GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
|
472
|
+
|
473
|
+
node->next = NULL;
|
474
|
+
node->prev = state->_current_node;
|
475
|
+
if (state->_current_node != NULL) {
|
476
|
+
// May be null for the initial document node.
|
477
|
+
state->_current_node->next = node;
|
478
|
+
}
|
479
|
+
state->_current_node = node;
|
480
|
+
|
446
481
|
node->parent = NULL;
|
447
482
|
node->index_within_parent = -1;
|
448
483
|
node->type = type;
|
@@ -489,7 +524,9 @@ static void parser_state_init(GumboParser* parser) {
|
|
489
524
|
gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
|
490
525
|
parser_state->_head_element = NULL;
|
491
526
|
parser_state->_form_element = NULL;
|
527
|
+
parser_state->_fragment_ctx = NULL;
|
492
528
|
parser_state->_current_token = NULL;
|
529
|
+
parser_state->_current_node = NULL;
|
493
530
|
parser_state->_closed_body_tag = false;
|
494
531
|
parser_state->_closed_html_tag = false;
|
495
532
|
parser->_parser_state = parser_state;
|
@@ -497,17 +534,25 @@ static void parser_state_init(GumboParser* parser) {
|
|
497
534
|
|
498
535
|
static void parser_state_destroy(GumboParser* parser) {
|
499
536
|
GumboParserState* state = parser->_parser_state;
|
537
|
+
if (state->_fragment_ctx) {
|
538
|
+
destroy_node(parser, state->_fragment_ctx);
|
539
|
+
}
|
500
540
|
gumbo_vector_destroy(parser, &state->_active_formatting_elements);
|
501
541
|
gumbo_vector_destroy(parser, &state->_open_elements);
|
502
542
|
gumbo_vector_destroy(parser, &state->_template_insertion_modes);
|
503
543
|
gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
|
504
544
|
gumbo_parser_deallocate(parser, state);
|
545
|
+
parser->_parser_state = NULL;
|
505
546
|
}
|
506
547
|
|
507
548
|
static GumboNode* get_document_node(GumboParser* parser) {
|
508
549
|
return parser->_output->document;
|
509
550
|
}
|
510
551
|
|
552
|
+
static bool is_fragment_parser(const GumboParser *parser) {
|
553
|
+
return !!parser->_parser_state->_fragment_ctx;
|
554
|
+
}
|
555
|
+
|
511
556
|
// Returns the node at the bottom of the stack of open elements, or NULL if no
|
512
557
|
// elements have been added yet.
|
513
558
|
static GumboNode* get_current_node(GumboParser* parser) {
|
@@ -521,6 +566,14 @@ static GumboNode* get_current_node(GumboParser* parser) {
|
|
521
566
|
return open_elements->data[open_elements->length - 1];
|
522
567
|
}
|
523
568
|
|
569
|
+
static GumboNode* get_adjusted_current_node(GumboParser* parser) {
|
570
|
+
GumboParserState *state = parser->_parser_state;
|
571
|
+
if (state->_open_elements.length == 1 && state->_fragment_ctx) {
|
572
|
+
return state->_fragment_ctx;
|
573
|
+
}
|
574
|
+
return get_current_node(parser);
|
575
|
+
}
|
576
|
+
|
524
577
|
// Returns true if the given needle is in the given array of literal
|
525
578
|
// GumboStringPieces. If exact_match is true, this requires that they match
|
526
579
|
// exactly; otherwise, this performs a prefix match to check if any of the
|
@@ -541,52 +594,80 @@ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
|
|
541
594
|
parser->_parser_state->_insertion_mode = mode;
|
542
595
|
}
|
543
596
|
|
597
|
+
|
544
598
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
|
545
599
|
// This is a helper function that returns the appropriate insertion mode instead
|
546
600
|
// of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
|
547
601
|
// indicate that there is no appropriate insertion mode, and the loop should
|
548
602
|
// continue.
|
549
|
-
static GumboInsertionMode get_appropriate_insertion_mode(
|
550
|
-
|
551
|
-
|
603
|
+
static GumboInsertionMode get_appropriate_insertion_mode(const GumboParser* parser, int index) {
|
604
|
+
const GumboVector* open_elements = &parser->_parser_state->_open_elements;
|
605
|
+
const GumboNode* node = open_elements->data[index];
|
606
|
+
const bool is_last = index == 0;
|
607
|
+
|
608
|
+
if (is_last && is_fragment_parser(parser)) {
|
609
|
+
node = parser->_parser_state->_fragment_ctx;
|
610
|
+
}
|
611
|
+
|
612
|
+
assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
|
552
613
|
switch (node->v.element.tag) {
|
553
|
-
|
614
|
+
case GUMBO_TAG_SELECT: {
|
615
|
+
if (is_last) {
|
554
616
|
return GUMBO_INSERTION_MODE_IN_SELECT;
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
return GUMBO_INSERTION_MODE_IN_CAPTION;
|
567
|
-
case GUMBO_TAG_COLGROUP:
|
568
|
-
return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
|
569
|
-
case GUMBO_TAG_TABLE:
|
570
|
-
return GUMBO_INSERTION_MODE_IN_TABLE;
|
571
|
-
case GUMBO_TAG_HEAD:
|
572
|
-
case GUMBO_TAG_BODY:
|
573
|
-
return GUMBO_INSERTION_MODE_IN_BODY;
|
574
|
-
case GUMBO_TAG_FRAMESET:
|
575
|
-
return GUMBO_INSERTION_MODE_IN_FRAMESET;
|
576
|
-
case GUMBO_TAG_HTML:
|
577
|
-
return GUMBO_INSERTION_MODE_BEFORE_HEAD;
|
578
|
-
default:
|
579
|
-
return is_last ?
|
580
|
-
GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
|
617
|
+
}
|
618
|
+
for (int i = index; i > 0; --i) {
|
619
|
+
const GumboNode* ancestor = open_elements->data[i];
|
620
|
+
if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) {
|
621
|
+
return GUMBO_INSERTION_MODE_IN_SELECT;
|
622
|
+
}
|
623
|
+
if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) {
|
624
|
+
return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE;
|
625
|
+
}
|
626
|
+
}
|
627
|
+
return GUMBO_INSERTION_MODE_IN_SELECT;
|
581
628
|
}
|
629
|
+
case GUMBO_TAG_TD:
|
630
|
+
case GUMBO_TAG_TH:
|
631
|
+
if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL;
|
632
|
+
break;
|
633
|
+
case GUMBO_TAG_TR:
|
634
|
+
return GUMBO_INSERTION_MODE_IN_ROW;
|
635
|
+
case GUMBO_TAG_TBODY:
|
636
|
+
case GUMBO_TAG_THEAD:
|
637
|
+
case GUMBO_TAG_TFOOT:
|
638
|
+
return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
|
639
|
+
case GUMBO_TAG_CAPTION:
|
640
|
+
return GUMBO_INSERTION_MODE_IN_CAPTION;
|
641
|
+
case GUMBO_TAG_COLGROUP:
|
642
|
+
return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
|
643
|
+
case GUMBO_TAG_TABLE:
|
644
|
+
return GUMBO_INSERTION_MODE_IN_TABLE;
|
645
|
+
case GUMBO_TAG_TEMPLATE:
|
646
|
+
return get_current_template_insertion_mode(parser);
|
647
|
+
case GUMBO_TAG_HEAD:
|
648
|
+
if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD;
|
649
|
+
break;
|
650
|
+
case GUMBO_TAG_BODY:
|
651
|
+
return GUMBO_INSERTION_MODE_IN_BODY;
|
652
|
+
case GUMBO_TAG_FRAMESET:
|
653
|
+
return GUMBO_INSERTION_MODE_IN_FRAMESET;
|
654
|
+
case GUMBO_TAG_HTML:
|
655
|
+
return parser->_parser_state->_head_element ?
|
656
|
+
GUMBO_INSERTION_MODE_AFTER_HEAD : GUMBO_INSERTION_MODE_BEFORE_HEAD;
|
657
|
+
default:
|
658
|
+
break;
|
659
|
+
}
|
660
|
+
return is_last ?
|
661
|
+
GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
|
582
662
|
}
|
583
663
|
|
664
|
+
|
584
665
|
// This performs the actual "reset the insertion mode" loop.
|
585
666
|
static void reset_insertion_mode_appropriately(GumboParser* parser) {
|
586
667
|
const GumboVector* open_elements = &parser->_parser_state->_open_elements;
|
587
668
|
for (int i = open_elements->length; --i >= 0; ) {
|
588
669
|
GumboInsertionMode mode =
|
589
|
-
get_appropriate_insertion_mode(
|
670
|
+
get_appropriate_insertion_mode(parser, i);
|
590
671
|
if (mode != GUMBO_INSERTION_MODE_INITIAL) {
|
591
672
|
set_insertion_mode(parser, mode);
|
592
673
|
return;
|
@@ -620,7 +701,7 @@ static GumboError* parser_add_parse_error(GumboParser* parser, const GumboToken*
|
|
620
701
|
&extra_data->tag_stack);
|
621
702
|
for (int i = 0; i < state->_open_elements.length; ++i) {
|
622
703
|
const GumboNode* node = state->_open_elements.data[i];
|
623
|
-
assert(node->type == GUMBO_NODE_ELEMENT);
|
704
|
+
assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
|
624
705
|
gumbo_vector_add(parser, (void*) node->v.element.tag,
|
625
706
|
&extra_data->tag_stack);
|
626
707
|
}
|
@@ -631,13 +712,7 @@ static GumboError* parser_add_parse_error(GumboParser* parser, const GumboToken*
|
|
631
712
|
// by is_start) with one of the tag types in the varargs list. Terminate the
|
632
713
|
// list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
|
633
714
|
// the spec references tags that are not in the spec.
|
634
|
-
|
635
|
-
// places in the code. This is how it's written in the spec (and it's done this
|
636
|
-
// way so it's easy to verify the code against the spec), but it may be worth
|
637
|
-
// coming up with a notion of a "tag set" that includes a list of tags, and
|
638
|
-
// using that in many places. It'd probably also help performance, but I want
|
639
|
-
// to profile before optimizing.
|
640
|
-
static bool tag_in(const GumboToken* token, bool is_start, ...) {
|
715
|
+
static bool tag_in(const GumboToken* token, bool is_start, const gumbo_tagset tags) {
|
641
716
|
GumboTag token_tag;
|
642
717
|
if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
|
643
718
|
token_tag = token->v.start_tag.tag;
|
@@ -646,19 +721,7 @@ static bool tag_in(const GumboToken* token, bool is_start, ...) {
|
|
646
721
|
} else {
|
647
722
|
return false;
|
648
723
|
}
|
649
|
-
|
650
|
-
va_list tags;
|
651
|
-
va_start(tags, is_start);
|
652
|
-
bool result = false;
|
653
|
-
for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
|
654
|
-
tag = va_arg(tags, GumboTag)) {
|
655
|
-
if (tag == token_tag) {
|
656
|
-
result = true;
|
657
|
-
break;
|
658
|
-
}
|
659
|
-
}
|
660
|
-
va_end(tags);
|
661
|
-
return result;
|
724
|
+
return (token_tag < GUMBO_TAG_LAST && tags[(int)token_tag] != 0);
|
662
725
|
}
|
663
726
|
|
664
727
|
// Like tag_in, but for the single-tag case.
|
@@ -673,52 +736,119 @@ static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
|
|
673
736
|
}
|
674
737
|
|
675
738
|
// Like tag_in, but checks for the tag of a node, rather than a token.
|
676
|
-
static bool
|
739
|
+
static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) {
|
677
740
|
assert(node != NULL);
|
678
|
-
if (node->type != GUMBO_NODE_ELEMENT) {
|
741
|
+
if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
|
679
742
|
return false;
|
680
743
|
}
|
681
|
-
|
682
|
-
|
683
|
-
va_list tags;
|
684
|
-
va_start(tags, node);
|
685
|
-
bool result = false;
|
686
|
-
for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
|
687
|
-
tag = va_arg(tags, GumboTag)) {
|
688
|
-
assert(tag <= GUMBO_TAG_LAST);
|
689
|
-
if (tag == node_tag) {
|
690
|
-
result = true;
|
691
|
-
break;
|
692
|
-
}
|
693
|
-
}
|
694
|
-
va_end(tags);
|
695
|
-
return result;
|
744
|
+
return TAGSET_INCLUDES(tags, node->v.element.tag_namespace, node->v.element.tag);
|
696
745
|
}
|
697
746
|
|
747
|
+
|
698
748
|
// Like node_tag_in, but for the single-tag case.
|
699
|
-
static bool
|
700
|
-
return node->type == GUMBO_NODE_ELEMENT
|
749
|
+
static bool node_qualified_tag_is(const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) {
|
750
|
+
return (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) &&
|
751
|
+
node->v.element.tag == tag &&
|
752
|
+
node->v.element.tag_namespace == ns;
|
753
|
+
}
|
754
|
+
|
755
|
+
// Like node_tag_in, but for the single-tag case in the HTML namespace
|
756
|
+
static bool node_html_tag_is(const GumboNode* node, GumboTag tag)
|
757
|
+
{
|
758
|
+
return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
|
759
|
+
}
|
760
|
+
|
761
|
+
static void push_template_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
|
762
|
+
gumbo_vector_add(parser, (void*) mode, &parser->_parser_state->_template_insertion_modes);
|
763
|
+
}
|
764
|
+
|
765
|
+
static void pop_template_insertion_mode(GumboParser* parser) {
|
766
|
+
gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes);
|
767
|
+
}
|
768
|
+
|
769
|
+
// Returns the current template insertion mode. If the stack of template
|
770
|
+
// insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL.
|
771
|
+
static GumboInsertionMode get_current_template_insertion_mode(const GumboParser* parser) {
|
772
|
+
GumboVector* template_insertion_modes = &parser->_parser_state->_template_insertion_modes;
|
773
|
+
if (template_insertion_modes->length == 0) {
|
774
|
+
return GUMBO_INSERTION_MODE_INITIAL;
|
775
|
+
}
|
776
|
+
return (GumboInsertionMode) template_insertion_modes->data[(template_insertion_modes->length - 1)];
|
701
777
|
}
|
702
778
|
|
703
779
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
|
704
780
|
static bool is_mathml_integration_point(const GumboNode* node) {
|
705
|
-
return
|
706
|
-
|
707
|
-
node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML;
|
781
|
+
return node_tag_in_set(node, (gumbo_tagset) { TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
|
782
|
+
TAG_MATHML(MS), TAG_MATHML(MTEXT) });
|
708
783
|
}
|
709
784
|
|
710
785
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
|
711
786
|
static bool is_html_integration_point(const GumboNode* node) {
|
712
|
-
return (
|
713
|
-
|
714
|
-
node->v.element.tag_namespace == GUMBO_NAMESPACE_SVG) ||
|
715
|
-
(node_tag_is(node, GUMBO_TAG_ANNOTATION_XML) && (
|
787
|
+
return node_tag_in_set(node, (gumbo_tagset) { TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) }) ||
|
788
|
+
(node_qualified_tag_is(node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) && (
|
716
789
|
attribute_matches(&node->v.element.attributes,
|
717
790
|
"encoding", "text/html") ||
|
718
791
|
attribute_matches(&node->v.element.attributes,
|
719
792
|
"encoding", "application/xhtml+xml")));
|
720
793
|
}
|
721
794
|
|
795
|
+
|
796
|
+
// This represents a place to insert a node, consisting of a target parent and a
|
797
|
+
// child index within that parent. If the node should be inserted at the end of
|
798
|
+
// the parent's child, index will be -1.
|
799
|
+
typedef struct {
|
800
|
+
GumboNode* target;
|
801
|
+
int index;
|
802
|
+
} InsertionLocation;
|
803
|
+
|
804
|
+
InsertionLocation get_appropriate_insertion_location(GumboParser* parser, GumboNode* override_target) {
|
805
|
+
InsertionLocation retval = { override_target, -1 };
|
806
|
+
if (retval.target == NULL) {
|
807
|
+
// No override target; default to the current node, but special-case the
|
808
|
+
// root node since get_current_node() assumes the stack of open elements is
|
809
|
+
// non-empty.
|
810
|
+
retval.target = parser->_output->root != NULL ?
|
811
|
+
get_current_node(parser) : get_document_node(parser);
|
812
|
+
}
|
813
|
+
if (!parser->_parser_state->_foster_parent_insertions ||
|
814
|
+
!node_tag_in_set(retval.target, (gumbo_tagset) { TAG(TABLE), TAG(TBODY),
|
815
|
+
TAG(TFOOT), TAG(THEAD), TAG(TR) })) {
|
816
|
+
return retval;
|
817
|
+
}
|
818
|
+
|
819
|
+
// Foster-parenting case.
|
820
|
+
int last_template_index = -1;
|
821
|
+
int last_table_index = -1;
|
822
|
+
GumboVector* open_elements = &parser->_parser_state->_open_elements;
|
823
|
+
for (int i = 0; i < open_elements->length; ++i) {
|
824
|
+
if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) {
|
825
|
+
last_template_index = i;
|
826
|
+
}
|
827
|
+
if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) {
|
828
|
+
last_table_index = i;
|
829
|
+
}
|
830
|
+
}
|
831
|
+
if (last_template_index != -1 &&
|
832
|
+
(last_table_index == -1 || last_template_index > last_table_index)) {
|
833
|
+
retval.target = open_elements->data[last_template_index];
|
834
|
+
return retval;
|
835
|
+
}
|
836
|
+
if (last_table_index == -1) {
|
837
|
+
retval.target = open_elements->data[0];
|
838
|
+
return retval;
|
839
|
+
}
|
840
|
+
GumboNode* last_table = open_elements->data[last_table_index];
|
841
|
+
if (last_table->parent != NULL) {
|
842
|
+
retval.target = last_table->parent;
|
843
|
+
retval.index = last_table->index_within_parent;
|
844
|
+
return retval;
|
845
|
+
}
|
846
|
+
|
847
|
+
retval.target = open_elements->data[last_table_index - 1];
|
848
|
+
return retval;
|
849
|
+
}
|
850
|
+
|
851
|
+
|
722
852
|
// Appends a node to the end of its parent, setting the "parent" and
|
723
853
|
// "index_within_parent" fields appropriately.
|
724
854
|
static void append_node(
|
@@ -726,7 +856,7 @@ static void append_node(
|
|
726
856
|
assert(node->parent == NULL);
|
727
857
|
assert(node->index_within_parent == -1);
|
728
858
|
GumboVector* children;
|
729
|
-
if (parent->type == GUMBO_NODE_ELEMENT) {
|
859
|
+
if (parent->type == GUMBO_NODE_ELEMENT || parent->type == GUMBO_NODE_TEMPLATE) {
|
730
860
|
children = &parent->v.element.children;
|
731
861
|
} else {
|
732
862
|
assert(parent->type == GUMBO_NODE_DOCUMENT);
|
@@ -738,66 +868,44 @@ static void append_node(
|
|
738
868
|
assert(node->index_within_parent < children->length);
|
739
869
|
}
|
740
870
|
|
741
|
-
// Inserts a node at the specified
|
871
|
+
// Inserts a node at the specified InsertionLocation, updating the
|
742
872
|
// "parent" and "index_within_parent" fields of it and all its siblings.
|
873
|
+
// If the index of the location is -1, this calls append_node.
|
743
874
|
static void insert_node(
|
744
|
-
|
875
|
+
GumboParser* parser, GumboNode* node, InsertionLocation location) {
|
745
876
|
assert(node->parent == NULL);
|
746
877
|
assert(node->index_within_parent == -1);
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
}
|
878
|
+
GumboNode* parent = location.target;
|
879
|
+
int index = location.index;
|
880
|
+
if (index != -1) {
|
881
|
+
GumboVector* children = NULL;
|
882
|
+
if (parent->type == GUMBO_NODE_ELEMENT ||
|
883
|
+
parent->type == GUMBO_NODE_TEMPLATE) {
|
884
|
+
children = &parent->v.element.children;
|
885
|
+
} else if (parent->type == GUMBO_NODE_DOCUMENT) {
|
886
|
+
children = &parent->v.document.children;
|
887
|
+
assert(children->length == 0);
|
888
|
+
} else {
|
889
|
+
assert(0);
|
890
|
+
}
|
761
891
|
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
GumboNode* table_element = open_elements->data[i];
|
773
|
-
if (node_tag_is(table_element, GUMBO_TAG_TABLE)) {
|
774
|
-
foster_parent_element = table_element->parent;
|
775
|
-
if (!foster_parent_element ||
|
776
|
-
foster_parent_element->type != GUMBO_NODE_ELEMENT) {
|
777
|
-
// Table has no parent; spec says it's possible if a script manipulated
|
778
|
-
// the DOM, although I don't think we have to worry about this case.
|
779
|
-
gumbo_debug("Table has no parent.\n");
|
780
|
-
foster_parent_element = open_elements->data[i - 1];
|
781
|
-
break;
|
782
|
-
}
|
783
|
-
assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
|
784
|
-
gumbo_debug("Found enclosing table (%x) at %d; parent=%s, index=%d.\n",
|
785
|
-
table_element, i, gumbo_normalized_tagname(
|
786
|
-
foster_parent_element->v.element.tag),
|
787
|
-
table_element->index_within_parent);
|
788
|
-
assert(foster_parent_element->v.element.children.data[
|
789
|
-
table_element->index_within_parent] == table_element);
|
790
|
-
insert_node(parser, foster_parent_element,
|
791
|
-
table_element->index_within_parent, node);
|
792
|
-
return;
|
892
|
+
assert(index >= 0);
|
893
|
+
assert(index < children->length);
|
894
|
+
node->parent = parent;
|
895
|
+
node->index_within_parent = index;
|
896
|
+
gumbo_vector_insert_at(parser, (void*) node, index, children);
|
897
|
+
assert(node->index_within_parent < children->length);
|
898
|
+
for (int i = index + 1; i < children->length; ++i) {
|
899
|
+
GumboNode* sibling = children->data[i];
|
900
|
+
sibling->index_within_parent = i;
|
901
|
+
assert(sibling->index_within_parent < children->length);
|
793
902
|
}
|
903
|
+
} else {
|
904
|
+
append_node(parser, parent, node);
|
794
905
|
}
|
795
|
-
if (node->type == GUMBO_NODE_ELEMENT) {
|
796
|
-
gumbo_vector_add(parser, (void*) node, open_elements);
|
797
|
-
}
|
798
|
-
append_node(parser, foster_parent_element, node);
|
799
906
|
}
|
800
907
|
|
908
|
+
|
801
909
|
static void maybe_flush_text_node_buffer(GumboParser* parser) {
|
802
910
|
GumboParserState* state = parser->_parser_state;
|
803
911
|
TextNodeBufferState* buffer_state = &state->_text_node;
|
@@ -806,7 +914,8 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) {
|
|
806
914
|
}
|
807
915
|
|
808
916
|
assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
|
809
|
-
buffer_state->_type == GUMBO_NODE_TEXT
|
917
|
+
buffer_state->_type == GUMBO_NODE_TEXT ||
|
918
|
+
buffer_state->_type == GUMBO_NODE_CDATA);
|
810
919
|
GumboNode* text_node = create_node(parser, buffer_state->_type);
|
811
920
|
GumboText* text_node_data = &text_node->v.text;
|
812
921
|
text_node_data->text = gumbo_string_buffer_to_string(
|
@@ -816,20 +925,20 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) {
|
|
816
925
|
state->_current_token->original_text.data -
|
817
926
|
buffer_state->_start_original_text;
|
818
927
|
text_node_data->start_pos = buffer_state->_start_position;
|
819
|
-
|
820
|
-
get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
|
821
|
-
GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
|
822
|
-
foster_parent_element(parser, text_node);
|
823
|
-
} else {
|
824
|
-
append_node(
|
825
|
-
parser, parser->_output->root ?
|
826
|
-
get_current_node(parser) : parser->_output->document, text_node);
|
827
|
-
}
|
928
|
+
|
828
929
|
gumbo_debug("Flushing text node buffer of %.*s.\n",
|
829
930
|
(int) buffer_state->_buffer.length, buffer_state->_buffer.data);
|
830
931
|
|
831
|
-
|
832
|
-
|
932
|
+
InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
|
933
|
+
if (location.target->type == GUMBO_NODE_DOCUMENT) {
|
934
|
+
// The DOM does not allow Document nodes to have Text children, so per the
|
935
|
+
// spec, they are dropped on the floor.
|
936
|
+
destroy_node(parser, text_node);
|
937
|
+
} else {
|
938
|
+
insert_node(parser, text_node, location);
|
939
|
+
}
|
940
|
+
|
941
|
+
gumbo_string_buffer_clear(parser, &buffer_state->_buffer);
|
833
942
|
buffer_state->_type = GUMBO_NODE_WHITESPACE;
|
834
943
|
assert(buffer_state->_buffer.length == 0);
|
835
944
|
}
|
@@ -846,7 +955,7 @@ static GumboNode* pop_current_node(GumboParser* parser) {
|
|
846
955
|
GumboParserState* state = parser->_parser_state;
|
847
956
|
maybe_flush_text_node_buffer(parser);
|
848
957
|
if (state->_open_elements.length > 0) {
|
849
|
-
assert(
|
958
|
+
assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
|
850
959
|
gumbo_debug(
|
851
960
|
"Popping %s node.\n",
|
852
961
|
gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
|
@@ -856,12 +965,12 @@ static GumboNode* pop_current_node(GumboParser* parser) {
|
|
856
965
|
assert(state->_open_elements.length == 0);
|
857
966
|
return NULL;
|
858
967
|
}
|
859
|
-
assert(current_node->type == GUMBO_NODE_ELEMENT);
|
968
|
+
assert(current_node->type == GUMBO_NODE_ELEMENT || current_node->type == GUMBO_NODE_TEMPLATE);
|
860
969
|
bool is_closed_body_or_html_tag =
|
861
|
-
(
|
862
|
-
(
|
970
|
+
(node_html_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
|
971
|
+
(node_html_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag);
|
863
972
|
if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
|
864
|
-
!
|
973
|
+
!node_html_tag_is(current_node, state->_current_token->v.end_tag)) &&
|
865
974
|
!is_closed_body_or_html_tag) {
|
866
975
|
current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
|
867
976
|
}
|
@@ -885,25 +994,22 @@ static void append_comment_node(
|
|
885
994
|
|
886
995
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
|
887
996
|
static void clear_stack_to_table_row_context(GumboParser* parser) {
|
888
|
-
while (!
|
889
|
-
GUMBO_TAG_HTML, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
|
997
|
+
while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TR), TAG(TEMPLATE)})) {
|
890
998
|
pop_current_node(parser);
|
891
999
|
}
|
892
1000
|
}
|
893
1001
|
|
894
1002
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
|
895
1003
|
static void clear_stack_to_table_context(GumboParser* parser) {
|
896
|
-
while (!
|
897
|
-
GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST)) {
|
1004
|
+
while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TABLE), TAG(TEMPLATE) } )) {
|
898
1005
|
pop_current_node(parser);
|
899
1006
|
}
|
900
1007
|
}
|
901
1008
|
|
902
1009
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
|
903
1010
|
void clear_stack_to_table_body_context(GumboParser* parser) {
|
904
|
-
while (!
|
905
|
-
|
906
|
-
GUMBO_TAG_LAST)) {
|
1011
|
+
while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TBODY),
|
1012
|
+
TAG(TFOOT), TAG(THEAD), TAG(TEMPLATE) })) {
|
907
1013
|
pop_current_node(parser);
|
908
1014
|
}
|
909
1015
|
}
|
@@ -918,7 +1024,8 @@ static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
|
|
918
1024
|
element->tag_namespace = GUMBO_NAMESPACE_HTML;
|
919
1025
|
element->original_tag = kGumboEmptyString;
|
920
1026
|
element->original_end_tag = kGumboEmptyString;
|
921
|
-
element->start_pos = parser->_parser_state->_current_token
|
1027
|
+
element->start_pos = (parser->_parser_state->_current_token) ?
|
1028
|
+
parser->_parser_state->_current_token->position : kGumboEmptySourcePosition;
|
922
1029
|
element->end_pos = kGumboEmptySourcePosition;
|
923
1030
|
return node;
|
924
1031
|
}
|
@@ -929,7 +1036,12 @@ static GumboNode* create_element_from_token(
|
|
929
1036
|
assert(token->type == GUMBO_TOKEN_START_TAG);
|
930
1037
|
GumboTokenStartTag* start_tag = &token->v.start_tag;
|
931
1038
|
|
932
|
-
|
1039
|
+
GumboNodeType type = (
|
1040
|
+
tag_namespace == GUMBO_NAMESPACE_HTML &&
|
1041
|
+
start_tag->tag == GUMBO_TAG_TEMPLATE)
|
1042
|
+
? GUMBO_NODE_TEMPLATE : GUMBO_NODE_ELEMENT;
|
1043
|
+
|
1044
|
+
GumboNode* node = create_node(parser, type);
|
933
1045
|
GumboElement* element = &node->v.element;
|
934
1046
|
gumbo_vector_init(parser, 1, &element->children);
|
935
1047
|
element->attributes = start_tag->attributes;
|
@@ -966,20 +1078,9 @@ static void insert_element(GumboParser* parser, GumboNode* node,
|
|
966
1078
|
if (!is_reconstructing_formatting_elements) {
|
967
1079
|
maybe_flush_text_node_buffer(parser);
|
968
1080
|
}
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
foster_parent_element(parser, node);
|
973
|
-
gumbo_vector_add(parser, (void*) node, &state->_open_elements);
|
974
|
-
return;
|
975
|
-
}
|
976
|
-
|
977
|
-
// This is called to insert the root HTML element, but get_current_node
|
978
|
-
// assumes the stack of open elements is non-empty, so we need special
|
979
|
-
// handling for this case.
|
980
|
-
append_node(
|
981
|
-
parser, parser->_output->root ?
|
982
|
-
get_current_node(parser) : parser->_output->document, node);
|
1081
|
+
InsertionLocation location =
|
1082
|
+
get_appropriate_insertion_location(parser, NULL);
|
1083
|
+
insert_node(parser, node, location);
|
983
1084
|
gumbo_vector_add(parser, (void*) node, &state->_open_elements);
|
984
1085
|
}
|
985
1086
|
|
@@ -1035,7 +1136,9 @@ static GumboNode* insert_foreign_element(
|
|
1035
1136
|
|
1036
1137
|
static void insert_text_token(GumboParser* parser, GumboToken* token) {
|
1037
1138
|
assert(token->type == GUMBO_TOKEN_WHITESPACE ||
|
1038
|
-
token->type == GUMBO_TOKEN_CHARACTER
|
1139
|
+
token->type == GUMBO_TOKEN_CHARACTER ||
|
1140
|
+
token->type == GUMBO_TOKEN_NULL ||
|
1141
|
+
token->type == GUMBO_TOKEN_CDATA);
|
1039
1142
|
TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
|
1040
1143
|
if (buffer_state->_buffer.length == 0) {
|
1041
1144
|
// Initialize position fields.
|
@@ -1046,6 +1149,8 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) {
|
|
1046
1149
|
parser, token->v.character, &buffer_state->_buffer);
|
1047
1150
|
if (token->type == GUMBO_TOKEN_CHARACTER) {
|
1048
1151
|
buffer_state->_type = GUMBO_NODE_TEXT;
|
1152
|
+
} else if (token->type == GUMBO_TOKEN_CDATA) {
|
1153
|
+
buffer_state->_type = GUMBO_NODE_CDATA;
|
1049
1154
|
}
|
1050
1155
|
gumbo_debug("Inserting text token '%c'.\n", token->v.character);
|
1051
1156
|
}
|
@@ -1073,7 +1178,7 @@ static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
|
|
1073
1178
|
if (node == &kActiveFormattingScopeMarker) {
|
1074
1179
|
return false;
|
1075
1180
|
}
|
1076
|
-
if (
|
1181
|
+
if (node_html_tag_is(node, GUMBO_TAG_A)) {
|
1077
1182
|
*anchor_index = i;
|
1078
1183
|
return true;
|
1079
1184
|
}
|
@@ -1097,10 +1202,8 @@ static int count_formatting_elements_of_tag(
|
|
1097
1202
|
break;
|
1098
1203
|
}
|
1099
1204
|
assert(node->type == GUMBO_NODE_ELEMENT);
|
1100
|
-
|
1101
|
-
|
1102
|
-
element->tag_namespace == desired_element->tag_namespace &&
|
1103
|
-
all_attributes_match(&element->attributes,
|
1205
|
+
if (node_qualified_tag_is(node, desired_element->tag_namespace, desired_element->tag) &&
|
1206
|
+
all_attributes_match(&node->v.element.attributes,
|
1104
1207
|
&desired_element->attributes)) {
|
1105
1208
|
num_identical_elements++;
|
1106
1209
|
*earliest_matching_index = i;
|
@@ -1150,7 +1253,7 @@ static bool is_open_element(GumboParser* parser, const GumboNode* node) {
|
|
1150
1253
|
// values are fresh copies.
|
1151
1254
|
GumboNode* clone_node(
|
1152
1255
|
GumboParser* parser, const GumboNode* node, GumboParseFlags reason) {
|
1153
|
-
assert(node->type == GUMBO_NODE_ELEMENT);
|
1256
|
+
assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
|
1154
1257
|
GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
|
1155
1258
|
*new_node = *node;
|
1156
1259
|
new_node->parent = NULL;
|
@@ -1220,7 +1323,10 @@ static void reconstruct_active_formatting_elements(GumboParser* parser) {
|
|
1220
1323
|
GumboNode* clone = clone_node(
|
1221
1324
|
parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
|
1222
1325
|
// Step 9.
|
1223
|
-
|
1326
|
+
InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
|
1327
|
+
insert_node(parser, clone, location);
|
1328
|
+
gumbo_vector_add(parser, (void*) clone, &parser->_parser_state->_open_elements);
|
1329
|
+
|
1224
1330
|
// Step 10.
|
1225
1331
|
elements->data[i] = clone;
|
1226
1332
|
gumbo_debug("Reconstructed %s element at %d.\n",
|
@@ -1269,83 +1375,47 @@ static GumboQuirksModeEnum compute_quirks_mode(
|
|
1269
1375
|
// The following functions are all defined by the "has an element in __ scope"
|
1270
1376
|
// sections of the HTML5 spec:
|
1271
1377
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
|
1272
|
-
// The basic idea behind them is that they check for an element of the given
|
1273
|
-
// name, contained within a scope formed by a set of other
|
1274
|
-
// example, "has an element in list scope" looks for an element of
|
1275
|
-
// within the nearest enclosing <ol> or <ul>, along
|
1276
|
-
// element types that serve to "firewall" their content
|
1277
|
-
// document.
|
1278
|
-
|
1279
|
-
|
1378
|
+
// The basic idea behind them is that they check for an element of the given
|
1379
|
+
// qualified name, contained within a scope formed by a set of other qualified
|
1380
|
+
// names. For example, "has an element in list scope" looks for an element of
|
1381
|
+
// the given qualified name within the nearest enclosing <ol> or <ul>, along
|
1382
|
+
// with a bunch of generic element types that serve to "firewall" their content
|
1383
|
+
// from the rest of the document. Note that because of the way the spec is written,
|
1384
|
+
// all elements are expected to be in the HTML namespace
|
1385
|
+
static bool has_an_element_in_specific_scope(GumboParser* parser,
|
1386
|
+
int expected_size, const GumboTag *expected, bool negate, const gumbo_tagset tags) {
|
1280
1387
|
GumboVector* open_elements = &parser->_parser_state->_open_elements;
|
1281
|
-
va_list args;
|
1282
|
-
va_start(args, negate);
|
1283
|
-
// va_arg can only run through the list once, so we copy it to an GumboVector
|
1284
|
-
// here. I wonder if it'd make more sense to make tags the GumboVector*
|
1285
|
-
// parameter and 'expected' a vararg list, but that'd require changing a lot
|
1286
|
-
// of code for unknown benefit. We may want to change the representation of
|
1287
|
-
// these tag sets anyway, to something more efficient.
|
1288
|
-
GumboVector tags;
|
1289
|
-
gumbo_vector_init(parser, 10, &tags);
|
1290
|
-
for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
|
1291
|
-
tag = va_arg(args, GumboTag)) {
|
1292
|
-
// We store the tags inline instead of storing pointers to them.
|
1293
|
-
gumbo_vector_add(parser, (void*) tag, &tags);
|
1294
|
-
}
|
1295
|
-
va_end(args);
|
1296
|
-
|
1297
|
-
bool result = false;
|
1298
1388
|
for (int i = open_elements->length; --i >= 0; ) {
|
1299
1389
|
const GumboNode* node = open_elements->data[i];
|
1300
|
-
if (node->type != GUMBO_NODE_ELEMENT)
|
1390
|
+
if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE)
|
1301
1391
|
continue;
|
1302
|
-
|
1392
|
+
|
1303
1393
|
GumboTag node_tag = node->v.element.tag;
|
1304
|
-
|
1305
|
-
|
1306
|
-
if (node_tag ==
|
1307
|
-
|
1308
|
-
goto cleanup;
|
1309
|
-
}
|
1394
|
+
GumboNamespaceEnum node_ns = node->v.element.tag_namespace;
|
1395
|
+
for (int j = 0; j < expected_size; ++j) {
|
1396
|
+
if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML)
|
1397
|
+
return true;
|
1310
1398
|
}
|
1311
1399
|
|
1312
|
-
bool
|
1313
|
-
|
1314
|
-
|
1315
|
-
if (tag == node_tag) {
|
1316
|
-
found_tag = true;
|
1317
|
-
break;
|
1318
|
-
}
|
1319
|
-
}
|
1320
|
-
if (negate != found_tag) {
|
1321
|
-
result = false;
|
1322
|
-
goto cleanup;
|
1323
|
-
}
|
1400
|
+
bool found = TAGSET_INCLUDES(tags, node_ns, node_tag);
|
1401
|
+
if (negate != found)
|
1402
|
+
return false;
|
1324
1403
|
}
|
1325
|
-
|
1326
|
-
gumbo_vector_destroy(parser, &tags);
|
1327
|
-
return result;
|
1404
|
+
return false;
|
1328
1405
|
}
|
1329
1406
|
|
1330
|
-
//
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
// and the data inside it can be freely accessed as if it were a normal
|
1335
|
-
// GumboVector.
|
1336
|
-
#define DECLARE_ONE_ELEMENT_GUMBO_VECTOR(varname, from_var) \
|
1337
|
-
void* varname ## _tmp_array[1] = { (void*) from_var }; \
|
1338
|
-
GumboVector varname = { varname ## _tmp_array, 1, 1 }
|
1407
|
+
// Checks for the presence of an open element of the specified tag type.
|
1408
|
+
static bool has_open_element(GumboParser* parser, GumboTag tag) {
|
1409
|
+
return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(HTML) } );
|
1410
|
+
}
|
1339
1411
|
|
1340
1412
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
|
1341
1413
|
static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
|
1348
|
-
GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
|
1414
|
+
return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(APPLET),
|
1415
|
+
TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
|
1416
|
+
TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
|
1417
|
+
TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
|
1418
|
+
TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) });
|
1349
1419
|
}
|
1350
1420
|
|
1351
1421
|
// Like "has an element in scope", but for the specific case of looking for a
|
@@ -1361,16 +1431,14 @@ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
|
|
1361
1431
|
if (current == node) {
|
1362
1432
|
return true;
|
1363
1433
|
}
|
1364
|
-
if (current->type != GUMBO_NODE_ELEMENT) {
|
1434
|
+
if (current->type != GUMBO_NODE_ELEMENT && current->type != GUMBO_NODE_TEMPLATE) {
|
1365
1435
|
continue;
|
1366
1436
|
}
|
1367
|
-
if (
|
1368
|
-
|
1369
|
-
|
1370
|
-
|
1371
|
-
|
1372
|
-
GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_TITLE,
|
1373
|
-
GUMBO_TAG_LAST)) {
|
1437
|
+
if (node_tag_in_set(current, (gumbo_tagset) { TAG(APPLET), TAG(CAPTION), TAG(HTML),
|
1438
|
+
TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE),
|
1439
|
+
TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
|
1440
|
+
TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT),
|
1441
|
+
TAG_SVG(DESC), TAG_SVG(TITLE) } )) {
|
1374
1442
|
return false;
|
1375
1443
|
}
|
1376
1444
|
}
|
@@ -1378,78 +1446,66 @@ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
|
|
1378
1446
|
return false;
|
1379
1447
|
}
|
1380
1448
|
|
1381
|
-
// Like has_an_element_in_scope, but restricts the expected
|
1382
|
-
// possible
|
1383
|
-
static bool has_an_element_in_scope_with_tagname(GumboParser* parser,
|
1384
|
-
|
1385
|
-
|
1386
|
-
|
1387
|
-
|
1388
|
-
|
1389
|
-
va_start(args, parser);
|
1390
|
-
for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
|
1391
|
-
tag = va_arg(args, GumboTag)) {
|
1392
|
-
gumbo_vector_add(parser, (void*) tag, &tags);
|
1393
|
-
}
|
1394
|
-
bool found = has_an_element_in_specific_scope(
|
1395
|
-
parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
|
1396
|
-
GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
|
1397
|
-
GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
|
1398
|
-
GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
|
1399
|
-
GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
|
1400
|
-
gumbo_vector_destroy(parser, &tags);
|
1401
|
-
va_end(args);
|
1402
|
-
return found;
|
1449
|
+
// Like has_an_element_in_scope, but restricts the expected qualified name to a
|
1450
|
+
// range of possible qualified names instead of just a single one.
|
1451
|
+
static bool has_an_element_in_scope_with_tagname(GumboParser* parser, int expected_len, const GumboTag expected[]) {
|
1452
|
+
return has_an_element_in_specific_scope(parser, expected_len, expected, false, (gumbo_tagset) {
|
1453
|
+
TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
|
1454
|
+
TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
|
1455
|
+
TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
|
1456
|
+
TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) });
|
1403
1457
|
}
|
1404
1458
|
|
1405
1459
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
|
1406
1460
|
static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
|
1407
|
-
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1413
|
-
GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_OL, GUMBO_TAG_UL,
|
1414
|
-
GUMBO_TAG_LAST);
|
1461
|
+
return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(APPLET),
|
1462
|
+
TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
|
1463
|
+
TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
|
1464
|
+
TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
|
1465
|
+
TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE), TAG(OL),
|
1466
|
+
TAG(UL) });
|
1415
1467
|
}
|
1416
1468
|
|
1417
1469
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
|
1418
1470
|
static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
|
1425
|
-
GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_BUTTON, GUMBO_TAG_LAST);
|
1471
|
+
return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(APPLET),
|
1472
|
+
TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
|
1473
|
+
TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
|
1474
|
+
TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
|
1475
|
+
TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE), TAG(BUTTON) });
|
1426
1476
|
}
|
1427
1477
|
|
1428
1478
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
|
1429
1479
|
static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
|
1430
|
-
|
1431
|
-
|
1432
|
-
parser, &tags, false, GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST);
|
1480
|
+
return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(HTML),
|
1481
|
+
TAG(TABLE), TAG(TEMPLATE) });
|
1433
1482
|
}
|
1434
1483
|
|
1435
1484
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
|
1436
1485
|
static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
|
1437
|
-
|
1438
|
-
return has_an_element_in_specific_scope(
|
1439
|
-
parser, &tags, true, GUMBO_TAG_OPTGROUP, GUMBO_TAG_OPTION,
|
1440
|
-
GUMBO_TAG_LAST);
|
1486
|
+
return has_an_element_in_specific_scope(parser, 1, &tag, true, (gumbo_tagset) { TAG(OPTGROUP), TAG(OPTION) });
|
1441
1487
|
}
|
1442
1488
|
|
1443
|
-
|
1444
1489
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
|
1445
1490
|
// "exception" is the "element to exclude from the process" listed in the spec.
|
1446
1491
|
// Pass GUMBO_TAG_LAST to not exclude any of them.
|
1447
1492
|
static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
|
1448
1493
|
for (;
|
1449
|
-
|
1450
|
-
|
1451
|
-
|
1452
|
-
!
|
1494
|
+
node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(DD),
|
1495
|
+
TAG(DT), TAG(LI), TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RB),
|
1496
|
+
TAG(RT), TAG(RTC) }) &&
|
1497
|
+
!node_html_tag_is(get_current_node(parser), exception);
|
1498
|
+
pop_current_node(parser));
|
1499
|
+
}
|
1500
|
+
|
1501
|
+
// This is the "generate all implied end tags thoroughly" clause of the spec.
|
1502
|
+
// https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags
|
1503
|
+
static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
|
1504
|
+
for (;
|
1505
|
+
node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(CAPTION),
|
1506
|
+
TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), TAG(OPTGROUP),
|
1507
|
+
TAG(P), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD),
|
1508
|
+
TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR) });
|
1453
1509
|
pop_current_node(parser));
|
1454
1510
|
}
|
1455
1511
|
|
@@ -1463,7 +1519,7 @@ static bool close_table(GumboParser* parser) {
|
|
1463
1519
|
}
|
1464
1520
|
|
1465
1521
|
GumboNode* node = pop_current_node(parser);
|
1466
|
-
while (!
|
1522
|
+
while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) {
|
1467
1523
|
node = pop_current_node(parser);
|
1468
1524
|
}
|
1469
1525
|
reset_insertion_mode_appropriately(parser);
|
@@ -1477,13 +1533,13 @@ static bool close_table_cell(GumboParser* parser, const GumboToken* token,
|
|
1477
1533
|
bool result = true;
|
1478
1534
|
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
1479
1535
|
const GumboNode* node = get_current_node(parser);
|
1480
|
-
if (!
|
1536
|
+
if (!node_html_tag_is(node, cell_tag)) {
|
1481
1537
|
parser_add_parse_error(parser, token);
|
1482
1538
|
result = false;
|
1483
1539
|
}
|
1484
1540
|
do {
|
1485
1541
|
node = pop_current_node(parser);
|
1486
|
-
} while (!
|
1542
|
+
} while (!node_html_tag_is(node, cell_tag));
|
1487
1543
|
|
1488
1544
|
clear_active_formatting_elements(parser);
|
1489
1545
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
@@ -1508,7 +1564,7 @@ static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
|
|
1508
1564
|
// resets the insertion mode appropriately.
|
1509
1565
|
static void close_current_select(GumboParser* parser) {
|
1510
1566
|
GumboNode* node = pop_current_node(parser);
|
1511
|
-
while (!
|
1567
|
+
while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) {
|
1512
1568
|
node = pop_current_node(parser);
|
1513
1569
|
}
|
1514
1570
|
reset_insertion_mode_appropriately(parser);
|
@@ -1517,60 +1573,43 @@ static void close_current_select(GumboParser* parser) {
|
|
1517
1573
|
// The list of nodes in the "special" category:
|
1518
1574
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
|
1519
1575
|
static bool is_special_node(const GumboNode* node) {
|
1520
|
-
assert(node->type == GUMBO_NODE_ELEMENT);
|
1521
|
-
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1531
|
-
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
GUMBO_TAG_TD, GUMBO_TAG_TEXTAREA, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
|
1544
|
-
GUMBO_TAG_THEAD, GUMBO_TAG_TITLE, GUMBO_TAG_TR, GUMBO_TAG_UL,
|
1545
|
-
GUMBO_TAG_WBR, GUMBO_TAG_XMP, GUMBO_TAG_LAST);
|
1546
|
-
case GUMBO_NAMESPACE_MATHML:
|
1547
|
-
return node_tag_in(node,
|
1548
|
-
GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
|
1549
|
-
GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_LAST);
|
1550
|
-
case GUMBO_NAMESPACE_SVG:
|
1551
|
-
return node_tag_in(node,
|
1552
|
-
GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_LAST);
|
1553
|
-
}
|
1554
|
-
abort();
|
1555
|
-
return false; // Pacify compiler.
|
1556
|
-
}
|
1557
|
-
|
1558
|
-
// Implicitly closes currently open tags until it reaches an element with the
|
1559
|
-
// specified tag name. If the elements closed are in the set handled by
|
1576
|
+
assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
|
1577
|
+
return node_tag_in_set(node, (gumbo_tagset) { TAG(ADDRESS), TAG(APPLET), TAG(AREA),
|
1578
|
+
TAG(ARTICLE), TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE),
|
1579
|
+
TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL),
|
1580
|
+
TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR), TAG(DIV), TAG(DL),
|
1581
|
+
TAG(DT), TAG(EMBED), TAG(FIELDSET), TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER),
|
1582
|
+
TAG(FORM), TAG(FRAME), TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4),
|
1583
|
+
TAG(H5), TAG(H6), TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML),
|
1584
|
+
TAG(IFRAME), TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK),
|
1585
|
+
TAG(LISTING), TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED),
|
1586
|
+
TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P), TAG(PARAM),
|
1587
|
+
TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION), TAG(SELECT), TAG(STYLE),
|
1588
|
+
TAG(SUMMARY), TAG(TABLE), TAG(TBODY), TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA),
|
1589
|
+
TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP),
|
1590
|
+
|
1591
|
+
TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
|
1592
|
+
TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
|
1593
|
+
|
1594
|
+
TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC) });
|
1595
|
+
}
|
1596
|
+
|
1597
|
+
// Implicitly closes currently open elements until it reaches an element with the
|
1598
|
+
// specified qualified name. If the elements closed are in the set handled by
|
1560
1599
|
// generate_implied_end_tags, this is normal operation and this function returns
|
1561
1600
|
// true. Otherwise, a parse error is recorded and this function returns false.
|
1562
1601
|
static bool implicitly_close_tags(
|
1563
|
-
GumboParser* parser, GumboToken* token, GumboTag target) {
|
1602
|
+
GumboParser* parser, GumboToken* token, GumboNamespaceEnum target_ns, GumboTag target) {
|
1564
1603
|
bool result = true;
|
1565
1604
|
generate_implied_end_tags(parser, target);
|
1566
|
-
if (!
|
1605
|
+
if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
|
1567
1606
|
parser_add_parse_error(parser, token);
|
1568
|
-
while (!
|
1607
|
+
while (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
|
1569
1608
|
pop_current_node(parser);
|
1570
1609
|
}
|
1571
1610
|
result = false;
|
1572
1611
|
}
|
1573
|
-
assert(
|
1612
|
+
assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
|
1574
1613
|
pop_current_node(parser);
|
1575
1614
|
return result;
|
1576
1615
|
}
|
@@ -1581,7 +1620,7 @@ static bool implicitly_close_tags(
|
|
1581
1620
|
// clause appears several times in the spec.
|
1582
1621
|
static bool maybe_implicitly_close_p_tag(GumboParser* parser, GumboToken* token) {
|
1583
1622
|
if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
|
1584
|
-
return implicitly_close_tags(parser, token, GUMBO_TAG_P);
|
1623
|
+
return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
|
1585
1624
|
}
|
1586
1625
|
return true;
|
1587
1626
|
}
|
@@ -1595,15 +1634,14 @@ static void maybe_implicitly_close_list_tag(
|
|
1595
1634
|
for (int i = state->_open_elements.length; --i >= 0; ) {
|
1596
1635
|
const GumboNode* node = state->_open_elements.data[i];
|
1597
1636
|
bool is_list_tag = is_li ?
|
1598
|
-
|
1599
|
-
|
1637
|
+
node_html_tag_is(node, GUMBO_TAG_LI) :
|
1638
|
+
node_tag_in_set(node, (gumbo_tagset) { TAG(DD), TAG(DT) } );
|
1600
1639
|
if (is_list_tag) {
|
1601
|
-
implicitly_close_tags(parser, token, node->v.element.tag);
|
1640
|
+
implicitly_close_tags(parser, token, node->v.element.tag_namespace, node->v.element.tag);
|
1602
1641
|
return;
|
1603
1642
|
}
|
1604
1643
|
if (is_special_node(node) &&
|
1605
|
-
!
|
1606
|
-
GUMBO_TAG_LAST)) {
|
1644
|
+
!node_tag_in_set(node, (gumbo_tagset) { TAG(ADDRESS), TAG(DIV), TAG(P) })) {
|
1607
1645
|
return;
|
1608
1646
|
}
|
1609
1647
|
}
|
@@ -1758,13 +1796,20 @@ static void remove_from_parent(GumboParser* parser, GumboNode* node) {
|
|
1758
1796
|
|
1759
1797
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
|
1760
1798
|
// Also described in the "in body" handling for end formatting tags.
|
1761
|
-
static bool adoption_agency_algorithm(
|
1762
|
-
GumboParser* parser, GumboToken* token, GumboTag closing_tag) {
|
1799
|
+
static bool adoption_agency_algorithm(GumboParser* parser, GumboToken* token, GumboTag subject) {
|
1763
1800
|
GumboParserState* state = parser->_parser_state;
|
1764
1801
|
gumbo_debug("Entering adoption agency algorithm.\n");
|
1765
|
-
//
|
1802
|
+
// Step 1.
|
1803
|
+
GumboNode* current_node = get_current_node(parser);
|
1804
|
+
if (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
|
1805
|
+
current_node->v.element.tag == subject &&
|
1806
|
+
gumbo_vector_index_of(&state->_active_formatting_elements, current_node) == -1) {
|
1807
|
+
pop_current_node(parser);
|
1808
|
+
return false;
|
1809
|
+
}
|
1810
|
+
// Steps 2-4 & 20:
|
1766
1811
|
for (int i = 0; i < 8; ++i) {
|
1767
|
-
// Step
|
1812
|
+
// Step 5.
|
1768
1813
|
GumboNode* formatting_node = NULL;
|
1769
1814
|
int formatting_node_in_open_elements = -1;
|
1770
1815
|
for (int j = state->_active_formatting_elements.length; --j >= 0; ) {
|
@@ -1774,13 +1819,13 @@ static bool adoption_agency_algorithm(
|
|
1774
1819
|
// Last scope marker; abort the algorithm.
|
1775
1820
|
return false;
|
1776
1821
|
}
|
1777
|
-
if (
|
1822
|
+
if (node_html_tag_is(current_node, subject)) {
|
1778
1823
|
// Found it.
|
1779
1824
|
formatting_node = current_node;
|
1780
1825
|
formatting_node_in_open_elements = gumbo_vector_index_of(
|
1781
|
-
|
1826
|
+
&state->_open_elements, formatting_node);
|
1782
1827
|
gumbo_debug("Formatting element of tag %s at %d.\n",
|
1783
|
-
gumbo_normalized_tagname(
|
1828
|
+
gumbo_normalized_tagname(subject),
|
1784
1829
|
formatting_node_in_open_elements);
|
1785
1830
|
break;
|
1786
1831
|
}
|
@@ -1793,39 +1838,44 @@ static bool adoption_agency_algorithm(
|
|
1793
1838
|
return false;
|
1794
1839
|
}
|
1795
1840
|
|
1841
|
+
// Step 6
|
1796
1842
|
if (formatting_node_in_open_elements == -1) {
|
1797
1843
|
gumbo_debug("Formatting node not on stack of open elements.\n");
|
1844
|
+
parser_add_parse_error(parser, token);
|
1798
1845
|
gumbo_vector_remove(parser, formatting_node,
|
1799
1846
|
&state->_active_formatting_elements);
|
1800
1847
|
return false;
|
1801
1848
|
}
|
1802
1849
|
|
1850
|
+
// Step 7
|
1803
1851
|
if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
|
1804
1852
|
parser_add_parse_error(parser, token);
|
1805
1853
|
gumbo_debug("Element not in scope.\n");
|
1806
1854
|
return false;
|
1807
1855
|
}
|
1856
|
+
|
1857
|
+
// Step 8
|
1808
1858
|
if (formatting_node != get_current_node(parser)) {
|
1809
1859
|
parser_add_parse_error(parser, token); // But continue onwards.
|
1810
1860
|
}
|
1811
1861
|
assert(formatting_node);
|
1812
|
-
assert(!
|
1813
|
-
assert(!
|
1862
|
+
assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
|
1863
|
+
assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
|
1814
1864
|
|
1815
|
-
// Step
|
1865
|
+
// Step 9 & 10
|
1816
1866
|
GumboNode* furthest_block = NULL;
|
1817
1867
|
for (int j = formatting_node_in_open_elements;
|
1818
1868
|
j < state->_open_elements.length; ++j) {
|
1819
1869
|
assert(j > 0);
|
1820
1870
|
GumboNode* current = state->_open_elements.data[j];
|
1821
1871
|
if (is_special_node(current)) {
|
1822
|
-
// Step
|
1872
|
+
// Step 9.
|
1823
1873
|
furthest_block = current;
|
1824
1874
|
break;
|
1825
1875
|
}
|
1826
1876
|
}
|
1827
1877
|
if (!furthest_block) {
|
1828
|
-
// Step
|
1878
|
+
// Step 10.
|
1829
1879
|
while (get_current_node(parser) != formatting_node) {
|
1830
1880
|
pop_current_node(parser);
|
1831
1881
|
}
|
@@ -1835,35 +1885,38 @@ static bool adoption_agency_algorithm(
|
|
1835
1885
|
&state->_active_formatting_elements);
|
1836
1886
|
return false;
|
1837
1887
|
}
|
1838
|
-
assert(!
|
1888
|
+
assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
|
1839
1889
|
assert(furthest_block);
|
1840
1890
|
|
1841
|
-
// Step
|
1891
|
+
// Step 11.
|
1842
1892
|
// Elements may be moved and reparented by this algorithm, so
|
1843
1893
|
// common_ancestor is not necessarily the same as formatting_node->parent.
|
1844
1894
|
GumboNode* common_ancestor =
|
1845
|
-
|
1846
|
-
|
1895
|
+
state->_open_elements.data[gumbo_vector_index_of(
|
1896
|
+
&state->_open_elements, formatting_node) - 1];
|
1847
1897
|
gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
|
1848
1898
|
gumbo_normalized_tagname(common_ancestor->v.element.tag),
|
1849
1899
|
gumbo_normalized_tagname(furthest_block->v.element.tag));
|
1850
1900
|
|
1851
|
-
// Step
|
1901
|
+
// Step 12.
|
1852
1902
|
int bookmark = gumbo_vector_index_of(
|
1853
|
-
|
1854
|
-
|
1903
|
+
&state->_active_formatting_elements, formatting_node) + 1;
|
1904
|
+
gumbo_debug("Bookmark at %d.\n", bookmark);
|
1905
|
+
// Step 13.
|
1855
1906
|
GumboNode* node = furthest_block;
|
1856
1907
|
GumboNode* last_node = furthest_block;
|
1857
1908
|
// Must be stored explicitly, in case node is removed from the stack of open
|
1858
1909
|
// elements, to handle step 9.4.
|
1859
1910
|
int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
|
1860
1911
|
assert(saved_node_index > 0);
|
1861
|
-
// Step
|
1862
|
-
for (int j = 0
|
1863
|
-
// Step
|
1912
|
+
// Step 13.1.
|
1913
|
+
for (int j = 0;;) {
|
1914
|
+
// Step 13.2.
|
1915
|
+
++j;
|
1916
|
+
// Step 13.3.
|
1864
1917
|
int node_index = gumbo_vector_index_of(&state->_open_elements, node);
|
1865
1918
|
gumbo_debug(
|
1866
|
-
|
1919
|
+
"Current index: %d, last index: %d.\n", node_index, saved_node_index);
|
1867
1920
|
if (node_index == -1) {
|
1868
1921
|
node_index = saved_node_index;
|
1869
1922
|
}
|
@@ -1872,62 +1925,78 @@ static bool adoption_agency_algorithm(
|
|
1872
1925
|
assert(node_index < state->_open_elements.capacity);
|
1873
1926
|
node = state->_open_elements.data[node_index];
|
1874
1927
|
assert(node->parent);
|
1875
|
-
|
1876
|
-
|
1877
|
-
|
1928
|
+
if (node == formatting_node) {
|
1929
|
+
// Step 13.4.
|
1930
|
+
break;
|
1931
|
+
}
|
1932
|
+
int formatting_index =
|
1933
|
+
gumbo_vector_index_of(&state->_active_formatting_elements, node);
|
1934
|
+
if (j > 3 && formatting_index != -1) {
|
1935
|
+
// Step 13.5.
|
1936
|
+
gumbo_debug(
|
1937
|
+
"Removing formatting element at %d.\n", formatting_index);
|
1938
|
+
gumbo_vector_remove_at(
|
1939
|
+
parser,
|
1940
|
+
formatting_index,
|
1941
|
+
&state->_active_formatting_elements);
|
1942
|
+
// Removing the element shifts all indices over by one, so we may need
|
1943
|
+
// to move the bookmark.
|
1944
|
+
if (formatting_index < bookmark) {
|
1945
|
+
--bookmark;
|
1946
|
+
gumbo_debug("Moving bookmark to %d.\n", bookmark);
|
1947
|
+
}
|
1948
|
+
continue;
|
1949
|
+
}
|
1950
|
+
if (formatting_index == -1) {
|
1951
|
+
// Step 13.6.
|
1878
1952
|
gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
|
1879
1953
|
continue;
|
1880
|
-
} else if (node == formatting_node) {
|
1881
|
-
// Step 9.6.
|
1882
|
-
break;
|
1883
1954
|
}
|
1884
|
-
// Step
|
1885
|
-
|
1886
|
-
|
1955
|
+
// Step 13.7.
|
1956
|
+
// "common ancestor as the intended parent" doesn't actually mean insert
|
1957
|
+
// it into the common ancestor; that happens below.
|
1887
1958
|
node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
|
1959
|
+
assert(formatting_index >= 0);
|
1888
1960
|
state->_active_formatting_elements.data[formatting_index] = node;
|
1961
|
+
assert(node_index >= 0);
|
1889
1962
|
state->_open_elements.data[node_index] = node;
|
1890
|
-
// Step
|
1963
|
+
// Step 13.8.
|
1891
1964
|
if (last_node == furthest_block) {
|
1892
1965
|
bookmark = formatting_index + 1;
|
1966
|
+
gumbo_debug("Bookmark moved to %d.\n", bookmark);
|
1893
1967
|
assert(bookmark <= state->_active_formatting_elements.length);
|
1894
1968
|
}
|
1895
|
-
// Step
|
1969
|
+
// Step 13.9.
|
1896
1970
|
last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
|
1897
1971
|
remove_from_parent(parser, last_node);
|
1898
1972
|
append_node(parser, node, last_node);
|
1899
|
-
// Step
|
1973
|
+
// Step 13.10.
|
1900
1974
|
last_node = node;
|
1901
|
-
}
|
1975
|
+
} // Step 13.11.
|
1902
1976
|
|
1903
|
-
// Step
|
1977
|
+
// Step 14.
|
1904
1978
|
gumbo_debug("Removing %s node from parent ",
|
1905
1979
|
gumbo_normalized_tagname(last_node->v.element.tag));
|
1906
1980
|
remove_from_parent(parser, last_node);
|
1907
1981
|
last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
|
1908
|
-
|
1909
|
-
|
1910
|
-
|
1911
|
-
|
1912
|
-
|
1913
|
-
} else {
|
1914
|
-
gumbo_debug("and inserting it into %s.\n",
|
1915
|
-
gumbo_normalized_tagname(common_ancestor->v.element.tag));
|
1916
|
-
append_node(parser, common_ancestor, last_node);
|
1917
|
-
}
|
1982
|
+
InsertionLocation location =
|
1983
|
+
get_appropriate_insertion_location(parser, common_ancestor);
|
1984
|
+
gumbo_debug("and inserting it into %s.\n",
|
1985
|
+
gumbo_normalized_tagname(location.target->v.element.tag));
|
1986
|
+
insert_node(parser, last_node, location);
|
1918
1987
|
|
1919
|
-
// Step
|
1988
|
+
// Step 15.
|
1920
1989
|
GumboNode* new_formatting_node = clone_node(
|
1921
|
-
|
1990
|
+
parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
|
1922
1991
|
formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
|
1923
1992
|
|
1924
|
-
// Step
|
1993
|
+
// Step 16. Instead of appending nodes one-by-one, we swap the children
|
1925
1994
|
// vector of furthest_block with the empty children of new_formatting_node,
|
1926
1995
|
// reducing memory traffic and allocations. We still have to reset their
|
1927
1996
|
// parent pointers, though.
|
1928
1997
|
GumboVector temp = new_formatting_node->v.element.children;
|
1929
1998
|
new_formatting_node->v.element.children =
|
1930
|
-
|
1999
|
+
furthest_block->v.element.children;
|
1931
2000
|
furthest_block->v.element.children = temp;
|
1932
2001
|
|
1933
2002
|
temp = new_formatting_node->v.element.children;
|
@@ -1936,36 +2005,39 @@ static bool adoption_agency_algorithm(
|
|
1936
2005
|
child->parent = new_formatting_node;
|
1937
2006
|
}
|
1938
2007
|
|
1939
|
-
// Step
|
2008
|
+
// Step 17.
|
1940
2009
|
append_node(parser, furthest_block, new_formatting_node);
|
1941
2010
|
|
1942
|
-
// Step
|
2011
|
+
// Step 18.
|
1943
2012
|
// If the formatting node was before the bookmark, it may shift over all
|
1944
2013
|
// indices after it, so we need to explicitly find the index and possibly
|
1945
2014
|
// adjust the bookmark.
|
1946
2015
|
int formatting_node_index = gumbo_vector_index_of(
|
1947
|
-
|
2016
|
+
&state->_active_formatting_elements, formatting_node);
|
1948
2017
|
assert(formatting_node_index != -1);
|
1949
2018
|
if (formatting_node_index < bookmark) {
|
2019
|
+
gumbo_debug(
|
2020
|
+
"Formatting node at %d is before bookmark at %d; decrementing.\n",
|
2021
|
+
formatting_node_index, bookmark);
|
1950
2022
|
--bookmark;
|
1951
2023
|
}
|
1952
2024
|
gumbo_vector_remove_at(
|
1953
|
-
|
2025
|
+
parser, formatting_node_index, &state->_active_formatting_elements);
|
1954
2026
|
assert(bookmark >= 0);
|
1955
2027
|
assert(bookmark <= state->_active_formatting_elements.length);
|
1956
2028
|
gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
|
1957
2029
|
&state->_active_formatting_elements);
|
1958
2030
|
|
1959
|
-
// Step
|
2031
|
+
// Step 19.
|
1960
2032
|
gumbo_vector_remove(
|
1961
|
-
|
2033
|
+
parser, formatting_node, &state->_open_elements);
|
1962
2034
|
int insert_at = gumbo_vector_index_of(
|
1963
|
-
|
2035
|
+
&state->_open_elements, furthest_block) + 1;
|
1964
2036
|
assert(insert_at >= 0);
|
1965
2037
|
assert(insert_at <= state->_open_elements.length);
|
1966
2038
|
gumbo_vector_insert_at(
|
1967
|
-
|
1968
|
-
}
|
2039
|
+
parser, new_formatting_node, insert_at, &state->_open_elements);
|
2040
|
+
} // Step 20.
|
1969
2041
|
return true;
|
1970
2042
|
}
|
1971
2043
|
|
@@ -1992,8 +2064,8 @@ static void finish_parsing(GumboParser* parser) {
|
|
1992
2064
|
GumboParserState* state = parser->_parser_state;
|
1993
2065
|
for (GumboNode* node = pop_current_node(parser); node;
|
1994
2066
|
node = pop_current_node(parser)) {
|
1995
|
-
if ((
|
1996
|
-
(
|
2067
|
+
if ((node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
|
2068
|
+
(node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
|
1997
2069
|
continue;
|
1998
2070
|
}
|
1999
2071
|
node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
|
@@ -2042,9 +2114,9 @@ static bool handle_before_html(GumboParser* parser, GumboToken* token) {
|
|
2042
2114
|
parser->_output->root = html_node;
|
2043
2115
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
|
2044
2116
|
return true;
|
2045
|
-
} else if (token->type == GUMBO_TOKEN_END_TAG &&
|
2046
|
-
|
2047
|
-
|
2117
|
+
} else if (token->type == GUMBO_TOKEN_END_TAG &&
|
2118
|
+
!tag_in(token, false, (gumbo_tagset) { TAG(HEAD), TAG(BODY), TAG(HTML),
|
2119
|
+
TAG(BR) } )) {
|
2048
2120
|
parser_add_parse_error(parser, token);
|
2049
2121
|
ignore_token(parser);
|
2050
2122
|
return false;
|
@@ -2076,9 +2148,9 @@ static bool handle_before_head(GumboParser* parser, GumboToken* token) {
|
|
2076
2148
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2077
2149
|
parser->_parser_state->_head_element = node;
|
2078
2150
|
return true;
|
2079
|
-
} else if (token->type == GUMBO_TOKEN_END_TAG &&
|
2080
|
-
|
2081
|
-
|
2151
|
+
} else if (token->type == GUMBO_TOKEN_END_TAG &&
|
2152
|
+
!tag_in(token, false, (gumbo_tagset) { TAG(HEAD), TAG(BODY), TAG(HTML),
|
2153
|
+
TAG(BR) })) {
|
2082
2154
|
parser_add_parse_error(parser, token);
|
2083
2155
|
ignore_token(parser);
|
2084
2156
|
return false;
|
@@ -2110,9 +2182,8 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2110
2182
|
return true;
|
2111
2183
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2112
2184
|
return handle_in_body(parser, token);
|
2113
|
-
} else if (tag_in(token, kStartTag,
|
2114
|
-
|
2115
|
-
GUMBO_TAG_LAST)) {
|
2185
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT),
|
2186
|
+
TAG(BGSOUND), TAG(MENUITEM), TAG(LINK) })) {
|
2116
2187
|
insert_element_from_token(parser, token);
|
2117
2188
|
pop_current_node(parser);
|
2118
2189
|
acknowledge_self_closing_tag(parser);
|
@@ -2129,8 +2200,7 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2129
2200
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
|
2130
2201
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
|
2131
2202
|
return true;
|
2132
|
-
} else if (tag_in(token, kStartTag,
|
2133
|
-
GUMBO_TAG_LAST)) {
|
2203
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(NOFRAMES), TAG(STYLE) })) {
|
2134
2204
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
|
2135
2205
|
return true;
|
2136
2206
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
|
@@ -2143,32 +2213,48 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2143
2213
|
} else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
|
2144
2214
|
GumboNode* head = pop_current_node(parser);
|
2145
2215
|
AVOID_UNUSED_VARIABLE_WARNING(head);
|
2146
|
-
assert(
|
2216
|
+
assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
|
2147
2217
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
|
2148
2218
|
return true;
|
2149
|
-
} else if (
|
2150
|
-
|
2151
|
-
|
2152
|
-
|
2153
|
-
|
2154
|
-
|
2155
|
-
|
2156
|
-
|
2157
|
-
|
2158
|
-
|
2159
|
-
|
2219
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(HTML), TAG(BR) })) {
|
2220
|
+
pop_current_node(parser);
|
2221
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
|
2222
|
+
parser->_parser_state->_reprocess_current_token = true;
|
2223
|
+
return true;
|
2224
|
+
} else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
|
2225
|
+
insert_element_from_token(parser, token);
|
2226
|
+
add_formatting_element(parser, &kActiveFormattingScopeMarker);
|
2227
|
+
parser->_parser_state->_frameset_ok = false;
|
2228
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
|
2229
|
+
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
|
2230
|
+
return true;
|
2231
|
+
} else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
|
2232
|
+
if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2233
|
+
parser_add_parse_error(parser, token);
|
2234
|
+
ignore_token(parser);
|
2235
|
+
return false;
|
2236
|
+
}
|
2237
|
+
generate_all_implied_end_tags_thoroughly(parser);
|
2238
|
+
bool success = true;
|
2239
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) {
|
2240
|
+
parser_add_parse_error(parser, token);
|
2241
|
+
success = false;
|
2242
|
+
}
|
2243
|
+
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE));
|
2244
|
+
clear_active_formatting_elements(parser);
|
2245
|
+
pop_template_insertion_mode(parser);
|
2246
|
+
reset_insertion_mode_appropriately(parser);
|
2247
|
+
return success;
|
2248
|
+
} else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) || (token->type == GUMBO_TOKEN_END_TAG)) {
|
2160
2249
|
parser_add_parse_error(parser, token);
|
2161
2250
|
ignore_token(parser);
|
2162
2251
|
return false;
|
2163
2252
|
} else {
|
2164
|
-
|
2165
|
-
assert(node_tag_is(node, GUMBO_TAG_HEAD));
|
2166
|
-
AVOID_UNUSED_VARIABLE_WARNING(node);
|
2253
|
+
pop_current_node(parser);
|
2167
2254
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
|
2168
2255
|
parser->_parser_state->_reprocess_current_token = true;
|
2169
2256
|
return true;
|
2170
2257
|
}
|
2171
|
-
|
2172
2258
|
return true;
|
2173
2259
|
}
|
2174
2260
|
|
@@ -2181,18 +2267,16 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
|
|
2181
2267
|
return handle_in_body(parser, token);
|
2182
2268
|
} else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
|
2183
2269
|
const GumboNode* node = pop_current_node(parser);
|
2184
|
-
assert(
|
2270
|
+
assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
|
2185
2271
|
AVOID_UNUSED_VARIABLE_WARNING(node);
|
2186
2272
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2187
2273
|
return true;
|
2188
2274
|
} else if (token->type == GUMBO_TOKEN_WHITESPACE ||
|
2189
2275
|
token->type == GUMBO_TOKEN_COMMENT ||
|
2190
|
-
tag_in(token, kStartTag,
|
2191
|
-
|
2192
|
-
|
2193
|
-
|
2194
|
-
} else if (tag_in(token, kStartTag, GUMBO_TAG_HEAD, GUMBO_TAG_NOSCRIPT,
|
2195
|
-
GUMBO_TAG_LAST) ||
|
2276
|
+
tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASEFONT), TAG(BGSOUND),
|
2277
|
+
TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(STYLE) })) {
|
2278
|
+
return handle_in_head(parser, token);
|
2279
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(HEAD), TAG(NOSCRIPT) }) ||
|
2196
2280
|
(token->type == GUMBO_TOKEN_END_TAG &&
|
2197
2281
|
!tag_is(token, kEndTag, GUMBO_TAG_BR))) {
|
2198
2282
|
parser_add_parse_error(parser, token);
|
@@ -2201,7 +2285,7 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
|
|
2201
2285
|
} else {
|
2202
2286
|
parser_add_parse_error(parser, token);
|
2203
2287
|
const GumboNode* node = pop_current_node(parser);
|
2204
|
-
assert(
|
2288
|
+
assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
|
2205
2289
|
AVOID_UNUSED_VARIABLE_WARNING(node);
|
2206
2290
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
|
2207
2291
|
parser->_parser_state->_reprocess_current_token = true;
|
@@ -2233,10 +2317,10 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
|
|
2233
2317
|
insert_element_from_token(parser, token);
|
2234
2318
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
|
2235
2319
|
return true;
|
2236
|
-
} else if (tag_in(token, kStartTag,
|
2237
|
-
|
2238
|
-
|
2239
|
-
|
2320
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT),
|
2321
|
+
TAG(BGSOUND), TAG(LINK), TAG(META),
|
2322
|
+
TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE),
|
2323
|
+
TAG(TEMPLATE), TAG(TITLE) })) {
|
2240
2324
|
parser_add_parse_error(parser, token);
|
2241
2325
|
assert(state->_head_element != NULL);
|
2242
2326
|
// This must be flushed before we push the head element on, as there may be
|
@@ -2246,10 +2330,11 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
|
|
2246
2330
|
bool result = handle_in_head(parser, token);
|
2247
2331
|
gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
|
2248
2332
|
return result;
|
2333
|
+
} else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
|
2334
|
+
return handle_in_head(parser, token);
|
2249
2335
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
|
2250
2336
|
(token->type == GUMBO_TOKEN_END_TAG &&
|
2251
|
-
!tag_in(token, kEndTag,
|
2252
|
-
GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
|
2337
|
+
!tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(HTML), TAG(BR) }))) {
|
2253
2338
|
parser_add_parse_error(parser, token);
|
2254
2339
|
ignore_token(parser);
|
2255
2340
|
return false;
|
@@ -2261,28 +2346,23 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
|
|
2261
2346
|
}
|
2262
2347
|
}
|
2263
2348
|
|
2264
|
-
static
|
2349
|
+
static GumboNode* destroy_node(GumboParser* parser, GumboNode* node) {
|
2265
2350
|
switch (node->type) {
|
2266
2351
|
case GUMBO_NODE_DOCUMENT:
|
2267
2352
|
{
|
2268
2353
|
GumboDocument* doc = &node->v.document;
|
2269
|
-
for (int i = 0; i < doc->children.length; ++i) {
|
2270
|
-
destroy_node(parser, doc->children.data[i]);
|
2271
|
-
}
|
2272
2354
|
gumbo_parser_deallocate(parser, (void*) doc->children.data);
|
2273
2355
|
gumbo_parser_deallocate(parser, (void*) doc->name);
|
2274
2356
|
gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
|
2275
2357
|
gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
|
2276
2358
|
}
|
2277
2359
|
break;
|
2360
|
+
case GUMBO_NODE_TEMPLATE:
|
2278
2361
|
case GUMBO_NODE_ELEMENT:
|
2279
2362
|
for (int i = 0; i < node->v.element.attributes.length; ++i) {
|
2280
2363
|
gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
|
2281
2364
|
}
|
2282
2365
|
gumbo_parser_deallocate(parser, node->v.element.attributes.data);
|
2283
|
-
for (int i = 0; i < node->v.element.children.length; ++i) {
|
2284
|
-
destroy_node(parser, node->v.element.children.data[i]);
|
2285
|
-
}
|
2286
2366
|
gumbo_parser_deallocate(parser, node->v.element.children.data);
|
2287
2367
|
break;
|
2288
2368
|
case GUMBO_NODE_TEXT:
|
@@ -2292,7 +2372,21 @@ static void destroy_node(GumboParser* parser, GumboNode* node) {
|
|
2292
2372
|
gumbo_parser_deallocate(parser, (void*) node->v.text.text);
|
2293
2373
|
break;
|
2294
2374
|
}
|
2375
|
+
// Remove from the next/prev linked list.
|
2376
|
+
GumboNode* prev = node->prev;
|
2377
|
+
GumboNode* next = node->next;
|
2378
|
+
if (prev != NULL) {
|
2379
|
+
prev->next = next;
|
2380
|
+
}
|
2381
|
+
if (next != NULL) {
|
2382
|
+
next->prev = prev;
|
2383
|
+
}
|
2384
|
+
if (parser->_parser_state && parser->_parser_state->_current_node == node) {
|
2385
|
+
parser->_parser_state->_current_node = prev;
|
2386
|
+
}
|
2387
|
+
|
2295
2388
|
gumbo_parser_deallocate(parser, node);
|
2389
|
+
return next;
|
2296
2390
|
}
|
2297
2391
|
|
2298
2392
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
|
@@ -2307,7 +2401,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2307
2401
|
reconstruct_active_formatting_elements(parser);
|
2308
2402
|
insert_text_token(parser, token);
|
2309
2403
|
return true;
|
2310
|
-
} else if (token->type == GUMBO_TOKEN_CHARACTER
|
2404
|
+
} else if (token->type == GUMBO_TOKEN_CHARACTER ||
|
2405
|
+
token->type == GUMBO_TOKEN_CDATA) {
|
2311
2406
|
reconstruct_active_formatting_elements(parser);
|
2312
2407
|
insert_text_token(parser, token);
|
2313
2408
|
set_frameset_not_ok(parser);
|
@@ -2320,20 +2415,24 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2320
2415
|
ignore_token(parser);
|
2321
2416
|
return false;
|
2322
2417
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2418
|
+
parser_add_parse_error(parser, token);
|
2419
|
+
if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2420
|
+
ignore_token(parser);
|
2421
|
+
return false;
|
2422
|
+
}
|
2323
2423
|
assert(parser->_output->root != NULL);
|
2324
2424
|
assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
|
2325
|
-
parser_add_parse_error(parser, token);
|
2326
2425
|
merge_attributes(parser, token, parser->_output->root);
|
2327
2426
|
return false;
|
2328
|
-
} else if (tag_in(token, kStartTag,
|
2329
|
-
|
2330
|
-
|
2331
|
-
|
2427
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT),
|
2428
|
+
TAG(BGSOUND), TAG(MENUITEM), TAG(LINK),
|
2429
|
+
TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
|
2430
|
+
TAG(STYLE), TAG(TEMPLATE), TAG(TITLE) } ) || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
|
2332
2431
|
return handle_in_head(parser, token);
|
2333
2432
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
|
2334
2433
|
parser_add_parse_error(parser, token);
|
2335
2434
|
if (state->_open_elements.length < 2 ||
|
2336
|
-
!
|
2435
|
+
!node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2337
2436
|
ignore_token(parser);
|
2338
2437
|
return false;
|
2339
2438
|
}
|
@@ -2343,7 +2442,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2343
2442
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
|
2344
2443
|
parser_add_parse_error(parser, token);
|
2345
2444
|
if (state->_open_elements.length < 2 ||
|
2346
|
-
!
|
2445
|
+
!node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
|
2347
2446
|
!state->_frameset_ok) {
|
2348
2447
|
ignore_token(parser);
|
2349
2448
|
return false;
|
@@ -2381,18 +2480,17 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2381
2480
|
return true;
|
2382
2481
|
} else if (token->type == GUMBO_TOKEN_EOF) {
|
2383
2482
|
for (int i = 0; i < state->_open_elements.length; ++i) {
|
2384
|
-
if (!
|
2385
|
-
|
2386
|
-
|
2387
|
-
GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_BODY,
|
2388
|
-
GUMBO_TAG_HTML, GUMBO_TAG_LAST)) {
|
2483
|
+
if (!node_tag_in_set(state->_open_elements.data[i], (gumbo_tagset) { TAG(DD),
|
2484
|
+
TAG(DT), TAG(LI), TAG(P), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH),
|
2485
|
+
TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML) } )) {
|
2389
2486
|
parser_add_parse_error(parser, token);
|
2390
|
-
return false;
|
2391
2487
|
}
|
2392
2488
|
}
|
2489
|
+
if (get_current_template_insertion_mode(parser) != GUMBO_INSERTION_MODE_INITIAL) {
|
2490
|
+
return handle_in_template(parser, token);
|
2491
|
+
}
|
2393
2492
|
return true;
|
2394
|
-
} else if (tag_in(token, kEndTag,
|
2395
|
-
GUMBO_TAG_LAST)) {
|
2493
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(HTML) })) {
|
2396
2494
|
if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
|
2397
2495
|
parser_add_parse_error(parser, token);
|
2398
2496
|
ignore_token(parser);
|
@@ -2400,13 +2498,11 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2400
2498
|
}
|
2401
2499
|
bool success = true;
|
2402
2500
|
for (int i = 0; i < state->_open_elements.length; ++i) {
|
2403
|
-
if (!
|
2404
|
-
|
2405
|
-
|
2406
|
-
|
2407
|
-
|
2408
|
-
GUMBO_TAG_TR, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
|
2409
|
-
GUMBO_TAG_LAST)) {
|
2501
|
+
if (!node_tag_in_set(state->_open_elements.data[i], (gumbo_tagset) {
|
2502
|
+
TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P),
|
2503
|
+
TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD),
|
2504
|
+
TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR),
|
2505
|
+
TAG(BODY), TAG(HTML) })) {
|
2410
2506
|
parser_add_parse_error(parser, token);
|
2411
2507
|
success = false;
|
2412
2508
|
break;
|
@@ -2417,58 +2513,54 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2417
2513
|
parser->_parser_state->_reprocess_current_token = true;
|
2418
2514
|
} else {
|
2419
2515
|
GumboNode* body = state->_open_elements.data[1];
|
2420
|
-
assert(
|
2516
|
+
assert(node_html_tag_is(body, GUMBO_TAG_BODY));
|
2421
2517
|
record_end_of_element(state->_current_token, &body->v.element);
|
2422
2518
|
}
|
2423
2519
|
return success;
|
2424
|
-
} else if (tag_in(token, kStartTag,
|
2425
|
-
|
2426
|
-
|
2427
|
-
|
2428
|
-
|
2429
|
-
GUMBO_TAG_HGROUP, GUMBO_TAG_MENU, GUMBO_TAG_NAV,
|
2430
|
-
GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_SECTION,
|
2431
|
-
GUMBO_TAG_SUMMARY, GUMBO_TAG_UL, GUMBO_TAG_LAST)) {
|
2520
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(ADDRESS), TAG(ARTICLE),
|
2521
|
+
TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS),
|
2522
|
+
TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
|
2523
|
+
TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP), TAG(MENU), TAG(MAIN),
|
2524
|
+
TAG(NAV), TAG(OL), TAG(P), TAG(SECTION), TAG(SUMMARY), TAG(UL) })) {
|
2432
2525
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
2433
2526
|
insert_element_from_token(parser, token);
|
2434
2527
|
return result;
|
2435
|
-
} else if (tag_in(token, kStartTag,
|
2436
|
-
|
2528
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(H1), TAG(H2), TAG(H3),
|
2529
|
+
TAG(H4), TAG(H5), TAG(H6) })) {
|
2437
2530
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
2438
|
-
if (
|
2439
|
-
|
2440
|
-
GUMBO_TAG_LAST)) {
|
2531
|
+
if (node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(H1), TAG(H2),
|
2532
|
+
TAG(H3), TAG(H4), TAG(H5), TAG(H6) })) {
|
2441
2533
|
parser_add_parse_error(parser, token);
|
2442
2534
|
pop_current_node(parser);
|
2443
2535
|
result = false;
|
2444
2536
|
}
|
2445
2537
|
insert_element_from_token(parser, token);
|
2446
2538
|
return result;
|
2447
|
-
|
2448
|
-
GUMBO_TAG_LAST)) {
|
2539
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(PRE), TAG(LISTING) })) {
|
2449
2540
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
2450
2541
|
insert_element_from_token(parser, token);
|
2451
2542
|
state->_ignore_next_linefeed = true;
|
2452
2543
|
state->_frameset_ok = false;
|
2453
2544
|
return result;
|
2454
2545
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
|
2455
|
-
if (state->_form_element != NULL) {
|
2546
|
+
if (state->_form_element != NULL && !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2456
2547
|
gumbo_debug("Ignoring nested form.\n");
|
2457
2548
|
parser_add_parse_error(parser, token);
|
2458
2549
|
ignore_token(parser);
|
2459
2550
|
return false;
|
2460
2551
|
}
|
2461
2552
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
2462
|
-
|
2463
|
-
|
2553
|
+
GumboNode* form_element = insert_element_from_token(parser, token);
|
2554
|
+
if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2555
|
+
state->_form_element = form_element;
|
2556
|
+
}
|
2464
2557
|
return result;
|
2465
2558
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
|
2466
2559
|
maybe_implicitly_close_list_tag(parser, token, true);
|
2467
2560
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
2468
2561
|
insert_element_from_token(parser, token);
|
2469
2562
|
return result;
|
2470
|
-
|
2471
|
-
GUMBO_TAG_LAST)) {
|
2563
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(DD), TAG(DT) })) {
|
2472
2564
|
maybe_implicitly_close_list_tag(parser, token, false);
|
2473
2565
|
bool result = maybe_implicitly_close_p_tag(parser, token);
|
2474
2566
|
insert_element_from_token(parser, token);
|
@@ -2481,7 +2573,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2481
2573
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
|
2482
2574
|
if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
|
2483
2575
|
parser_add_parse_error(parser, token);
|
2484
|
-
implicitly_close_tags(parser, token, GUMBO_TAG_BUTTON);
|
2576
|
+
implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_BUTTON);
|
2485
2577
|
state->_reprocess_current_token = true;
|
2486
2578
|
return false;
|
2487
2579
|
}
|
@@ -2489,67 +2581,78 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2489
2581
|
insert_element_from_token(parser, token);
|
2490
2582
|
state->_frameset_ok = false;
|
2491
2583
|
return true;
|
2492
|
-
|
2493
|
-
|
2494
|
-
|
2495
|
-
|
2496
|
-
|
2497
|
-
|
2498
|
-
GUMBO_TAG_MENU, GUMBO_TAG_NAV, GUMBO_TAG_OL, GUMBO_TAG_PRE,
|
2499
|
-
GUMBO_TAG_SECTION, GUMBO_TAG_SUMMARY, GUMBO_TAG_UL,
|
2500
|
-
GUMBO_TAG_LAST)) {
|
2584
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(ADDRESS), TAG(ARTICLE),
|
2585
|
+
TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS),
|
2586
|
+
TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
|
2587
|
+
TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP), TAG(LISTING),
|
2588
|
+
TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(PRE),
|
2589
|
+
TAG(SECTION), TAG(SUMMARY), TAG(UL) })) {
|
2501
2590
|
GumboTag tag = token->v.end_tag;
|
2502
2591
|
if (!has_an_element_in_scope(parser, tag)) {
|
2503
2592
|
parser_add_parse_error(parser, token);
|
2504
2593
|
ignore_token(parser);
|
2505
2594
|
return false;
|
2506
2595
|
}
|
2507
|
-
implicitly_close_tags(parser, token, token->v.end_tag);
|
2596
|
+
implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag);
|
2508
2597
|
return true;
|
2509
2598
|
} else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
|
2510
|
-
|
2511
|
-
|
2512
|
-
|
2513
|
-
|
2514
|
-
|
2515
|
-
|
2516
|
-
|
2517
|
-
|
2518
|
-
|
2519
|
-
|
2520
|
-
|
2521
|
-
|
2522
|
-
|
2523
|
-
|
2524
|
-
|
2525
|
-
result =
|
2526
|
-
|
2599
|
+
if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2600
|
+
if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
|
2601
|
+
parser_add_parse_error(parser, token);
|
2602
|
+
ignore_token(parser);
|
2603
|
+
return false;
|
2604
|
+
}
|
2605
|
+
bool success = true;
|
2606
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
2607
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
|
2608
|
+
parser_add_parse_error(parser, token);
|
2609
|
+
return false;
|
2610
|
+
}
|
2611
|
+
while(!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM));
|
2612
|
+
return success;
|
2613
|
+
} else {
|
2614
|
+
bool result = true;
|
2615
|
+
const GumboNode* node = state->_form_element;
|
2616
|
+
assert(!node || node->type == GUMBO_NODE_ELEMENT);
|
2617
|
+
state->_form_element = NULL;
|
2618
|
+
if (!node || !has_node_in_scope(parser, node)) {
|
2619
|
+
gumbo_debug("Closing an unopened form.\n");
|
2620
|
+
parser_add_parse_error(parser, token);
|
2621
|
+
ignore_token(parser);
|
2622
|
+
return false;
|
2623
|
+
}
|
2624
|
+
// This differs from implicitly_close_tags because we remove *only* the
|
2625
|
+
// <form> element; other nodes are left in scope.
|
2626
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
2627
|
+
if (get_current_node(parser) != node) {
|
2628
|
+
parser_add_parse_error(parser, token);
|
2629
|
+
result = false;
|
2630
|
+
}
|
2527
2631
|
|
2528
|
-
|
2529
|
-
|
2530
|
-
|
2531
|
-
|
2532
|
-
|
2533
|
-
|
2632
|
+
GumboVector* open_elements = &state->_open_elements;
|
2633
|
+
int index = gumbo_vector_index_of(open_elements, node);
|
2634
|
+
assert(index >= 0);
|
2635
|
+
gumbo_vector_remove_at(parser, index, open_elements);
|
2636
|
+
return result;
|
2637
|
+
}
|
2534
2638
|
} else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
|
2535
2639
|
if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
|
2536
2640
|
parser_add_parse_error(parser, token);
|
2537
|
-
reconstruct_active_formatting_elements(parser);
|
2641
|
+
// reconstruct_active_formatting_elements(parser);
|
2538
2642
|
insert_element_of_tag_type(
|
2539
2643
|
parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
|
2540
2644
|
state->_reprocess_current_token = true;
|
2541
2645
|
return false;
|
2542
2646
|
}
|
2543
|
-
return implicitly_close_tags(parser, token, GUMBO_TAG_P);
|
2647
|
+
return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
|
2544
2648
|
} else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
|
2545
2649
|
if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
|
2546
2650
|
parser_add_parse_error(parser, token);
|
2547
2651
|
ignore_token(parser);
|
2548
2652
|
return false;
|
2549
2653
|
}
|
2550
|
-
return implicitly_close_tags(parser, token, GUMBO_TAG_LI);
|
2551
|
-
|
2552
|
-
GUMBO_TAG_LAST)) {
|
2654
|
+
return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_LI);
|
2655
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(DD), TAG(DT) })) {
|
2553
2656
|
assert(token->type == GUMBO_TOKEN_END_TAG);
|
2554
2657
|
GumboTag token_tag = token->v.end_tag;
|
2555
2658
|
if (!has_an_element_in_scope(parser, token_tag)) {
|
@@ -2557,12 +2660,12 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2557
2660
|
ignore_token(parser);
|
2558
2661
|
return false;
|
2559
2662
|
}
|
2560
|
-
return implicitly_close_tags(parser, token, token_tag);
|
2561
|
-
|
2562
|
-
|
2563
|
-
if (!has_an_element_in_scope_with_tagname(
|
2564
|
-
|
2565
|
-
|
2663
|
+
return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
|
2664
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) {
|
2665
|
+
TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6) })) {
|
2666
|
+
if (!has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) {
|
2667
|
+
GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
|
2668
|
+
GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) {
|
2566
2669
|
// No heading open; ignore the token entirely.
|
2567
2670
|
parser_add_parse_error(parser, token);
|
2568
2671
|
ignore_token(parser);
|
@@ -2570,7 +2673,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2570
2673
|
} else {
|
2571
2674
|
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
2572
2675
|
const GumboNode* current_node = get_current_node(parser);
|
2573
|
-
bool success =
|
2676
|
+
bool success = node_html_tag_is(current_node, token->v.end_tag);
|
2574
2677
|
if (!success) {
|
2575
2678
|
// There're children of the heading currently open; close them below and
|
2576
2679
|
// record a parse error.
|
@@ -2580,9 +2683,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2580
2683
|
}
|
2581
2684
|
do {
|
2582
2685
|
current_node = pop_current_node(parser);
|
2583
|
-
} while (!
|
2584
|
-
|
2585
|
-
GUMBO_TAG_H6, GUMBO_TAG_LAST));
|
2686
|
+
} while (!node_tag_in_set(current_node, (gumbo_tagset) { TAG(H1), TAG(H2),
|
2687
|
+
TAG(H3), TAG(H4), TAG(H5), TAG(H6) } ));
|
2586
2688
|
return success;
|
2587
2689
|
}
|
2588
2690
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
|
@@ -2608,11 +2710,10 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2608
2710
|
reconstruct_active_formatting_elements(parser);
|
2609
2711
|
add_formatting_element(parser, insert_element_from_token(parser, token));
|
2610
2712
|
return success;
|
2611
|
-
|
2612
|
-
|
2613
|
-
|
2614
|
-
|
2615
|
-
GUMBO_TAG_LAST)) {
|
2713
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(B), TAG(BIG),
|
2714
|
+
TAG(CODE), TAG(EM), TAG(FONT), TAG(I),
|
2715
|
+
TAG(S), TAG(SMALL), TAG(STRIKE),
|
2716
|
+
TAG(STRONG), TAG(TT), TAG(U) })) {
|
2616
2717
|
reconstruct_active_formatting_elements(parser);
|
2617
2718
|
add_formatting_element(parser, insert_element_from_token(parser, token));
|
2618
2719
|
return true;
|
@@ -2628,28 +2729,27 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2628
2729
|
insert_element_from_token(parser, token);
|
2629
2730
|
add_formatting_element(parser, get_current_node(parser));
|
2630
2731
|
return result;
|
2631
|
-
|
2632
|
-
|
2633
|
-
|
2634
|
-
|
2635
|
-
GUMBO_TAG_U, GUMBO_TAG_LAST)) {
|
2732
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(A), TAG(B), TAG(BIG),
|
2733
|
+
TAG(CODE), TAG(EM), TAG(FONT), TAG(I),
|
2734
|
+
TAG(NOBR), TAG(S), TAG(SMALL),
|
2735
|
+
TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U) })) {
|
2636
2736
|
return adoption_agency_algorithm(parser, token, token->v.end_tag);
|
2637
|
-
|
2638
|
-
|
2737
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(APPLET), TAG(MARQUEE),
|
2738
|
+
TAG(OBJECT) })) {
|
2639
2739
|
reconstruct_active_formatting_elements(parser);
|
2640
2740
|
insert_element_from_token(parser, token);
|
2641
2741
|
add_formatting_element(parser, &kActiveFormattingScopeMarker);
|
2642
2742
|
set_frameset_not_ok(parser);
|
2643
2743
|
return true;
|
2644
|
-
|
2645
|
-
|
2744
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(APPLET), TAG(MARQUEE),
|
2745
|
+
TAG(OBJECT) })) {
|
2646
2746
|
GumboTag token_tag = token->v.end_tag;
|
2647
2747
|
if (!has_an_element_in_table_scope(parser, token_tag)) {
|
2648
2748
|
parser_add_parse_error(parser, token);
|
2649
2749
|
ignore_token(parser);
|
2650
2750
|
return false;
|
2651
2751
|
}
|
2652
|
-
implicitly_close_tags(parser, token, token_tag);
|
2752
|
+
implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
|
2653
2753
|
clear_active_formatting_elements(parser);
|
2654
2754
|
return true;
|
2655
2755
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
|
@@ -2661,9 +2761,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2661
2761
|
set_frameset_not_ok(parser);
|
2662
2762
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
2663
2763
|
return true;
|
2664
|
-
|
2665
|
-
|
2666
|
-
GUMBO_TAG_KEYGEN, GUMBO_TAG_WBR, GUMBO_TAG_LAST)) {
|
2764
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(AREA), TAG(BR),
|
2765
|
+
TAG(EMBED), TAG(IMG), TAG(IMAGE), TAG(KEYGEN), TAG(WBR) })) {
|
2667
2766
|
bool success = true;
|
2668
2767
|
if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
|
2669
2768
|
success = false;
|
@@ -2693,8 +2792,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2693
2792
|
pop_current_node(parser);
|
2694
2793
|
acknowledge_self_closing_tag(parser);
|
2695
2794
|
return true;
|
2696
|
-
|
2697
|
-
GUMBO_TAG_TRACK, GUMBO_TAG_LAST)) {
|
2795
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(PARAM), TAG(SOURCE), TAG(TRACK) })) {
|
2698
2796
|
insert_element_from_token(parser, token);
|
2699
2797
|
pop_current_node(parser);
|
2700
2798
|
acknowledge_self_closing_tag(parser);
|
@@ -2708,7 +2806,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2708
2806
|
return result;
|
2709
2807
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
|
2710
2808
|
parser_add_parse_error(parser, token);
|
2711
|
-
if (parser->_parser_state->_form_element != NULL
|
2809
|
+
if (parser->_parser_state->_form_element != NULL &&
|
2810
|
+
!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2712
2811
|
ignore_token(parser);
|
2713
2812
|
return false;
|
2714
2813
|
}
|
@@ -2723,6 +2822,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2723
2822
|
|
2724
2823
|
GumboNode* form = insert_element_of_tag_type(
|
2725
2824
|
parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
|
2825
|
+
if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2826
|
+
parser->_parser_state->_form_element = form;
|
2827
|
+
}
|
2726
2828
|
if (action_attr) {
|
2727
2829
|
gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
|
2728
2830
|
}
|
@@ -2786,6 +2888,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2786
2888
|
parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
|
2787
2889
|
pop_current_node(parser); // <hr>
|
2788
2890
|
pop_current_node(parser); // <form>
|
2891
|
+
if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
2892
|
+
parser->_parser_state->_form_element = NULL;
|
2893
|
+
}
|
2789
2894
|
return false;
|
2790
2895
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
|
2791
2896
|
run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
|
@@ -2820,21 +2925,24 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2820
2925
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
|
2821
2926
|
}
|
2822
2927
|
return true;
|
2823
|
-
|
2824
|
-
|
2825
|
-
if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
2928
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(OPTION), TAG(OPTGROUP) })) {
|
2929
|
+
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
2826
2930
|
pop_current_node(parser);
|
2827
2931
|
}
|
2828
2932
|
reconstruct_active_formatting_elements(parser);
|
2829
2933
|
insert_element_from_token(parser, token);
|
2830
2934
|
return true;
|
2831
|
-
} else if (tag_in(token, kStartTag,
|
2832
|
-
|
2935
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) {
|
2936
|
+
TAG(RB), TAG(RP), TAG(RT), TAG(RTC) })) {
|
2833
2937
|
bool success = true;
|
2938
|
+
GumboTag exception = tag_in(token, kStartTag, (gumbo_tagset) {
|
2939
|
+
TAG(RT), TAG(RP) }) ? GUMBO_TAG_RTC : GUMBO_TAG_LAST;
|
2834
2940
|
if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
|
2835
|
-
generate_implied_end_tags(parser,
|
2941
|
+
generate_implied_end_tags(parser, exception);
|
2836
2942
|
}
|
2837
|
-
if (!
|
2943
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY) &&
|
2944
|
+
!(exception == GUMBO_TAG_LAST ||
|
2945
|
+
node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC))) {
|
2838
2946
|
parser_add_parse_error(parser, token);
|
2839
2947
|
success = false;
|
2840
2948
|
}
|
@@ -2867,11 +2975,10 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2867
2975
|
acknowledge_self_closing_tag(parser);
|
2868
2976
|
}
|
2869
2977
|
return true;
|
2870
|
-
|
2871
|
-
|
2872
|
-
|
2873
|
-
|
2874
|
-
GUMBO_TAG_LAST)) {
|
2978
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COL),
|
2979
|
+
TAG(COLGROUP), TAG(FRAME), TAG(HEAD),
|
2980
|
+
TAG(TBODY), TAG(TD), TAG(TFOOT),
|
2981
|
+
TAG(TH), TAG(THEAD), TAG(TR) })) {
|
2875
2982
|
parser_add_parse_error(parser, token);
|
2876
2983
|
ignore_token(parser);
|
2877
2984
|
return false;
|
@@ -2883,7 +2990,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2883
2990
|
assert(token->type == GUMBO_TOKEN_END_TAG);
|
2884
2991
|
GumboTag end_tag = token->v.end_tag;
|
2885
2992
|
assert(state->_open_elements.length > 0);
|
2886
|
-
assert(
|
2993
|
+
assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
|
2887
2994
|
// Walk up the stack of open elements until we find one that either:
|
2888
2995
|
// a) Matches the tag name we saw
|
2889
2996
|
// b) Is in the "special" category.
|
@@ -2892,8 +2999,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2892
2999
|
// implied end tags) and ignore the end tag token.
|
2893
3000
|
for (int i = state->_open_elements.length; --i >= 0; ) {
|
2894
3001
|
const GumboNode* node = state->_open_elements.data[i];
|
2895
|
-
if (node
|
2896
|
-
node_tag_is(node, end_tag)) {
|
3002
|
+
if (node_html_tag_is(node, end_tag)) {
|
2897
3003
|
generate_implied_end_tags(parser, end_tag);
|
2898
3004
|
// TODO(jdtang): Do I need to add a parse error here? The condition in
|
2899
3005
|
// the spec seems like it's the inverse of the loop condition above, and
|
@@ -2974,13 +3080,11 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
|
|
2974
3080
|
parser->_parser_state->_reprocess_current_token = true;
|
2975
3081
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
|
2976
3082
|
return true;
|
2977
|
-
} else if (tag_in(token, kStartTag,
|
2978
|
-
|
2979
|
-
GUMBO_TAG_LAST)) {
|
3083
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(TBODY), TAG(TFOOT),
|
3084
|
+
TAG(THEAD), TAG(TD), TAG(TH), TAG(TR) })) {
|
2980
3085
|
clear_stack_to_table_context(parser);
|
2981
3086
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
2982
|
-
if (tag_in(token, kStartTag,
|
2983
|
-
GUMBO_TAG_LAST)) {
|
3087
|
+
if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(TD), TAG(TH), TAG(TR) })) {
|
2984
3088
|
insert_element_of_tag_type(
|
2985
3089
|
parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED);
|
2986
3090
|
state->_reprocess_current_token = true;
|
@@ -3002,16 +3106,15 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
|
|
3002
3106
|
return false;
|
3003
3107
|
}
|
3004
3108
|
return true;
|
3005
|
-
} else if (tag_in(token, kEndTag,
|
3006
|
-
|
3007
|
-
|
3008
|
-
|
3009
|
-
GUMBO_TAG_LAST)) {
|
3109
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(CAPTION),
|
3110
|
+
TAG(COL), TAG(COLGROUP), TAG(HTML),
|
3111
|
+
TAG(TBODY), TAG(TD), TAG(TFOOT),
|
3112
|
+
TAG(TH), TAG(THEAD), TAG(TR) })) {
|
3010
3113
|
parser_add_parse_error(parser, token);
|
3011
3114
|
ignore_token(parser);
|
3012
3115
|
return false;
|
3013
|
-
} else if (tag_in(token, kStartTag,
|
3014
|
-
|
3116
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE) }) ||
|
3117
|
+
(tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))) {
|
3015
3118
|
return handle_in_head(parser, token);
|
3016
3119
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) &&
|
3017
3120
|
attribute_matches(&token->v.start_tag.attributes,
|
@@ -3022,7 +3125,7 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
|
|
3022
3125
|
return false;
|
3023
3126
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
|
3024
3127
|
parser_add_parse_error(parser, token);
|
3025
|
-
if (state->_form_element) {
|
3128
|
+
if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
3026
3129
|
ignore_token(parser);
|
3027
3130
|
return false;
|
3028
3131
|
}
|
@@ -3030,11 +3133,7 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
|
|
3030
3133
|
pop_current_node(parser);
|
3031
3134
|
return false;
|
3032
3135
|
} else if (token->type == GUMBO_TOKEN_EOF) {
|
3033
|
-
|
3034
|
-
parser_add_parse_error(parser, token);
|
3035
|
-
return false;
|
3036
|
-
}
|
3037
|
-
return true;
|
3136
|
+
return handle_in_body(parser, token);
|
3038
3137
|
} else {
|
3039
3138
|
parser_add_parse_error(parser, token);
|
3040
3139
|
state->_foster_parent_insertions = true;
|
@@ -3063,7 +3162,7 @@ static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
|
|
3063
3162
|
// of any one byte that is not whitespace means we flip the flag, so this
|
3064
3163
|
// loop is still valid.
|
3065
3164
|
for (int i = 0; i < buffer->length; ++i) {
|
3066
|
-
if (!isspace(buffer->data[i]) || buffer->data[i] == '\v') {
|
3165
|
+
if (!isspace((unsigned char)buffer->data[i]) || buffer->data[i] == '\v') {
|
3067
3166
|
state->_foster_parent_insertions = true;
|
3068
3167
|
reconstruct_active_formatting_elements(parser);
|
3069
3168
|
break;
|
@@ -3079,38 +3178,37 @@ static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
|
|
3079
3178
|
|
3080
3179
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption
|
3081
3180
|
static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
|
3082
|
-
if (
|
3083
|
-
GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
|
3084
|
-
GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
|
3085
|
-
GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
|
3086
|
-
tag_in(token, kEndTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
|
3087
|
-
GUMBO_TAG_LAST)) {
|
3181
|
+
if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
|
3088
3182
|
if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
|
3089
3183
|
parser_add_parse_error(parser, token);
|
3090
3184
|
ignore_token(parser);
|
3091
3185
|
return false;
|
3186
|
+
} else {
|
3187
|
+
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
3188
|
+
bool result = true;
|
3189
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
|
3190
|
+
parser_add_parse_error(parser, token);
|
3191
|
+
}
|
3192
|
+
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION));
|
3193
|
+
clear_active_formatting_elements(parser);
|
3194
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3195
|
+
return result;
|
3092
3196
|
}
|
3093
|
-
|
3094
|
-
|
3095
|
-
|
3096
|
-
|
3097
|
-
generate_implied_end_tags(parser, GUMBO_TAG_LAST);
|
3098
|
-
bool result = true;
|
3099
|
-
if (!node_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
|
3197
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COL),
|
3198
|
+
TAG(COLGROUP), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) }) ||
|
3199
|
+
(tag_is(token, kEndTag, GUMBO_TAG_TABLE))) {
|
3200
|
+
if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
|
3100
3201
|
parser_add_parse_error(parser, token);
|
3101
|
-
|
3102
|
-
|
3103
|
-
}
|
3104
|
-
result = false;
|
3202
|
+
ignore_token(parser);
|
3203
|
+
return false;
|
3105
3204
|
}
|
3106
|
-
pop_current_node(parser);
|
3205
|
+
while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION));
|
3107
3206
|
clear_active_formatting_elements(parser);
|
3108
3207
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3109
|
-
|
3110
|
-
|
3111
|
-
|
3112
|
-
|
3113
|
-
GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
|
3208
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3209
|
+
return true;
|
3210
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(COL), TAG(COLGROUP),
|
3211
|
+
TAG(HTML), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) } )) {
|
3114
3212
|
parser_add_parse_error(parser, token);
|
3115
3213
|
ignore_token(parser);
|
3116
3214
|
return false;
|
@@ -3138,24 +3236,33 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
|
|
3138
3236
|
pop_current_node(parser);
|
3139
3237
|
acknowledge_self_closing_tag(parser);
|
3140
3238
|
return true;
|
3239
|
+
} else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
|
3240
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
|
3241
|
+
parser_add_parse_error(parser, token);
|
3242
|
+
ignore_token(parser);
|
3243
|
+
return false;
|
3244
|
+
}
|
3245
|
+
pop_current_node(parser);
|
3246
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3247
|
+
return false;
|
3141
3248
|
} else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
|
3142
3249
|
parser_add_parse_error(parser, token);
|
3143
3250
|
ignore_token(parser);
|
3144
3251
|
return false;
|
3145
|
-
} else if (token
|
3146
|
-
|
3147
|
-
return
|
3252
|
+
} else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) ||
|
3253
|
+
tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
|
3254
|
+
return handle_in_head(parser, token);
|
3255
|
+
} else if (token->type == GUMBO_TOKEN_EOF) {
|
3256
|
+
return handle_in_body(parser, token);
|
3148
3257
|
} else {
|
3149
|
-
if (get_current_node(parser)
|
3258
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
|
3150
3259
|
parser_add_parse_error(parser, token);
|
3260
|
+
ignore_token(parser);
|
3151
3261
|
return false;
|
3152
3262
|
}
|
3153
|
-
assert(node_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP));
|
3154
3263
|
pop_current_node(parser);
|
3155
3264
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3156
|
-
|
3157
|
-
parser->_parser_state->_reprocess_current_token = true;
|
3158
|
-
}
|
3265
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3159
3266
|
return true;
|
3160
3267
|
}
|
3161
3268
|
}
|
@@ -3167,16 +3274,14 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
|
|
3167
3274
|
insert_element_from_token(parser, token);
|
3168
3275
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
3169
3276
|
return true;
|
3170
|
-
} else if (tag_in(token, kStartTag,
|
3171
|
-
GUMBO_TAG_LAST)) {
|
3277
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(TD), TAG(TH) })) {
|
3172
3278
|
parser_add_parse_error(parser, token);
|
3173
3279
|
clear_stack_to_table_body_context(parser);
|
3174
3280
|
insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
|
3175
3281
|
parser->_parser_state->_reprocess_current_token = true;
|
3176
3282
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
3177
3283
|
return false;
|
3178
|
-
} else if (tag_in(token, kEndTag,
|
3179
|
-
GUMBO_TAG_THEAD, GUMBO_TAG_LAST)) {
|
3284
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(TBODY), TAG(TFOOT), TAG(THEAD) })) {
|
3180
3285
|
if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
|
3181
3286
|
parser_add_parse_error(parser, token);
|
3182
3287
|
ignore_token(parser);
|
@@ -3186,9 +3291,8 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
|
|
3186
3291
|
pop_current_node(parser);
|
3187
3292
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3188
3293
|
return true;
|
3189
|
-
} else if (tag_in(token, kStartTag,
|
3190
|
-
|
3191
|
-
GUMBO_TAG_THEAD, GUMBO_TAG_LAST) ||
|
3294
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COL),
|
3295
|
+
TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), TAG(THEAD) }) ||
|
3192
3296
|
tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
|
3193
3297
|
if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) ||
|
3194
3298
|
has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) ||
|
@@ -3202,9 +3306,8 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
|
|
3202
3306
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3203
3307
|
parser->_parser_state->_reprocess_current_token = true;
|
3204
3308
|
return true;
|
3205
|
-
} else if (tag_in(token, kEndTag,
|
3206
|
-
|
3207
|
-
GUMBO_TAG_HTML, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST))
|
3309
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(CAPTION),
|
3310
|
+
TAG(COL), TAG(TR), TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH) }))
|
3208
3311
|
{
|
3209
3312
|
parser_add_parse_error(parser, token);
|
3210
3313
|
ignore_token(parser);
|
@@ -3216,51 +3319,54 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
|
|
3216
3319
|
|
3217
3320
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr
|
3218
3321
|
static bool handle_in_row(GumboParser* parser, GumboToken* token) {
|
3219
|
-
if (tag_in(token, kStartTag,
|
3322
|
+
if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(TH), TAG(TD) })) {
|
3220
3323
|
clear_stack_to_table_row_context(parser);
|
3221
3324
|
insert_element_from_token(parser, token);
|
3222
3325
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
|
3223
3326
|
add_formatting_element(parser, &kActiveFormattingScopeMarker);
|
3224
3327
|
return true;
|
3225
|
-
} else if (
|
3226
|
-
|
3227
|
-
GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
|
3228
|
-
tag_in(token, kEndTag, GUMBO_TAG_TR, GUMBO_TAG_TABLE,
|
3229
|
-
GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
|
3230
|
-
GUMBO_TAG_LAST)) {
|
3231
|
-
// This case covers 4 clauses of the spec, each of which say "Otherwise, act
|
3232
|
-
// as if an end tag with the tag name "tr" had been seen." The differences
|
3233
|
-
// are in error handling and whether the current token is reprocessed.
|
3234
|
-
GumboTag desired_tag =
|
3235
|
-
tag_in(token, kEndTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
|
3236
|
-
GUMBO_TAG_THEAD, GUMBO_TAG_LAST)
|
3237
|
-
? token->v.end_tag : GUMBO_TAG_TR;
|
3238
|
-
if (!has_an_element_in_table_scope(parser, desired_tag)) {
|
3239
|
-
gumbo_debug("Bailing because there is no tag %s in table scope.\nOpen elements:",
|
3240
|
-
gumbo_normalized_tagname(desired_tag));
|
3241
|
-
for (int i = 0; i < parser->_parser_state->_open_elements.length; ++i) {
|
3242
|
-
const GumboNode* node = parser->_parser_state->_open_elements.data[i];
|
3243
|
-
gumbo_debug("%s\n", gumbo_normalized_tagname(node->v.element.tag));
|
3244
|
-
}
|
3328
|
+
} else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
|
3329
|
+
if (!has_an_element_in_table_scope(parser,GUMBO_TAG_TR)) {
|
3245
3330
|
parser_add_parse_error(parser, token);
|
3246
3331
|
ignore_token(parser);
|
3247
3332
|
return false;
|
3333
|
+
} else {
|
3334
|
+
clear_stack_to_table_row_context(parser);
|
3335
|
+
pop_current_node(parser);
|
3336
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3337
|
+
return true;
|
3248
3338
|
}
|
3249
|
-
|
3250
|
-
|
3251
|
-
|
3252
|
-
|
3253
|
-
|
3254
|
-
|
3339
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COL), TAG(COLGROUP),
|
3340
|
+
TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR) }) || tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
|
3341
|
+
if (!has_an_element_in_table_scope(parser,GUMBO_TAG_TR)) {
|
3342
|
+
parser_add_parse_error(parser, token);
|
3343
|
+
ignore_token(parser);
|
3344
|
+
return false;
|
3345
|
+
} else {
|
3346
|
+
clear_stack_to_table_row_context(parser);
|
3347
|
+
pop_current_node(parser);
|
3348
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3255
3349
|
parser->_parser_state->_reprocess_current_token = true;
|
3350
|
+
return true;
|
3256
3351
|
}
|
3257
|
-
|
3258
|
-
|
3259
|
-
|
3260
|
-
|
3261
|
-
|
3262
|
-
|
3263
|
-
|
3352
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(TBODY), TAG(TFOOT), TAG(THEAD) })) {
|
3353
|
+
if (!has_an_element_in_table_scope(parser, token->v.end_tag) ||
|
3354
|
+
(!has_an_element_in_table_scope(parser, GUMBO_TAG_TR))) {
|
3355
|
+
parser_add_parse_error(parser, token);
|
3356
|
+
ignore_token(parser);
|
3357
|
+
return false;
|
3358
|
+
} else {
|
3359
|
+
clear_stack_to_table_row_context(parser);
|
3360
|
+
pop_current_node(parser);
|
3361
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3362
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3363
|
+
return true;
|
3364
|
+
}
|
3365
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(CAPTION), TAG(COL),
|
3366
|
+
TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH) })) {
|
3367
|
+
parser_add_parse_error(parser, token);
|
3368
|
+
ignore_token(parser);
|
3369
|
+
return false;
|
3264
3370
|
} else {
|
3265
3371
|
return handle_in_table(parser, token);
|
3266
3372
|
}
|
@@ -3268,17 +3374,17 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
|
|
3268
3374
|
|
3269
3375
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd
|
3270
3376
|
static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
|
3271
|
-
if (tag_in(token, kEndTag,
|
3377
|
+
if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(TD), TAG(TH) })) {
|
3272
3378
|
GumboTag token_tag = token->v.end_tag;
|
3273
3379
|
if (!has_an_element_in_table_scope(parser, token_tag)) {
|
3274
3380
|
parser_add_parse_error(parser, token);
|
3381
|
+
ignore_token(parser);
|
3275
3382
|
return false;
|
3276
3383
|
}
|
3277
3384
|
return close_table_cell(parser, token, token_tag);
|
3278
|
-
} else if (tag_in(token, kStartTag,
|
3279
|
-
|
3280
|
-
|
3281
|
-
GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
|
3385
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COL),
|
3386
|
+
TAG(COLGROUP), TAG(TBODY), TAG(TD),
|
3387
|
+
TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) })) {
|
3282
3388
|
gumbo_debug("Handling <td> in cell.\n");
|
3283
3389
|
if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) &&
|
3284
3390
|
!has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
|
@@ -3289,15 +3395,13 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
|
|
3289
3395
|
}
|
3290
3396
|
parser->_parser_state->_reprocess_current_token = true;
|
3291
3397
|
return close_current_cell(parser, token);
|
3292
|
-
} else if (tag_in(token, kEndTag,
|
3293
|
-
|
3294
|
-
GUMBO_TAG_LAST)) {
|
3398
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(CAPTION),
|
3399
|
+
TAG(COL), TAG(COLGROUP), TAG(HTML) })) {
|
3295
3400
|
parser_add_parse_error(parser, token);
|
3296
3401
|
ignore_token(parser);
|
3297
3402
|
return false;
|
3298
|
-
} else if (tag_in(token, kEndTag,
|
3299
|
-
|
3300
|
-
GUMBO_TAG_LAST)) {
|
3403
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(TABLE), TAG(TBODY),
|
3404
|
+
TAG(TFOOT), TAG(THEAD), TAG(TR) })) {
|
3301
3405
|
if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
|
3302
3406
|
parser_add_parse_error(parser, token);
|
3303
3407
|
ignore_token(parser);
|
@@ -3330,28 +3434,28 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3330
3434
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
3331
3435
|
return handle_in_body(parser, token);
|
3332
3436
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
|
3333
|
-
if (
|
3437
|
+
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
3334
3438
|
pop_current_node(parser);
|
3335
3439
|
}
|
3336
3440
|
insert_element_from_token(parser, token);
|
3337
3441
|
return true;
|
3338
3442
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
|
3339
|
-
if (
|
3443
|
+
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
3340
3444
|
pop_current_node(parser);
|
3341
3445
|
}
|
3342
|
-
if (
|
3446
|
+
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
|
3343
3447
|
pop_current_node(parser);
|
3344
3448
|
}
|
3345
3449
|
insert_element_from_token(parser, token);
|
3346
3450
|
return true;
|
3347
3451
|
} else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
|
3348
3452
|
GumboVector* open_elements = &parser->_parser_state->_open_elements;
|
3349
|
-
if (
|
3350
|
-
|
3453
|
+
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
|
3454
|
+
node_html_tag_is(open_elements->data[open_elements->length - 2],
|
3351
3455
|
GUMBO_TAG_OPTGROUP)) {
|
3352
3456
|
pop_current_node(parser);
|
3353
3457
|
}
|
3354
|
-
if (
|
3458
|
+
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
|
3355
3459
|
pop_current_node(parser);
|
3356
3460
|
return true;
|
3357
3461
|
} else {
|
@@ -3360,7 +3464,7 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3360
3464
|
return false;
|
3361
3465
|
}
|
3362
3466
|
} else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
|
3363
|
-
if (
|
3467
|
+
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
|
3364
3468
|
pop_current_node(parser);
|
3365
3469
|
return true;
|
3366
3470
|
} else {
|
@@ -3381,8 +3485,7 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3381
3485
|
ignore_token(parser);
|
3382
3486
|
close_current_select(parser);
|
3383
3487
|
return false;
|
3384
|
-
} else if (tag_in(token, kStartTag,
|
3385
|
-
GUMBO_TAG_TEXTAREA, GUMBO_TAG_LAST)) {
|
3488
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA) })) {
|
3386
3489
|
parser_add_parse_error(parser, token);
|
3387
3490
|
if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
|
3388
3491
|
ignore_token(parser);
|
@@ -3391,14 +3494,11 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3391
3494
|
parser->_parser_state->_reprocess_current_token = true;
|
3392
3495
|
}
|
3393
3496
|
return false;
|
3394
|
-
} else if (
|
3497
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(SCRIPT) , TAG(TEMPLATE) }) ||
|
3498
|
+
tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
|
3395
3499
|
return handle_in_head(parser, token);
|
3396
3500
|
} else if (token->type == GUMBO_TOKEN_EOF) {
|
3397
|
-
|
3398
|
-
parser_add_parse_error(parser, token);
|
3399
|
-
return false;
|
3400
|
-
}
|
3401
|
-
return true;
|
3501
|
+
return handle_in_body(parser, token);
|
3402
3502
|
} else {
|
3403
3503
|
parser_add_parse_error(parser, token);
|
3404
3504
|
ignore_token(parser);
|
@@ -3408,25 +3508,25 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
|
|
3408
3508
|
|
3409
3509
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
|
3410
3510
|
static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
|
3411
|
-
if (tag_in(token, kStartTag,
|
3412
|
-
|
3413
|
-
GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
|
3511
|
+
if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(TABLE),
|
3512
|
+
TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH) })) {
|
3414
3513
|
parser_add_parse_error(parser, token);
|
3415
3514
|
close_current_select(parser);
|
3416
3515
|
parser->_parser_state->_reprocess_current_token = true;
|
3417
3516
|
return false;
|
3418
|
-
} else if (tag_in(token, kEndTag,
|
3419
|
-
|
3420
|
-
GUMBO_TAG_TR, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
|
3517
|
+
} else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(CAPTION), TAG(TABLE),
|
3518
|
+
TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH) })) {
|
3421
3519
|
parser_add_parse_error(parser, token);
|
3422
|
-
if (has_an_element_in_table_scope(parser, token->v.end_tag)) {
|
3520
|
+
if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
|
3521
|
+
ignore_token(parser);
|
3522
|
+
return false;
|
3523
|
+
} else {
|
3423
3524
|
close_current_select(parser);
|
3424
|
-
reset_insertion_mode_appropriately
|
3525
|
+
// close_current_select already does the reset_insertion_mode_appropriately
|
3526
|
+
// reset_insertion_mode_appropriately(parser);
|
3425
3527
|
parser->_parser_state->_reprocess_current_token = true;
|
3426
|
-
|
3427
|
-
ignore_token(parser);
|
3528
|
+
return false;
|
3428
3529
|
}
|
3429
|
-
return false;
|
3430
3530
|
} else {
|
3431
3531
|
return handle_in_select(parser, token);
|
3432
3532
|
}
|
@@ -3434,8 +3534,68 @@ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
|
|
3434
3534
|
|
3435
3535
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
|
3436
3536
|
static bool handle_in_template(GumboParser* parser, GumboToken* token) {
|
3437
|
-
|
3438
|
-
|
3537
|
+
GumboParserState* state = parser->_parser_state;
|
3538
|
+
if (token->type == GUMBO_TOKEN_WHITESPACE ||
|
3539
|
+
token->type == GUMBO_TOKEN_CHARACTER ||
|
3540
|
+
token->type == GUMBO_TOKEN_COMMENT ||
|
3541
|
+
token->type == GUMBO_TOKEN_DOCTYPE) {
|
3542
|
+
return handle_in_body(parser, token);
|
3543
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
|
3544
|
+
TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE),
|
3545
|
+
TAG(TEMPLATE), TAG(TITLE) }) ||
|
3546
|
+
tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
|
3547
|
+
return handle_in_head(parser, token);
|
3548
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COLGROUP),
|
3549
|
+
TAG(TBODY), TAG(TFOOT), TAG(THEAD) })) {
|
3550
|
+
pop_template_insertion_mode(parser);
|
3551
|
+
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3552
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
|
3553
|
+
state->_reprocess_current_token = true;
|
3554
|
+
return true;
|
3555
|
+
} else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
|
3556
|
+
pop_template_insertion_mode(parser);
|
3557
|
+
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
|
3558
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
|
3559
|
+
state->_reprocess_current_token = true;
|
3560
|
+
return true;
|
3561
|
+
} else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
|
3562
|
+
pop_template_insertion_mode(parser);
|
3563
|
+
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3564
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
|
3565
|
+
state->_reprocess_current_token = true;
|
3566
|
+
return true;
|
3567
|
+
} else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(TD), TAG(TH) })) {
|
3568
|
+
pop_template_insertion_mode(parser);
|
3569
|
+
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
3570
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
|
3571
|
+
state->_reprocess_current_token = true;
|
3572
|
+
return true;
|
3573
|
+
} else if (token->type == GUMBO_TOKEN_START_TAG) {
|
3574
|
+
pop_template_insertion_mode(parser);
|
3575
|
+
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
3576
|
+
set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
|
3577
|
+
state->_reprocess_current_token = true;
|
3578
|
+
return true;
|
3579
|
+
} else if (token->type == GUMBO_TOKEN_END_TAG) {
|
3580
|
+
parser_add_parse_error(parser, token);
|
3581
|
+
ignore_token(parser);
|
3582
|
+
return false;
|
3583
|
+
} else if (token->type == GUMBO_TOKEN_EOF) {
|
3584
|
+
if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
|
3585
|
+
// Stop parsing.
|
3586
|
+
return true;
|
3587
|
+
}
|
3588
|
+
parser_add_parse_error(parser, token);
|
3589
|
+
while(!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE));
|
3590
|
+
clear_active_formatting_elements(parser);
|
3591
|
+
pop_template_insertion_mode(parser);
|
3592
|
+
reset_insertion_mode_appropriately(parser);
|
3593
|
+
state->_reprocess_current_token = true;
|
3594
|
+
return false;
|
3595
|
+
} else {
|
3596
|
+
assert(0);
|
3597
|
+
return false;
|
3598
|
+
}
|
3439
3599
|
}
|
3440
3600
|
|
3441
3601
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
|
@@ -3453,10 +3613,15 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
|
|
3453
3613
|
ignore_token(parser);
|
3454
3614
|
return false;
|
3455
3615
|
} else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
|
3456
|
-
|
3616
|
+
/* fragment case: ignore the closing HTML token */
|
3617
|
+
if (is_fragment_parser(parser)) {
|
3618
|
+
parser_add_parse_error(parser, token);
|
3619
|
+
ignore_token(parser);
|
3620
|
+
return false;
|
3621
|
+
}
|
3457
3622
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
|
3458
3623
|
GumboNode* html = parser->_parser_state->_open_elements.data[0];
|
3459
|
-
assert(
|
3624
|
+
assert(node_html_tag_is(html, GUMBO_TAG_HTML));
|
3460
3625
|
record_end_of_element(
|
3461
3626
|
parser->_parser_state->_current_token, &html->v.element);
|
3462
3627
|
return true;
|
@@ -3488,15 +3653,14 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
|
|
3488
3653
|
insert_element_from_token(parser, token);
|
3489
3654
|
return true;
|
3490
3655
|
} else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
|
3491
|
-
if (
|
3656
|
+
if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
|
3492
3657
|
parser_add_parse_error(parser, token);
|
3493
3658
|
ignore_token(parser);
|
3494
3659
|
return false;
|
3495
3660
|
}
|
3496
3661
|
pop_current_node(parser);
|
3497
|
-
|
3498
|
-
|
3499
|
-
if (!node_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
|
3662
|
+
if (!is_fragment_parser(parser) &&
|
3663
|
+
!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
|
3500
3664
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
|
3501
3665
|
}
|
3502
3666
|
return true;
|
@@ -3508,7 +3672,7 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
|
|
3508
3672
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
|
3509
3673
|
return handle_in_head(parser, token);
|
3510
3674
|
} else if (token->type == GUMBO_TOKEN_EOF) {
|
3511
|
-
if (!
|
3675
|
+
if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
|
3512
3676
|
parser_add_parse_error(parser, token);
|
3513
3677
|
return false;
|
3514
3678
|
}
|
@@ -3536,7 +3700,7 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
|
|
3536
3700
|
return handle_in_body(parser, token);
|
3537
3701
|
} else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
|
3538
3702
|
GumboNode* html = parser->_parser_state->_open_elements.data[0];
|
3539
|
-
assert(
|
3703
|
+
assert(node_html_tag_is(html, GUMBO_TAG_HTML));
|
3540
3704
|
record_end_of_element(
|
3541
3705
|
parser->_parser_state->_current_token, &html->v.element);
|
3542
3706
|
set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
|
@@ -3631,13 +3795,13 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
|
3631
3795
|
switch (token->type) {
|
3632
3796
|
case GUMBO_TOKEN_NULL:
|
3633
3797
|
parser_add_parse_error(parser, token);
|
3634
|
-
token->type = GUMBO_TOKEN_CHARACTER;
|
3635
3798
|
token->v.character = kUtf8ReplacementChar;
|
3636
3799
|
insert_text_token(parser, token);
|
3637
3800
|
return false;
|
3638
3801
|
case GUMBO_TOKEN_WHITESPACE:
|
3639
3802
|
insert_text_token(parser, token);
|
3640
3803
|
return true;
|
3804
|
+
case GUMBO_TOKEN_CDATA:
|
3641
3805
|
case GUMBO_TOKEN_CHARACTER:
|
3642
3806
|
insert_text_token(parser, token);
|
3643
3807
|
set_frameset_not_ok(parser);
|
@@ -3654,35 +3818,48 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|
|
3654
3818
|
break;
|
3655
3819
|
}
|
3656
3820
|
// Order matters for these clauses.
|
3657
|
-
if (tag_in(token, kStartTag,
|
3658
|
-
|
3659
|
-
|
3660
|
-
|
3661
|
-
|
3662
|
-
|
3663
|
-
|
3664
|
-
|
3665
|
-
|
3666
|
-
|
3667
|
-
|
3668
|
-
|
3669
|
-
GUMBO_TAG_VAR, GUMBO_TAG_LAST) ||
|
3821
|
+
if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(B), TAG(BIG),
|
3822
|
+
TAG(BLOCKQUOTE), TAG(BODY), TAG(BR),
|
3823
|
+
TAG(CENTER), TAG(CODE), TAG(DD), TAG(DIV),
|
3824
|
+
TAG(DL), TAG(DT), TAG(EM), TAG(EMBED),
|
3825
|
+
TAG(H1), TAG(H2), TAG(H3), TAG(H4),
|
3826
|
+
TAG(H5), TAG(H6), TAG(HEAD), TAG(HR),
|
3827
|
+
TAG(I), TAG(IMG), TAG(LI), TAG(LISTING),
|
3828
|
+
TAG(MENU), TAG(META), TAG(NOBR), TAG(OL),
|
3829
|
+
TAG(P), TAG(PRE), TAG(RUBY), TAG(S),
|
3830
|
+
TAG(SMALL), TAG(SPAN), TAG(STRONG),
|
3831
|
+
TAG(STRIKE), TAG(SUB), TAG(SUP),
|
3832
|
+
TAG(TABLE), TAG(TT), TAG(U), TAG(UL), TAG(VAR) }) ||
|
3670
3833
|
(tag_is(token, kStartTag, GUMBO_TAG_FONT) && (
|
3671
3834
|
token_has_attribute(token, "color") ||
|
3672
3835
|
token_has_attribute(token, "face") ||
|
3673
3836
|
token_has_attribute(token, "size")))) {
|
3837
|
+
|
3838
|
+
/* Parse error */
|
3674
3839
|
parser_add_parse_error(parser, token);
|
3675
|
-
|
3676
|
-
|
3677
|
-
|
3678
|
-
|
3679
|
-
|
3680
|
-
|
3681
|
-
parser
|
3682
|
-
|
3683
|
-
|
3840
|
+
|
3841
|
+
/*
|
3842
|
+
* Fragment case: If the parser was originally created for the HTML
|
3843
|
+
* fragment parsing algorithm, then act as described in the "any other
|
3844
|
+
* start tag" entry below.
|
3845
|
+
*/
|
3846
|
+
if (!is_fragment_parser(parser)) {
|
3847
|
+
do {
|
3848
|
+
pop_current_node(parser);
|
3849
|
+
} while(!(is_mathml_integration_point(get_current_node(parser)) ||
|
3850
|
+
is_html_integration_point(get_current_node(parser)) ||
|
3851
|
+
get_current_node(parser)->v.element.tag_namespace ==
|
3852
|
+
GUMBO_NAMESPACE_HTML));
|
3853
|
+
parser->_parser_state->_reprocess_current_token = true;
|
3854
|
+
return false;
|
3855
|
+
}
|
3856
|
+
|
3857
|
+
assert(token->type == GUMBO_TOKEN_START_TAG);
|
3858
|
+
}
|
3859
|
+
|
3860
|
+
if (token->type == GUMBO_TOKEN_START_TAG) {
|
3684
3861
|
const GumboNamespaceEnum current_namespace =
|
3685
|
-
|
3862
|
+
get_adjusted_current_node(parser)->v.element.tag_namespace;
|
3686
3863
|
if (current_namespace == GUMBO_NAMESPACE_MATHML) {
|
3687
3864
|
adjust_mathml_attributes(parser, token);
|
3688
3865
|
}
|
@@ -3771,8 +3948,10 @@ static bool handle_token(GumboParser* parser, GumboToken* token) {
|
|
3771
3948
|
parser->_parser_state->_closed_html_tag = true;
|
3772
3949
|
}
|
3773
3950
|
|
3774
|
-
const GumboNode* current_node =
|
3775
|
-
assert(!current_node ||
|
3951
|
+
const GumboNode* current_node = get_adjusted_current_node(parser);
|
3952
|
+
assert(!current_node ||
|
3953
|
+
current_node->type == GUMBO_NODE_ELEMENT ||
|
3954
|
+
current_node->type == GUMBO_NODE_TEMPLATE);
|
3776
3955
|
if (current_node) {
|
3777
3956
|
gumbo_debug("Current node: <%s>.\n",
|
3778
3957
|
gumbo_normalized_tagname(current_node->v.element.tag));
|
@@ -3784,10 +3963,9 @@ static bool handle_token(GumboParser* parser, GumboToken* token) {
|
|
3784
3963
|
token->type == GUMBO_TOKEN_WHITESPACE ||
|
3785
3964
|
token->type == GUMBO_TOKEN_NULL ||
|
3786
3965
|
(token->type == GUMBO_TOKEN_START_TAG &&
|
3787
|
-
!tag_in(token, kStartTag,
|
3788
|
-
GUMBO_TAG_LAST)))) ||
|
3966
|
+
!tag_in(token, kStartTag, (gumbo_tagset) { TAG(MGLYPH), TAG(MALIGNMARK) })))) ||
|
3789
3967
|
(current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
|
3790
|
-
|
3968
|
+
node_qualified_tag_is(current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
|
3791
3969
|
tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
|
3792
3970
|
(is_html_integration_point(current_node) && (
|
3793
3971
|
token->type == GUMBO_TOKEN_START_TAG ||
|
@@ -3801,6 +3979,66 @@ static bool handle_token(GumboParser* parser, GumboToken* token) {
|
|
3801
3979
|
}
|
3802
3980
|
}
|
3803
3981
|
|
3982
|
+
static void fragment_parser_init(
|
3983
|
+
GumboParser *parser, GumboTag fragment_ctx,
|
3984
|
+
GumboNamespaceEnum fragment_namespace) {
|
3985
|
+
GumboNode *root;
|
3986
|
+
assert(fragment_ctx != GUMBO_TAG_LAST);
|
3987
|
+
|
3988
|
+
// 3
|
3989
|
+
parser->_parser_state->_fragment_ctx = create_element(parser, fragment_ctx);
|
3990
|
+
parser->_parser_state->_fragment_ctx->v.element.tag_namespace =
|
3991
|
+
fragment_namespace;
|
3992
|
+
|
3993
|
+
// 4
|
3994
|
+
if (fragment_namespace == GUMBO_NAMESPACE_HTML) {
|
3995
|
+
// Non-HTML namespaces always start in the DATA state.
|
3996
|
+
switch (fragment_ctx) {
|
3997
|
+
case GUMBO_TAG_TITLE:
|
3998
|
+
case GUMBO_TAG_TEXTAREA:
|
3999
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
|
4000
|
+
break;
|
4001
|
+
|
4002
|
+
case GUMBO_TAG_STYLE:
|
4003
|
+
case GUMBO_TAG_XMP:
|
4004
|
+
case GUMBO_TAG_IFRAME:
|
4005
|
+
case GUMBO_TAG_NOEMBED:
|
4006
|
+
case GUMBO_TAG_NOFRAMES:
|
4007
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
|
4008
|
+
break;
|
4009
|
+
|
4010
|
+
case GUMBO_TAG_SCRIPT:
|
4011
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
|
4012
|
+
break;
|
4013
|
+
|
4014
|
+
case GUMBO_TAG_NOSCRIPT:
|
4015
|
+
/* scripting is disabled in Gumbo, so leave the tokenizer
|
4016
|
+
* in the default data state */
|
4017
|
+
break;
|
4018
|
+
|
4019
|
+
case GUMBO_TAG_PLAINTEXT:
|
4020
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
|
4021
|
+
break;
|
4022
|
+
|
4023
|
+
default:
|
4024
|
+
/* default data state */
|
4025
|
+
break;
|
4026
|
+
}
|
4027
|
+
}
|
4028
|
+
|
4029
|
+
// 5. 6. 7.
|
4030
|
+
root = insert_element_of_tag_type(parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
|
4031
|
+
parser->_output->root = root;
|
4032
|
+
|
4033
|
+
// 8.
|
4034
|
+
if (fragment_ctx == GUMBO_TAG_TEMPLATE) {
|
4035
|
+
push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
|
4036
|
+
}
|
4037
|
+
|
4038
|
+
// 10.
|
4039
|
+
reset_insertion_mode_appropriately(parser);
|
4040
|
+
}
|
4041
|
+
|
3804
4042
|
GumboOutput* gumbo_parse(const char* buffer) {
|
3805
4043
|
return gumbo_parse_with_options(
|
3806
4044
|
&kGumboDefaultOptions, buffer, strlen(buffer));
|
@@ -3808,11 +4046,27 @@ GumboOutput* gumbo_parse(const char* buffer) {
|
|
3808
4046
|
|
3809
4047
|
GumboOutput* gumbo_parse_with_options(
|
3810
4048
|
const GumboOptions* options, const char* buffer, size_t length) {
|
4049
|
+
return gumbo_parse_fragment(
|
4050
|
+
options, buffer, length, GUMBO_TAG_LAST, GUMBO_NAMESPACE_HTML);
|
4051
|
+
}
|
4052
|
+
|
4053
|
+
GumboOutput* gumbo_parse_fragment(
|
4054
|
+
const GumboOptions* options, const char* buffer, size_t length,
|
4055
|
+
const GumboTag fragment_ctx, const GumboNamespaceEnum fragment_namespace) {
|
3811
4056
|
GumboParser parser;
|
3812
4057
|
parser._options = options;
|
4058
|
+
parser_state_init(&parser);
|
4059
|
+
// Must come after parser_state_init, since creating the document node must
|
4060
|
+
// reference parser_state->_current_node.
|
3813
4061
|
output_init(&parser);
|
4062
|
+
// And this must come after output_init, because initializing the tokenizer
|
4063
|
+
// reads the first character and that may cause a UTF-8 decode error
|
4064
|
+
// (inserting into output->errors) if that's invalid.
|
3814
4065
|
gumbo_tokenizer_state_init(&parser, buffer, length);
|
3815
|
-
|
4066
|
+
|
4067
|
+
if (fragment_ctx != GUMBO_TAG_LAST) {
|
4068
|
+
fragment_parser_init(&parser, fragment_ctx, fragment_namespace);
|
4069
|
+
}
|
3816
4070
|
|
3817
4071
|
GumboParserState* state = parser._parser_state;
|
3818
4072
|
gumbo_debug("Parsing %.*s.\n", length, buffer);
|
@@ -3823,6 +4077,7 @@ GumboOutput* gumbo_parse_with_options(
|
|
3823
4077
|
|
3824
4078
|
GumboToken token;
|
3825
4079
|
bool has_error = false;
|
4080
|
+
|
3826
4081
|
do {
|
3827
4082
|
if (state->_reprocess_current_token) {
|
3828
4083
|
state->_reprocess_current_token = false;
|
@@ -3899,20 +4154,16 @@ GumboOutput* gumbo_parse_with_options(
|
|
3899
4154
|
return parser._output;
|
3900
4155
|
}
|
3901
4156
|
|
3902
|
-
void gumbo_destroy_node(GumboOptions* options, GumboNode* node) {
|
3903
|
-
// Need a dummy GumboParser because the allocator comes along with the
|
3904
|
-
// options object.
|
3905
|
-
GumboParser parser;
|
3906
|
-
parser._options = options;
|
3907
|
-
destroy_node(&parser, node);
|
3908
|
-
}
|
3909
|
-
|
3910
4157
|
void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
|
3911
4158
|
// Need a dummy GumboParser because the allocator comes along with the
|
3912
4159
|
// options object.
|
3913
4160
|
GumboParser parser;
|
4161
|
+
parser._parser_state = NULL;
|
3914
4162
|
parser._options = options;
|
3915
|
-
|
4163
|
+
GumboNode* current = output->document;
|
4164
|
+
while (current) {
|
4165
|
+
current = destroy_node(&parser, current);
|
4166
|
+
}
|
3916
4167
|
for (int i = 0; i < output->errors.length; ++i) {
|
3917
4168
|
gumbo_error_destroy(&parser, output->errors.data[i]);
|
3918
4169
|
}
|