nokogumbo 1.3.0 → 1.4.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -36,7 +36,7 @@ Example
36
36
  -----
37
37
  ```ruby
38
38
  require 'nokogumbo'
39
- puts Nokogiri::HTML5.get('http://nokogiri.org').at('h1 abbr')['title']
39
+ puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
40
40
  ```
41
41
 
42
42
  Notes
@@ -157,6 +157,7 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
157
157
 
158
158
  switch (child->type) {
159
159
  case GUMBO_NODE_ELEMENT:
160
+ case GUMBO_NODE_TEMPLATE:
160
161
  node = walk_tree(document, &child->v.element);
161
162
  break;
162
163
  case GUMBO_NODE_WHITESPACE:
@@ -35,10 +35,11 @@ static const size_t kMessageBufferSize = 256;
35
35
  static int print_message(GumboParser* parser, GumboStringBuffer* output,
36
36
  const char* format, ...) {
37
37
  va_list args;
38
- va_start(args, format);
39
38
  int remaining_capacity = output->capacity - output->length;
39
+ va_start(args, format);
40
40
  int bytes_written = vsnprintf(output->data + output->length,
41
41
  remaining_capacity, format, args);
42
+ va_end(args);
42
43
  #ifdef _MSC_VER
43
44
  if (bytes_written == -1) {
44
45
  // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
@@ -47,6 +48,7 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
47
48
  // we retry (letting it fail and returning 0 if it doesn't), since there's
48
49
  // no way to smartly resize the buffer.
49
50
  gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
51
+ va_start(args, format);
50
52
  int result = vsnprintf(output->data + output->length,
51
53
  remaining_capacity, format, args);
52
54
  va_end(args);
@@ -55,7 +57,6 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
55
57
  #else
56
58
  // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
57
59
  if (bytes_written == -1) {
58
- va_end(args);
59
60
  return 0;
60
61
  }
61
62
  #endif
@@ -64,11 +65,12 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
64
65
  gumbo_string_buffer_reserve(
65
66
  parser, output->capacity + bytes_written, output);
66
67
  remaining_capacity = output->capacity - output->length;
68
+ va_start(args, format);
67
69
  bytes_written = vsnprintf(output->data + output->length,
68
70
  remaining_capacity, format, args);
71
+ va_end(args);
69
72
  }
70
73
  output->length += bytes_written;
71
- va_end(args);
72
74
  return bytes_written;
73
75
  }
74
76
 
@@ -106,6 +108,7 @@ static void handle_parser_error(GumboParser* parser,
106
108
  // But just in case...
107
109
  print_message(parser, output, "Comments aren't legal here");
108
110
  return;
111
+ case GUMBO_TOKEN_CDATA:
109
112
  case GUMBO_TOKEN_WHITESPACE:
110
113
  case GUMBO_TOKEN_CHARACTER:
111
114
  print_message(parser, output, "Character tokens aren't legal here");
@@ -141,7 +141,7 @@ extern const GumboVector kGumboEmptyVector;
141
141
  * Returns the first index at which an element appears in this vector (testing
142
142
  * by pointer equality), or -1 if it never does.
143
143
  */
144
- int gumbo_vector_index_of(GumboVector* vector, void* element);
144
+ int gumbo_vector_index_of(GumboVector* vector, const void* element);
145
145
 
146
146
 
147
147
  /**
@@ -157,172 +157,10 @@ int gumbo_vector_index_of(GumboVector* vector, void* element);
157
157
  * strings.
158
158
  */
159
159
  typedef enum {
160
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
161
- GUMBO_TAG_HTML,
162
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
163
- GUMBO_TAG_HEAD,
164
- GUMBO_TAG_TITLE,
165
- GUMBO_TAG_BASE,
166
- GUMBO_TAG_LINK,
167
- GUMBO_TAG_META,
168
- GUMBO_TAG_STYLE,
169
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
170
- GUMBO_TAG_SCRIPT,
171
- GUMBO_TAG_NOSCRIPT,
172
- GUMBO_TAG_TEMPLATE,
173
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
174
- GUMBO_TAG_BODY,
175
- GUMBO_TAG_ARTICLE,
176
- GUMBO_TAG_SECTION,
177
- GUMBO_TAG_NAV,
178
- GUMBO_TAG_ASIDE,
179
- GUMBO_TAG_H1,
180
- GUMBO_TAG_H2,
181
- GUMBO_TAG_H3,
182
- GUMBO_TAG_H4,
183
- GUMBO_TAG_H5,
184
- GUMBO_TAG_H6,
185
- GUMBO_TAG_HGROUP,
186
- GUMBO_TAG_HEADER,
187
- GUMBO_TAG_FOOTER,
188
- GUMBO_TAG_ADDRESS,
189
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
190
- GUMBO_TAG_P,
191
- GUMBO_TAG_HR,
192
- GUMBO_TAG_PRE,
193
- GUMBO_TAG_BLOCKQUOTE,
194
- GUMBO_TAG_OL,
195
- GUMBO_TAG_UL,
196
- GUMBO_TAG_LI,
197
- GUMBO_TAG_DL,
198
- GUMBO_TAG_DT,
199
- GUMBO_TAG_DD,
200
- GUMBO_TAG_FIGURE,
201
- GUMBO_TAG_FIGCAPTION,
202
- GUMBO_TAG_MAIN,
203
- GUMBO_TAG_DIV,
204
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
205
- GUMBO_TAG_A,
206
- GUMBO_TAG_EM,
207
- GUMBO_TAG_STRONG,
208
- GUMBO_TAG_SMALL,
209
- GUMBO_TAG_S,
210
- GUMBO_TAG_CITE,
211
- GUMBO_TAG_Q,
212
- GUMBO_TAG_DFN,
213
- GUMBO_TAG_ABBR,
214
- GUMBO_TAG_DATA,
215
- GUMBO_TAG_TIME,
216
- GUMBO_TAG_CODE,
217
- GUMBO_TAG_VAR,
218
- GUMBO_TAG_SAMP,
219
- GUMBO_TAG_KBD,
220
- GUMBO_TAG_SUB,
221
- GUMBO_TAG_SUP,
222
- GUMBO_TAG_I,
223
- GUMBO_TAG_B,
224
- GUMBO_TAG_U,
225
- GUMBO_TAG_MARK,
226
- GUMBO_TAG_RUBY,
227
- GUMBO_TAG_RT,
228
- GUMBO_TAG_RP,
229
- GUMBO_TAG_BDI,
230
- GUMBO_TAG_BDO,
231
- GUMBO_TAG_SPAN,
232
- GUMBO_TAG_BR,
233
- GUMBO_TAG_WBR,
234
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
235
- GUMBO_TAG_INS,
236
- GUMBO_TAG_DEL,
237
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
238
- GUMBO_TAG_IMAGE,
239
- GUMBO_TAG_IMG,
240
- GUMBO_TAG_IFRAME,
241
- GUMBO_TAG_EMBED,
242
- GUMBO_TAG_OBJECT,
243
- GUMBO_TAG_PARAM,
244
- GUMBO_TAG_VIDEO,
245
- GUMBO_TAG_AUDIO,
246
- GUMBO_TAG_SOURCE,
247
- GUMBO_TAG_TRACK,
248
- GUMBO_TAG_CANVAS,
249
- GUMBO_TAG_MAP,
250
- GUMBO_TAG_AREA,
251
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
252
- GUMBO_TAG_MATH,
253
- GUMBO_TAG_MI,
254
- GUMBO_TAG_MO,
255
- GUMBO_TAG_MN,
256
- GUMBO_TAG_MS,
257
- GUMBO_TAG_MTEXT,
258
- GUMBO_TAG_MGLYPH,
259
- GUMBO_TAG_MALIGNMARK,
260
- GUMBO_TAG_ANNOTATION_XML,
261
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
262
- GUMBO_TAG_SVG,
263
- GUMBO_TAG_FOREIGNOBJECT,
264
- GUMBO_TAG_DESC,
265
- // SVG title tags will have GUMBO_TAG_TITLE as with HTML.
266
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
267
- GUMBO_TAG_TABLE,
268
- GUMBO_TAG_CAPTION,
269
- GUMBO_TAG_COLGROUP,
270
- GUMBO_TAG_COL,
271
- GUMBO_TAG_TBODY,
272
- GUMBO_TAG_THEAD,
273
- GUMBO_TAG_TFOOT,
274
- GUMBO_TAG_TR,
275
- GUMBO_TAG_TD,
276
- GUMBO_TAG_TH,
277
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
278
- GUMBO_TAG_FORM,
279
- GUMBO_TAG_FIELDSET,
280
- GUMBO_TAG_LEGEND,
281
- GUMBO_TAG_LABEL,
282
- GUMBO_TAG_INPUT,
283
- GUMBO_TAG_BUTTON,
284
- GUMBO_TAG_SELECT,
285
- GUMBO_TAG_DATALIST,
286
- GUMBO_TAG_OPTGROUP,
287
- GUMBO_TAG_OPTION,
288
- GUMBO_TAG_TEXTAREA,
289
- GUMBO_TAG_KEYGEN,
290
- GUMBO_TAG_OUTPUT,
291
- GUMBO_TAG_PROGRESS,
292
- GUMBO_TAG_METER,
293
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
294
- GUMBO_TAG_DETAILS,
295
- GUMBO_TAG_SUMMARY,
296
- GUMBO_TAG_MENU,
297
- GUMBO_TAG_MENUITEM,
298
- // Non-conforming elements that nonetheless appear in the HTML5 spec.
299
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
300
- GUMBO_TAG_APPLET,
301
- GUMBO_TAG_ACRONYM,
302
- GUMBO_TAG_BGSOUND,
303
- GUMBO_TAG_DIR,
304
- GUMBO_TAG_FRAME,
305
- GUMBO_TAG_FRAMESET,
306
- GUMBO_TAG_NOFRAMES,
307
- GUMBO_TAG_ISINDEX,
308
- GUMBO_TAG_LISTING,
309
- GUMBO_TAG_XMP,
310
- GUMBO_TAG_NEXTID,
311
- GUMBO_TAG_NOEMBED,
312
- GUMBO_TAG_PLAINTEXT,
313
- GUMBO_TAG_RB,
314
- GUMBO_TAG_STRIKE,
315
- GUMBO_TAG_BASEFONT,
316
- GUMBO_TAG_BIG,
317
- GUMBO_TAG_BLINK,
318
- GUMBO_TAG_CENTER,
319
- GUMBO_TAG_FONT,
320
- GUMBO_TAG_MARQUEE,
321
- GUMBO_TAG_MULTICOL,
322
- GUMBO_TAG_NOBR,
323
- GUMBO_TAG_SPACER,
324
- GUMBO_TAG_TT,
325
- // Used for all tags that don't have special handling in HTML.
160
+ // Load all the tags from an external source, generated from tag.in.
161
+ # include "tag_enum.h"
162
+ // Used for all tags that don't have special handling in HTML. Add new tags
163
+ // to the end of tag.in so as to preserve backwards-compatibility.
326
164
  GUMBO_TAG_UNKNOWN,
327
165
  // A marker value to indicate the end of the enum, for iterating over it.
328
166
  // Also used as the terminator for varargs functions that take tags.
@@ -364,9 +202,10 @@ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
364
202
 
365
203
  /**
366
204
  * Converts a tag name string (which may be in upper or mixed case) to a tag
367
- * enum.
205
+ * enum. The `tag` version expects `tagname` to be NULL-terminated
368
206
  */
369
207
  GumboTag gumbo_tag_enum(const char* tagname);
208
+ GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
370
209
 
371
210
  /**
372
211
  * Attribute namespaces.
@@ -461,10 +300,16 @@ typedef enum {
461
300
  GUMBO_NODE_TEXT,
462
301
  /** CDATA node. v will be a GumboText. */
463
302
  GUMBO_NODE_CDATA,
464
- /** Comment node. v. will be a GumboText, excluding comment delimiters. */
303
+ /** Comment node. v will be a GumboText, excluding comment delimiters. */
465
304
  GUMBO_NODE_COMMENT,
466
305
  /** Text node, where all contents is whitespace. v will be a GumboText. */
467
- GUMBO_NODE_WHITESPACE
306
+ GUMBO_NODE_WHITESPACE,
307
+ /** Template node. This is separate from GUMBO_NODE_ELEMENT because many
308
+ * client libraries will want to ignore the contents of template nodes, as
309
+ * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
310
+ * here, while clients that want to include template contents should also
311
+ * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
312
+ GUMBO_NODE_TEMPLATE
468
313
  } GumboNodeType;
469
314
 
470
315
  /**
@@ -678,6 +523,19 @@ struct GumboInternalNode {
678
523
  /** Pointer back to parent node. Not owned. */
679
524
  GumboNode* parent;
680
525
 
526
+ /**
527
+ * Pointer to next node in document order. This is the next node by start tag
528
+ * position in the document, or by position of the tag that forces the parser
529
+ * to insert it for parser-inserted nodes. It's necessary to maintain API
530
+ * compatibility with some other libraries, eg. BeautifulSoup. Not owned.
531
+ */
532
+ GumboNode* next;
533
+
534
+ /**
535
+ * Pointer to previous node in document order.
536
+ */
537
+ GumboNode* prev;
538
+
681
539
  /** The index within the parent's children vector of this node. */
682
540
  size_t index_within_parent;
683
541
 
@@ -795,6 +653,14 @@ GumboOutput* gumbo_parse(const char* buffer);
795
653
  GumboOutput* gumbo_parse_with_options(
796
654
  const GumboOptions* options, const char* buffer, size_t buffer_length);
797
655
 
656
+ /**
657
+ * Parse a chunk of HTML with the given fragment context. If `fragment_ctx`
658
+ * is `GUMBO_TAG_LAST`, the fragment will be parsed as a full document.
659
+ */
660
+ GumboOutput* gumbo_parse_fragment(
661
+ const GumboOptions* options, const char* buffer, size_t length,
662
+ const GumboTag fragment_ctx, const GumboNamespaceEnum fragment_namespace);
663
+
798
664
  /** Release the memory used for the parse tree & parse errors. */
799
665
  void gumbo_destroy_output(
800
666
  const GumboOptions* options, GumboOutput* output);
@@ -32,12 +32,30 @@
32
32
  #include "util.h"
33
33
  #include "vector.h"
34
34
 
35
-
36
35
  #define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
37
36
 
38
37
  #define GUMBO_STRING(literal) { literal, sizeof(literal) - 1 }
39
38
  #define TERMINATOR { "", 0 }
40
39
 
40
+ typedef char gumbo_tagset[GUMBO_TAG_LAST];
41
+ #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML)
42
+ #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG)
43
+ #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML)
44
+
45
+ #define TAGSET_INCLUDES(tagset, namespace, tag) \
46
+ (tag < GUMBO_TAG_LAST && \
47
+ tagset[(int)tag] == (1 << (int)namespace))
48
+
49
+
50
+
51
+ // selected forward declarations as it is getting hard to find
52
+ // an appropriate order
53
+ static bool node_html_tag_is(const GumboNode*, GumboTag);
54
+ static GumboInsertionMode get_current_template_insertion_mode(const GumboParser*);
55
+ static bool handle_in_template(GumboParser*, GumboToken*);
56
+ static GumboNode* destroy_node(GumboParser*, GumboNode*);
57
+
58
+
41
59
  static void* malloc_wrapper(void* unused, size_t size) {
42
60
  return malloc(size);
43
61
  }
@@ -181,7 +199,7 @@ typedef struct _ReplacementEntry {
181
199
  { GUMBO_STRING(from), GUMBO_STRING(to) }
182
200
 
183
201
  // Static data for SVG attribute replacements.
184
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-svg-attributes
202
+ // https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes
185
203
  static const ReplacementEntry kSvgAttributeReplacements[] = {
186
204
  REPLACEMENT_ENTRY("attributename", "attributeName"),
187
205
  REPLACEMENT_ENTRY("attributetype", "attributeType"),
@@ -189,12 +207,12 @@ static const ReplacementEntry kSvgAttributeReplacements[] = {
189
207
  REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
190
208
  REPLACEMENT_ENTRY("calcmode", "calcMode"),
191
209
  REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
192
- REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
193
- REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
210
+ // REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
211
+ // REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
194
212
  REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
195
213
  REPLACEMENT_ENTRY("edgemode", "edgeMode"),
196
- REPLACEMENT_ENTRY("externalresourcesrequired", "externalResourcesRequired"),
197
- REPLACEMENT_ENTRY("filterres", "filterRes"),
214
+ // REPLACEMENT_ENTRY("externalresourcesrequired", "externalResourcesRequired"),
215
+ // REPLACEMENT_ENTRY("filterres", "filterRes"),
198
216
  REPLACEMENT_ENTRY("filterunits", "filterUnits"),
199
217
  REPLACEMENT_ENTRY("glyphref", "glyphRef"),
200
218
  REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
@@ -336,7 +354,7 @@ typedef struct _TextNodeBufferState {
336
354
  // The source position of the start of this text node.
337
355
  GumboSourcePosition _start_position;
338
356
 
339
- // The type of node that will be inserted (TEXT or WHITESPACE).
357
+ // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
340
358
  GumboNodeType _type;
341
359
  } TextNodeBufferState;
342
360
 
@@ -362,6 +380,9 @@ typedef struct GumboInternalParserState {
362
380
  GumboNode* _head_element;
363
381
  GumboNode* _form_element;
364
382
 
383
+ // The element used as fragment context when parsing in fragment mode
384
+ GumboNode* _fragment_ctx;
385
+
365
386
  // The flag for when the spec says "Reprocess the current token in..."
366
387
  bool _reprocess_current_token;
367
388
 
@@ -390,6 +411,10 @@ typedef struct GumboInternalParserState {
390
411
  // The current token.
391
412
  GumboToken* _current_token;
392
413
 
414
+ // The current (most recently inserted) node. This is used to link together
415
+ // nodes in document order.
416
+ GumboNode* _current_node;
417
+
393
418
  // The way that the spec is written, the </body> and </html> tags are *always*
394
419
  // implicit, because encountering one of those tokens merely switches the
395
420
  // insertion mode out of "in body". So we have individual state flags for
@@ -442,7 +467,17 @@ static void set_frameset_not_ok(GumboParser* parser) {
442
467
  }
443
468
 
444
469
  static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
470
+ GumboParserState* state = parser->_parser_state;
445
471
  GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
472
+
473
+ node->next = NULL;
474
+ node->prev = state->_current_node;
475
+ if (state->_current_node != NULL) {
476
+ // May be null for the initial document node.
477
+ state->_current_node->next = node;
478
+ }
479
+ state->_current_node = node;
480
+
446
481
  node->parent = NULL;
447
482
  node->index_within_parent = -1;
448
483
  node->type = type;
@@ -489,7 +524,9 @@ static void parser_state_init(GumboParser* parser) {
489
524
  gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
490
525
  parser_state->_head_element = NULL;
491
526
  parser_state->_form_element = NULL;
527
+ parser_state->_fragment_ctx = NULL;
492
528
  parser_state->_current_token = NULL;
529
+ parser_state->_current_node = NULL;
493
530
  parser_state->_closed_body_tag = false;
494
531
  parser_state->_closed_html_tag = false;
495
532
  parser->_parser_state = parser_state;
@@ -497,17 +534,25 @@ static void parser_state_init(GumboParser* parser) {
497
534
 
498
535
  static void parser_state_destroy(GumboParser* parser) {
499
536
  GumboParserState* state = parser->_parser_state;
537
+ if (state->_fragment_ctx) {
538
+ destroy_node(parser, state->_fragment_ctx);
539
+ }
500
540
  gumbo_vector_destroy(parser, &state->_active_formatting_elements);
501
541
  gumbo_vector_destroy(parser, &state->_open_elements);
502
542
  gumbo_vector_destroy(parser, &state->_template_insertion_modes);
503
543
  gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
504
544
  gumbo_parser_deallocate(parser, state);
545
+ parser->_parser_state = NULL;
505
546
  }
506
547
 
507
548
  static GumboNode* get_document_node(GumboParser* parser) {
508
549
  return parser->_output->document;
509
550
  }
510
551
 
552
+ static bool is_fragment_parser(const GumboParser *parser) {
553
+ return !!parser->_parser_state->_fragment_ctx;
554
+ }
555
+
511
556
  // Returns the node at the bottom of the stack of open elements, or NULL if no
512
557
  // elements have been added yet.
513
558
  static GumboNode* get_current_node(GumboParser* parser) {
@@ -521,6 +566,14 @@ static GumboNode* get_current_node(GumboParser* parser) {
521
566
  return open_elements->data[open_elements->length - 1];
522
567
  }
523
568
 
569
+ static GumboNode* get_adjusted_current_node(GumboParser* parser) {
570
+ GumboParserState *state = parser->_parser_state;
571
+ if (state->_open_elements.length == 1 && state->_fragment_ctx) {
572
+ return state->_fragment_ctx;
573
+ }
574
+ return get_current_node(parser);
575
+ }
576
+
524
577
  // Returns true if the given needle is in the given array of literal
525
578
  // GumboStringPieces. If exact_match is true, this requires that they match
526
579
  // exactly; otherwise, this performs a prefix match to check if any of the
@@ -541,52 +594,80 @@ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
541
594
  parser->_parser_state->_insertion_mode = mode;
542
595
  }
543
596
 
597
+
544
598
  // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
545
599
  // This is a helper function that returns the appropriate insertion mode instead
546
600
  // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
547
601
  // indicate that there is no appropriate insertion mode, and the loop should
548
602
  // continue.
549
- static GumboInsertionMode get_appropriate_insertion_mode(
550
- const GumboNode* node, bool is_last) {
551
- assert(node->type == GUMBO_NODE_ELEMENT);
603
+ static GumboInsertionMode get_appropriate_insertion_mode(const GumboParser* parser, int index) {
604
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
605
+ const GumboNode* node = open_elements->data[index];
606
+ const bool is_last = index == 0;
607
+
608
+ if (is_last && is_fragment_parser(parser)) {
609
+ node = parser->_parser_state->_fragment_ctx;
610
+ }
611
+
612
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
552
613
  switch (node->v.element.tag) {
553
- case GUMBO_TAG_SELECT:
614
+ case GUMBO_TAG_SELECT: {
615
+ if (is_last) {
554
616
  return GUMBO_INSERTION_MODE_IN_SELECT;
555
- case GUMBO_TAG_TD:
556
- case GUMBO_TAG_TH:
557
- return is_last ?
558
- GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_IN_CELL;
559
- case GUMBO_TAG_TR:
560
- return GUMBO_INSERTION_MODE_IN_ROW;
561
- case GUMBO_TAG_TBODY:
562
- case GUMBO_TAG_THEAD:
563
- case GUMBO_TAG_TFOOT:
564
- return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
565
- case GUMBO_TAG_CAPTION:
566
- return GUMBO_INSERTION_MODE_IN_CAPTION;
567
- case GUMBO_TAG_COLGROUP:
568
- return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
569
- case GUMBO_TAG_TABLE:
570
- return GUMBO_INSERTION_MODE_IN_TABLE;
571
- case GUMBO_TAG_HEAD:
572
- case GUMBO_TAG_BODY:
573
- return GUMBO_INSERTION_MODE_IN_BODY;
574
- case GUMBO_TAG_FRAMESET:
575
- return GUMBO_INSERTION_MODE_IN_FRAMESET;
576
- case GUMBO_TAG_HTML:
577
- return GUMBO_INSERTION_MODE_BEFORE_HEAD;
578
- default:
579
- return is_last ?
580
- GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
617
+ }
618
+ for (int i = index; i > 0; --i) {
619
+ const GumboNode* ancestor = open_elements->data[i];
620
+ if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) {
621
+ return GUMBO_INSERTION_MODE_IN_SELECT;
622
+ }
623
+ if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) {
624
+ return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE;
625
+ }
626
+ }
627
+ return GUMBO_INSERTION_MODE_IN_SELECT;
581
628
  }
629
+ case GUMBO_TAG_TD:
630
+ case GUMBO_TAG_TH:
631
+ if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL;
632
+ break;
633
+ case GUMBO_TAG_TR:
634
+ return GUMBO_INSERTION_MODE_IN_ROW;
635
+ case GUMBO_TAG_TBODY:
636
+ case GUMBO_TAG_THEAD:
637
+ case GUMBO_TAG_TFOOT:
638
+ return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
639
+ case GUMBO_TAG_CAPTION:
640
+ return GUMBO_INSERTION_MODE_IN_CAPTION;
641
+ case GUMBO_TAG_COLGROUP:
642
+ return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
643
+ case GUMBO_TAG_TABLE:
644
+ return GUMBO_INSERTION_MODE_IN_TABLE;
645
+ case GUMBO_TAG_TEMPLATE:
646
+ return get_current_template_insertion_mode(parser);
647
+ case GUMBO_TAG_HEAD:
648
+ if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD;
649
+ break;
650
+ case GUMBO_TAG_BODY:
651
+ return GUMBO_INSERTION_MODE_IN_BODY;
652
+ case GUMBO_TAG_FRAMESET:
653
+ return GUMBO_INSERTION_MODE_IN_FRAMESET;
654
+ case GUMBO_TAG_HTML:
655
+ return parser->_parser_state->_head_element ?
656
+ GUMBO_INSERTION_MODE_AFTER_HEAD : GUMBO_INSERTION_MODE_BEFORE_HEAD;
657
+ default:
658
+ break;
659
+ }
660
+ return is_last ?
661
+ GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
582
662
  }
583
663
 
664
+
584
665
  // This performs the actual "reset the insertion mode" loop.
585
666
  static void reset_insertion_mode_appropriately(GumboParser* parser) {
586
667
  const GumboVector* open_elements = &parser->_parser_state->_open_elements;
587
668
  for (int i = open_elements->length; --i >= 0; ) {
588
669
  GumboInsertionMode mode =
589
- get_appropriate_insertion_mode(open_elements->data[i], i == 0);
670
+ get_appropriate_insertion_mode(parser, i);
590
671
  if (mode != GUMBO_INSERTION_MODE_INITIAL) {
591
672
  set_insertion_mode(parser, mode);
592
673
  return;
@@ -620,7 +701,7 @@ static GumboError* parser_add_parse_error(GumboParser* parser, const GumboToken*
620
701
  &extra_data->tag_stack);
621
702
  for (int i = 0; i < state->_open_elements.length; ++i) {
622
703
  const GumboNode* node = state->_open_elements.data[i];
623
- assert(node->type == GUMBO_NODE_ELEMENT);
704
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
624
705
  gumbo_vector_add(parser, (void*) node->v.element.tag,
625
706
  &extra_data->tag_stack);
626
707
  }
@@ -631,13 +712,7 @@ static GumboError* parser_add_parse_error(GumboParser* parser, const GumboToken*
631
712
  // by is_start) with one of the tag types in the varargs list. Terminate the
632
713
  // list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
633
714
  // the spec references tags that are not in the spec.
634
- // TODO(jdtang): A lot of the tag lists for this function are repeated in many
635
- // places in the code. This is how it's written in the spec (and it's done this
636
- // way so it's easy to verify the code against the spec), but it may be worth
637
- // coming up with a notion of a "tag set" that includes a list of tags, and
638
- // using that in many places. It'd probably also help performance, but I want
639
- // to profile before optimizing.
640
- static bool tag_in(const GumboToken* token, bool is_start, ...) {
715
+ static bool tag_in(const GumboToken* token, bool is_start, const gumbo_tagset tags) {
641
716
  GumboTag token_tag;
642
717
  if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
643
718
  token_tag = token->v.start_tag.tag;
@@ -646,19 +721,7 @@ static bool tag_in(const GumboToken* token, bool is_start, ...) {
646
721
  } else {
647
722
  return false;
648
723
  }
649
-
650
- va_list tags;
651
- va_start(tags, is_start);
652
- bool result = false;
653
- for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
654
- tag = va_arg(tags, GumboTag)) {
655
- if (tag == token_tag) {
656
- result = true;
657
- break;
658
- }
659
- }
660
- va_end(tags);
661
- return result;
724
+ return (token_tag < GUMBO_TAG_LAST && tags[(int)token_tag] != 0);
662
725
  }
663
726
 
664
727
  // Like tag_in, but for the single-tag case.
@@ -673,52 +736,119 @@ static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
673
736
  }
674
737
 
675
738
  // Like tag_in, but checks for the tag of a node, rather than a token.
676
- static bool node_tag_in(const GumboNode* node, ...) {
739
+ static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) {
677
740
  assert(node != NULL);
678
- if (node->type != GUMBO_NODE_ELEMENT) {
741
+ if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
679
742
  return false;
680
743
  }
681
- GumboTag node_tag = node->v.element.tag;
682
-
683
- va_list tags;
684
- va_start(tags, node);
685
- bool result = false;
686
- for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
687
- tag = va_arg(tags, GumboTag)) {
688
- assert(tag <= GUMBO_TAG_LAST);
689
- if (tag == node_tag) {
690
- result = true;
691
- break;
692
- }
693
- }
694
- va_end(tags);
695
- return result;
744
+ return TAGSET_INCLUDES(tags, node->v.element.tag_namespace, node->v.element.tag);
696
745
  }
697
746
 
747
+
698
748
  // Like node_tag_in, but for the single-tag case.
699
- static bool node_tag_is(const GumboNode* node, GumboTag tag) {
700
- return node->type == GUMBO_NODE_ELEMENT && node->v.element.tag == tag;
749
+ static bool node_qualified_tag_is(const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) {
750
+ return (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) &&
751
+ node->v.element.tag == tag &&
752
+ node->v.element.tag_namespace == ns;
753
+ }
754
+
755
+ // Like node_tag_in, but for the single-tag case in the HTML namespace
756
+ static bool node_html_tag_is(const GumboNode* node, GumboTag tag)
757
+ {
758
+ return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
759
+ }
760
+
761
+ static void push_template_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
762
+ gumbo_vector_add(parser, (void*) mode, &parser->_parser_state->_template_insertion_modes);
763
+ }
764
+
765
+ static void pop_template_insertion_mode(GumboParser* parser) {
766
+ gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes);
767
+ }
768
+
769
+ // Returns the current template insertion mode. If the stack of template
770
+ // insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL.
771
+ static GumboInsertionMode get_current_template_insertion_mode(const GumboParser* parser) {
772
+ GumboVector* template_insertion_modes = &parser->_parser_state->_template_insertion_modes;
773
+ if (template_insertion_modes->length == 0) {
774
+ return GUMBO_INSERTION_MODE_INITIAL;
775
+ }
776
+ return (GumboInsertionMode) template_insertion_modes->data[(template_insertion_modes->length - 1)];
701
777
  }
702
778
 
703
779
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
704
780
  static bool is_mathml_integration_point(const GumboNode* node) {
705
- return node_tag_in(node, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
706
- GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST) &&
707
- node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML;
781
+ return node_tag_in_set(node, (gumbo_tagset) { TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
782
+ TAG_MATHML(MS), TAG_MATHML(MTEXT) });
708
783
  }
709
784
 
710
785
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
711
786
  static bool is_html_integration_point(const GumboNode* node) {
712
- return (node_tag_in(node, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC,
713
- GUMBO_TAG_TITLE, GUMBO_TAG_LAST) &&
714
- node->v.element.tag_namespace == GUMBO_NAMESPACE_SVG) ||
715
- (node_tag_is(node, GUMBO_TAG_ANNOTATION_XML) && (
787
+ return node_tag_in_set(node, (gumbo_tagset) { TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) }) ||
788
+ (node_qualified_tag_is(node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) && (
716
789
  attribute_matches(&node->v.element.attributes,
717
790
  "encoding", "text/html") ||
718
791
  attribute_matches(&node->v.element.attributes,
719
792
  "encoding", "application/xhtml+xml")));
720
793
  }
721
794
 
795
+
796
+ // This represents a place to insert a node, consisting of a target parent and a
797
+ // child index within that parent. If the node should be inserted at the end of
798
+ // the parent's child, index will be -1.
799
+ typedef struct {
800
+ GumboNode* target;
801
+ int index;
802
+ } InsertionLocation;
803
+
804
+ InsertionLocation get_appropriate_insertion_location(GumboParser* parser, GumboNode* override_target) {
805
+ InsertionLocation retval = { override_target, -1 };
806
+ if (retval.target == NULL) {
807
+ // No override target; default to the current node, but special-case the
808
+ // root node since get_current_node() assumes the stack of open elements is
809
+ // non-empty.
810
+ retval.target = parser->_output->root != NULL ?
811
+ get_current_node(parser) : get_document_node(parser);
812
+ }
813
+ if (!parser->_parser_state->_foster_parent_insertions ||
814
+ !node_tag_in_set(retval.target, (gumbo_tagset) { TAG(TABLE), TAG(TBODY),
815
+ TAG(TFOOT), TAG(THEAD), TAG(TR) })) {
816
+ return retval;
817
+ }
818
+
819
+ // Foster-parenting case.
820
+ int last_template_index = -1;
821
+ int last_table_index = -1;
822
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
823
+ for (int i = 0; i < open_elements->length; ++i) {
824
+ if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) {
825
+ last_template_index = i;
826
+ }
827
+ if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) {
828
+ last_table_index = i;
829
+ }
830
+ }
831
+ if (last_template_index != -1 &&
832
+ (last_table_index == -1 || last_template_index > last_table_index)) {
833
+ retval.target = open_elements->data[last_template_index];
834
+ return retval;
835
+ }
836
+ if (last_table_index == -1) {
837
+ retval.target = open_elements->data[0];
838
+ return retval;
839
+ }
840
+ GumboNode* last_table = open_elements->data[last_table_index];
841
+ if (last_table->parent != NULL) {
842
+ retval.target = last_table->parent;
843
+ retval.index = last_table->index_within_parent;
844
+ return retval;
845
+ }
846
+
847
+ retval.target = open_elements->data[last_table_index - 1];
848
+ return retval;
849
+ }
850
+
851
+
722
852
  // Appends a node to the end of its parent, setting the "parent" and
723
853
  // "index_within_parent" fields appropriately.
724
854
  static void append_node(
@@ -726,7 +856,7 @@ static void append_node(
726
856
  assert(node->parent == NULL);
727
857
  assert(node->index_within_parent == -1);
728
858
  GumboVector* children;
729
- if (parent->type == GUMBO_NODE_ELEMENT) {
859
+ if (parent->type == GUMBO_NODE_ELEMENT || parent->type == GUMBO_NODE_TEMPLATE) {
730
860
  children = &parent->v.element.children;
731
861
  } else {
732
862
  assert(parent->type == GUMBO_NODE_DOCUMENT);
@@ -738,66 +868,44 @@ static void append_node(
738
868
  assert(node->index_within_parent < children->length);
739
869
  }
740
870
 
741
- // Inserts a node at the specified index within its parent, updating the
871
+ // Inserts a node at the specified InsertionLocation, updating the
742
872
  // "parent" and "index_within_parent" fields of it and all its siblings.
873
+ // If the index of the location is -1, this calls append_node.
743
874
  static void insert_node(
744
- GumboParser* parser, GumboNode* parent, int index, GumboNode* node) {
875
+ GumboParser* parser, GumboNode* node, InsertionLocation location) {
745
876
  assert(node->parent == NULL);
746
877
  assert(node->index_within_parent == -1);
747
- assert(parent->type == GUMBO_NODE_ELEMENT);
748
- GumboVector* children = &parent->v.element.children;
749
- assert(index >= 0);
750
- assert(index < children->length);
751
- node->parent = parent;
752
- node->index_within_parent = index;
753
- gumbo_vector_insert_at(parser, (void*) node, index, children);
754
- assert(node->index_within_parent < children->length);
755
- for (int i = index + 1; i < children->length; ++i) {
756
- GumboNode* sibling = children->data[i];
757
- sibling->index_within_parent = i;
758
- assert(sibling->index_within_parent < children->length);
759
- }
760
- }
878
+ GumboNode* parent = location.target;
879
+ int index = location.index;
880
+ if (index != -1) {
881
+ GumboVector* children = NULL;
882
+ if (parent->type == GUMBO_NODE_ELEMENT ||
883
+ parent->type == GUMBO_NODE_TEMPLATE) {
884
+ children = &parent->v.element.children;
885
+ } else if (parent->type == GUMBO_NODE_DOCUMENT) {
886
+ children = &parent->v.document.children;
887
+ assert(children->length == 0);
888
+ } else {
889
+ assert(0);
890
+ }
761
891
 
762
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#foster-parenting
763
- static void foster_parent_element(GumboParser* parser, GumboNode* node) {
764
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
765
- assert(open_elements->length > 2);
766
-
767
- node->parse_flags |= GUMBO_INSERTION_FOSTER_PARENTED;
768
- GumboNode* foster_parent_element = open_elements->data[0];
769
- assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
770
- assert(node_tag_is(foster_parent_element, GUMBO_TAG_HTML));
771
- for (int i = open_elements->length; --i > 1; ) {
772
- GumboNode* table_element = open_elements->data[i];
773
- if (node_tag_is(table_element, GUMBO_TAG_TABLE)) {
774
- foster_parent_element = table_element->parent;
775
- if (!foster_parent_element ||
776
- foster_parent_element->type != GUMBO_NODE_ELEMENT) {
777
- // Table has no parent; spec says it's possible if a script manipulated
778
- // the DOM, although I don't think we have to worry about this case.
779
- gumbo_debug("Table has no parent.\n");
780
- foster_parent_element = open_elements->data[i - 1];
781
- break;
782
- }
783
- assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
784
- gumbo_debug("Found enclosing table (%x) at %d; parent=%s, index=%d.\n",
785
- table_element, i, gumbo_normalized_tagname(
786
- foster_parent_element->v.element.tag),
787
- table_element->index_within_parent);
788
- assert(foster_parent_element->v.element.children.data[
789
- table_element->index_within_parent] == table_element);
790
- insert_node(parser, foster_parent_element,
791
- table_element->index_within_parent, node);
792
- return;
892
+ assert(index >= 0);
893
+ assert(index < children->length);
894
+ node->parent = parent;
895
+ node->index_within_parent = index;
896
+ gumbo_vector_insert_at(parser, (void*) node, index, children);
897
+ assert(node->index_within_parent < children->length);
898
+ for (int i = index + 1; i < children->length; ++i) {
899
+ GumboNode* sibling = children->data[i];
900
+ sibling->index_within_parent = i;
901
+ assert(sibling->index_within_parent < children->length);
793
902
  }
903
+ } else {
904
+ append_node(parser, parent, node);
794
905
  }
795
- if (node->type == GUMBO_NODE_ELEMENT) {
796
- gumbo_vector_add(parser, (void*) node, open_elements);
797
- }
798
- append_node(parser, foster_parent_element, node);
799
906
  }
800
907
 
908
+
801
909
  static void maybe_flush_text_node_buffer(GumboParser* parser) {
802
910
  GumboParserState* state = parser->_parser_state;
803
911
  TextNodeBufferState* buffer_state = &state->_text_node;
@@ -806,7 +914,8 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) {
806
914
  }
807
915
 
808
916
  assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
809
- buffer_state->_type == GUMBO_NODE_TEXT);
917
+ buffer_state->_type == GUMBO_NODE_TEXT ||
918
+ buffer_state->_type == GUMBO_NODE_CDATA);
810
919
  GumboNode* text_node = create_node(parser, buffer_state->_type);
811
920
  GumboText* text_node_data = &text_node->v.text;
812
921
  text_node_data->text = gumbo_string_buffer_to_string(
@@ -816,20 +925,20 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) {
816
925
  state->_current_token->original_text.data -
817
926
  buffer_state->_start_original_text;
818
927
  text_node_data->start_pos = buffer_state->_start_position;
819
- if (state->_foster_parent_insertions && node_tag_in(
820
- get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
821
- GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
822
- foster_parent_element(parser, text_node);
823
- } else {
824
- append_node(
825
- parser, parser->_output->root ?
826
- get_current_node(parser) : parser->_output->document, text_node);
827
- }
928
+
828
929
  gumbo_debug("Flushing text node buffer of %.*s.\n",
829
930
  (int) buffer_state->_buffer.length, buffer_state->_buffer.data);
830
931
 
831
- gumbo_string_buffer_destroy(parser, &buffer_state->_buffer);
832
- gumbo_string_buffer_init(parser, &buffer_state->_buffer);
932
+ InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
933
+ if (location.target->type == GUMBO_NODE_DOCUMENT) {
934
+ // The DOM does not allow Document nodes to have Text children, so per the
935
+ // spec, they are dropped on the floor.
936
+ destroy_node(parser, text_node);
937
+ } else {
938
+ insert_node(parser, text_node, location);
939
+ }
940
+
941
+ gumbo_string_buffer_clear(parser, &buffer_state->_buffer);
833
942
  buffer_state->_type = GUMBO_NODE_WHITESPACE;
834
943
  assert(buffer_state->_buffer.length == 0);
835
944
  }
@@ -846,7 +955,7 @@ static GumboNode* pop_current_node(GumboParser* parser) {
846
955
  GumboParserState* state = parser->_parser_state;
847
956
  maybe_flush_text_node_buffer(parser);
848
957
  if (state->_open_elements.length > 0) {
849
- assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
958
+ assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
850
959
  gumbo_debug(
851
960
  "Popping %s node.\n",
852
961
  gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
@@ -856,12 +965,12 @@ static GumboNode* pop_current_node(GumboParser* parser) {
856
965
  assert(state->_open_elements.length == 0);
857
966
  return NULL;
858
967
  }
859
- assert(current_node->type == GUMBO_NODE_ELEMENT);
968
+ assert(current_node->type == GUMBO_NODE_ELEMENT || current_node->type == GUMBO_NODE_TEMPLATE);
860
969
  bool is_closed_body_or_html_tag =
861
- (node_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
862
- (node_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag);
970
+ (node_html_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
971
+ (node_html_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag);
863
972
  if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
864
- !node_tag_is(current_node, state->_current_token->v.end_tag)) &&
973
+ !node_html_tag_is(current_node, state->_current_token->v.end_tag)) &&
865
974
  !is_closed_body_or_html_tag) {
866
975
  current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
867
976
  }
@@ -885,25 +994,22 @@ static void append_comment_node(
885
994
 
886
995
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
887
996
  static void clear_stack_to_table_row_context(GumboParser* parser) {
888
- while (!node_tag_in(get_current_node(parser),
889
- GUMBO_TAG_HTML, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
997
+ while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TR), TAG(TEMPLATE)})) {
890
998
  pop_current_node(parser);
891
999
  }
892
1000
  }
893
1001
 
894
1002
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
895
1003
  static void clear_stack_to_table_context(GumboParser* parser) {
896
- while (!node_tag_in(get_current_node(parser),
897
- GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST)) {
1004
+ while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TABLE), TAG(TEMPLATE) } )) {
898
1005
  pop_current_node(parser);
899
1006
  }
900
1007
  }
901
1008
 
902
1009
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
903
1010
  void clear_stack_to_table_body_context(GumboParser* parser) {
904
- while (!node_tag_in(get_current_node(parser), GUMBO_TAG_HTML,
905
- GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
906
- GUMBO_TAG_LAST)) {
1011
+ while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TBODY),
1012
+ TAG(TFOOT), TAG(THEAD), TAG(TEMPLATE) })) {
907
1013
  pop_current_node(parser);
908
1014
  }
909
1015
  }
@@ -918,7 +1024,8 @@ static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
918
1024
  element->tag_namespace = GUMBO_NAMESPACE_HTML;
919
1025
  element->original_tag = kGumboEmptyString;
920
1026
  element->original_end_tag = kGumboEmptyString;
921
- element->start_pos = parser->_parser_state->_current_token->position;
1027
+ element->start_pos = (parser->_parser_state->_current_token) ?
1028
+ parser->_parser_state->_current_token->position : kGumboEmptySourcePosition;
922
1029
  element->end_pos = kGumboEmptySourcePosition;
923
1030
  return node;
924
1031
  }
@@ -929,7 +1036,12 @@ static GumboNode* create_element_from_token(
929
1036
  assert(token->type == GUMBO_TOKEN_START_TAG);
930
1037
  GumboTokenStartTag* start_tag = &token->v.start_tag;
931
1038
 
932
- GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
1039
+ GumboNodeType type = (
1040
+ tag_namespace == GUMBO_NAMESPACE_HTML &&
1041
+ start_tag->tag == GUMBO_TAG_TEMPLATE)
1042
+ ? GUMBO_NODE_TEMPLATE : GUMBO_NODE_ELEMENT;
1043
+
1044
+ GumboNode* node = create_node(parser, type);
933
1045
  GumboElement* element = &node->v.element;
934
1046
  gumbo_vector_init(parser, 1, &element->children);
935
1047
  element->attributes = start_tag->attributes;
@@ -966,20 +1078,9 @@ static void insert_element(GumboParser* parser, GumboNode* node,
966
1078
  if (!is_reconstructing_formatting_elements) {
967
1079
  maybe_flush_text_node_buffer(parser);
968
1080
  }
969
- if (state->_foster_parent_insertions && node_tag_in(
970
- get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
971
- GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
972
- foster_parent_element(parser, node);
973
- gumbo_vector_add(parser, (void*) node, &state->_open_elements);
974
- return;
975
- }
976
-
977
- // This is called to insert the root HTML element, but get_current_node
978
- // assumes the stack of open elements is non-empty, so we need special
979
- // handling for this case.
980
- append_node(
981
- parser, parser->_output->root ?
982
- get_current_node(parser) : parser->_output->document, node);
1081
+ InsertionLocation location =
1082
+ get_appropriate_insertion_location(parser, NULL);
1083
+ insert_node(parser, node, location);
983
1084
  gumbo_vector_add(parser, (void*) node, &state->_open_elements);
984
1085
  }
985
1086
 
@@ -1035,7 +1136,9 @@ static GumboNode* insert_foreign_element(
1035
1136
 
1036
1137
  static void insert_text_token(GumboParser* parser, GumboToken* token) {
1037
1138
  assert(token->type == GUMBO_TOKEN_WHITESPACE ||
1038
- token->type == GUMBO_TOKEN_CHARACTER);
1139
+ token->type == GUMBO_TOKEN_CHARACTER ||
1140
+ token->type == GUMBO_TOKEN_NULL ||
1141
+ token->type == GUMBO_TOKEN_CDATA);
1039
1142
  TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
1040
1143
  if (buffer_state->_buffer.length == 0) {
1041
1144
  // Initialize position fields.
@@ -1046,6 +1149,8 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) {
1046
1149
  parser, token->v.character, &buffer_state->_buffer);
1047
1150
  if (token->type == GUMBO_TOKEN_CHARACTER) {
1048
1151
  buffer_state->_type = GUMBO_NODE_TEXT;
1152
+ } else if (token->type == GUMBO_TOKEN_CDATA) {
1153
+ buffer_state->_type = GUMBO_NODE_CDATA;
1049
1154
  }
1050
1155
  gumbo_debug("Inserting text token '%c'.\n", token->v.character);
1051
1156
  }
@@ -1073,7 +1178,7 @@ static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
1073
1178
  if (node == &kActiveFormattingScopeMarker) {
1074
1179
  return false;
1075
1180
  }
1076
- if (node_tag_is(node, GUMBO_TAG_A)) {
1181
+ if (node_html_tag_is(node, GUMBO_TAG_A)) {
1077
1182
  *anchor_index = i;
1078
1183
  return true;
1079
1184
  }
@@ -1097,10 +1202,8 @@ static int count_formatting_elements_of_tag(
1097
1202
  break;
1098
1203
  }
1099
1204
  assert(node->type == GUMBO_NODE_ELEMENT);
1100
- GumboElement* element = &node->v.element;
1101
- if (node_tag_is(node, desired_element->tag) &&
1102
- element->tag_namespace == desired_element->tag_namespace &&
1103
- all_attributes_match(&element->attributes,
1205
+ if (node_qualified_tag_is(node, desired_element->tag_namespace, desired_element->tag) &&
1206
+ all_attributes_match(&node->v.element.attributes,
1104
1207
  &desired_element->attributes)) {
1105
1208
  num_identical_elements++;
1106
1209
  *earliest_matching_index = i;
@@ -1150,7 +1253,7 @@ static bool is_open_element(GumboParser* parser, const GumboNode* node) {
1150
1253
  // values are fresh copies.
1151
1254
  GumboNode* clone_node(
1152
1255
  GumboParser* parser, const GumboNode* node, GumboParseFlags reason) {
1153
- assert(node->type == GUMBO_NODE_ELEMENT);
1256
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1154
1257
  GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
1155
1258
  *new_node = *node;
1156
1259
  new_node->parent = NULL;
@@ -1220,7 +1323,10 @@ static void reconstruct_active_formatting_elements(GumboParser* parser) {
1220
1323
  GumboNode* clone = clone_node(
1221
1324
  parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
1222
1325
  // Step 9.
1223
- insert_element(parser, clone, true);
1326
+ InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
1327
+ insert_node(parser, clone, location);
1328
+ gumbo_vector_add(parser, (void*) clone, &parser->_parser_state->_open_elements);
1329
+
1224
1330
  // Step 10.
1225
1331
  elements->data[i] = clone;
1226
1332
  gumbo_debug("Reconstructed %s element at %d.\n",
@@ -1269,83 +1375,47 @@ static GumboQuirksModeEnum compute_quirks_mode(
1269
1375
  // The following functions are all defined by the "has an element in __ scope"
1270
1376
  // sections of the HTML5 spec:
1271
1377
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
1272
- // The basic idea behind them is that they check for an element of the given tag
1273
- // name, contained within a scope formed by a set of other tag names. For
1274
- // example, "has an element in list scope" looks for an element of the given tag
1275
- // within the nearest enclosing <ol> or <ul>, along with a bunch of generic
1276
- // element types that serve to "firewall" their content from the rest of the
1277
- // document.
1278
- static bool has_an_element_in_specific_scope(
1279
- GumboParser* parser, GumboVector* /* GumboTag */ expected, bool negate, ...) {
1378
+ // The basic idea behind them is that they check for an element of the given
1379
+ // qualified name, contained within a scope formed by a set of other qualified
1380
+ // names. For example, "has an element in list scope" looks for an element of
1381
+ // the given qualified name within the nearest enclosing <ol> or <ul>, along
1382
+ // with a bunch of generic element types that serve to "firewall" their content
1383
+ // from the rest of the document. Note that because of the way the spec is written,
1384
+ // all elements are expected to be in the HTML namespace
1385
+ static bool has_an_element_in_specific_scope(GumboParser* parser,
1386
+ int expected_size, const GumboTag *expected, bool negate, const gumbo_tagset tags) {
1280
1387
  GumboVector* open_elements = &parser->_parser_state->_open_elements;
1281
- va_list args;
1282
- va_start(args, negate);
1283
- // va_arg can only run through the list once, so we copy it to an GumboVector
1284
- // here. I wonder if it'd make more sense to make tags the GumboVector*
1285
- // parameter and 'expected' a vararg list, but that'd require changing a lot
1286
- // of code for unknown benefit. We may want to change the representation of
1287
- // these tag sets anyway, to something more efficient.
1288
- GumboVector tags;
1289
- gumbo_vector_init(parser, 10, &tags);
1290
- for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
1291
- tag = va_arg(args, GumboTag)) {
1292
- // We store the tags inline instead of storing pointers to them.
1293
- gumbo_vector_add(parser, (void*) tag, &tags);
1294
- }
1295
- va_end(args);
1296
-
1297
- bool result = false;
1298
1388
  for (int i = open_elements->length; --i >= 0; ) {
1299
1389
  const GumboNode* node = open_elements->data[i];
1300
- if (node->type != GUMBO_NODE_ELEMENT) {
1390
+ if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE)
1301
1391
  continue;
1302
- }
1392
+
1303
1393
  GumboTag node_tag = node->v.element.tag;
1304
- for (int j = 0; j < expected->length; ++j) {
1305
- GumboTag expected_tag = (GumboTag) expected->data[j];
1306
- if (node_tag == expected_tag) {
1307
- result = true;
1308
- goto cleanup;
1309
- }
1394
+ GumboNamespaceEnum node_ns = node->v.element.tag_namespace;
1395
+ for (int j = 0; j < expected_size; ++j) {
1396
+ if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML)
1397
+ return true;
1310
1398
  }
1311
1399
 
1312
- bool found_tag = false;
1313
- for (int j = 0; j < tags.length; ++j) {
1314
- GumboTag tag = (GumboTag) tags.data[j];
1315
- if (tag == node_tag) {
1316
- found_tag = true;
1317
- break;
1318
- }
1319
- }
1320
- if (negate != found_tag) {
1321
- result = false;
1322
- goto cleanup;
1323
- }
1400
+ bool found = TAGSET_INCLUDES(tags, node_ns, node_tag);
1401
+ if (negate != found)
1402
+ return false;
1324
1403
  }
1325
- cleanup:
1326
- gumbo_vector_destroy(parser, &tags);
1327
- return result;
1404
+ return false;
1328
1405
  }
1329
1406
 
1330
- // This is a bit of a hack to stack-allocate a one-element GumboVector name
1331
- // 'varname' containing the 'from_var' variable, since it's used in nearly all
1332
- // the subsequent helper functions. Note the use of void* and casts instead of
1333
- // GumboTag; this is so the alignment requirements are the same as GumboVector
1334
- // and the data inside it can be freely accessed as if it were a normal
1335
- // GumboVector.
1336
- #define DECLARE_ONE_ELEMENT_GUMBO_VECTOR(varname, from_var) \
1337
- void* varname ## _tmp_array[1] = { (void*) from_var }; \
1338
- GumboVector varname = { varname ## _tmp_array, 1, 1 }
1407
+ // Checks for the presence of an open element of the specified tag type.
1408
+ static bool has_open_element(GumboParser* parser, GumboTag tag) {
1409
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(HTML) } );
1410
+ }
1339
1411
 
1340
1412
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
1341
1413
  static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
1342
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1343
- return has_an_element_in_specific_scope(
1344
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1345
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1346
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1347
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1348
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
1414
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(APPLET),
1415
+ TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
1416
+ TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
1417
+ TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1418
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) });
1349
1419
  }
1350
1420
 
1351
1421
  // Like "has an element in scope", but for the specific case of looking for a
@@ -1361,16 +1431,14 @@ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
1361
1431
  if (current == node) {
1362
1432
  return true;
1363
1433
  }
1364
- if (current->type != GUMBO_NODE_ELEMENT) {
1434
+ if (current->type != GUMBO_NODE_ELEMENT && current->type != GUMBO_NODE_TEMPLATE) {
1365
1435
  continue;
1366
1436
  }
1367
- if (node_tag_in(
1368
- current, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1369
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1370
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
1371
- GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML,
1372
- GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_TITLE,
1373
- GUMBO_TAG_LAST)) {
1437
+ if (node_tag_in_set(current, (gumbo_tagset) { TAG(APPLET), TAG(CAPTION), TAG(HTML),
1438
+ TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE),
1439
+ TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1440
+ TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT),
1441
+ TAG_SVG(DESC), TAG_SVG(TITLE) } )) {
1374
1442
  return false;
1375
1443
  }
1376
1444
  }
@@ -1378,78 +1446,66 @@ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
1378
1446
  return false;
1379
1447
  }
1380
1448
 
1381
- // Like has_an_element_in_scope, but restricts the expected tag to a range of
1382
- // possible tag names instead of just a single one.
1383
- static bool has_an_element_in_scope_with_tagname(GumboParser* parser, ...) {
1384
- GumboVector tags;
1385
- // 6 = arbitrary initial size for vector, chosen because the major use-case
1386
- // for this method is heading tags, of which there are 6.
1387
- gumbo_vector_init(parser, 6, &tags);
1388
- va_list args;
1389
- va_start(args, parser);
1390
- for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
1391
- tag = va_arg(args, GumboTag)) {
1392
- gumbo_vector_add(parser, (void*) tag, &tags);
1393
- }
1394
- bool found = has_an_element_in_specific_scope(
1395
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1396
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1397
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1398
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1399
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
1400
- gumbo_vector_destroy(parser, &tags);
1401
- va_end(args);
1402
- return found;
1449
+ // Like has_an_element_in_scope, but restricts the expected qualified name to a
1450
+ // range of possible qualified names instead of just a single one.
1451
+ static bool has_an_element_in_scope_with_tagname(GumboParser* parser, int expected_len, const GumboTag expected[]) {
1452
+ return has_an_element_in_specific_scope(parser, expected_len, expected, false, (gumbo_tagset) {
1453
+ TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
1454
+ TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
1455
+ TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1456
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) });
1403
1457
  }
1404
1458
 
1405
1459
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
1406
1460
  static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
1407
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1408
- return has_an_element_in_specific_scope(
1409
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1410
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1411
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1412
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1413
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_OL, GUMBO_TAG_UL,
1414
- GUMBO_TAG_LAST);
1461
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(APPLET),
1462
+ TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
1463
+ TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
1464
+ TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1465
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE), TAG(OL),
1466
+ TAG(UL) });
1415
1467
  }
1416
1468
 
1417
1469
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
1418
1470
  static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
1419
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1420
- return has_an_element_in_specific_scope(
1421
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1422
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1423
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1424
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1425
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_BUTTON, GUMBO_TAG_LAST);
1471
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(APPLET),
1472
+ TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
1473
+ TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
1474
+ TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1475
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE), TAG(BUTTON) });
1426
1476
  }
1427
1477
 
1428
1478
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
1429
1479
  static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
1430
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1431
- return has_an_element_in_specific_scope(
1432
- parser, &tags, false, GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST);
1480
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(HTML),
1481
+ TAG(TABLE), TAG(TEMPLATE) });
1433
1482
  }
1434
1483
 
1435
1484
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
1436
1485
  static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
1437
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1438
- return has_an_element_in_specific_scope(
1439
- parser, &tags, true, GUMBO_TAG_OPTGROUP, GUMBO_TAG_OPTION,
1440
- GUMBO_TAG_LAST);
1486
+ return has_an_element_in_specific_scope(parser, 1, &tag, true, (gumbo_tagset) { TAG(OPTGROUP), TAG(OPTION) });
1441
1487
  }
1442
1488
 
1443
-
1444
1489
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
1445
1490
  // "exception" is the "element to exclude from the process" listed in the spec.
1446
1491
  // Pass GUMBO_TAG_LAST to not exclude any of them.
1447
1492
  static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1448
1493
  for (;
1449
- node_tag_in(get_current_node(parser), GUMBO_TAG_DD, GUMBO_TAG_DT,
1450
- GUMBO_TAG_LI, GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP,
1451
- GUMBO_TAG_P, GUMBO_TAG_RP, GUMBO_TAG_RT, GUMBO_TAG_LAST) &&
1452
- !node_tag_is(get_current_node(parser), exception);
1494
+ node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(DD),
1495
+ TAG(DT), TAG(LI), TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RB),
1496
+ TAG(RT), TAG(RTC) }) &&
1497
+ !node_html_tag_is(get_current_node(parser), exception);
1498
+ pop_current_node(parser));
1499
+ }
1500
+
1501
+ // This is the "generate all implied end tags thoroughly" clause of the spec.
1502
+ // https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags
1503
+ static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
1504
+ for (;
1505
+ node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(CAPTION),
1506
+ TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), TAG(OPTGROUP),
1507
+ TAG(P), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD),
1508
+ TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR) });
1453
1509
  pop_current_node(parser));
1454
1510
  }
1455
1511
 
@@ -1463,7 +1519,7 @@ static bool close_table(GumboParser* parser) {
1463
1519
  }
1464
1520
 
1465
1521
  GumboNode* node = pop_current_node(parser);
1466
- while (!node_tag_is(node, GUMBO_TAG_TABLE)) {
1522
+ while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) {
1467
1523
  node = pop_current_node(parser);
1468
1524
  }
1469
1525
  reset_insertion_mode_appropriately(parser);
@@ -1477,13 +1533,13 @@ static bool close_table_cell(GumboParser* parser, const GumboToken* token,
1477
1533
  bool result = true;
1478
1534
  generate_implied_end_tags(parser, GUMBO_TAG_LAST);
1479
1535
  const GumboNode* node = get_current_node(parser);
1480
- if (!node_tag_is(node, cell_tag)) {
1536
+ if (!node_html_tag_is(node, cell_tag)) {
1481
1537
  parser_add_parse_error(parser, token);
1482
1538
  result = false;
1483
1539
  }
1484
1540
  do {
1485
1541
  node = pop_current_node(parser);
1486
- } while (!node_tag_is(node, cell_tag));
1542
+ } while (!node_html_tag_is(node, cell_tag));
1487
1543
 
1488
1544
  clear_active_formatting_elements(parser);
1489
1545
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
@@ -1508,7 +1564,7 @@ static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1508
1564
  // resets the insertion mode appropriately.
1509
1565
  static void close_current_select(GumboParser* parser) {
1510
1566
  GumboNode* node = pop_current_node(parser);
1511
- while (!node_tag_is(node, GUMBO_TAG_SELECT)) {
1567
+ while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) {
1512
1568
  node = pop_current_node(parser);
1513
1569
  }
1514
1570
  reset_insertion_mode_appropriately(parser);
@@ -1517,60 +1573,43 @@ static void close_current_select(GumboParser* parser) {
1517
1573
  // The list of nodes in the "special" category:
1518
1574
  // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
1519
1575
  static bool is_special_node(const GumboNode* node) {
1520
- assert(node->type == GUMBO_NODE_ELEMENT);
1521
- switch (node->v.element.tag_namespace) {
1522
- case GUMBO_NAMESPACE_HTML:
1523
- return node_tag_in(node,
1524
- GUMBO_TAG_ADDRESS, GUMBO_TAG_APPLET, GUMBO_TAG_AREA,
1525
- GUMBO_TAG_ARTICLE, GUMBO_TAG_ASIDE, GUMBO_TAG_BASE,
1526
- GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND, GUMBO_TAG_BLOCKQUOTE,
1527
- GUMBO_TAG_BODY, GUMBO_TAG_BR, GUMBO_TAG_BUTTON, GUMBO_TAG_CAPTION,
1528
- GUMBO_TAG_CENTER, GUMBO_TAG_COL, GUMBO_TAG_COLGROUP,
1529
- GUMBO_TAG_MENUITEM, GUMBO_TAG_DD, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
1530
- GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EMBED,
1531
- GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE,
1532
- GUMBO_TAG_FOOTER, GUMBO_TAG_FORM, GUMBO_TAG_FRAME,
1533
- GUMBO_TAG_FRAMESET, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
1534
- GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HEAD,
1535
- GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_HR, GUMBO_TAG_HTML,
1536
- GUMBO_TAG_IFRAME, GUMBO_TAG_IMG, GUMBO_TAG_INPUT, GUMBO_TAG_ISINDEX,
1537
- GUMBO_TAG_LI, GUMBO_TAG_LINK, GUMBO_TAG_LISTING, GUMBO_TAG_MARQUEE,
1538
- GUMBO_TAG_MENU, GUMBO_TAG_META, GUMBO_TAG_NAV, GUMBO_TAG_NOEMBED,
1539
- GUMBO_TAG_NOFRAMES, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_OBJECT,
1540
- GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_PARAM, GUMBO_TAG_PLAINTEXT,
1541
- GUMBO_TAG_PRE, GUMBO_TAG_SCRIPT, GUMBO_TAG_SECTION, GUMBO_TAG_SELECT,
1542
- GUMBO_TAG_STYLE, GUMBO_TAG_SUMMARY, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
1543
- GUMBO_TAG_TD, GUMBO_TAG_TEXTAREA, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
1544
- GUMBO_TAG_THEAD, GUMBO_TAG_TITLE, GUMBO_TAG_TR, GUMBO_TAG_UL,
1545
- GUMBO_TAG_WBR, GUMBO_TAG_XMP, GUMBO_TAG_LAST);
1546
- case GUMBO_NAMESPACE_MATHML:
1547
- return node_tag_in(node,
1548
- GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1549
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_LAST);
1550
- case GUMBO_NAMESPACE_SVG:
1551
- return node_tag_in(node,
1552
- GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_LAST);
1553
- }
1554
- abort();
1555
- return false; // Pacify compiler.
1556
- }
1557
-
1558
- // Implicitly closes currently open tags until it reaches an element with the
1559
- // specified tag name. If the elements closed are in the set handled by
1576
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1577
+ return node_tag_in_set(node, (gumbo_tagset) { TAG(ADDRESS), TAG(APPLET), TAG(AREA),
1578
+ TAG(ARTICLE), TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE),
1579
+ TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL),
1580
+ TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR), TAG(DIV), TAG(DL),
1581
+ TAG(DT), TAG(EMBED), TAG(FIELDSET), TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER),
1582
+ TAG(FORM), TAG(FRAME), TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4),
1583
+ TAG(H5), TAG(H6), TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML),
1584
+ TAG(IFRAME), TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK),
1585
+ TAG(LISTING), TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED),
1586
+ TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P), TAG(PARAM),
1587
+ TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION), TAG(SELECT), TAG(STYLE),
1588
+ TAG(SUMMARY), TAG(TABLE), TAG(TBODY), TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA),
1589
+ TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP),
1590
+
1591
+ TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1592
+ TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1593
+
1594
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC) });
1595
+ }
1596
+
1597
+ // Implicitly closes currently open elements until it reaches an element with the
1598
+ // specified qualified name. If the elements closed are in the set handled by
1560
1599
  // generate_implied_end_tags, this is normal operation and this function returns
1561
1600
  // true. Otherwise, a parse error is recorded and this function returns false.
1562
1601
  static bool implicitly_close_tags(
1563
- GumboParser* parser, GumboToken* token, GumboTag target) {
1602
+ GumboParser* parser, GumboToken* token, GumboNamespaceEnum target_ns, GumboTag target) {
1564
1603
  bool result = true;
1565
1604
  generate_implied_end_tags(parser, target);
1566
- if (!node_tag_is(get_current_node(parser), target)) {
1605
+ if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1567
1606
  parser_add_parse_error(parser, token);
1568
- while (!node_tag_is(get_current_node(parser), target)) {
1607
+ while (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1569
1608
  pop_current_node(parser);
1570
1609
  }
1571
1610
  result = false;
1572
1611
  }
1573
- assert(node_tag_is(get_current_node(parser), target));
1612
+ assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
1574
1613
  pop_current_node(parser);
1575
1614
  return result;
1576
1615
  }
@@ -1581,7 +1620,7 @@ static bool implicitly_close_tags(
1581
1620
  // clause appears several times in the spec.
1582
1621
  static bool maybe_implicitly_close_p_tag(GumboParser* parser, GumboToken* token) {
1583
1622
  if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1584
- return implicitly_close_tags(parser, token, GUMBO_TAG_P);
1623
+ return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
1585
1624
  }
1586
1625
  return true;
1587
1626
  }
@@ -1595,15 +1634,14 @@ static void maybe_implicitly_close_list_tag(
1595
1634
  for (int i = state->_open_elements.length; --i >= 0; ) {
1596
1635
  const GumboNode* node = state->_open_elements.data[i];
1597
1636
  bool is_list_tag = is_li ?
1598
- node_tag_is(node, GUMBO_TAG_LI) :
1599
- node_tag_in(node, GUMBO_TAG_DD, GUMBO_TAG_DT, GUMBO_TAG_LAST);
1637
+ node_html_tag_is(node, GUMBO_TAG_LI) :
1638
+ node_tag_in_set(node, (gumbo_tagset) { TAG(DD), TAG(DT) } );
1600
1639
  if (is_list_tag) {
1601
- implicitly_close_tags(parser, token, node->v.element.tag);
1640
+ implicitly_close_tags(parser, token, node->v.element.tag_namespace, node->v.element.tag);
1602
1641
  return;
1603
1642
  }
1604
1643
  if (is_special_node(node) &&
1605
- !node_tag_in(node, GUMBO_TAG_ADDRESS, GUMBO_TAG_DIV, GUMBO_TAG_P,
1606
- GUMBO_TAG_LAST)) {
1644
+ !node_tag_in_set(node, (gumbo_tagset) { TAG(ADDRESS), TAG(DIV), TAG(P) })) {
1607
1645
  return;
1608
1646
  }
1609
1647
  }
@@ -1758,13 +1796,20 @@ static void remove_from_parent(GumboParser* parser, GumboNode* node) {
1758
1796
 
1759
1797
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
1760
1798
  // Also described in the "in body" handling for end formatting tags.
1761
- static bool adoption_agency_algorithm(
1762
- GumboParser* parser, GumboToken* token, GumboTag closing_tag) {
1799
+ static bool adoption_agency_algorithm(GumboParser* parser, GumboToken* token, GumboTag subject) {
1763
1800
  GumboParserState* state = parser->_parser_state;
1764
1801
  gumbo_debug("Entering adoption agency algorithm.\n");
1765
- // Steps 1-3 & 16:
1802
+ // Step 1.
1803
+ GumboNode* current_node = get_current_node(parser);
1804
+ if (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
1805
+ current_node->v.element.tag == subject &&
1806
+ gumbo_vector_index_of(&state->_active_formatting_elements, current_node) == -1) {
1807
+ pop_current_node(parser);
1808
+ return false;
1809
+ }
1810
+ // Steps 2-4 & 20:
1766
1811
  for (int i = 0; i < 8; ++i) {
1767
- // Step 4.
1812
+ // Step 5.
1768
1813
  GumboNode* formatting_node = NULL;
1769
1814
  int formatting_node_in_open_elements = -1;
1770
1815
  for (int j = state->_active_formatting_elements.length; --j >= 0; ) {
@@ -1774,13 +1819,13 @@ static bool adoption_agency_algorithm(
1774
1819
  // Last scope marker; abort the algorithm.
1775
1820
  return false;
1776
1821
  }
1777
- if (node_tag_is(current_node, closing_tag)) {
1822
+ if (node_html_tag_is(current_node, subject)) {
1778
1823
  // Found it.
1779
1824
  formatting_node = current_node;
1780
1825
  formatting_node_in_open_elements = gumbo_vector_index_of(
1781
- &state->_open_elements, formatting_node);
1826
+ &state->_open_elements, formatting_node);
1782
1827
  gumbo_debug("Formatting element of tag %s at %d.\n",
1783
- gumbo_normalized_tagname(closing_tag),
1828
+ gumbo_normalized_tagname(subject),
1784
1829
  formatting_node_in_open_elements);
1785
1830
  break;
1786
1831
  }
@@ -1793,39 +1838,44 @@ static bool adoption_agency_algorithm(
1793
1838
  return false;
1794
1839
  }
1795
1840
 
1841
+ // Step 6
1796
1842
  if (formatting_node_in_open_elements == -1) {
1797
1843
  gumbo_debug("Formatting node not on stack of open elements.\n");
1844
+ parser_add_parse_error(parser, token);
1798
1845
  gumbo_vector_remove(parser, formatting_node,
1799
1846
  &state->_active_formatting_elements);
1800
1847
  return false;
1801
1848
  }
1802
1849
 
1850
+ // Step 7
1803
1851
  if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
1804
1852
  parser_add_parse_error(parser, token);
1805
1853
  gumbo_debug("Element not in scope.\n");
1806
1854
  return false;
1807
1855
  }
1856
+
1857
+ // Step 8
1808
1858
  if (formatting_node != get_current_node(parser)) {
1809
1859
  parser_add_parse_error(parser, token); // But continue onwards.
1810
1860
  }
1811
1861
  assert(formatting_node);
1812
- assert(!node_tag_is(formatting_node, GUMBO_TAG_HTML));
1813
- assert(!node_tag_is(formatting_node, GUMBO_TAG_BODY));
1862
+ assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
1863
+ assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
1814
1864
 
1815
- // Step 5 & 6.
1865
+ // Step 9 & 10
1816
1866
  GumboNode* furthest_block = NULL;
1817
1867
  for (int j = formatting_node_in_open_elements;
1818
1868
  j < state->_open_elements.length; ++j) {
1819
1869
  assert(j > 0);
1820
1870
  GumboNode* current = state->_open_elements.data[j];
1821
1871
  if (is_special_node(current)) {
1822
- // Step 5.
1872
+ // Step 9.
1823
1873
  furthest_block = current;
1824
1874
  break;
1825
1875
  }
1826
1876
  }
1827
1877
  if (!furthest_block) {
1828
- // Step 6.
1878
+ // Step 10.
1829
1879
  while (get_current_node(parser) != formatting_node) {
1830
1880
  pop_current_node(parser);
1831
1881
  }
@@ -1835,35 +1885,38 @@ static bool adoption_agency_algorithm(
1835
1885
  &state->_active_formatting_elements);
1836
1886
  return false;
1837
1887
  }
1838
- assert(!node_tag_is(furthest_block, GUMBO_TAG_HTML));
1888
+ assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
1839
1889
  assert(furthest_block);
1840
1890
 
1841
- // Step 7.
1891
+ // Step 11.
1842
1892
  // Elements may be moved and reparented by this algorithm, so
1843
1893
  // common_ancestor is not necessarily the same as formatting_node->parent.
1844
1894
  GumboNode* common_ancestor =
1845
- state->_open_elements.data[gumbo_vector_index_of(
1846
- &state->_open_elements, formatting_node) - 1];
1895
+ state->_open_elements.data[gumbo_vector_index_of(
1896
+ &state->_open_elements, formatting_node) - 1];
1847
1897
  gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
1848
1898
  gumbo_normalized_tagname(common_ancestor->v.element.tag),
1849
1899
  gumbo_normalized_tagname(furthest_block->v.element.tag));
1850
1900
 
1851
- // Step 8.
1901
+ // Step 12.
1852
1902
  int bookmark = gumbo_vector_index_of(
1853
- &state->_active_formatting_elements, formatting_node);;
1854
- // Step 9.
1903
+ &state->_active_formatting_elements, formatting_node) + 1;
1904
+ gumbo_debug("Bookmark at %d.\n", bookmark);
1905
+ // Step 13.
1855
1906
  GumboNode* node = furthest_block;
1856
1907
  GumboNode* last_node = furthest_block;
1857
1908
  // Must be stored explicitly, in case node is removed from the stack of open
1858
1909
  // elements, to handle step 9.4.
1859
1910
  int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
1860
1911
  assert(saved_node_index > 0);
1861
- // Step 9.1-9.3 & 9.11.
1862
- for (int j = 0; j < 3; ++j) {
1863
- // Step 9.4.
1912
+ // Step 13.1.
1913
+ for (int j = 0;;) {
1914
+ // Step 13.2.
1915
+ ++j;
1916
+ // Step 13.3.
1864
1917
  int node_index = gumbo_vector_index_of(&state->_open_elements, node);
1865
1918
  gumbo_debug(
1866
- "Current index: %d, last index: %d.\n", node_index, saved_node_index);
1919
+ "Current index: %d, last index: %d.\n", node_index, saved_node_index);
1867
1920
  if (node_index == -1) {
1868
1921
  node_index = saved_node_index;
1869
1922
  }
@@ -1872,62 +1925,78 @@ static bool adoption_agency_algorithm(
1872
1925
  assert(node_index < state->_open_elements.capacity);
1873
1926
  node = state->_open_elements.data[node_index];
1874
1927
  assert(node->parent);
1875
- // Step 9.5.
1876
- if (gumbo_vector_index_of(
1877
- &state->_active_formatting_elements, node) == -1) {
1928
+ if (node == formatting_node) {
1929
+ // Step 13.4.
1930
+ break;
1931
+ }
1932
+ int formatting_index =
1933
+ gumbo_vector_index_of(&state->_active_formatting_elements, node);
1934
+ if (j > 3 && formatting_index != -1) {
1935
+ // Step 13.5.
1936
+ gumbo_debug(
1937
+ "Removing formatting element at %d.\n", formatting_index);
1938
+ gumbo_vector_remove_at(
1939
+ parser,
1940
+ formatting_index,
1941
+ &state->_active_formatting_elements);
1942
+ // Removing the element shifts all indices over by one, so we may need
1943
+ // to move the bookmark.
1944
+ if (formatting_index < bookmark) {
1945
+ --bookmark;
1946
+ gumbo_debug("Moving bookmark to %d.\n", bookmark);
1947
+ }
1948
+ continue;
1949
+ }
1950
+ if (formatting_index == -1) {
1951
+ // Step 13.6.
1878
1952
  gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
1879
1953
  continue;
1880
- } else if (node == formatting_node) {
1881
- // Step 9.6.
1882
- break;
1883
1954
  }
1884
- // Step 9.7.
1885
- int formatting_index = gumbo_vector_index_of(
1886
- &state->_active_formatting_elements, node);
1955
+ // Step 13.7.
1956
+ // "common ancestor as the intended parent" doesn't actually mean insert
1957
+ // it into the common ancestor; that happens below.
1887
1958
  node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1959
+ assert(formatting_index >= 0);
1888
1960
  state->_active_formatting_elements.data[formatting_index] = node;
1961
+ assert(node_index >= 0);
1889
1962
  state->_open_elements.data[node_index] = node;
1890
- // Step 9.8.
1963
+ // Step 13.8.
1891
1964
  if (last_node == furthest_block) {
1892
1965
  bookmark = formatting_index + 1;
1966
+ gumbo_debug("Bookmark moved to %d.\n", bookmark);
1893
1967
  assert(bookmark <= state->_active_formatting_elements.length);
1894
1968
  }
1895
- // Step 9.9.
1969
+ // Step 13.9.
1896
1970
  last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1897
1971
  remove_from_parent(parser, last_node);
1898
1972
  append_node(parser, node, last_node);
1899
- // Step 9.10.
1973
+ // Step 13.10.
1900
1974
  last_node = node;
1901
- }
1975
+ } // Step 13.11.
1902
1976
 
1903
- // Step 10.
1977
+ // Step 14.
1904
1978
  gumbo_debug("Removing %s node from parent ",
1905
1979
  gumbo_normalized_tagname(last_node->v.element.tag));
1906
1980
  remove_from_parent(parser, last_node);
1907
1981
  last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1908
- if (node_tag_in(common_ancestor, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
1909
- GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
1910
- GUMBO_TAG_LAST)) {
1911
- gumbo_debug("and foster-parenting it.\n");
1912
- foster_parent_element(parser, last_node);
1913
- } else {
1914
- gumbo_debug("and inserting it into %s.\n",
1915
- gumbo_normalized_tagname(common_ancestor->v.element.tag));
1916
- append_node(parser, common_ancestor, last_node);
1917
- }
1982
+ InsertionLocation location =
1983
+ get_appropriate_insertion_location(parser, common_ancestor);
1984
+ gumbo_debug("and inserting it into %s.\n",
1985
+ gumbo_normalized_tagname(location.target->v.element.tag));
1986
+ insert_node(parser, last_node, location);
1918
1987
 
1919
- // Step 11.
1988
+ // Step 15.
1920
1989
  GumboNode* new_formatting_node = clone_node(
1921
- parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1990
+ parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1922
1991
  formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1923
1992
 
1924
- // Step 12. Instead of appending nodes one-by-one, we swap the children
1993
+ // Step 16. Instead of appending nodes one-by-one, we swap the children
1925
1994
  // vector of furthest_block with the empty children of new_formatting_node,
1926
1995
  // reducing memory traffic and allocations. We still have to reset their
1927
1996
  // parent pointers, though.
1928
1997
  GumboVector temp = new_formatting_node->v.element.children;
1929
1998
  new_formatting_node->v.element.children =
1930
- furthest_block->v.element.children;
1999
+ furthest_block->v.element.children;
1931
2000
  furthest_block->v.element.children = temp;
1932
2001
 
1933
2002
  temp = new_formatting_node->v.element.children;
@@ -1936,36 +2005,39 @@ static bool adoption_agency_algorithm(
1936
2005
  child->parent = new_formatting_node;
1937
2006
  }
1938
2007
 
1939
- // Step 13.
2008
+ // Step 17.
1940
2009
  append_node(parser, furthest_block, new_formatting_node);
1941
2010
 
1942
- // Step 14.
2011
+ // Step 18.
1943
2012
  // If the formatting node was before the bookmark, it may shift over all
1944
2013
  // indices after it, so we need to explicitly find the index and possibly
1945
2014
  // adjust the bookmark.
1946
2015
  int formatting_node_index = gumbo_vector_index_of(
1947
- &state->_active_formatting_elements, formatting_node);
2016
+ &state->_active_formatting_elements, formatting_node);
1948
2017
  assert(formatting_node_index != -1);
1949
2018
  if (formatting_node_index < bookmark) {
2019
+ gumbo_debug(
2020
+ "Formatting node at %d is before bookmark at %d; decrementing.\n",
2021
+ formatting_node_index, bookmark);
1950
2022
  --bookmark;
1951
2023
  }
1952
2024
  gumbo_vector_remove_at(
1953
- parser, formatting_node_index, &state->_active_formatting_elements);
2025
+ parser, formatting_node_index, &state->_active_formatting_elements);
1954
2026
  assert(bookmark >= 0);
1955
2027
  assert(bookmark <= state->_active_formatting_elements.length);
1956
2028
  gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
1957
2029
  &state->_active_formatting_elements);
1958
2030
 
1959
- // Step 15.
2031
+ // Step 19.
1960
2032
  gumbo_vector_remove(
1961
- parser, formatting_node, &state->_open_elements);
2033
+ parser, formatting_node, &state->_open_elements);
1962
2034
  int insert_at = gumbo_vector_index_of(
1963
- &state->_open_elements, furthest_block) + 1;
2035
+ &state->_open_elements, furthest_block) + 1;
1964
2036
  assert(insert_at >= 0);
1965
2037
  assert(insert_at <= state->_open_elements.length);
1966
2038
  gumbo_vector_insert_at(
1967
- parser, new_formatting_node, insert_at, &state->_open_elements);
1968
- }
2039
+ parser, new_formatting_node, insert_at, &state->_open_elements);
2040
+ } // Step 20.
1969
2041
  return true;
1970
2042
  }
1971
2043
 
@@ -1992,8 +2064,8 @@ static void finish_parsing(GumboParser* parser) {
1992
2064
  GumboParserState* state = parser->_parser_state;
1993
2065
  for (GumboNode* node = pop_current_node(parser); node;
1994
2066
  node = pop_current_node(parser)) {
1995
- if ((node_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
1996
- (node_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
2067
+ if ((node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
2068
+ (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
1997
2069
  continue;
1998
2070
  }
1999
2071
  node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
@@ -2042,9 +2114,9 @@ static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2042
2114
  parser->_output->root = html_node;
2043
2115
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2044
2116
  return true;
2045
- } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
2046
- token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2047
- GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
2117
+ } else if (token->type == GUMBO_TOKEN_END_TAG &&
2118
+ !tag_in(token, false, (gumbo_tagset) { TAG(HEAD), TAG(BODY), TAG(HTML),
2119
+ TAG(BR) } )) {
2048
2120
  parser_add_parse_error(parser, token);
2049
2121
  ignore_token(parser);
2050
2122
  return false;
@@ -2076,9 +2148,9 @@ static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2076
2148
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2077
2149
  parser->_parser_state->_head_element = node;
2078
2150
  return true;
2079
- } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
2080
- token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2081
- GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
2151
+ } else if (token->type == GUMBO_TOKEN_END_TAG &&
2152
+ !tag_in(token, false, (gumbo_tagset) { TAG(HEAD), TAG(BODY), TAG(HTML),
2153
+ TAG(BR) })) {
2082
2154
  parser_add_parse_error(parser, token);
2083
2155
  ignore_token(parser);
2084
2156
  return false;
@@ -2110,9 +2182,8 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2110
2182
  return true;
2111
2183
  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2112
2184
  return handle_in_body(parser, token);
2113
- } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2114
- GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
2115
- GUMBO_TAG_LAST)) {
2185
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT),
2186
+ TAG(BGSOUND), TAG(MENUITEM), TAG(LINK) })) {
2116
2187
  insert_element_from_token(parser, token);
2117
2188
  pop_current_node(parser);
2118
2189
  acknowledge_self_closing_tag(parser);
@@ -2129,8 +2200,7 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2129
2200
  } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2130
2201
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2131
2202
  return true;
2132
- } else if (tag_in(token, kStartTag, GUMBO_TAG_NOFRAMES, GUMBO_TAG_STYLE,
2133
- GUMBO_TAG_LAST)) {
2203
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(NOFRAMES), TAG(STYLE) })) {
2134
2204
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2135
2205
  return true;
2136
2206
  } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
@@ -2143,32 +2213,48 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2143
2213
  } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2144
2214
  GumboNode* head = pop_current_node(parser);
2145
2215
  AVOID_UNUSED_VARIABLE_WARNING(head);
2146
- assert(node_tag_is(head, GUMBO_TAG_HEAD));
2216
+ assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
2147
2217
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2148
2218
  return true;
2149
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2150
- parser_add_parse_error(parser, token);
2151
- ignore_token(parser);
2152
- return false;
2153
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2154
- (token->type == GUMBO_TOKEN_END_TAG &&
2155
- !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2156
- GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
2157
- parser_add_parse_error(parser, token);
2158
- return false;
2159
- } else if (tag_is(token, kStartTag, GUMBO_TAG_UNKNOWN) && token->v.start_tag.is_self_closing) {
2219
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(HTML), TAG(BR) })) {
2220
+ pop_current_node(parser);
2221
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2222
+ parser->_parser_state->_reprocess_current_token = true;
2223
+ return true;
2224
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2225
+ insert_element_from_token(parser, token);
2226
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
2227
+ parser->_parser_state->_frameset_ok = false;
2228
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2229
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2230
+ return true;
2231
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2232
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2233
+ parser_add_parse_error(parser, token);
2234
+ ignore_token(parser);
2235
+ return false;
2236
+ }
2237
+ generate_all_implied_end_tags_thoroughly(parser);
2238
+ bool success = true;
2239
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) {
2240
+ parser_add_parse_error(parser, token);
2241
+ success = false;
2242
+ }
2243
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE));
2244
+ clear_active_formatting_elements(parser);
2245
+ pop_template_insertion_mode(parser);
2246
+ reset_insertion_mode_appropriately(parser);
2247
+ return success;
2248
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) || (token->type == GUMBO_TOKEN_END_TAG)) {
2160
2249
  parser_add_parse_error(parser, token);
2161
2250
  ignore_token(parser);
2162
2251
  return false;
2163
2252
  } else {
2164
- const GumboNode* node = pop_current_node(parser);
2165
- assert(node_tag_is(node, GUMBO_TAG_HEAD));
2166
- AVOID_UNUSED_VARIABLE_WARNING(node);
2253
+ pop_current_node(parser);
2167
2254
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2168
2255
  parser->_parser_state->_reprocess_current_token = true;
2169
2256
  return true;
2170
2257
  }
2171
-
2172
2258
  return true;
2173
2259
  }
2174
2260
 
@@ -2181,18 +2267,16 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2181
2267
  return handle_in_body(parser, token);
2182
2268
  } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2183
2269
  const GumboNode* node = pop_current_node(parser);
2184
- assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
2270
+ assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2185
2271
  AVOID_UNUSED_VARIABLE_WARNING(node);
2186
2272
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2187
2273
  return true;
2188
2274
  } else if (token->type == GUMBO_TOKEN_WHITESPACE ||
2189
2275
  token->type == GUMBO_TOKEN_COMMENT ||
2190
- tag_in(token, kStartTag, GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND,
2191
- GUMBO_TAG_LINK, GUMBO_TAG_META, GUMBO_TAG_NOFRAMES,
2192
- GUMBO_TAG_STYLE, GUMBO_TAG_LAST)) {
2193
- return handle_in_head(parser, token);
2194
- } else if (tag_in(token, kStartTag, GUMBO_TAG_HEAD, GUMBO_TAG_NOSCRIPT,
2195
- GUMBO_TAG_LAST) ||
2276
+ tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASEFONT), TAG(BGSOUND),
2277
+ TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(STYLE) })) {
2278
+ return handle_in_head(parser, token);
2279
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(HEAD), TAG(NOSCRIPT) }) ||
2196
2280
  (token->type == GUMBO_TOKEN_END_TAG &&
2197
2281
  !tag_is(token, kEndTag, GUMBO_TAG_BR))) {
2198
2282
  parser_add_parse_error(parser, token);
@@ -2201,7 +2285,7 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2201
2285
  } else {
2202
2286
  parser_add_parse_error(parser, token);
2203
2287
  const GumboNode* node = pop_current_node(parser);
2204
- assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
2288
+ assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2205
2289
  AVOID_UNUSED_VARIABLE_WARNING(node);
2206
2290
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2207
2291
  parser->_parser_state->_reprocess_current_token = true;
@@ -2233,10 +2317,10 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2233
2317
  insert_element_from_token(parser, token);
2234
2318
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2235
2319
  return true;
2236
- } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2237
- GUMBO_TAG_BGSOUND, GUMBO_TAG_LINK, GUMBO_TAG_META,
2238
- GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT, GUMBO_TAG_STYLE,
2239
- GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
2320
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT),
2321
+ TAG(BGSOUND), TAG(LINK), TAG(META),
2322
+ TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE),
2323
+ TAG(TEMPLATE), TAG(TITLE) })) {
2240
2324
  parser_add_parse_error(parser, token);
2241
2325
  assert(state->_head_element != NULL);
2242
2326
  // This must be flushed before we push the head element on, as there may be
@@ -2246,10 +2330,11 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2246
2330
  bool result = handle_in_head(parser, token);
2247
2331
  gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
2248
2332
  return result;
2333
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2334
+ return handle_in_head(parser, token);
2249
2335
  } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2250
2336
  (token->type == GUMBO_TOKEN_END_TAG &&
2251
- !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2252
- GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
2337
+ !tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(HTML), TAG(BR) }))) {
2253
2338
  parser_add_parse_error(parser, token);
2254
2339
  ignore_token(parser);
2255
2340
  return false;
@@ -2261,28 +2346,23 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2261
2346
  }
2262
2347
  }
2263
2348
 
2264
- static void destroy_node(GumboParser* parser, GumboNode* node) {
2349
+ static GumboNode* destroy_node(GumboParser* parser, GumboNode* node) {
2265
2350
  switch (node->type) {
2266
2351
  case GUMBO_NODE_DOCUMENT:
2267
2352
  {
2268
2353
  GumboDocument* doc = &node->v.document;
2269
- for (int i = 0; i < doc->children.length; ++i) {
2270
- destroy_node(parser, doc->children.data[i]);
2271
- }
2272
2354
  gumbo_parser_deallocate(parser, (void*) doc->children.data);
2273
2355
  gumbo_parser_deallocate(parser, (void*) doc->name);
2274
2356
  gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
2275
2357
  gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
2276
2358
  }
2277
2359
  break;
2360
+ case GUMBO_NODE_TEMPLATE:
2278
2361
  case GUMBO_NODE_ELEMENT:
2279
2362
  for (int i = 0; i < node->v.element.attributes.length; ++i) {
2280
2363
  gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
2281
2364
  }
2282
2365
  gumbo_parser_deallocate(parser, node->v.element.attributes.data);
2283
- for (int i = 0; i < node->v.element.children.length; ++i) {
2284
- destroy_node(parser, node->v.element.children.data[i]);
2285
- }
2286
2366
  gumbo_parser_deallocate(parser, node->v.element.children.data);
2287
2367
  break;
2288
2368
  case GUMBO_NODE_TEXT:
@@ -2292,7 +2372,21 @@ static void destroy_node(GumboParser* parser, GumboNode* node) {
2292
2372
  gumbo_parser_deallocate(parser, (void*) node->v.text.text);
2293
2373
  break;
2294
2374
  }
2375
+ // Remove from the next/prev linked list.
2376
+ GumboNode* prev = node->prev;
2377
+ GumboNode* next = node->next;
2378
+ if (prev != NULL) {
2379
+ prev->next = next;
2380
+ }
2381
+ if (next != NULL) {
2382
+ next->prev = prev;
2383
+ }
2384
+ if (parser->_parser_state && parser->_parser_state->_current_node == node) {
2385
+ parser->_parser_state->_current_node = prev;
2386
+ }
2387
+
2295
2388
  gumbo_parser_deallocate(parser, node);
2389
+ return next;
2296
2390
  }
2297
2391
 
2298
2392
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
@@ -2307,7 +2401,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2307
2401
  reconstruct_active_formatting_elements(parser);
2308
2402
  insert_text_token(parser, token);
2309
2403
  return true;
2310
- } else if (token->type == GUMBO_TOKEN_CHARACTER) {
2404
+ } else if (token->type == GUMBO_TOKEN_CHARACTER ||
2405
+ token->type == GUMBO_TOKEN_CDATA) {
2311
2406
  reconstruct_active_formatting_elements(parser);
2312
2407
  insert_text_token(parser, token);
2313
2408
  set_frameset_not_ok(parser);
@@ -2320,20 +2415,24 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2320
2415
  ignore_token(parser);
2321
2416
  return false;
2322
2417
  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2418
+ parser_add_parse_error(parser, token);
2419
+ if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2420
+ ignore_token(parser);
2421
+ return false;
2422
+ }
2323
2423
  assert(parser->_output->root != NULL);
2324
2424
  assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2325
- parser_add_parse_error(parser, token);
2326
2425
  merge_attributes(parser, token, parser->_output->root);
2327
2426
  return false;
2328
- } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2329
- GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
2330
- GUMBO_TAG_META, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT,
2331
- GUMBO_TAG_STYLE, GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
2427
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT),
2428
+ TAG(BGSOUND), TAG(MENUITEM), TAG(LINK),
2429
+ TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
2430
+ TAG(STYLE), TAG(TEMPLATE), TAG(TITLE) } ) || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2332
2431
  return handle_in_head(parser, token);
2333
2432
  } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2334
2433
  parser_add_parse_error(parser, token);
2335
2434
  if (state->_open_elements.length < 2 ||
2336
- !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY)) {
2435
+ !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2337
2436
  ignore_token(parser);
2338
2437
  return false;
2339
2438
  }
@@ -2343,7 +2442,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2343
2442
  } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2344
2443
  parser_add_parse_error(parser, token);
2345
2444
  if (state->_open_elements.length < 2 ||
2346
- !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2445
+ !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2347
2446
  !state->_frameset_ok) {
2348
2447
  ignore_token(parser);
2349
2448
  return false;
@@ -2381,18 +2480,17 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2381
2480
  return true;
2382
2481
  } else if (token->type == GUMBO_TOKEN_EOF) {
2383
2482
  for (int i = 0; i < state->_open_elements.length; ++i) {
2384
- if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
2385
- GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_P, GUMBO_TAG_TBODY,
2386
- GUMBO_TAG_TD, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
2387
- GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_BODY,
2388
- GUMBO_TAG_HTML, GUMBO_TAG_LAST)) {
2483
+ if (!node_tag_in_set(state->_open_elements.data[i], (gumbo_tagset) { TAG(DD),
2484
+ TAG(DT), TAG(LI), TAG(P), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH),
2485
+ TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML) } )) {
2389
2486
  parser_add_parse_error(parser, token);
2390
- return false;
2391
2487
  }
2392
2488
  }
2489
+ if (get_current_template_insertion_mode(parser) != GUMBO_INSERTION_MODE_INITIAL) {
2490
+ return handle_in_template(parser, token);
2491
+ }
2393
2492
  return true;
2394
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2395
- GUMBO_TAG_LAST)) {
2493
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(HTML) })) {
2396
2494
  if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2397
2495
  parser_add_parse_error(parser, token);
2398
2496
  ignore_token(parser);
@@ -2400,13 +2498,11 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2400
2498
  }
2401
2499
  bool success = true;
2402
2500
  for (int i = 0; i < state->_open_elements.length; ++i) {
2403
- if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
2404
- GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_OPTGROUP,
2405
- GUMBO_TAG_OPTION, GUMBO_TAG_P, GUMBO_TAG_RP,
2406
- GUMBO_TAG_RT, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
2407
- GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
2408
- GUMBO_TAG_TR, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2409
- GUMBO_TAG_LAST)) {
2501
+ if (!node_tag_in_set(state->_open_elements.data[i], (gumbo_tagset) {
2502
+ TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P),
2503
+ TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD),
2504
+ TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR),
2505
+ TAG(BODY), TAG(HTML) })) {
2410
2506
  parser_add_parse_error(parser, token);
2411
2507
  success = false;
2412
2508
  break;
@@ -2417,58 +2513,54 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2417
2513
  parser->_parser_state->_reprocess_current_token = true;
2418
2514
  } else {
2419
2515
  GumboNode* body = state->_open_elements.data[1];
2420
- assert(node_tag_is(body, GUMBO_TAG_BODY));
2516
+ assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2421
2517
  record_end_of_element(state->_current_token, &body->v.element);
2422
2518
  }
2423
2519
  return success;
2424
- } else if (tag_in(token, kStartTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
2425
- GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_CENTER,
2426
- GUMBO_TAG_DETAILS, GUMBO_TAG_DIR, GUMBO_TAG_DIV,
2427
- GUMBO_TAG_DL, GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION,
2428
- GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER, GUMBO_TAG_HEADER,
2429
- GUMBO_TAG_HGROUP, GUMBO_TAG_MENU, GUMBO_TAG_NAV,
2430
- GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_SECTION,
2431
- GUMBO_TAG_SUMMARY, GUMBO_TAG_UL, GUMBO_TAG_LAST)) {
2520
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(ADDRESS), TAG(ARTICLE),
2521
+ TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS),
2522
+ TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
2523
+ TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP), TAG(MENU), TAG(MAIN),
2524
+ TAG(NAV), TAG(OL), TAG(P), TAG(SECTION), TAG(SUMMARY), TAG(UL) })) {
2432
2525
  bool result = maybe_implicitly_close_p_tag(parser, token);
2433
2526
  insert_element_from_token(parser, token);
2434
2527
  return result;
2435
- } else if (tag_in(token, kStartTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2436
- GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2528
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(H1), TAG(H2), TAG(H3),
2529
+ TAG(H4), TAG(H5), TAG(H6) })) {
2437
2530
  bool result = maybe_implicitly_close_p_tag(parser, token);
2438
- if (node_tag_in(get_current_node(parser), GUMBO_TAG_H1, GUMBO_TAG_H2,
2439
- GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6,
2440
- GUMBO_TAG_LAST)) {
2531
+ if (node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(H1), TAG(H2),
2532
+ TAG(H3), TAG(H4), TAG(H5), TAG(H6) })) {
2441
2533
  parser_add_parse_error(parser, token);
2442
2534
  pop_current_node(parser);
2443
2535
  result = false;
2444
2536
  }
2445
2537
  insert_element_from_token(parser, token);
2446
2538
  return result;
2447
- } else if (tag_in(token, kStartTag, GUMBO_TAG_PRE, GUMBO_TAG_LISTING,
2448
- GUMBO_TAG_LAST)) {
2539
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(PRE), TAG(LISTING) })) {
2449
2540
  bool result = maybe_implicitly_close_p_tag(parser, token);
2450
2541
  insert_element_from_token(parser, token);
2451
2542
  state->_ignore_next_linefeed = true;
2452
2543
  state->_frameset_ok = false;
2453
2544
  return result;
2454
2545
  } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2455
- if (state->_form_element != NULL) {
2546
+ if (state->_form_element != NULL && !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2456
2547
  gumbo_debug("Ignoring nested form.\n");
2457
2548
  parser_add_parse_error(parser, token);
2458
2549
  ignore_token(parser);
2459
2550
  return false;
2460
2551
  }
2461
2552
  bool result = maybe_implicitly_close_p_tag(parser, token);
2462
- state->_form_element =
2463
- insert_element_from_token(parser, token);
2553
+ GumboNode* form_element = insert_element_from_token(parser, token);
2554
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2555
+ state->_form_element = form_element;
2556
+ }
2464
2557
  return result;
2465
2558
  } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2466
2559
  maybe_implicitly_close_list_tag(parser, token, true);
2467
2560
  bool result = maybe_implicitly_close_p_tag(parser, token);
2468
2561
  insert_element_from_token(parser, token);
2469
2562
  return result;
2470
- } else if (tag_in(token, kStartTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
2471
- GUMBO_TAG_LAST)) {
2563
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(DD), TAG(DT) })) {
2472
2564
  maybe_implicitly_close_list_tag(parser, token, false);
2473
2565
  bool result = maybe_implicitly_close_p_tag(parser, token);
2474
2566
  insert_element_from_token(parser, token);
@@ -2481,7 +2573,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2481
2573
  } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2482
2574
  if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2483
2575
  parser_add_parse_error(parser, token);
2484
- implicitly_close_tags(parser, token, GUMBO_TAG_BUTTON);
2576
+ implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_BUTTON);
2485
2577
  state->_reprocess_current_token = true;
2486
2578
  return false;
2487
2579
  }
@@ -2489,67 +2581,78 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2489
2581
  insert_element_from_token(parser, token);
2490
2582
  state->_frameset_ok = false;
2491
2583
  return true;
2492
- } else if (tag_in(token, kEndTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
2493
- GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_BUTTON,
2494
- GUMBO_TAG_CENTER, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
2495
- GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_FIELDSET,
2496
- GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER,
2497
- GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_LISTING,
2498
- GUMBO_TAG_MENU, GUMBO_TAG_NAV, GUMBO_TAG_OL, GUMBO_TAG_PRE,
2499
- GUMBO_TAG_SECTION, GUMBO_TAG_SUMMARY, GUMBO_TAG_UL,
2500
- GUMBO_TAG_LAST)) {
2584
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(ADDRESS), TAG(ARTICLE),
2585
+ TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS),
2586
+ TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
2587
+ TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP), TAG(LISTING),
2588
+ TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(PRE),
2589
+ TAG(SECTION), TAG(SUMMARY), TAG(UL) })) {
2501
2590
  GumboTag tag = token->v.end_tag;
2502
2591
  if (!has_an_element_in_scope(parser, tag)) {
2503
2592
  parser_add_parse_error(parser, token);
2504
2593
  ignore_token(parser);
2505
2594
  return false;
2506
2595
  }
2507
- implicitly_close_tags(parser, token, token->v.end_tag);
2596
+ implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag);
2508
2597
  return true;
2509
2598
  } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2510
- bool result = true;
2511
- const GumboNode* node = state->_form_element;
2512
- assert(!node || node->type == GUMBO_NODE_ELEMENT);
2513
- state->_form_element = NULL;
2514
- if (!node || !has_node_in_scope(parser, node)) {
2515
- gumbo_debug("Closing an unopened form.\n");
2516
- parser_add_parse_error(parser, token);
2517
- ignore_token(parser);
2518
- return false;
2519
- }
2520
- // This differs from implicitly_close_tags because we remove *only* the
2521
- // <form> element; other nodes are left in scope.
2522
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2523
- if (get_current_node(parser) != node) {
2524
- parser_add_parse_error(parser, token);
2525
- result = false;
2526
- }
2599
+ if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2600
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
2601
+ parser_add_parse_error(parser, token);
2602
+ ignore_token(parser);
2603
+ return false;
2604
+ }
2605
+ bool success = true;
2606
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2607
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
2608
+ parser_add_parse_error(parser, token);
2609
+ return false;
2610
+ }
2611
+ while(!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM));
2612
+ return success;
2613
+ } else {
2614
+ bool result = true;
2615
+ const GumboNode* node = state->_form_element;
2616
+ assert(!node || node->type == GUMBO_NODE_ELEMENT);
2617
+ state->_form_element = NULL;
2618
+ if (!node || !has_node_in_scope(parser, node)) {
2619
+ gumbo_debug("Closing an unopened form.\n");
2620
+ parser_add_parse_error(parser, token);
2621
+ ignore_token(parser);
2622
+ return false;
2623
+ }
2624
+ // This differs from implicitly_close_tags because we remove *only* the
2625
+ // <form> element; other nodes are left in scope.
2626
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2627
+ if (get_current_node(parser) != node) {
2628
+ parser_add_parse_error(parser, token);
2629
+ result = false;
2630
+ }
2527
2631
 
2528
- GumboVector* open_elements = &state->_open_elements;
2529
- int index = open_elements->length - 1;
2530
- for (; index >= 0 && open_elements->data[index] != node; --index);
2531
- assert(index >= 0);
2532
- gumbo_vector_remove_at(parser, index, open_elements);
2533
- return result;
2632
+ GumboVector* open_elements = &state->_open_elements;
2633
+ int index = gumbo_vector_index_of(open_elements, node);
2634
+ assert(index >= 0);
2635
+ gumbo_vector_remove_at(parser, index, open_elements);
2636
+ return result;
2637
+ }
2534
2638
  } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
2535
2639
  if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2536
2640
  parser_add_parse_error(parser, token);
2537
- reconstruct_active_formatting_elements(parser);
2641
+ // reconstruct_active_formatting_elements(parser);
2538
2642
  insert_element_of_tag_type(
2539
2643
  parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2540
2644
  state->_reprocess_current_token = true;
2541
2645
  return false;
2542
2646
  }
2543
- return implicitly_close_tags(parser, token, GUMBO_TAG_P);
2647
+ return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
2544
2648
  } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
2545
2649
  if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
2546
2650
  parser_add_parse_error(parser, token);
2547
2651
  ignore_token(parser);
2548
2652
  return false;
2549
2653
  }
2550
- return implicitly_close_tags(parser, token, GUMBO_TAG_LI);
2551
- } else if (tag_in(token, kEndTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
2552
- GUMBO_TAG_LAST)) {
2654
+ return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_LI);
2655
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(DD), TAG(DT) })) {
2553
2656
  assert(token->type == GUMBO_TOKEN_END_TAG);
2554
2657
  GumboTag token_tag = token->v.end_tag;
2555
2658
  if (!has_an_element_in_scope(parser, token_tag)) {
@@ -2557,12 +2660,12 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2557
2660
  ignore_token(parser);
2558
2661
  return false;
2559
2662
  }
2560
- return implicitly_close_tags(parser, token, token_tag);
2561
- } else if (tag_in(token, kEndTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2562
- GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2563
- if (!has_an_element_in_scope_with_tagname(
2564
- parser, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
2565
- GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2663
+ return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
2664
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) {
2665
+ TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6) })) {
2666
+ if (!has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) {
2667
+ GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2668
+ GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) {
2566
2669
  // No heading open; ignore the token entirely.
2567
2670
  parser_add_parse_error(parser, token);
2568
2671
  ignore_token(parser);
@@ -2570,7 +2673,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2570
2673
  } else {
2571
2674
  generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2572
2675
  const GumboNode* current_node = get_current_node(parser);
2573
- bool success = node_tag_is(current_node, token->v.end_tag);
2676
+ bool success = node_html_tag_is(current_node, token->v.end_tag);
2574
2677
  if (!success) {
2575
2678
  // There're children of the heading currently open; close them below and
2576
2679