nokogumbo 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -36,7 +36,7 @@ Example
36
36
  -----
37
37
  ```ruby
38
38
  require 'nokogumbo'
39
- puts Nokogiri::HTML5.get('http://nokogiri.org').at('h1 abbr')['title']
39
+ puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
40
40
  ```
41
41
 
42
42
  Notes
@@ -157,6 +157,7 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
157
157
 
158
158
  switch (child->type) {
159
159
  case GUMBO_NODE_ELEMENT:
160
+ case GUMBO_NODE_TEMPLATE:
160
161
  node = walk_tree(document, &child->v.element);
161
162
  break;
162
163
  case GUMBO_NODE_WHITESPACE:
@@ -35,10 +35,11 @@ static const size_t kMessageBufferSize = 256;
35
35
  static int print_message(GumboParser* parser, GumboStringBuffer* output,
36
36
  const char* format, ...) {
37
37
  va_list args;
38
- va_start(args, format);
39
38
  int remaining_capacity = output->capacity - output->length;
39
+ va_start(args, format);
40
40
  int bytes_written = vsnprintf(output->data + output->length,
41
41
  remaining_capacity, format, args);
42
+ va_end(args);
42
43
  #ifdef _MSC_VER
43
44
  if (bytes_written == -1) {
44
45
  // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of
@@ -47,6 +48,7 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
47
48
  // we retry (letting it fail and returning 0 if it doesn't), since there's
48
49
  // no way to smartly resize the buffer.
49
50
  gumbo_string_buffer_reserve(parser, output->capacity * 2, output);
51
+ va_start(args, format);
50
52
  int result = vsnprintf(output->data + output->length,
51
53
  remaining_capacity, format, args);
52
54
  va_end(args);
@@ -55,7 +57,6 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
55
57
  #else
56
58
  // -1 in standard C99 indicates an encoding error. Return 0 and do nothing.
57
59
  if (bytes_written == -1) {
58
- va_end(args);
59
60
  return 0;
60
61
  }
61
62
  #endif
@@ -64,11 +65,12 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output,
64
65
  gumbo_string_buffer_reserve(
65
66
  parser, output->capacity + bytes_written, output);
66
67
  remaining_capacity = output->capacity - output->length;
68
+ va_start(args, format);
67
69
  bytes_written = vsnprintf(output->data + output->length,
68
70
  remaining_capacity, format, args);
71
+ va_end(args);
69
72
  }
70
73
  output->length += bytes_written;
71
- va_end(args);
72
74
  return bytes_written;
73
75
  }
74
76
 
@@ -106,6 +108,7 @@ static void handle_parser_error(GumboParser* parser,
106
108
  // But just in case...
107
109
  print_message(parser, output, "Comments aren't legal here");
108
110
  return;
111
+ case GUMBO_TOKEN_CDATA:
109
112
  case GUMBO_TOKEN_WHITESPACE:
110
113
  case GUMBO_TOKEN_CHARACTER:
111
114
  print_message(parser, output, "Character tokens aren't legal here");
@@ -141,7 +141,7 @@ extern const GumboVector kGumboEmptyVector;
141
141
  * Returns the first index at which an element appears in this vector (testing
142
142
  * by pointer equality), or -1 if it never does.
143
143
  */
144
- int gumbo_vector_index_of(GumboVector* vector, void* element);
144
+ int gumbo_vector_index_of(GumboVector* vector, const void* element);
145
145
 
146
146
 
147
147
  /**
@@ -157,172 +157,10 @@ int gumbo_vector_index_of(GumboVector* vector, void* element);
157
157
  * strings.
158
158
  */
159
159
  typedef enum {
160
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
161
- GUMBO_TAG_HTML,
162
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
163
- GUMBO_TAG_HEAD,
164
- GUMBO_TAG_TITLE,
165
- GUMBO_TAG_BASE,
166
- GUMBO_TAG_LINK,
167
- GUMBO_TAG_META,
168
- GUMBO_TAG_STYLE,
169
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
170
- GUMBO_TAG_SCRIPT,
171
- GUMBO_TAG_NOSCRIPT,
172
- GUMBO_TAG_TEMPLATE,
173
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
174
- GUMBO_TAG_BODY,
175
- GUMBO_TAG_ARTICLE,
176
- GUMBO_TAG_SECTION,
177
- GUMBO_TAG_NAV,
178
- GUMBO_TAG_ASIDE,
179
- GUMBO_TAG_H1,
180
- GUMBO_TAG_H2,
181
- GUMBO_TAG_H3,
182
- GUMBO_TAG_H4,
183
- GUMBO_TAG_H5,
184
- GUMBO_TAG_H6,
185
- GUMBO_TAG_HGROUP,
186
- GUMBO_TAG_HEADER,
187
- GUMBO_TAG_FOOTER,
188
- GUMBO_TAG_ADDRESS,
189
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
190
- GUMBO_TAG_P,
191
- GUMBO_TAG_HR,
192
- GUMBO_TAG_PRE,
193
- GUMBO_TAG_BLOCKQUOTE,
194
- GUMBO_TAG_OL,
195
- GUMBO_TAG_UL,
196
- GUMBO_TAG_LI,
197
- GUMBO_TAG_DL,
198
- GUMBO_TAG_DT,
199
- GUMBO_TAG_DD,
200
- GUMBO_TAG_FIGURE,
201
- GUMBO_TAG_FIGCAPTION,
202
- GUMBO_TAG_MAIN,
203
- GUMBO_TAG_DIV,
204
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
205
- GUMBO_TAG_A,
206
- GUMBO_TAG_EM,
207
- GUMBO_TAG_STRONG,
208
- GUMBO_TAG_SMALL,
209
- GUMBO_TAG_S,
210
- GUMBO_TAG_CITE,
211
- GUMBO_TAG_Q,
212
- GUMBO_TAG_DFN,
213
- GUMBO_TAG_ABBR,
214
- GUMBO_TAG_DATA,
215
- GUMBO_TAG_TIME,
216
- GUMBO_TAG_CODE,
217
- GUMBO_TAG_VAR,
218
- GUMBO_TAG_SAMP,
219
- GUMBO_TAG_KBD,
220
- GUMBO_TAG_SUB,
221
- GUMBO_TAG_SUP,
222
- GUMBO_TAG_I,
223
- GUMBO_TAG_B,
224
- GUMBO_TAG_U,
225
- GUMBO_TAG_MARK,
226
- GUMBO_TAG_RUBY,
227
- GUMBO_TAG_RT,
228
- GUMBO_TAG_RP,
229
- GUMBO_TAG_BDI,
230
- GUMBO_TAG_BDO,
231
- GUMBO_TAG_SPAN,
232
- GUMBO_TAG_BR,
233
- GUMBO_TAG_WBR,
234
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
235
- GUMBO_TAG_INS,
236
- GUMBO_TAG_DEL,
237
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
238
- GUMBO_TAG_IMAGE,
239
- GUMBO_TAG_IMG,
240
- GUMBO_TAG_IFRAME,
241
- GUMBO_TAG_EMBED,
242
- GUMBO_TAG_OBJECT,
243
- GUMBO_TAG_PARAM,
244
- GUMBO_TAG_VIDEO,
245
- GUMBO_TAG_AUDIO,
246
- GUMBO_TAG_SOURCE,
247
- GUMBO_TAG_TRACK,
248
- GUMBO_TAG_CANVAS,
249
- GUMBO_TAG_MAP,
250
- GUMBO_TAG_AREA,
251
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
252
- GUMBO_TAG_MATH,
253
- GUMBO_TAG_MI,
254
- GUMBO_TAG_MO,
255
- GUMBO_TAG_MN,
256
- GUMBO_TAG_MS,
257
- GUMBO_TAG_MTEXT,
258
- GUMBO_TAG_MGLYPH,
259
- GUMBO_TAG_MALIGNMARK,
260
- GUMBO_TAG_ANNOTATION_XML,
261
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
262
- GUMBO_TAG_SVG,
263
- GUMBO_TAG_FOREIGNOBJECT,
264
- GUMBO_TAG_DESC,
265
- // SVG title tags will have GUMBO_TAG_TITLE as with HTML.
266
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
267
- GUMBO_TAG_TABLE,
268
- GUMBO_TAG_CAPTION,
269
- GUMBO_TAG_COLGROUP,
270
- GUMBO_TAG_COL,
271
- GUMBO_TAG_TBODY,
272
- GUMBO_TAG_THEAD,
273
- GUMBO_TAG_TFOOT,
274
- GUMBO_TAG_TR,
275
- GUMBO_TAG_TD,
276
- GUMBO_TAG_TH,
277
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
278
- GUMBO_TAG_FORM,
279
- GUMBO_TAG_FIELDSET,
280
- GUMBO_TAG_LEGEND,
281
- GUMBO_TAG_LABEL,
282
- GUMBO_TAG_INPUT,
283
- GUMBO_TAG_BUTTON,
284
- GUMBO_TAG_SELECT,
285
- GUMBO_TAG_DATALIST,
286
- GUMBO_TAG_OPTGROUP,
287
- GUMBO_TAG_OPTION,
288
- GUMBO_TAG_TEXTAREA,
289
- GUMBO_TAG_KEYGEN,
290
- GUMBO_TAG_OUTPUT,
291
- GUMBO_TAG_PROGRESS,
292
- GUMBO_TAG_METER,
293
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
294
- GUMBO_TAG_DETAILS,
295
- GUMBO_TAG_SUMMARY,
296
- GUMBO_TAG_MENU,
297
- GUMBO_TAG_MENUITEM,
298
- // Non-conforming elements that nonetheless appear in the HTML5 spec.
299
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
300
- GUMBO_TAG_APPLET,
301
- GUMBO_TAG_ACRONYM,
302
- GUMBO_TAG_BGSOUND,
303
- GUMBO_TAG_DIR,
304
- GUMBO_TAG_FRAME,
305
- GUMBO_TAG_FRAMESET,
306
- GUMBO_TAG_NOFRAMES,
307
- GUMBO_TAG_ISINDEX,
308
- GUMBO_TAG_LISTING,
309
- GUMBO_TAG_XMP,
310
- GUMBO_TAG_NEXTID,
311
- GUMBO_TAG_NOEMBED,
312
- GUMBO_TAG_PLAINTEXT,
313
- GUMBO_TAG_RB,
314
- GUMBO_TAG_STRIKE,
315
- GUMBO_TAG_BASEFONT,
316
- GUMBO_TAG_BIG,
317
- GUMBO_TAG_BLINK,
318
- GUMBO_TAG_CENTER,
319
- GUMBO_TAG_FONT,
320
- GUMBO_TAG_MARQUEE,
321
- GUMBO_TAG_MULTICOL,
322
- GUMBO_TAG_NOBR,
323
- GUMBO_TAG_SPACER,
324
- GUMBO_TAG_TT,
325
- // Used for all tags that don't have special handling in HTML.
160
+ // Load all the tags from an external source, generated from tag.in.
161
+ # include "tag_enum.h"
162
+ // Used for all tags that don't have special handling in HTML. Add new tags
163
+ // to the end of tag.in so as to preserve backwards-compatibility.
326
164
  GUMBO_TAG_UNKNOWN,
327
165
  // A marker value to indicate the end of the enum, for iterating over it.
328
166
  // Also used as the terminator for varargs functions that take tags.
@@ -364,9 +202,10 @@ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
364
202
 
365
203
  /**
366
204
  * Converts a tag name string (which may be in upper or mixed case) to a tag
367
- * enum.
205
+ * enum. The `tag` version expects `tagname` to be NULL-terminated
368
206
  */
369
207
  GumboTag gumbo_tag_enum(const char* tagname);
208
+ GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
370
209
 
371
210
  /**
372
211
  * Attribute namespaces.
@@ -461,10 +300,16 @@ typedef enum {
461
300
  GUMBO_NODE_TEXT,
462
301
  /** CDATA node. v will be a GumboText. */
463
302
  GUMBO_NODE_CDATA,
464
- /** Comment node. v. will be a GumboText, excluding comment delimiters. */
303
+ /** Comment node. v will be a GumboText, excluding comment delimiters. */
465
304
  GUMBO_NODE_COMMENT,
466
305
  /** Text node, where all contents is whitespace. v will be a GumboText. */
467
- GUMBO_NODE_WHITESPACE
306
+ GUMBO_NODE_WHITESPACE,
307
+ /** Template node. This is separate from GUMBO_NODE_ELEMENT because many
308
+ * client libraries will want to ignore the contents of template nodes, as
309
+ * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
310
+ * here, while clients that want to include template contents should also
311
+ * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
312
+ GUMBO_NODE_TEMPLATE
468
313
  } GumboNodeType;
469
314
 
470
315
  /**
@@ -678,6 +523,19 @@ struct GumboInternalNode {
678
523
  /** Pointer back to parent node. Not owned. */
679
524
  GumboNode* parent;
680
525
 
526
+ /**
527
+ * Pointer to next node in document order. This is the next node by start tag
528
+ * position in the document, or by position of the tag that forces the parser
529
+ * to insert it for parser-inserted nodes. It's necessary to maintain API
530
+ * compatibility with some other libraries, eg. BeautifulSoup. Not owned.
531
+ */
532
+ GumboNode* next;
533
+
534
+ /**
535
+ * Pointer to previous node in document order.
536
+ */
537
+ GumboNode* prev;
538
+
681
539
  /** The index within the parent's children vector of this node. */
682
540
  size_t index_within_parent;
683
541
 
@@ -795,6 +653,14 @@ GumboOutput* gumbo_parse(const char* buffer);
795
653
  GumboOutput* gumbo_parse_with_options(
796
654
  const GumboOptions* options, const char* buffer, size_t buffer_length);
797
655
 
656
+ /**
657
+ * Parse a chunk of HTML with the given fragment context. If `fragment_ctx`
658
+ * is `GUMBO_TAG_LAST`, the fragment will be parsed as a full document.
659
+ */
660
+ GumboOutput* gumbo_parse_fragment(
661
+ const GumboOptions* options, const char* buffer, size_t length,
662
+ const GumboTag fragment_ctx, const GumboNamespaceEnum fragment_namespace);
663
+
798
664
  /** Release the memory used for the parse tree & parse errors. */
799
665
  void gumbo_destroy_output(
800
666
  const GumboOptions* options, GumboOutput* output);
@@ -32,12 +32,30 @@
32
32
  #include "util.h"
33
33
  #include "vector.h"
34
34
 
35
-
36
35
  #define AVOID_UNUSED_VARIABLE_WARNING(i) (void)(i)
37
36
 
38
37
  #define GUMBO_STRING(literal) { literal, sizeof(literal) - 1 }
39
38
  #define TERMINATOR { "", 0 }
40
39
 
40
+ typedef char gumbo_tagset[GUMBO_TAG_LAST];
41
+ #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML)
42
+ #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG)
43
+ #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML)
44
+
45
+ #define TAGSET_INCLUDES(tagset, namespace, tag) \
46
+ (tag < GUMBO_TAG_LAST && \
47
+ tagset[(int)tag] == (1 << (int)namespace))
48
+
49
+
50
+
51
+ // selected forward declarations as it is getting hard to find
52
+ // an appropriate order
53
+ static bool node_html_tag_is(const GumboNode*, GumboTag);
54
+ static GumboInsertionMode get_current_template_insertion_mode(const GumboParser*);
55
+ static bool handle_in_template(GumboParser*, GumboToken*);
56
+ static GumboNode* destroy_node(GumboParser*, GumboNode*);
57
+
58
+
41
59
  static void* malloc_wrapper(void* unused, size_t size) {
42
60
  return malloc(size);
43
61
  }
@@ -181,7 +199,7 @@ typedef struct _ReplacementEntry {
181
199
  { GUMBO_STRING(from), GUMBO_STRING(to) }
182
200
 
183
201
  // Static data for SVG attribute replacements.
184
- // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-svg-attributes
202
+ // https://html.spec.whatwg.org/multipage/syntax.html#creating-and-inserting-nodes
185
203
  static const ReplacementEntry kSvgAttributeReplacements[] = {
186
204
  REPLACEMENT_ENTRY("attributename", "attributeName"),
187
205
  REPLACEMENT_ENTRY("attributetype", "attributeType"),
@@ -189,12 +207,12 @@ static const ReplacementEntry kSvgAttributeReplacements[] = {
189
207
  REPLACEMENT_ENTRY("baseprofile", "baseProfile"),
190
208
  REPLACEMENT_ENTRY("calcmode", "calcMode"),
191
209
  REPLACEMENT_ENTRY("clippathunits", "clipPathUnits"),
192
- REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
193
- REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
210
+ // REPLACEMENT_ENTRY("contentscripttype", "contentScriptType"),
211
+ // REPLACEMENT_ENTRY("contentstyletype", "contentStyleType"),
194
212
  REPLACEMENT_ENTRY("diffuseconstant", "diffuseConstant"),
195
213
  REPLACEMENT_ENTRY("edgemode", "edgeMode"),
196
- REPLACEMENT_ENTRY("externalresourcesrequired", "externalResourcesRequired"),
197
- REPLACEMENT_ENTRY("filterres", "filterRes"),
214
+ // REPLACEMENT_ENTRY("externalresourcesrequired", "externalResourcesRequired"),
215
+ // REPLACEMENT_ENTRY("filterres", "filterRes"),
198
216
  REPLACEMENT_ENTRY("filterunits", "filterUnits"),
199
217
  REPLACEMENT_ENTRY("glyphref", "glyphRef"),
200
218
  REPLACEMENT_ENTRY("gradienttransform", "gradientTransform"),
@@ -336,7 +354,7 @@ typedef struct _TextNodeBufferState {
336
354
  // The source position of the start of this text node.
337
355
  GumboSourcePosition _start_position;
338
356
 
339
- // The type of node that will be inserted (TEXT or WHITESPACE).
357
+ // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE).
340
358
  GumboNodeType _type;
341
359
  } TextNodeBufferState;
342
360
 
@@ -362,6 +380,9 @@ typedef struct GumboInternalParserState {
362
380
  GumboNode* _head_element;
363
381
  GumboNode* _form_element;
364
382
 
383
+ // The element used as fragment context when parsing in fragment mode
384
+ GumboNode* _fragment_ctx;
385
+
365
386
  // The flag for when the spec says "Reprocess the current token in..."
366
387
  bool _reprocess_current_token;
367
388
 
@@ -390,6 +411,10 @@ typedef struct GumboInternalParserState {
390
411
  // The current token.
391
412
  GumboToken* _current_token;
392
413
 
414
+ // The current (most recently inserted) node. This is used to link together
415
+ // nodes in document order.
416
+ GumboNode* _current_node;
417
+
393
418
  // The way that the spec is written, the </body> and </html> tags are *always*
394
419
  // implicit, because encountering one of those tokens merely switches the
395
420
  // insertion mode out of "in body". So we have individual state flags for
@@ -442,7 +467,17 @@ static void set_frameset_not_ok(GumboParser* parser) {
442
467
  }
443
468
 
444
469
  static GumboNode* create_node(GumboParser* parser, GumboNodeType type) {
470
+ GumboParserState* state = parser->_parser_state;
445
471
  GumboNode* node = gumbo_parser_allocate(parser, sizeof(GumboNode));
472
+
473
+ node->next = NULL;
474
+ node->prev = state->_current_node;
475
+ if (state->_current_node != NULL) {
476
+ // May be null for the initial document node.
477
+ state->_current_node->next = node;
478
+ }
479
+ state->_current_node = node;
480
+
446
481
  node->parent = NULL;
447
482
  node->index_within_parent = -1;
448
483
  node->type = type;
@@ -489,7 +524,9 @@ static void parser_state_init(GumboParser* parser) {
489
524
  gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
490
525
  parser_state->_head_element = NULL;
491
526
  parser_state->_form_element = NULL;
527
+ parser_state->_fragment_ctx = NULL;
492
528
  parser_state->_current_token = NULL;
529
+ parser_state->_current_node = NULL;
493
530
  parser_state->_closed_body_tag = false;
494
531
  parser_state->_closed_html_tag = false;
495
532
  parser->_parser_state = parser_state;
@@ -497,17 +534,25 @@ static void parser_state_init(GumboParser* parser) {
497
534
 
498
535
  static void parser_state_destroy(GumboParser* parser) {
499
536
  GumboParserState* state = parser->_parser_state;
537
+ if (state->_fragment_ctx) {
538
+ destroy_node(parser, state->_fragment_ctx);
539
+ }
500
540
  gumbo_vector_destroy(parser, &state->_active_formatting_elements);
501
541
  gumbo_vector_destroy(parser, &state->_open_elements);
502
542
  gumbo_vector_destroy(parser, &state->_template_insertion_modes);
503
543
  gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
504
544
  gumbo_parser_deallocate(parser, state);
545
+ parser->_parser_state = NULL;
505
546
  }
506
547
 
507
548
  static GumboNode* get_document_node(GumboParser* parser) {
508
549
  return parser->_output->document;
509
550
  }
510
551
 
552
+ static bool is_fragment_parser(const GumboParser *parser) {
553
+ return !!parser->_parser_state->_fragment_ctx;
554
+ }
555
+
511
556
  // Returns the node at the bottom of the stack of open elements, or NULL if no
512
557
  // elements have been added yet.
513
558
  static GumboNode* get_current_node(GumboParser* parser) {
@@ -521,6 +566,14 @@ static GumboNode* get_current_node(GumboParser* parser) {
521
566
  return open_elements->data[open_elements->length - 1];
522
567
  }
523
568
 
569
+ static GumboNode* get_adjusted_current_node(GumboParser* parser) {
570
+ GumboParserState *state = parser->_parser_state;
571
+ if (state->_open_elements.length == 1 && state->_fragment_ctx) {
572
+ return state->_fragment_ctx;
573
+ }
574
+ return get_current_node(parser);
575
+ }
576
+
524
577
  // Returns true if the given needle is in the given array of literal
525
578
  // GumboStringPieces. If exact_match is true, this requires that they match
526
579
  // exactly; otherwise, this performs a prefix match to check if any of the
@@ -541,52 +594,80 @@ static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
541
594
  parser->_parser_state->_insertion_mode = mode;
542
595
  }
543
596
 
597
+
544
598
  // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#reset-the-insertion-mode-appropriately
545
599
  // This is a helper function that returns the appropriate insertion mode instead
546
600
  // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to
547
601
  // indicate that there is no appropriate insertion mode, and the loop should
548
602
  // continue.
549
- static GumboInsertionMode get_appropriate_insertion_mode(
550
- const GumboNode* node, bool is_last) {
551
- assert(node->type == GUMBO_NODE_ELEMENT);
603
+ static GumboInsertionMode get_appropriate_insertion_mode(const GumboParser* parser, int index) {
604
+ const GumboVector* open_elements = &parser->_parser_state->_open_elements;
605
+ const GumboNode* node = open_elements->data[index];
606
+ const bool is_last = index == 0;
607
+
608
+ if (is_last && is_fragment_parser(parser)) {
609
+ node = parser->_parser_state->_fragment_ctx;
610
+ }
611
+
612
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
552
613
  switch (node->v.element.tag) {
553
- case GUMBO_TAG_SELECT:
614
+ case GUMBO_TAG_SELECT: {
615
+ if (is_last) {
554
616
  return GUMBO_INSERTION_MODE_IN_SELECT;
555
- case GUMBO_TAG_TD:
556
- case GUMBO_TAG_TH:
557
- return is_last ?
558
- GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_IN_CELL;
559
- case GUMBO_TAG_TR:
560
- return GUMBO_INSERTION_MODE_IN_ROW;
561
- case GUMBO_TAG_TBODY:
562
- case GUMBO_TAG_THEAD:
563
- case GUMBO_TAG_TFOOT:
564
- return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
565
- case GUMBO_TAG_CAPTION:
566
- return GUMBO_INSERTION_MODE_IN_CAPTION;
567
- case GUMBO_TAG_COLGROUP:
568
- return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
569
- case GUMBO_TAG_TABLE:
570
- return GUMBO_INSERTION_MODE_IN_TABLE;
571
- case GUMBO_TAG_HEAD:
572
- case GUMBO_TAG_BODY:
573
- return GUMBO_INSERTION_MODE_IN_BODY;
574
- case GUMBO_TAG_FRAMESET:
575
- return GUMBO_INSERTION_MODE_IN_FRAMESET;
576
- case GUMBO_TAG_HTML:
577
- return GUMBO_INSERTION_MODE_BEFORE_HEAD;
578
- default:
579
- return is_last ?
580
- GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
617
+ }
618
+ for (int i = index; i > 0; --i) {
619
+ const GumboNode* ancestor = open_elements->data[i];
620
+ if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) {
621
+ return GUMBO_INSERTION_MODE_IN_SELECT;
622
+ }
623
+ if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) {
624
+ return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE;
625
+ }
626
+ }
627
+ return GUMBO_INSERTION_MODE_IN_SELECT;
581
628
  }
629
+ case GUMBO_TAG_TD:
630
+ case GUMBO_TAG_TH:
631
+ if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL;
632
+ break;
633
+ case GUMBO_TAG_TR:
634
+ return GUMBO_INSERTION_MODE_IN_ROW;
635
+ case GUMBO_TAG_TBODY:
636
+ case GUMBO_TAG_THEAD:
637
+ case GUMBO_TAG_TFOOT:
638
+ return GUMBO_INSERTION_MODE_IN_TABLE_BODY;
639
+ case GUMBO_TAG_CAPTION:
640
+ return GUMBO_INSERTION_MODE_IN_CAPTION;
641
+ case GUMBO_TAG_COLGROUP:
642
+ return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP;
643
+ case GUMBO_TAG_TABLE:
644
+ return GUMBO_INSERTION_MODE_IN_TABLE;
645
+ case GUMBO_TAG_TEMPLATE:
646
+ return get_current_template_insertion_mode(parser);
647
+ case GUMBO_TAG_HEAD:
648
+ if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD;
649
+ break;
650
+ case GUMBO_TAG_BODY:
651
+ return GUMBO_INSERTION_MODE_IN_BODY;
652
+ case GUMBO_TAG_FRAMESET:
653
+ return GUMBO_INSERTION_MODE_IN_FRAMESET;
654
+ case GUMBO_TAG_HTML:
655
+ return parser->_parser_state->_head_element ?
656
+ GUMBO_INSERTION_MODE_AFTER_HEAD : GUMBO_INSERTION_MODE_BEFORE_HEAD;
657
+ default:
658
+ break;
659
+ }
660
+ return is_last ?
661
+ GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL;
582
662
  }
583
663
 
664
+
584
665
  // This performs the actual "reset the insertion mode" loop.
585
666
  static void reset_insertion_mode_appropriately(GumboParser* parser) {
586
667
  const GumboVector* open_elements = &parser->_parser_state->_open_elements;
587
668
  for (int i = open_elements->length; --i >= 0; ) {
588
669
  GumboInsertionMode mode =
589
- get_appropriate_insertion_mode(open_elements->data[i], i == 0);
670
+ get_appropriate_insertion_mode(parser, i);
590
671
  if (mode != GUMBO_INSERTION_MODE_INITIAL) {
591
672
  set_insertion_mode(parser, mode);
592
673
  return;
@@ -620,7 +701,7 @@ static GumboError* parser_add_parse_error(GumboParser* parser, const GumboToken*
620
701
  &extra_data->tag_stack);
621
702
  for (int i = 0; i < state->_open_elements.length; ++i) {
622
703
  const GumboNode* node = state->_open_elements.data[i];
623
- assert(node->type == GUMBO_NODE_ELEMENT);
704
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
624
705
  gumbo_vector_add(parser, (void*) node->v.element.tag,
625
706
  &extra_data->tag_stack);
626
707
  }
@@ -631,13 +712,7 @@ static GumboError* parser_add_parse_error(GumboParser* parser, const GumboToken*
631
712
  // by is_start) with one of the tag types in the varargs list. Terminate the
632
713
  // list with GUMBO_TAG_LAST; this functions as a sentinel since no portion of
633
714
  // the spec references tags that are not in the spec.
634
- // TODO(jdtang): A lot of the tag lists for this function are repeated in many
635
- // places in the code. This is how it's written in the spec (and it's done this
636
- // way so it's easy to verify the code against the spec), but it may be worth
637
- // coming up with a notion of a "tag set" that includes a list of tags, and
638
- // using that in many places. It'd probably also help performance, but I want
639
- // to profile before optimizing.
640
- static bool tag_in(const GumboToken* token, bool is_start, ...) {
715
+ static bool tag_in(const GumboToken* token, bool is_start, const gumbo_tagset tags) {
641
716
  GumboTag token_tag;
642
717
  if (is_start && token->type == GUMBO_TOKEN_START_TAG) {
643
718
  token_tag = token->v.start_tag.tag;
@@ -646,19 +721,7 @@ static bool tag_in(const GumboToken* token, bool is_start, ...) {
646
721
  } else {
647
722
  return false;
648
723
  }
649
-
650
- va_list tags;
651
- va_start(tags, is_start);
652
- bool result = false;
653
- for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
654
- tag = va_arg(tags, GumboTag)) {
655
- if (tag == token_tag) {
656
- result = true;
657
- break;
658
- }
659
- }
660
- va_end(tags);
661
- return result;
724
+ return (token_tag < GUMBO_TAG_LAST && tags[(int)token_tag] != 0);
662
725
  }
663
726
 
664
727
  // Like tag_in, but for the single-tag case.
@@ -673,52 +736,119 @@ static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) {
673
736
  }
674
737
 
675
738
  // Like tag_in, but checks for the tag of a node, rather than a token.
676
- static bool node_tag_in(const GumboNode* node, ...) {
739
+ static bool node_tag_in_set(const GumboNode* node, const gumbo_tagset tags) {
677
740
  assert(node != NULL);
678
- if (node->type != GUMBO_NODE_ELEMENT) {
741
+ if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) {
679
742
  return false;
680
743
  }
681
- GumboTag node_tag = node->v.element.tag;
682
-
683
- va_list tags;
684
- va_start(tags, node);
685
- bool result = false;
686
- for (GumboTag tag = va_arg(tags, GumboTag); tag != GUMBO_TAG_LAST;
687
- tag = va_arg(tags, GumboTag)) {
688
- assert(tag <= GUMBO_TAG_LAST);
689
- if (tag == node_tag) {
690
- result = true;
691
- break;
692
- }
693
- }
694
- va_end(tags);
695
- return result;
744
+ return TAGSET_INCLUDES(tags, node->v.element.tag_namespace, node->v.element.tag);
696
745
  }
697
746
 
747
+
698
748
  // Like node_tag_in, but for the single-tag case.
699
- static bool node_tag_is(const GumboNode* node, GumboTag tag) {
700
- return node->type == GUMBO_NODE_ELEMENT && node->v.element.tag == tag;
749
+ static bool node_qualified_tag_is(const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag) {
750
+ return (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) &&
751
+ node->v.element.tag == tag &&
752
+ node->v.element.tag_namespace == ns;
753
+ }
754
+
755
+ // Like node_tag_in, but for the single-tag case in the HTML namespace
756
+ static bool node_html_tag_is(const GumboNode* node, GumboTag tag)
757
+ {
758
+ return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag);
759
+ }
760
+
761
+ static void push_template_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
762
+ gumbo_vector_add(parser, (void*) mode, &parser->_parser_state->_template_insertion_modes);
763
+ }
764
+
765
+ static void pop_template_insertion_mode(GumboParser* parser) {
766
+ gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes);
767
+ }
768
+
769
+ // Returns the current template insertion mode. If the stack of template
770
+ // insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL.
771
+ static GumboInsertionMode get_current_template_insertion_mode(const GumboParser* parser) {
772
+ GumboVector* template_insertion_modes = &parser->_parser_state->_template_insertion_modes;
773
+ if (template_insertion_modes->length == 0) {
774
+ return GUMBO_INSERTION_MODE_INITIAL;
775
+ }
776
+ return (GumboInsertionMode) template_insertion_modes->data[(template_insertion_modes->length - 1)];
701
777
  }
702
778
 
703
779
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#mathml-text-integration-point
704
780
  static bool is_mathml_integration_point(const GumboNode* node) {
705
- return node_tag_in(node, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
706
- GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST) &&
707
- node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML;
781
+ return node_tag_in_set(node, (gumbo_tagset) { TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
782
+ TAG_MATHML(MS), TAG_MATHML(MTEXT) });
708
783
  }
709
784
 
710
785
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#html-integration-point
711
786
  static bool is_html_integration_point(const GumboNode* node) {
712
- return (node_tag_in(node, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC,
713
- GUMBO_TAG_TITLE, GUMBO_TAG_LAST) &&
714
- node->v.element.tag_namespace == GUMBO_NAMESPACE_SVG) ||
715
- (node_tag_is(node, GUMBO_TAG_ANNOTATION_XML) && (
787
+ return node_tag_in_set(node, (gumbo_tagset) { TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) }) ||
788
+ (node_qualified_tag_is(node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) && (
716
789
  attribute_matches(&node->v.element.attributes,
717
790
  "encoding", "text/html") ||
718
791
  attribute_matches(&node->v.element.attributes,
719
792
  "encoding", "application/xhtml+xml")));
720
793
  }
721
794
 
795
+
796
+ // This represents a place to insert a node, consisting of a target parent and a
797
+ // child index within that parent. If the node should be inserted at the end of
798
+ // the parent's child, index will be -1.
799
+ typedef struct {
800
+ GumboNode* target;
801
+ int index;
802
+ } InsertionLocation;
803
+
804
+ InsertionLocation get_appropriate_insertion_location(GumboParser* parser, GumboNode* override_target) {
805
+ InsertionLocation retval = { override_target, -1 };
806
+ if (retval.target == NULL) {
807
+ // No override target; default to the current node, but special-case the
808
+ // root node since get_current_node() assumes the stack of open elements is
809
+ // non-empty.
810
+ retval.target = parser->_output->root != NULL ?
811
+ get_current_node(parser) : get_document_node(parser);
812
+ }
813
+ if (!parser->_parser_state->_foster_parent_insertions ||
814
+ !node_tag_in_set(retval.target, (gumbo_tagset) { TAG(TABLE), TAG(TBODY),
815
+ TAG(TFOOT), TAG(THEAD), TAG(TR) })) {
816
+ return retval;
817
+ }
818
+
819
+ // Foster-parenting case.
820
+ int last_template_index = -1;
821
+ int last_table_index = -1;
822
+ GumboVector* open_elements = &parser->_parser_state->_open_elements;
823
+ for (int i = 0; i < open_elements->length; ++i) {
824
+ if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) {
825
+ last_template_index = i;
826
+ }
827
+ if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) {
828
+ last_table_index = i;
829
+ }
830
+ }
831
+ if (last_template_index != -1 &&
832
+ (last_table_index == -1 || last_template_index > last_table_index)) {
833
+ retval.target = open_elements->data[last_template_index];
834
+ return retval;
835
+ }
836
+ if (last_table_index == -1) {
837
+ retval.target = open_elements->data[0];
838
+ return retval;
839
+ }
840
+ GumboNode* last_table = open_elements->data[last_table_index];
841
+ if (last_table->parent != NULL) {
842
+ retval.target = last_table->parent;
843
+ retval.index = last_table->index_within_parent;
844
+ return retval;
845
+ }
846
+
847
+ retval.target = open_elements->data[last_table_index - 1];
848
+ return retval;
849
+ }
850
+
851
+
722
852
  // Appends a node to the end of its parent, setting the "parent" and
723
853
  // "index_within_parent" fields appropriately.
724
854
  static void append_node(
@@ -726,7 +856,7 @@ static void append_node(
726
856
  assert(node->parent == NULL);
727
857
  assert(node->index_within_parent == -1);
728
858
  GumboVector* children;
729
- if (parent->type == GUMBO_NODE_ELEMENT) {
859
+ if (parent->type == GUMBO_NODE_ELEMENT || parent->type == GUMBO_NODE_TEMPLATE) {
730
860
  children = &parent->v.element.children;
731
861
  } else {
732
862
  assert(parent->type == GUMBO_NODE_DOCUMENT);
@@ -738,66 +868,44 @@ static void append_node(
738
868
  assert(node->index_within_parent < children->length);
739
869
  }
740
870
 
741
- // Inserts a node at the specified index within its parent, updating the
871
+ // Inserts a node at the specified InsertionLocation, updating the
742
872
  // "parent" and "index_within_parent" fields of it and all its siblings.
873
+ // If the index of the location is -1, this calls append_node.
743
874
  static void insert_node(
744
- GumboParser* parser, GumboNode* parent, int index, GumboNode* node) {
875
+ GumboParser* parser, GumboNode* node, InsertionLocation location) {
745
876
  assert(node->parent == NULL);
746
877
  assert(node->index_within_parent == -1);
747
- assert(parent->type == GUMBO_NODE_ELEMENT);
748
- GumboVector* children = &parent->v.element.children;
749
- assert(index >= 0);
750
- assert(index < children->length);
751
- node->parent = parent;
752
- node->index_within_parent = index;
753
- gumbo_vector_insert_at(parser, (void*) node, index, children);
754
- assert(node->index_within_parent < children->length);
755
- for (int i = index + 1; i < children->length; ++i) {
756
- GumboNode* sibling = children->data[i];
757
- sibling->index_within_parent = i;
758
- assert(sibling->index_within_parent < children->length);
759
- }
760
- }
878
+ GumboNode* parent = location.target;
879
+ int index = location.index;
880
+ if (index != -1) {
881
+ GumboVector* children = NULL;
882
+ if (parent->type == GUMBO_NODE_ELEMENT ||
883
+ parent->type == GUMBO_NODE_TEMPLATE) {
884
+ children = &parent->v.element.children;
885
+ } else if (parent->type == GUMBO_NODE_DOCUMENT) {
886
+ children = &parent->v.document.children;
887
+ assert(children->length == 0);
888
+ } else {
889
+ assert(0);
890
+ }
761
891
 
762
- // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#foster-parenting
763
- static void foster_parent_element(GumboParser* parser, GumboNode* node) {
764
- GumboVector* open_elements = &parser->_parser_state->_open_elements;
765
- assert(open_elements->length > 2);
766
-
767
- node->parse_flags |= GUMBO_INSERTION_FOSTER_PARENTED;
768
- GumboNode* foster_parent_element = open_elements->data[0];
769
- assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
770
- assert(node_tag_is(foster_parent_element, GUMBO_TAG_HTML));
771
- for (int i = open_elements->length; --i > 1; ) {
772
- GumboNode* table_element = open_elements->data[i];
773
- if (node_tag_is(table_element, GUMBO_TAG_TABLE)) {
774
- foster_parent_element = table_element->parent;
775
- if (!foster_parent_element ||
776
- foster_parent_element->type != GUMBO_NODE_ELEMENT) {
777
- // Table has no parent; spec says it's possible if a script manipulated
778
- // the DOM, although I don't think we have to worry about this case.
779
- gumbo_debug("Table has no parent.\n");
780
- foster_parent_element = open_elements->data[i - 1];
781
- break;
782
- }
783
- assert(foster_parent_element->type == GUMBO_NODE_ELEMENT);
784
- gumbo_debug("Found enclosing table (%x) at %d; parent=%s, index=%d.\n",
785
- table_element, i, gumbo_normalized_tagname(
786
- foster_parent_element->v.element.tag),
787
- table_element->index_within_parent);
788
- assert(foster_parent_element->v.element.children.data[
789
- table_element->index_within_parent] == table_element);
790
- insert_node(parser, foster_parent_element,
791
- table_element->index_within_parent, node);
792
- return;
892
+ assert(index >= 0);
893
+ assert(index < children->length);
894
+ node->parent = parent;
895
+ node->index_within_parent = index;
896
+ gumbo_vector_insert_at(parser, (void*) node, index, children);
897
+ assert(node->index_within_parent < children->length);
898
+ for (int i = index + 1; i < children->length; ++i) {
899
+ GumboNode* sibling = children->data[i];
900
+ sibling->index_within_parent = i;
901
+ assert(sibling->index_within_parent < children->length);
793
902
  }
903
+ } else {
904
+ append_node(parser, parent, node);
794
905
  }
795
- if (node->type == GUMBO_NODE_ELEMENT) {
796
- gumbo_vector_add(parser, (void*) node, open_elements);
797
- }
798
- append_node(parser, foster_parent_element, node);
799
906
  }
800
907
 
908
+
801
909
  static void maybe_flush_text_node_buffer(GumboParser* parser) {
802
910
  GumboParserState* state = parser->_parser_state;
803
911
  TextNodeBufferState* buffer_state = &state->_text_node;
@@ -806,7 +914,8 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) {
806
914
  }
807
915
 
808
916
  assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
809
- buffer_state->_type == GUMBO_NODE_TEXT);
917
+ buffer_state->_type == GUMBO_NODE_TEXT ||
918
+ buffer_state->_type == GUMBO_NODE_CDATA);
810
919
  GumboNode* text_node = create_node(parser, buffer_state->_type);
811
920
  GumboText* text_node_data = &text_node->v.text;
812
921
  text_node_data->text = gumbo_string_buffer_to_string(
@@ -816,20 +925,20 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) {
816
925
  state->_current_token->original_text.data -
817
926
  buffer_state->_start_original_text;
818
927
  text_node_data->start_pos = buffer_state->_start_position;
819
- if (state->_foster_parent_insertions && node_tag_in(
820
- get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
821
- GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
822
- foster_parent_element(parser, text_node);
823
- } else {
824
- append_node(
825
- parser, parser->_output->root ?
826
- get_current_node(parser) : parser->_output->document, text_node);
827
- }
928
+
828
929
  gumbo_debug("Flushing text node buffer of %.*s.\n",
829
930
  (int) buffer_state->_buffer.length, buffer_state->_buffer.data);
830
931
 
831
- gumbo_string_buffer_destroy(parser, &buffer_state->_buffer);
832
- gumbo_string_buffer_init(parser, &buffer_state->_buffer);
932
+ InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
933
+ if (location.target->type == GUMBO_NODE_DOCUMENT) {
934
+ // The DOM does not allow Document nodes to have Text children, so per the
935
+ // spec, they are dropped on the floor.
936
+ destroy_node(parser, text_node);
937
+ } else {
938
+ insert_node(parser, text_node, location);
939
+ }
940
+
941
+ gumbo_string_buffer_clear(parser, &buffer_state->_buffer);
833
942
  buffer_state->_type = GUMBO_NODE_WHITESPACE;
834
943
  assert(buffer_state->_buffer.length == 0);
835
944
  }
@@ -846,7 +955,7 @@ static GumboNode* pop_current_node(GumboParser* parser) {
846
955
  GumboParserState* state = parser->_parser_state;
847
956
  maybe_flush_text_node_buffer(parser);
848
957
  if (state->_open_elements.length > 0) {
849
- assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
958
+ assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
850
959
  gumbo_debug(
851
960
  "Popping %s node.\n",
852
961
  gumbo_normalized_tagname(get_current_node(parser)->v.element.tag));
@@ -856,12 +965,12 @@ static GumboNode* pop_current_node(GumboParser* parser) {
856
965
  assert(state->_open_elements.length == 0);
857
966
  return NULL;
858
967
  }
859
- assert(current_node->type == GUMBO_NODE_ELEMENT);
968
+ assert(current_node->type == GUMBO_NODE_ELEMENT || current_node->type == GUMBO_NODE_TEMPLATE);
860
969
  bool is_closed_body_or_html_tag =
861
- (node_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
862
- (node_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag);
970
+ (node_html_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
971
+ (node_html_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag);
863
972
  if ((state->_current_token->type != GUMBO_TOKEN_END_TAG ||
864
- !node_tag_is(current_node, state->_current_token->v.end_tag)) &&
973
+ !node_html_tag_is(current_node, state->_current_token->v.end_tag)) &&
865
974
  !is_closed_body_or_html_tag) {
866
975
  current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
867
976
  }
@@ -885,25 +994,22 @@ static void append_comment_node(
885
994
 
886
995
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-row-context
887
996
  static void clear_stack_to_table_row_context(GumboParser* parser) {
888
- while (!node_tag_in(get_current_node(parser),
889
- GUMBO_TAG_HTML, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
997
+ while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TR), TAG(TEMPLATE)})) {
890
998
  pop_current_node(parser);
891
999
  }
892
1000
  }
893
1001
 
894
1002
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-context
895
1003
  static void clear_stack_to_table_context(GumboParser* parser) {
896
- while (!node_tag_in(get_current_node(parser),
897
- GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST)) {
1004
+ while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TABLE), TAG(TEMPLATE) } )) {
898
1005
  pop_current_node(parser);
899
1006
  }
900
1007
  }
901
1008
 
902
1009
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#clear-the-stack-back-to-a-table-body-context
903
1010
  void clear_stack_to_table_body_context(GumboParser* parser) {
904
- while (!node_tag_in(get_current_node(parser), GUMBO_TAG_HTML,
905
- GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
906
- GUMBO_TAG_LAST)) {
1011
+ while (!node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(HTML), TAG(TBODY),
1012
+ TAG(TFOOT), TAG(THEAD), TAG(TEMPLATE) })) {
907
1013
  pop_current_node(parser);
908
1014
  }
909
1015
  }
@@ -918,7 +1024,8 @@ static GumboNode* create_element(GumboParser* parser, GumboTag tag) {
918
1024
  element->tag_namespace = GUMBO_NAMESPACE_HTML;
919
1025
  element->original_tag = kGumboEmptyString;
920
1026
  element->original_end_tag = kGumboEmptyString;
921
- element->start_pos = parser->_parser_state->_current_token->position;
1027
+ element->start_pos = (parser->_parser_state->_current_token) ?
1028
+ parser->_parser_state->_current_token->position : kGumboEmptySourcePosition;
922
1029
  element->end_pos = kGumboEmptySourcePosition;
923
1030
  return node;
924
1031
  }
@@ -929,7 +1036,12 @@ static GumboNode* create_element_from_token(
929
1036
  assert(token->type == GUMBO_TOKEN_START_TAG);
930
1037
  GumboTokenStartTag* start_tag = &token->v.start_tag;
931
1038
 
932
- GumboNode* node = create_node(parser, GUMBO_NODE_ELEMENT);
1039
+ GumboNodeType type = (
1040
+ tag_namespace == GUMBO_NAMESPACE_HTML &&
1041
+ start_tag->tag == GUMBO_TAG_TEMPLATE)
1042
+ ? GUMBO_NODE_TEMPLATE : GUMBO_NODE_ELEMENT;
1043
+
1044
+ GumboNode* node = create_node(parser, type);
933
1045
  GumboElement* element = &node->v.element;
934
1046
  gumbo_vector_init(parser, 1, &element->children);
935
1047
  element->attributes = start_tag->attributes;
@@ -966,20 +1078,9 @@ static void insert_element(GumboParser* parser, GumboNode* node,
966
1078
  if (!is_reconstructing_formatting_elements) {
967
1079
  maybe_flush_text_node_buffer(parser);
968
1080
  }
969
- if (state->_foster_parent_insertions && node_tag_in(
970
- get_current_node(parser), GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
971
- GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
972
- foster_parent_element(parser, node);
973
- gumbo_vector_add(parser, (void*) node, &state->_open_elements);
974
- return;
975
- }
976
-
977
- // This is called to insert the root HTML element, but get_current_node
978
- // assumes the stack of open elements is non-empty, so we need special
979
- // handling for this case.
980
- append_node(
981
- parser, parser->_output->root ?
982
- get_current_node(parser) : parser->_output->document, node);
1081
+ InsertionLocation location =
1082
+ get_appropriate_insertion_location(parser, NULL);
1083
+ insert_node(parser, node, location);
983
1084
  gumbo_vector_add(parser, (void*) node, &state->_open_elements);
984
1085
  }
985
1086
 
@@ -1035,7 +1136,9 @@ static GumboNode* insert_foreign_element(
1035
1136
 
1036
1137
  static void insert_text_token(GumboParser* parser, GumboToken* token) {
1037
1138
  assert(token->type == GUMBO_TOKEN_WHITESPACE ||
1038
- token->type == GUMBO_TOKEN_CHARACTER);
1139
+ token->type == GUMBO_TOKEN_CHARACTER ||
1140
+ token->type == GUMBO_TOKEN_NULL ||
1141
+ token->type == GUMBO_TOKEN_CDATA);
1039
1142
  TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
1040
1143
  if (buffer_state->_buffer.length == 0) {
1041
1144
  // Initialize position fields.
@@ -1046,6 +1149,8 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) {
1046
1149
  parser, token->v.character, &buffer_state->_buffer);
1047
1150
  if (token->type == GUMBO_TOKEN_CHARACTER) {
1048
1151
  buffer_state->_type = GUMBO_NODE_TEXT;
1152
+ } else if (token->type == GUMBO_TOKEN_CDATA) {
1153
+ buffer_state->_type = GUMBO_NODE_CDATA;
1049
1154
  }
1050
1155
  gumbo_debug("Inserting text token '%c'.\n", token->v.character);
1051
1156
  }
@@ -1073,7 +1178,7 @@ static bool find_last_anchor_index(GumboParser* parser, int* anchor_index) {
1073
1178
  if (node == &kActiveFormattingScopeMarker) {
1074
1179
  return false;
1075
1180
  }
1076
- if (node_tag_is(node, GUMBO_TAG_A)) {
1181
+ if (node_html_tag_is(node, GUMBO_TAG_A)) {
1077
1182
  *anchor_index = i;
1078
1183
  return true;
1079
1184
  }
@@ -1097,10 +1202,8 @@ static int count_formatting_elements_of_tag(
1097
1202
  break;
1098
1203
  }
1099
1204
  assert(node->type == GUMBO_NODE_ELEMENT);
1100
- GumboElement* element = &node->v.element;
1101
- if (node_tag_is(node, desired_element->tag) &&
1102
- element->tag_namespace == desired_element->tag_namespace &&
1103
- all_attributes_match(&element->attributes,
1205
+ if (node_qualified_tag_is(node, desired_element->tag_namespace, desired_element->tag) &&
1206
+ all_attributes_match(&node->v.element.attributes,
1104
1207
  &desired_element->attributes)) {
1105
1208
  num_identical_elements++;
1106
1209
  *earliest_matching_index = i;
@@ -1150,7 +1253,7 @@ static bool is_open_element(GumboParser* parser, const GumboNode* node) {
1150
1253
  // values are fresh copies.
1151
1254
  GumboNode* clone_node(
1152
1255
  GumboParser* parser, const GumboNode* node, GumboParseFlags reason) {
1153
- assert(node->type == GUMBO_NODE_ELEMENT);
1256
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1154
1257
  GumboNode* new_node = gumbo_parser_allocate(parser, sizeof(GumboNode));
1155
1258
  *new_node = *node;
1156
1259
  new_node->parent = NULL;
@@ -1220,7 +1323,10 @@ static void reconstruct_active_formatting_elements(GumboParser* parser) {
1220
1323
  GumboNode* clone = clone_node(
1221
1324
  parser, element, GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT);
1222
1325
  // Step 9.
1223
- insert_element(parser, clone, true);
1326
+ InsertionLocation location = get_appropriate_insertion_location(parser, NULL);
1327
+ insert_node(parser, clone, location);
1328
+ gumbo_vector_add(parser, (void*) clone, &parser->_parser_state->_open_elements);
1329
+
1224
1330
  // Step 10.
1225
1331
  elements->data[i] = clone;
1226
1332
  gumbo_debug("Reconstructed %s element at %d.\n",
@@ -1269,83 +1375,47 @@ static GumboQuirksModeEnum compute_quirks_mode(
1269
1375
  // The following functions are all defined by the "has an element in __ scope"
1270
1376
  // sections of the HTML5 spec:
1271
1377
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-the-specific-scope
1272
- // The basic idea behind them is that they check for an element of the given tag
1273
- // name, contained within a scope formed by a set of other tag names. For
1274
- // example, "has an element in list scope" looks for an element of the given tag
1275
- // within the nearest enclosing <ol> or <ul>, along with a bunch of generic
1276
- // element types that serve to "firewall" their content from the rest of the
1277
- // document.
1278
- static bool has_an_element_in_specific_scope(
1279
- GumboParser* parser, GumboVector* /* GumboTag */ expected, bool negate, ...) {
1378
+ // The basic idea behind them is that they check for an element of the given
1379
+ // qualified name, contained within a scope formed by a set of other qualified
1380
+ // names. For example, "has an element in list scope" looks for an element of
1381
+ // the given qualified name within the nearest enclosing <ol> or <ul>, along
1382
+ // with a bunch of generic element types that serve to "firewall" their content
1383
+ // from the rest of the document. Note that because of the way the spec is written,
1384
+ // all elements are expected to be in the HTML namespace
1385
+ static bool has_an_element_in_specific_scope(GumboParser* parser,
1386
+ int expected_size, const GumboTag *expected, bool negate, const gumbo_tagset tags) {
1280
1387
  GumboVector* open_elements = &parser->_parser_state->_open_elements;
1281
- va_list args;
1282
- va_start(args, negate);
1283
- // va_arg can only run through the list once, so we copy it to an GumboVector
1284
- // here. I wonder if it'd make more sense to make tags the GumboVector*
1285
- // parameter and 'expected' a vararg list, but that'd require changing a lot
1286
- // of code for unknown benefit. We may want to change the representation of
1287
- // these tag sets anyway, to something more efficient.
1288
- GumboVector tags;
1289
- gumbo_vector_init(parser, 10, &tags);
1290
- for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
1291
- tag = va_arg(args, GumboTag)) {
1292
- // We store the tags inline instead of storing pointers to them.
1293
- gumbo_vector_add(parser, (void*) tag, &tags);
1294
- }
1295
- va_end(args);
1296
-
1297
- bool result = false;
1298
1388
  for (int i = open_elements->length; --i >= 0; ) {
1299
1389
  const GumboNode* node = open_elements->data[i];
1300
- if (node->type != GUMBO_NODE_ELEMENT) {
1390
+ if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE)
1301
1391
  continue;
1302
- }
1392
+
1303
1393
  GumboTag node_tag = node->v.element.tag;
1304
- for (int j = 0; j < expected->length; ++j) {
1305
- GumboTag expected_tag = (GumboTag) expected->data[j];
1306
- if (node_tag == expected_tag) {
1307
- result = true;
1308
- goto cleanup;
1309
- }
1394
+ GumboNamespaceEnum node_ns = node->v.element.tag_namespace;
1395
+ for (int j = 0; j < expected_size; ++j) {
1396
+ if (node_tag == expected[j] && node_ns == GUMBO_NAMESPACE_HTML)
1397
+ return true;
1310
1398
  }
1311
1399
 
1312
- bool found_tag = false;
1313
- for (int j = 0; j < tags.length; ++j) {
1314
- GumboTag tag = (GumboTag) tags.data[j];
1315
- if (tag == node_tag) {
1316
- found_tag = true;
1317
- break;
1318
- }
1319
- }
1320
- if (negate != found_tag) {
1321
- result = false;
1322
- goto cleanup;
1323
- }
1400
+ bool found = TAGSET_INCLUDES(tags, node_ns, node_tag);
1401
+ if (negate != found)
1402
+ return false;
1324
1403
  }
1325
- cleanup:
1326
- gumbo_vector_destroy(parser, &tags);
1327
- return result;
1404
+ return false;
1328
1405
  }
1329
1406
 
1330
- // This is a bit of a hack to stack-allocate a one-element GumboVector name
1331
- // 'varname' containing the 'from_var' variable, since it's used in nearly all
1332
- // the subsequent helper functions. Note the use of void* and casts instead of
1333
- // GumboTag; this is so the alignment requirements are the same as GumboVector
1334
- // and the data inside it can be freely accessed as if it were a normal
1335
- // GumboVector.
1336
- #define DECLARE_ONE_ELEMENT_GUMBO_VECTOR(varname, from_var) \
1337
- void* varname ## _tmp_array[1] = { (void*) from_var }; \
1338
- GumboVector varname = { varname ## _tmp_array, 1, 1 }
1407
+ // Checks for the presence of an open element of the specified tag type.
1408
+ static bool has_open_element(GumboParser* parser, GumboTag tag) {
1409
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(HTML) } );
1410
+ }
1339
1411
 
1340
1412
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-scope
1341
1413
  static bool has_an_element_in_scope(GumboParser* parser, GumboTag tag) {
1342
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1343
- return has_an_element_in_specific_scope(
1344
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1345
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1346
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1347
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1348
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
1414
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(APPLET),
1415
+ TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
1416
+ TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
1417
+ TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1418
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) });
1349
1419
  }
1350
1420
 
1351
1421
  // Like "has an element in scope", but for the specific case of looking for a
@@ -1361,16 +1431,14 @@ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
1361
1431
  if (current == node) {
1362
1432
  return true;
1363
1433
  }
1364
- if (current->type != GUMBO_NODE_ELEMENT) {
1434
+ if (current->type != GUMBO_NODE_ELEMENT && current->type != GUMBO_NODE_TEMPLATE) {
1365
1435
  continue;
1366
1436
  }
1367
- if (node_tag_in(
1368
- current, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1369
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1370
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN,
1371
- GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML,
1372
- GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_TITLE,
1373
- GUMBO_TAG_LAST)) {
1437
+ if (node_tag_in_set(current, (gumbo_tagset) { TAG(APPLET), TAG(CAPTION), TAG(HTML),
1438
+ TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE), TAG(OBJECT), TAG(TEMPLATE),
1439
+ TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1440
+ TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML), TAG_SVG(FOREIGNOBJECT),
1441
+ TAG_SVG(DESC), TAG_SVG(TITLE) } )) {
1374
1442
  return false;
1375
1443
  }
1376
1444
  }
@@ -1378,78 +1446,66 @@ static bool has_node_in_scope(GumboParser* parser, const GumboNode* node) {
1378
1446
  return false;
1379
1447
  }
1380
1448
 
1381
- // Like has_an_element_in_scope, but restricts the expected tag to a range of
1382
- // possible tag names instead of just a single one.
1383
- static bool has_an_element_in_scope_with_tagname(GumboParser* parser, ...) {
1384
- GumboVector tags;
1385
- // 6 = arbitrary initial size for vector, chosen because the major use-case
1386
- // for this method is heading tags, of which there are 6.
1387
- gumbo_vector_init(parser, 6, &tags);
1388
- va_list args;
1389
- va_start(args, parser);
1390
- for (GumboTag tag = va_arg(args, GumboTag); tag != GUMBO_TAG_LAST;
1391
- tag = va_arg(args, GumboTag)) {
1392
- gumbo_vector_add(parser, (void*) tag, &tags);
1393
- }
1394
- bool found = has_an_element_in_specific_scope(
1395
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1396
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1397
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1398
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1399
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_LAST);
1400
- gumbo_vector_destroy(parser, &tags);
1401
- va_end(args);
1402
- return found;
1449
+ // Like has_an_element_in_scope, but restricts the expected qualified name to a
1450
+ // range of possible qualified names instead of just a single one.
1451
+ static bool has_an_element_in_scope_with_tagname(GumboParser* parser, int expected_len, const GumboTag expected[]) {
1452
+ return has_an_element_in_specific_scope(parser, expected_len, expected, false, (gumbo_tagset) {
1453
+ TAG(APPLET), TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
1454
+ TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
1455
+ TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1456
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) });
1403
1457
  }
1404
1458
 
1405
1459
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-list-item-scope
1406
1460
  static bool has_an_element_in_list_scope(GumboParser* parser, GumboTag tag) {
1407
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1408
- return has_an_element_in_specific_scope(
1409
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1410
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1411
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1412
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1413
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_OL, GUMBO_TAG_UL,
1414
- GUMBO_TAG_LAST);
1461
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(APPLET),
1462
+ TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
1463
+ TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
1464
+ TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1465
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE), TAG(OL),
1466
+ TAG(UL) });
1415
1467
  }
1416
1468
 
1417
1469
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-button-scope
1418
1470
  static bool has_an_element_in_button_scope(GumboParser* parser, GumboTag tag) {
1419
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1420
- return has_an_element_in_specific_scope(
1421
- parser, &tags, false, GUMBO_TAG_APPLET, GUMBO_TAG_CAPTION, GUMBO_TAG_HTML,
1422
- GUMBO_TAG_TABLE, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_MARQUEE,
1423
- GUMBO_TAG_OBJECT, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1424
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_FOREIGNOBJECT,
1425
- GUMBO_TAG_DESC, GUMBO_TAG_TITLE, GUMBO_TAG_BUTTON, GUMBO_TAG_LAST);
1471
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(APPLET),
1472
+ TAG(CAPTION), TAG(HTML), TAG(TABLE), TAG(TD), TAG(TH), TAG(MARQUEE),
1473
+ TAG(OBJECT), TAG(TEMPLATE), TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN),
1474
+ TAG_MATHML(MS), TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1475
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE), TAG(BUTTON) });
1426
1476
  }
1427
1477
 
1428
1478
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-table-scope
1429
1479
  static bool has_an_element_in_table_scope(GumboParser* parser, GumboTag tag) {
1430
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1431
- return has_an_element_in_specific_scope(
1432
- parser, &tags, false, GUMBO_TAG_HTML, GUMBO_TAG_TABLE, GUMBO_TAG_LAST);
1480
+ return has_an_element_in_specific_scope(parser, 1, &tag, false, (gumbo_tagset) { TAG(HTML),
1481
+ TAG(TABLE), TAG(TEMPLATE) });
1433
1482
  }
1434
1483
 
1435
1484
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#has-an-element-in-select-scope
1436
1485
  static bool has_an_element_in_select_scope(GumboParser* parser, GumboTag tag) {
1437
- DECLARE_ONE_ELEMENT_GUMBO_VECTOR(tags, tag);
1438
- return has_an_element_in_specific_scope(
1439
- parser, &tags, true, GUMBO_TAG_OPTGROUP, GUMBO_TAG_OPTION,
1440
- GUMBO_TAG_LAST);
1486
+ return has_an_element_in_specific_scope(parser, 1, &tag, true, (gumbo_tagset) { TAG(OPTGROUP), TAG(OPTION) });
1441
1487
  }
1442
1488
 
1443
-
1444
1489
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#generate-implied-end-tags
1445
1490
  // "exception" is the "element to exclude from the process" listed in the spec.
1446
1491
  // Pass GUMBO_TAG_LAST to not exclude any of them.
1447
1492
  static void generate_implied_end_tags(GumboParser* parser, GumboTag exception) {
1448
1493
  for (;
1449
- node_tag_in(get_current_node(parser), GUMBO_TAG_DD, GUMBO_TAG_DT,
1450
- GUMBO_TAG_LI, GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP,
1451
- GUMBO_TAG_P, GUMBO_TAG_RP, GUMBO_TAG_RT, GUMBO_TAG_LAST) &&
1452
- !node_tag_is(get_current_node(parser), exception);
1494
+ node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(DD),
1495
+ TAG(DT), TAG(LI), TAG(OPTION), TAG(OPTGROUP), TAG(P), TAG(RP), TAG(RB),
1496
+ TAG(RT), TAG(RTC) }) &&
1497
+ !node_html_tag_is(get_current_node(parser), exception);
1498
+ pop_current_node(parser));
1499
+ }
1500
+
1501
+ // This is the "generate all implied end tags thoroughly" clause of the spec.
1502
+ // https://html.spec.whatwg.org/multipage/syntax.html#closing-elements-that-have-implied-end-tags
1503
+ static void generate_all_implied_end_tags_thoroughly(GumboParser* parser) {
1504
+ for (;
1505
+ node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(CAPTION),
1506
+ TAG(COLGROUP), TAG(DD), TAG(DT), TAG(LI), TAG(OPTION), TAG(OPTGROUP),
1507
+ TAG(P), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD),
1508
+ TAG(TFOOT), TAG(TH), TAG(HEAD), TAG(TR) });
1453
1509
  pop_current_node(parser));
1454
1510
  }
1455
1511
 
@@ -1463,7 +1519,7 @@ static bool close_table(GumboParser* parser) {
1463
1519
  }
1464
1520
 
1465
1521
  GumboNode* node = pop_current_node(parser);
1466
- while (!node_tag_is(node, GUMBO_TAG_TABLE)) {
1522
+ while (!node_html_tag_is(node, GUMBO_TAG_TABLE)) {
1467
1523
  node = pop_current_node(parser);
1468
1524
  }
1469
1525
  reset_insertion_mode_appropriately(parser);
@@ -1477,13 +1533,13 @@ static bool close_table_cell(GumboParser* parser, const GumboToken* token,
1477
1533
  bool result = true;
1478
1534
  generate_implied_end_tags(parser, GUMBO_TAG_LAST);
1479
1535
  const GumboNode* node = get_current_node(parser);
1480
- if (!node_tag_is(node, cell_tag)) {
1536
+ if (!node_html_tag_is(node, cell_tag)) {
1481
1537
  parser_add_parse_error(parser, token);
1482
1538
  result = false;
1483
1539
  }
1484
1540
  do {
1485
1541
  node = pop_current_node(parser);
1486
- } while (!node_tag_is(node, cell_tag));
1542
+ } while (!node_html_tag_is(node, cell_tag));
1487
1543
 
1488
1544
  clear_active_formatting_elements(parser);
1489
1545
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
@@ -1508,7 +1564,7 @@ static bool close_current_cell(GumboParser* parser, const GumboToken* token) {
1508
1564
  // resets the insertion mode appropriately.
1509
1565
  static void close_current_select(GumboParser* parser) {
1510
1566
  GumboNode* node = pop_current_node(parser);
1511
- while (!node_tag_is(node, GUMBO_TAG_SELECT)) {
1567
+ while (!node_html_tag_is(node, GUMBO_TAG_SELECT)) {
1512
1568
  node = pop_current_node(parser);
1513
1569
  }
1514
1570
  reset_insertion_mode_appropriately(parser);
@@ -1517,60 +1573,43 @@ static void close_current_select(GumboParser* parser) {
1517
1573
  // The list of nodes in the "special" category:
1518
1574
  // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#special
1519
1575
  static bool is_special_node(const GumboNode* node) {
1520
- assert(node->type == GUMBO_NODE_ELEMENT);
1521
- switch (node->v.element.tag_namespace) {
1522
- case GUMBO_NAMESPACE_HTML:
1523
- return node_tag_in(node,
1524
- GUMBO_TAG_ADDRESS, GUMBO_TAG_APPLET, GUMBO_TAG_AREA,
1525
- GUMBO_TAG_ARTICLE, GUMBO_TAG_ASIDE, GUMBO_TAG_BASE,
1526
- GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND, GUMBO_TAG_BLOCKQUOTE,
1527
- GUMBO_TAG_BODY, GUMBO_TAG_BR, GUMBO_TAG_BUTTON, GUMBO_TAG_CAPTION,
1528
- GUMBO_TAG_CENTER, GUMBO_TAG_COL, GUMBO_TAG_COLGROUP,
1529
- GUMBO_TAG_MENUITEM, GUMBO_TAG_DD, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
1530
- GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EMBED,
1531
- GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE,
1532
- GUMBO_TAG_FOOTER, GUMBO_TAG_FORM, GUMBO_TAG_FRAME,
1533
- GUMBO_TAG_FRAMESET, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
1534
- GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HEAD,
1535
- GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_HR, GUMBO_TAG_HTML,
1536
- GUMBO_TAG_IFRAME, GUMBO_TAG_IMG, GUMBO_TAG_INPUT, GUMBO_TAG_ISINDEX,
1537
- GUMBO_TAG_LI, GUMBO_TAG_LINK, GUMBO_TAG_LISTING, GUMBO_TAG_MARQUEE,
1538
- GUMBO_TAG_MENU, GUMBO_TAG_META, GUMBO_TAG_NAV, GUMBO_TAG_NOEMBED,
1539
- GUMBO_TAG_NOFRAMES, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_OBJECT,
1540
- GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_PARAM, GUMBO_TAG_PLAINTEXT,
1541
- GUMBO_TAG_PRE, GUMBO_TAG_SCRIPT, GUMBO_TAG_SECTION, GUMBO_TAG_SELECT,
1542
- GUMBO_TAG_STYLE, GUMBO_TAG_SUMMARY, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
1543
- GUMBO_TAG_TD, GUMBO_TAG_TEXTAREA, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
1544
- GUMBO_TAG_THEAD, GUMBO_TAG_TITLE, GUMBO_TAG_TR, GUMBO_TAG_UL,
1545
- GUMBO_TAG_WBR, GUMBO_TAG_XMP, GUMBO_TAG_LAST);
1546
- case GUMBO_NAMESPACE_MATHML:
1547
- return node_tag_in(node,
1548
- GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS,
1549
- GUMBO_TAG_MTEXT, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_LAST);
1550
- case GUMBO_NAMESPACE_SVG:
1551
- return node_tag_in(node,
1552
- GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_LAST);
1553
- }
1554
- abort();
1555
- return false; // Pacify compiler.
1556
- }
1557
-
1558
- // Implicitly closes currently open tags until it reaches an element with the
1559
- // specified tag name. If the elements closed are in the set handled by
1576
+ assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE);
1577
+ return node_tag_in_set(node, (gumbo_tagset) { TAG(ADDRESS), TAG(APPLET), TAG(AREA),
1578
+ TAG(ARTICLE), TAG(ASIDE), TAG(BASE), TAG(BASEFONT), TAG(BGSOUND), TAG(BLOCKQUOTE),
1579
+ TAG(BODY), TAG(BR), TAG(BUTTON), TAG(CAPTION), TAG(CENTER), TAG(COL),
1580
+ TAG(COLGROUP), TAG(MENUITEM), TAG(DD), TAG(DETAILS), TAG(DIR), TAG(DIV), TAG(DL),
1581
+ TAG(DT), TAG(EMBED), TAG(FIELDSET), TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER),
1582
+ TAG(FORM), TAG(FRAME), TAG(FRAMESET), TAG(H1), TAG(H2), TAG(H3), TAG(H4),
1583
+ TAG(H5), TAG(H6), TAG(HEAD), TAG(HEADER), TAG(HGROUP), TAG(HR), TAG(HTML),
1584
+ TAG(IFRAME), TAG(IMG), TAG(INPUT), TAG(ISINDEX), TAG(LI), TAG(LINK),
1585
+ TAG(LISTING), TAG(MARQUEE), TAG(MENU), TAG(META), TAG(NAV), TAG(NOEMBED),
1586
+ TAG(NOFRAMES), TAG(NOSCRIPT), TAG(OBJECT), TAG(OL), TAG(P), TAG(PARAM),
1587
+ TAG(PLAINTEXT), TAG(PRE), TAG(SCRIPT), TAG(SECTION), TAG(SELECT), TAG(STYLE),
1588
+ TAG(SUMMARY), TAG(TABLE), TAG(TBODY), TAG(TD), TAG(TEMPLATE), TAG(TEXTAREA),
1589
+ TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TITLE), TAG(TR), TAG(UL), TAG(WBR), TAG(XMP),
1590
+
1591
+ TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS),
1592
+ TAG_MATHML(MTEXT), TAG_MATHML(ANNOTATION_XML),
1593
+
1594
+ TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC) });
1595
+ }
1596
+
1597
+ // Implicitly closes currently open elements until it reaches an element with the
1598
+ // specified qualified name. If the elements closed are in the set handled by
1560
1599
  // generate_implied_end_tags, this is normal operation and this function returns
1561
1600
  // true. Otherwise, a parse error is recorded and this function returns false.
1562
1601
  static bool implicitly_close_tags(
1563
- GumboParser* parser, GumboToken* token, GumboTag target) {
1602
+ GumboParser* parser, GumboToken* token, GumboNamespaceEnum target_ns, GumboTag target) {
1564
1603
  bool result = true;
1565
1604
  generate_implied_end_tags(parser, target);
1566
- if (!node_tag_is(get_current_node(parser), target)) {
1605
+ if (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1567
1606
  parser_add_parse_error(parser, token);
1568
- while (!node_tag_is(get_current_node(parser), target)) {
1607
+ while (!node_qualified_tag_is(get_current_node(parser), target_ns, target)) {
1569
1608
  pop_current_node(parser);
1570
1609
  }
1571
1610
  result = false;
1572
1611
  }
1573
- assert(node_tag_is(get_current_node(parser), target));
1612
+ assert(node_qualified_tag_is(get_current_node(parser), target_ns, target));
1574
1613
  pop_current_node(parser);
1575
1614
  return result;
1576
1615
  }
@@ -1581,7 +1620,7 @@ static bool implicitly_close_tags(
1581
1620
  // clause appears several times in the spec.
1582
1621
  static bool maybe_implicitly_close_p_tag(GumboParser* parser, GumboToken* token) {
1583
1622
  if (has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
1584
- return implicitly_close_tags(parser, token, GUMBO_TAG_P);
1623
+ return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
1585
1624
  }
1586
1625
  return true;
1587
1626
  }
@@ -1595,15 +1634,14 @@ static void maybe_implicitly_close_list_tag(
1595
1634
  for (int i = state->_open_elements.length; --i >= 0; ) {
1596
1635
  const GumboNode* node = state->_open_elements.data[i];
1597
1636
  bool is_list_tag = is_li ?
1598
- node_tag_is(node, GUMBO_TAG_LI) :
1599
- node_tag_in(node, GUMBO_TAG_DD, GUMBO_TAG_DT, GUMBO_TAG_LAST);
1637
+ node_html_tag_is(node, GUMBO_TAG_LI) :
1638
+ node_tag_in_set(node, (gumbo_tagset) { TAG(DD), TAG(DT) } );
1600
1639
  if (is_list_tag) {
1601
- implicitly_close_tags(parser, token, node->v.element.tag);
1640
+ implicitly_close_tags(parser, token, node->v.element.tag_namespace, node->v.element.tag);
1602
1641
  return;
1603
1642
  }
1604
1643
  if (is_special_node(node) &&
1605
- !node_tag_in(node, GUMBO_TAG_ADDRESS, GUMBO_TAG_DIV, GUMBO_TAG_P,
1606
- GUMBO_TAG_LAST)) {
1644
+ !node_tag_in_set(node, (gumbo_tagset) { TAG(ADDRESS), TAG(DIV), TAG(P) })) {
1607
1645
  return;
1608
1646
  }
1609
1647
  }
@@ -1758,13 +1796,20 @@ static void remove_from_parent(GumboParser* parser, GumboNode* node) {
1758
1796
 
1759
1797
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
1760
1798
  // Also described in the "in body" handling for end formatting tags.
1761
- static bool adoption_agency_algorithm(
1762
- GumboParser* parser, GumboToken* token, GumboTag closing_tag) {
1799
+ static bool adoption_agency_algorithm(GumboParser* parser, GumboToken* token, GumboTag subject) {
1763
1800
  GumboParserState* state = parser->_parser_state;
1764
1801
  gumbo_debug("Entering adoption agency algorithm.\n");
1765
- // Steps 1-3 & 16:
1802
+ // Step 1.
1803
+ GumboNode* current_node = get_current_node(parser);
1804
+ if (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
1805
+ current_node->v.element.tag == subject &&
1806
+ gumbo_vector_index_of(&state->_active_formatting_elements, current_node) == -1) {
1807
+ pop_current_node(parser);
1808
+ return false;
1809
+ }
1810
+ // Steps 2-4 & 20:
1766
1811
  for (int i = 0; i < 8; ++i) {
1767
- // Step 4.
1812
+ // Step 5.
1768
1813
  GumboNode* formatting_node = NULL;
1769
1814
  int formatting_node_in_open_elements = -1;
1770
1815
  for (int j = state->_active_formatting_elements.length; --j >= 0; ) {
@@ -1774,13 +1819,13 @@ static bool adoption_agency_algorithm(
1774
1819
  // Last scope marker; abort the algorithm.
1775
1820
  return false;
1776
1821
  }
1777
- if (node_tag_is(current_node, closing_tag)) {
1822
+ if (node_html_tag_is(current_node, subject)) {
1778
1823
  // Found it.
1779
1824
  formatting_node = current_node;
1780
1825
  formatting_node_in_open_elements = gumbo_vector_index_of(
1781
- &state->_open_elements, formatting_node);
1826
+ &state->_open_elements, formatting_node);
1782
1827
  gumbo_debug("Formatting element of tag %s at %d.\n",
1783
- gumbo_normalized_tagname(closing_tag),
1828
+ gumbo_normalized_tagname(subject),
1784
1829
  formatting_node_in_open_elements);
1785
1830
  break;
1786
1831
  }
@@ -1793,39 +1838,44 @@ static bool adoption_agency_algorithm(
1793
1838
  return false;
1794
1839
  }
1795
1840
 
1841
+ // Step 6
1796
1842
  if (formatting_node_in_open_elements == -1) {
1797
1843
  gumbo_debug("Formatting node not on stack of open elements.\n");
1844
+ parser_add_parse_error(parser, token);
1798
1845
  gumbo_vector_remove(parser, formatting_node,
1799
1846
  &state->_active_formatting_elements);
1800
1847
  return false;
1801
1848
  }
1802
1849
 
1850
+ // Step 7
1803
1851
  if (!has_an_element_in_scope(parser, formatting_node->v.element.tag)) {
1804
1852
  parser_add_parse_error(parser, token);
1805
1853
  gumbo_debug("Element not in scope.\n");
1806
1854
  return false;
1807
1855
  }
1856
+
1857
+ // Step 8
1808
1858
  if (formatting_node != get_current_node(parser)) {
1809
1859
  parser_add_parse_error(parser, token); // But continue onwards.
1810
1860
  }
1811
1861
  assert(formatting_node);
1812
- assert(!node_tag_is(formatting_node, GUMBO_TAG_HTML));
1813
- assert(!node_tag_is(formatting_node, GUMBO_TAG_BODY));
1862
+ assert(!node_html_tag_is(formatting_node, GUMBO_TAG_HTML));
1863
+ assert(!node_html_tag_is(formatting_node, GUMBO_TAG_BODY));
1814
1864
 
1815
- // Step 5 & 6.
1865
+ // Step 9 & 10
1816
1866
  GumboNode* furthest_block = NULL;
1817
1867
  for (int j = formatting_node_in_open_elements;
1818
1868
  j < state->_open_elements.length; ++j) {
1819
1869
  assert(j > 0);
1820
1870
  GumboNode* current = state->_open_elements.data[j];
1821
1871
  if (is_special_node(current)) {
1822
- // Step 5.
1872
+ // Step 9.
1823
1873
  furthest_block = current;
1824
1874
  break;
1825
1875
  }
1826
1876
  }
1827
1877
  if (!furthest_block) {
1828
- // Step 6.
1878
+ // Step 10.
1829
1879
  while (get_current_node(parser) != formatting_node) {
1830
1880
  pop_current_node(parser);
1831
1881
  }
@@ -1835,35 +1885,38 @@ static bool adoption_agency_algorithm(
1835
1885
  &state->_active_formatting_elements);
1836
1886
  return false;
1837
1887
  }
1838
- assert(!node_tag_is(furthest_block, GUMBO_TAG_HTML));
1888
+ assert(!node_html_tag_is(furthest_block, GUMBO_TAG_HTML));
1839
1889
  assert(furthest_block);
1840
1890
 
1841
- // Step 7.
1891
+ // Step 11.
1842
1892
  // Elements may be moved and reparented by this algorithm, so
1843
1893
  // common_ancestor is not necessarily the same as formatting_node->parent.
1844
1894
  GumboNode* common_ancestor =
1845
- state->_open_elements.data[gumbo_vector_index_of(
1846
- &state->_open_elements, formatting_node) - 1];
1895
+ state->_open_elements.data[gumbo_vector_index_of(
1896
+ &state->_open_elements, formatting_node) - 1];
1847
1897
  gumbo_debug("Common ancestor tag = %s, furthest block tag = %s.\n",
1848
1898
  gumbo_normalized_tagname(common_ancestor->v.element.tag),
1849
1899
  gumbo_normalized_tagname(furthest_block->v.element.tag));
1850
1900
 
1851
- // Step 8.
1901
+ // Step 12.
1852
1902
  int bookmark = gumbo_vector_index_of(
1853
- &state->_active_formatting_elements, formatting_node);;
1854
- // Step 9.
1903
+ &state->_active_formatting_elements, formatting_node) + 1;
1904
+ gumbo_debug("Bookmark at %d.\n", bookmark);
1905
+ // Step 13.
1855
1906
  GumboNode* node = furthest_block;
1856
1907
  GumboNode* last_node = furthest_block;
1857
1908
  // Must be stored explicitly, in case node is removed from the stack of open
1858
1909
  // elements, to handle step 9.4.
1859
1910
  int saved_node_index = gumbo_vector_index_of(&state->_open_elements, node);
1860
1911
  assert(saved_node_index > 0);
1861
- // Step 9.1-9.3 & 9.11.
1862
- for (int j = 0; j < 3; ++j) {
1863
- // Step 9.4.
1912
+ // Step 13.1.
1913
+ for (int j = 0;;) {
1914
+ // Step 13.2.
1915
+ ++j;
1916
+ // Step 13.3.
1864
1917
  int node_index = gumbo_vector_index_of(&state->_open_elements, node);
1865
1918
  gumbo_debug(
1866
- "Current index: %d, last index: %d.\n", node_index, saved_node_index);
1919
+ "Current index: %d, last index: %d.\n", node_index, saved_node_index);
1867
1920
  if (node_index == -1) {
1868
1921
  node_index = saved_node_index;
1869
1922
  }
@@ -1872,62 +1925,78 @@ static bool adoption_agency_algorithm(
1872
1925
  assert(node_index < state->_open_elements.capacity);
1873
1926
  node = state->_open_elements.data[node_index];
1874
1927
  assert(node->parent);
1875
- // Step 9.5.
1876
- if (gumbo_vector_index_of(
1877
- &state->_active_formatting_elements, node) == -1) {
1928
+ if (node == formatting_node) {
1929
+ // Step 13.4.
1930
+ break;
1931
+ }
1932
+ int formatting_index =
1933
+ gumbo_vector_index_of(&state->_active_formatting_elements, node);
1934
+ if (j > 3 && formatting_index != -1) {
1935
+ // Step 13.5.
1936
+ gumbo_debug(
1937
+ "Removing formatting element at %d.\n", formatting_index);
1938
+ gumbo_vector_remove_at(
1939
+ parser,
1940
+ formatting_index,
1941
+ &state->_active_formatting_elements);
1942
+ // Removing the element shifts all indices over by one, so we may need
1943
+ // to move the bookmark.
1944
+ if (formatting_index < bookmark) {
1945
+ --bookmark;
1946
+ gumbo_debug("Moving bookmark to %d.\n", bookmark);
1947
+ }
1948
+ continue;
1949
+ }
1950
+ if (formatting_index == -1) {
1951
+ // Step 13.6.
1878
1952
  gumbo_vector_remove_at(parser, node_index, &state->_open_elements);
1879
1953
  continue;
1880
- } else if (node == formatting_node) {
1881
- // Step 9.6.
1882
- break;
1883
1954
  }
1884
- // Step 9.7.
1885
- int formatting_index = gumbo_vector_index_of(
1886
- &state->_active_formatting_elements, node);
1955
+ // Step 13.7.
1956
+ // "common ancestor as the intended parent" doesn't actually mean insert
1957
+ // it into the common ancestor; that happens below.
1887
1958
  node = clone_node(parser, node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1959
+ assert(formatting_index >= 0);
1888
1960
  state->_active_formatting_elements.data[formatting_index] = node;
1961
+ assert(node_index >= 0);
1889
1962
  state->_open_elements.data[node_index] = node;
1890
- // Step 9.8.
1963
+ // Step 13.8.
1891
1964
  if (last_node == furthest_block) {
1892
1965
  bookmark = formatting_index + 1;
1966
+ gumbo_debug("Bookmark moved to %d.\n", bookmark);
1893
1967
  assert(bookmark <= state->_active_formatting_elements.length);
1894
1968
  }
1895
- // Step 9.9.
1969
+ // Step 13.9.
1896
1970
  last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1897
1971
  remove_from_parent(parser, last_node);
1898
1972
  append_node(parser, node, last_node);
1899
- // Step 9.10.
1973
+ // Step 13.10.
1900
1974
  last_node = node;
1901
- }
1975
+ } // Step 13.11.
1902
1976
 
1903
- // Step 10.
1977
+ // Step 14.
1904
1978
  gumbo_debug("Removing %s node from parent ",
1905
1979
  gumbo_normalized_tagname(last_node->v.element.tag));
1906
1980
  remove_from_parent(parser, last_node);
1907
1981
  last_node->parse_flags |= GUMBO_INSERTION_ADOPTION_AGENCY_MOVED;
1908
- if (node_tag_in(common_ancestor, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
1909
- GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
1910
- GUMBO_TAG_LAST)) {
1911
- gumbo_debug("and foster-parenting it.\n");
1912
- foster_parent_element(parser, last_node);
1913
- } else {
1914
- gumbo_debug("and inserting it into %s.\n",
1915
- gumbo_normalized_tagname(common_ancestor->v.element.tag));
1916
- append_node(parser, common_ancestor, last_node);
1917
- }
1982
+ InsertionLocation location =
1983
+ get_appropriate_insertion_location(parser, common_ancestor);
1984
+ gumbo_debug("and inserting it into %s.\n",
1985
+ gumbo_normalized_tagname(location.target->v.element.tag));
1986
+ insert_node(parser, last_node, location);
1918
1987
 
1919
- // Step 11.
1988
+ // Step 15.
1920
1989
  GumboNode* new_formatting_node = clone_node(
1921
- parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1990
+ parser, formatting_node, GUMBO_INSERTION_ADOPTION_AGENCY_CLONED);
1922
1991
  formatting_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
1923
1992
 
1924
- // Step 12. Instead of appending nodes one-by-one, we swap the children
1993
+ // Step 16. Instead of appending nodes one-by-one, we swap the children
1925
1994
  // vector of furthest_block with the empty children of new_formatting_node,
1926
1995
  // reducing memory traffic and allocations. We still have to reset their
1927
1996
  // parent pointers, though.
1928
1997
  GumboVector temp = new_formatting_node->v.element.children;
1929
1998
  new_formatting_node->v.element.children =
1930
- furthest_block->v.element.children;
1999
+ furthest_block->v.element.children;
1931
2000
  furthest_block->v.element.children = temp;
1932
2001
 
1933
2002
  temp = new_formatting_node->v.element.children;
@@ -1936,36 +2005,39 @@ static bool adoption_agency_algorithm(
1936
2005
  child->parent = new_formatting_node;
1937
2006
  }
1938
2007
 
1939
- // Step 13.
2008
+ // Step 17.
1940
2009
  append_node(parser, furthest_block, new_formatting_node);
1941
2010
 
1942
- // Step 14.
2011
+ // Step 18.
1943
2012
  // If the formatting node was before the bookmark, it may shift over all
1944
2013
  // indices after it, so we need to explicitly find the index and possibly
1945
2014
  // adjust the bookmark.
1946
2015
  int formatting_node_index = gumbo_vector_index_of(
1947
- &state->_active_formatting_elements, formatting_node);
2016
+ &state->_active_formatting_elements, formatting_node);
1948
2017
  assert(formatting_node_index != -1);
1949
2018
  if (formatting_node_index < bookmark) {
2019
+ gumbo_debug(
2020
+ "Formatting node at %d is before bookmark at %d; decrementing.\n",
2021
+ formatting_node_index, bookmark);
1950
2022
  --bookmark;
1951
2023
  }
1952
2024
  gumbo_vector_remove_at(
1953
- parser, formatting_node_index, &state->_active_formatting_elements);
2025
+ parser, formatting_node_index, &state->_active_formatting_elements);
1954
2026
  assert(bookmark >= 0);
1955
2027
  assert(bookmark <= state->_active_formatting_elements.length);
1956
2028
  gumbo_vector_insert_at(parser, new_formatting_node, bookmark,
1957
2029
  &state->_active_formatting_elements);
1958
2030
 
1959
- // Step 15.
2031
+ // Step 19.
1960
2032
  gumbo_vector_remove(
1961
- parser, formatting_node, &state->_open_elements);
2033
+ parser, formatting_node, &state->_open_elements);
1962
2034
  int insert_at = gumbo_vector_index_of(
1963
- &state->_open_elements, furthest_block) + 1;
2035
+ &state->_open_elements, furthest_block) + 1;
1964
2036
  assert(insert_at >= 0);
1965
2037
  assert(insert_at <= state->_open_elements.length);
1966
2038
  gumbo_vector_insert_at(
1967
- parser, new_formatting_node, insert_at, &state->_open_elements);
1968
- }
2039
+ parser, new_formatting_node, insert_at, &state->_open_elements);
2040
+ } // Step 20.
1969
2041
  return true;
1970
2042
  }
1971
2043
 
@@ -1992,8 +2064,8 @@ static void finish_parsing(GumboParser* parser) {
1992
2064
  GumboParserState* state = parser->_parser_state;
1993
2065
  for (GumboNode* node = pop_current_node(parser); node;
1994
2066
  node = pop_current_node(parser)) {
1995
- if ((node_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
1996
- (node_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
2067
+ if ((node_html_tag_is(node, GUMBO_TAG_BODY) && state->_closed_body_tag) ||
2068
+ (node_html_tag_is(node, GUMBO_TAG_HTML) && state->_closed_html_tag)) {
1997
2069
  continue;
1998
2070
  }
1999
2071
  node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG;
@@ -2042,9 +2114,9 @@ static bool handle_before_html(GumboParser* parser, GumboToken* token) {
2042
2114
  parser->_output->root = html_node;
2043
2115
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_BEFORE_HEAD);
2044
2116
  return true;
2045
- } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
2046
- token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2047
- GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
2117
+ } else if (token->type == GUMBO_TOKEN_END_TAG &&
2118
+ !tag_in(token, false, (gumbo_tagset) { TAG(HEAD), TAG(BODY), TAG(HTML),
2119
+ TAG(BR) } )) {
2048
2120
  parser_add_parse_error(parser, token);
2049
2121
  ignore_token(parser);
2050
2122
  return false;
@@ -2076,9 +2148,9 @@ static bool handle_before_head(GumboParser* parser, GumboToken* token) {
2076
2148
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2077
2149
  parser->_parser_state->_head_element = node;
2078
2150
  return true;
2079
- } else if (token->type == GUMBO_TOKEN_END_TAG && !tag_in(
2080
- token, false, GUMBO_TAG_HEAD, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2081
- GUMBO_TAG_BR, GUMBO_TAG_LAST)) {
2151
+ } else if (token->type == GUMBO_TOKEN_END_TAG &&
2152
+ !tag_in(token, false, (gumbo_tagset) { TAG(HEAD), TAG(BODY), TAG(HTML),
2153
+ TAG(BR) })) {
2082
2154
  parser_add_parse_error(parser, token);
2083
2155
  ignore_token(parser);
2084
2156
  return false;
@@ -2110,9 +2182,8 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2110
2182
  return true;
2111
2183
  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2112
2184
  return handle_in_body(parser, token);
2113
- } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2114
- GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
2115
- GUMBO_TAG_LAST)) {
2185
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT),
2186
+ TAG(BGSOUND), TAG(MENUITEM), TAG(LINK) })) {
2116
2187
  insert_element_from_token(parser, token);
2117
2188
  pop_current_node(parser);
2118
2189
  acknowledge_self_closing_tag(parser);
@@ -2129,8 +2200,7 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2129
2200
  } else if (tag_is(token, kStartTag, GUMBO_TAG_TITLE)) {
2130
2201
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
2131
2202
  return true;
2132
- } else if (tag_in(token, kStartTag, GUMBO_TAG_NOFRAMES, GUMBO_TAG_STYLE,
2133
- GUMBO_TAG_LAST)) {
2203
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(NOFRAMES), TAG(STYLE) })) {
2134
2204
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RAWTEXT);
2135
2205
  return true;
2136
2206
  } else if (tag_is(token, kStartTag, GUMBO_TAG_NOSCRIPT)) {
@@ -2143,32 +2213,48 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
2143
2213
  } else if (tag_is(token, kEndTag, GUMBO_TAG_HEAD)) {
2144
2214
  GumboNode* head = pop_current_node(parser);
2145
2215
  AVOID_UNUSED_VARIABLE_WARNING(head);
2146
- assert(node_tag_is(head, GUMBO_TAG_HEAD));
2216
+ assert(node_html_tag_is(head, GUMBO_TAG_HEAD));
2147
2217
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2148
2218
  return true;
2149
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD)) {
2150
- parser_add_parse_error(parser, token);
2151
- ignore_token(parser);
2152
- return false;
2153
- } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2154
- (token->type == GUMBO_TOKEN_END_TAG &&
2155
- !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2156
- GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
2157
- parser_add_parse_error(parser, token);
2158
- return false;
2159
- } else if (tag_is(token, kStartTag, GUMBO_TAG_UNKNOWN) && token->v.start_tag.is_self_closing) {
2219
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(HTML), TAG(BR) })) {
2220
+ pop_current_node(parser);
2221
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2222
+ parser->_parser_state->_reprocess_current_token = true;
2223
+ return true;
2224
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE)) {
2225
+ insert_element_from_token(parser, token);
2226
+ add_formatting_element(parser, &kActiveFormattingScopeMarker);
2227
+ parser->_parser_state->_frameset_ok = false;
2228
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2229
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
2230
+ return true;
2231
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2232
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2233
+ parser_add_parse_error(parser, token);
2234
+ ignore_token(parser);
2235
+ return false;
2236
+ }
2237
+ generate_all_implied_end_tags_thoroughly(parser);
2238
+ bool success = true;
2239
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_TEMPLATE)) {
2240
+ parser_add_parse_error(parser, token);
2241
+ success = false;
2242
+ }
2243
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE));
2244
+ clear_active_formatting_elements(parser);
2245
+ pop_template_insertion_mode(parser);
2246
+ reset_insertion_mode_appropriately(parser);
2247
+ return success;
2248
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) || (token->type == GUMBO_TOKEN_END_TAG)) {
2160
2249
  parser_add_parse_error(parser, token);
2161
2250
  ignore_token(parser);
2162
2251
  return false;
2163
2252
  } else {
2164
- const GumboNode* node = pop_current_node(parser);
2165
- assert(node_tag_is(node, GUMBO_TAG_HEAD));
2166
- AVOID_UNUSED_VARIABLE_WARNING(node);
2253
+ pop_current_node(parser);
2167
2254
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_HEAD);
2168
2255
  parser->_parser_state->_reprocess_current_token = true;
2169
2256
  return true;
2170
2257
  }
2171
-
2172
2258
  return true;
2173
2259
  }
2174
2260
 
@@ -2181,18 +2267,16 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2181
2267
  return handle_in_body(parser, token);
2182
2268
  } else if (tag_is(token, kEndTag, GUMBO_TAG_NOSCRIPT)) {
2183
2269
  const GumboNode* node = pop_current_node(parser);
2184
- assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
2270
+ assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2185
2271
  AVOID_UNUSED_VARIABLE_WARNING(node);
2186
2272
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2187
2273
  return true;
2188
2274
  } else if (token->type == GUMBO_TOKEN_WHITESPACE ||
2189
2275
  token->type == GUMBO_TOKEN_COMMENT ||
2190
- tag_in(token, kStartTag, GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND,
2191
- GUMBO_TAG_LINK, GUMBO_TAG_META, GUMBO_TAG_NOFRAMES,
2192
- GUMBO_TAG_STYLE, GUMBO_TAG_LAST)) {
2193
- return handle_in_head(parser, token);
2194
- } else if (tag_in(token, kStartTag, GUMBO_TAG_HEAD, GUMBO_TAG_NOSCRIPT,
2195
- GUMBO_TAG_LAST) ||
2276
+ tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASEFONT), TAG(BGSOUND),
2277
+ TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(STYLE) })) {
2278
+ return handle_in_head(parser, token);
2279
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(HEAD), TAG(NOSCRIPT) }) ||
2196
2280
  (token->type == GUMBO_TOKEN_END_TAG &&
2197
2281
  !tag_is(token, kEndTag, GUMBO_TAG_BR))) {
2198
2282
  parser_add_parse_error(parser, token);
@@ -2201,7 +2285,7 @@ static bool handle_in_head_noscript(GumboParser* parser, GumboToken* token) {
2201
2285
  } else {
2202
2286
  parser_add_parse_error(parser, token);
2203
2287
  const GumboNode* node = pop_current_node(parser);
2204
- assert(node_tag_is(node, GUMBO_TAG_NOSCRIPT));
2288
+ assert(node_html_tag_is(node, GUMBO_TAG_NOSCRIPT));
2205
2289
  AVOID_UNUSED_VARIABLE_WARNING(node);
2206
2290
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_HEAD);
2207
2291
  parser->_parser_state->_reprocess_current_token = true;
@@ -2233,10 +2317,10 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2233
2317
  insert_element_from_token(parser, token);
2234
2318
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_FRAMESET);
2235
2319
  return true;
2236
- } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2237
- GUMBO_TAG_BGSOUND, GUMBO_TAG_LINK, GUMBO_TAG_META,
2238
- GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT, GUMBO_TAG_STYLE,
2239
- GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
2320
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT),
2321
+ TAG(BGSOUND), TAG(LINK), TAG(META),
2322
+ TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE),
2323
+ TAG(TEMPLATE), TAG(TITLE) })) {
2240
2324
  parser_add_parse_error(parser, token);
2241
2325
  assert(state->_head_element != NULL);
2242
2326
  // This must be flushed before we push the head element on, as there may be
@@ -2246,10 +2330,11 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2246
2330
  bool result = handle_in_head(parser, token);
2247
2331
  gumbo_vector_remove(parser, state->_head_element, &state->_open_elements);
2248
2332
  return result;
2333
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2334
+ return handle_in_head(parser, token);
2249
2335
  } else if (tag_is(token, kStartTag, GUMBO_TAG_HEAD) ||
2250
2336
  (token->type == GUMBO_TOKEN_END_TAG &&
2251
- !tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2252
- GUMBO_TAG_BR, GUMBO_TAG_LAST))) {
2337
+ !tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(HTML), TAG(BR) }))) {
2253
2338
  parser_add_parse_error(parser, token);
2254
2339
  ignore_token(parser);
2255
2340
  return false;
@@ -2261,28 +2346,23 @@ static bool handle_after_head(GumboParser* parser, GumboToken* token) {
2261
2346
  }
2262
2347
  }
2263
2348
 
2264
- static void destroy_node(GumboParser* parser, GumboNode* node) {
2349
+ static GumboNode* destroy_node(GumboParser* parser, GumboNode* node) {
2265
2350
  switch (node->type) {
2266
2351
  case GUMBO_NODE_DOCUMENT:
2267
2352
  {
2268
2353
  GumboDocument* doc = &node->v.document;
2269
- for (int i = 0; i < doc->children.length; ++i) {
2270
- destroy_node(parser, doc->children.data[i]);
2271
- }
2272
2354
  gumbo_parser_deallocate(parser, (void*) doc->children.data);
2273
2355
  gumbo_parser_deallocate(parser, (void*) doc->name);
2274
2356
  gumbo_parser_deallocate(parser, (void*) doc->public_identifier);
2275
2357
  gumbo_parser_deallocate(parser, (void*) doc->system_identifier);
2276
2358
  }
2277
2359
  break;
2360
+ case GUMBO_NODE_TEMPLATE:
2278
2361
  case GUMBO_NODE_ELEMENT:
2279
2362
  for (int i = 0; i < node->v.element.attributes.length; ++i) {
2280
2363
  gumbo_destroy_attribute(parser, node->v.element.attributes.data[i]);
2281
2364
  }
2282
2365
  gumbo_parser_deallocate(parser, node->v.element.attributes.data);
2283
- for (int i = 0; i < node->v.element.children.length; ++i) {
2284
- destroy_node(parser, node->v.element.children.data[i]);
2285
- }
2286
2366
  gumbo_parser_deallocate(parser, node->v.element.children.data);
2287
2367
  break;
2288
2368
  case GUMBO_NODE_TEXT:
@@ -2292,7 +2372,21 @@ static void destroy_node(GumboParser* parser, GumboNode* node) {
2292
2372
  gumbo_parser_deallocate(parser, (void*) node->v.text.text);
2293
2373
  break;
2294
2374
  }
2375
+ // Remove from the next/prev linked list.
2376
+ GumboNode* prev = node->prev;
2377
+ GumboNode* next = node->next;
2378
+ if (prev != NULL) {
2379
+ prev->next = next;
2380
+ }
2381
+ if (next != NULL) {
2382
+ next->prev = prev;
2383
+ }
2384
+ if (parser->_parser_state && parser->_parser_state->_current_node == node) {
2385
+ parser->_parser_state->_current_node = prev;
2386
+ }
2387
+
2295
2388
  gumbo_parser_deallocate(parser, node);
2389
+ return next;
2296
2390
  }
2297
2391
 
2298
2392
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inbody
@@ -2307,7 +2401,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2307
2401
  reconstruct_active_formatting_elements(parser);
2308
2402
  insert_text_token(parser, token);
2309
2403
  return true;
2310
- } else if (token->type == GUMBO_TOKEN_CHARACTER) {
2404
+ } else if (token->type == GUMBO_TOKEN_CHARACTER ||
2405
+ token->type == GUMBO_TOKEN_CDATA) {
2311
2406
  reconstruct_active_formatting_elements(parser);
2312
2407
  insert_text_token(parser, token);
2313
2408
  set_frameset_not_ok(parser);
@@ -2320,20 +2415,24 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2320
2415
  ignore_token(parser);
2321
2416
  return false;
2322
2417
  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
2418
+ parser_add_parse_error(parser, token);
2419
+ if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2420
+ ignore_token(parser);
2421
+ return false;
2422
+ }
2323
2423
  assert(parser->_output->root != NULL);
2324
2424
  assert(parser->_output->root->type == GUMBO_NODE_ELEMENT);
2325
- parser_add_parse_error(parser, token);
2326
2425
  merge_attributes(parser, token, parser->_output->root);
2327
2426
  return false;
2328
- } else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
2329
- GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
2330
- GUMBO_TAG_META, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT,
2331
- GUMBO_TAG_STYLE, GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
2427
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT),
2428
+ TAG(BGSOUND), TAG(MENUITEM), TAG(LINK),
2429
+ TAG(META), TAG(NOFRAMES), TAG(SCRIPT),
2430
+ TAG(STYLE), TAG(TEMPLATE), TAG(TITLE) } ) || tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
2332
2431
  return handle_in_head(parser, token);
2333
2432
  } else if (tag_is(token, kStartTag, GUMBO_TAG_BODY)) {
2334
2433
  parser_add_parse_error(parser, token);
2335
2434
  if (state->_open_elements.length < 2 ||
2336
- !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY)) {
2435
+ !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2337
2436
  ignore_token(parser);
2338
2437
  return false;
2339
2438
  }
@@ -2343,7 +2442,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2343
2442
  } else if (tag_is(token, kStartTag, GUMBO_TAG_FRAMESET)) {
2344
2443
  parser_add_parse_error(parser, token);
2345
2444
  if (state->_open_elements.length < 2 ||
2346
- !node_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2445
+ !node_html_tag_is(state->_open_elements.data[1], GUMBO_TAG_BODY) ||
2347
2446
  !state->_frameset_ok) {
2348
2447
  ignore_token(parser);
2349
2448
  return false;
@@ -2381,18 +2480,17 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2381
2480
  return true;
2382
2481
  } else if (token->type == GUMBO_TOKEN_EOF) {
2383
2482
  for (int i = 0; i < state->_open_elements.length; ++i) {
2384
- if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
2385
- GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_P, GUMBO_TAG_TBODY,
2386
- GUMBO_TAG_TD, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
2387
- GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_BODY,
2388
- GUMBO_TAG_HTML, GUMBO_TAG_LAST)) {
2483
+ if (!node_tag_in_set(state->_open_elements.data[i], (gumbo_tagset) { TAG(DD),
2484
+ TAG(DT), TAG(LI), TAG(P), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH),
2485
+ TAG(THEAD), TAG(TR), TAG(BODY), TAG(HTML) } )) {
2389
2486
  parser_add_parse_error(parser, token);
2390
- return false;
2391
2487
  }
2392
2488
  }
2489
+ if (get_current_template_insertion_mode(parser) != GUMBO_INSERTION_MODE_INITIAL) {
2490
+ return handle_in_template(parser, token);
2491
+ }
2393
2492
  return true;
2394
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2395
- GUMBO_TAG_LAST)) {
2493
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(HTML) })) {
2396
2494
  if (!has_an_element_in_scope(parser, GUMBO_TAG_BODY)) {
2397
2495
  parser_add_parse_error(parser, token);
2398
2496
  ignore_token(parser);
@@ -2400,13 +2498,11 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2400
2498
  }
2401
2499
  bool success = true;
2402
2500
  for (int i = 0; i < state->_open_elements.length; ++i) {
2403
- if (!node_tag_in(state->_open_elements.data[i], GUMBO_TAG_DD,
2404
- GUMBO_TAG_DT, GUMBO_TAG_LI, GUMBO_TAG_OPTGROUP,
2405
- GUMBO_TAG_OPTION, GUMBO_TAG_P, GUMBO_TAG_RP,
2406
- GUMBO_TAG_RT, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
2407
- GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
2408
- GUMBO_TAG_TR, GUMBO_TAG_BODY, GUMBO_TAG_HTML,
2409
- GUMBO_TAG_LAST)) {
2501
+ if (!node_tag_in_set(state->_open_elements.data[i], (gumbo_tagset) {
2502
+ TAG(DD), TAG(DT), TAG(LI), TAG(OPTGROUP), TAG(OPTION), TAG(P),
2503
+ TAG(RB), TAG(RP), TAG(RT), TAG(RTC), TAG(TBODY), TAG(TD),
2504
+ TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR),
2505
+ TAG(BODY), TAG(HTML) })) {
2410
2506
  parser_add_parse_error(parser, token);
2411
2507
  success = false;
2412
2508
  break;
@@ -2417,58 +2513,54 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2417
2513
  parser->_parser_state->_reprocess_current_token = true;
2418
2514
  } else {
2419
2515
  GumboNode* body = state->_open_elements.data[1];
2420
- assert(node_tag_is(body, GUMBO_TAG_BODY));
2516
+ assert(node_html_tag_is(body, GUMBO_TAG_BODY));
2421
2517
  record_end_of_element(state->_current_token, &body->v.element);
2422
2518
  }
2423
2519
  return success;
2424
- } else if (tag_in(token, kStartTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
2425
- GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_CENTER,
2426
- GUMBO_TAG_DETAILS, GUMBO_TAG_DIR, GUMBO_TAG_DIV,
2427
- GUMBO_TAG_DL, GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION,
2428
- GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER, GUMBO_TAG_HEADER,
2429
- GUMBO_TAG_HGROUP, GUMBO_TAG_MENU, GUMBO_TAG_NAV,
2430
- GUMBO_TAG_OL, GUMBO_TAG_P, GUMBO_TAG_SECTION,
2431
- GUMBO_TAG_SUMMARY, GUMBO_TAG_UL, GUMBO_TAG_LAST)) {
2520
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(ADDRESS), TAG(ARTICLE),
2521
+ TAG(ASIDE), TAG(BLOCKQUOTE), TAG(CENTER), TAG(DETAILS),
2522
+ TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
2523
+ TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP), TAG(MENU), TAG(MAIN),
2524
+ TAG(NAV), TAG(OL), TAG(P), TAG(SECTION), TAG(SUMMARY), TAG(UL) })) {
2432
2525
  bool result = maybe_implicitly_close_p_tag(parser, token);
2433
2526
  insert_element_from_token(parser, token);
2434
2527
  return result;
2435
- } else if (tag_in(token, kStartTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2436
- GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2528
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(H1), TAG(H2), TAG(H3),
2529
+ TAG(H4), TAG(H5), TAG(H6) })) {
2437
2530
  bool result = maybe_implicitly_close_p_tag(parser, token);
2438
- if (node_tag_in(get_current_node(parser), GUMBO_TAG_H1, GUMBO_TAG_H2,
2439
- GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6,
2440
- GUMBO_TAG_LAST)) {
2531
+ if (node_tag_in_set(get_current_node(parser), (gumbo_tagset) { TAG(H1), TAG(H2),
2532
+ TAG(H3), TAG(H4), TAG(H5), TAG(H6) })) {
2441
2533
  parser_add_parse_error(parser, token);
2442
2534
  pop_current_node(parser);
2443
2535
  result = false;
2444
2536
  }
2445
2537
  insert_element_from_token(parser, token);
2446
2538
  return result;
2447
- } else if (tag_in(token, kStartTag, GUMBO_TAG_PRE, GUMBO_TAG_LISTING,
2448
- GUMBO_TAG_LAST)) {
2539
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(PRE), TAG(LISTING) })) {
2449
2540
  bool result = maybe_implicitly_close_p_tag(parser, token);
2450
2541
  insert_element_from_token(parser, token);
2451
2542
  state->_ignore_next_linefeed = true;
2452
2543
  state->_frameset_ok = false;
2453
2544
  return result;
2454
2545
  } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
2455
- if (state->_form_element != NULL) {
2546
+ if (state->_form_element != NULL && !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2456
2547
  gumbo_debug("Ignoring nested form.\n");
2457
2548
  parser_add_parse_error(parser, token);
2458
2549
  ignore_token(parser);
2459
2550
  return false;
2460
2551
  }
2461
2552
  bool result = maybe_implicitly_close_p_tag(parser, token);
2462
- state->_form_element =
2463
- insert_element_from_token(parser, token);
2553
+ GumboNode* form_element = insert_element_from_token(parser, token);
2554
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2555
+ state->_form_element = form_element;
2556
+ }
2464
2557
  return result;
2465
2558
  } else if (tag_is(token, kStartTag, GUMBO_TAG_LI)) {
2466
2559
  maybe_implicitly_close_list_tag(parser, token, true);
2467
2560
  bool result = maybe_implicitly_close_p_tag(parser, token);
2468
2561
  insert_element_from_token(parser, token);
2469
2562
  return result;
2470
- } else if (tag_in(token, kStartTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
2471
- GUMBO_TAG_LAST)) {
2563
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(DD), TAG(DT) })) {
2472
2564
  maybe_implicitly_close_list_tag(parser, token, false);
2473
2565
  bool result = maybe_implicitly_close_p_tag(parser, token);
2474
2566
  insert_element_from_token(parser, token);
@@ -2481,7 +2573,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2481
2573
  } else if (tag_is(token, kStartTag, GUMBO_TAG_BUTTON)) {
2482
2574
  if (has_an_element_in_scope(parser, GUMBO_TAG_BUTTON)) {
2483
2575
  parser_add_parse_error(parser, token);
2484
- implicitly_close_tags(parser, token, GUMBO_TAG_BUTTON);
2576
+ implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_BUTTON);
2485
2577
  state->_reprocess_current_token = true;
2486
2578
  return false;
2487
2579
  }
@@ -2489,67 +2581,78 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2489
2581
  insert_element_from_token(parser, token);
2490
2582
  state->_frameset_ok = false;
2491
2583
  return true;
2492
- } else if (tag_in(token, kEndTag, GUMBO_TAG_ADDRESS, GUMBO_TAG_ARTICLE,
2493
- GUMBO_TAG_ASIDE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_BUTTON,
2494
- GUMBO_TAG_CENTER, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
2495
- GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_FIELDSET,
2496
- GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE, GUMBO_TAG_FOOTER,
2497
- GUMBO_TAG_HEADER, GUMBO_TAG_HGROUP, GUMBO_TAG_LISTING,
2498
- GUMBO_TAG_MENU, GUMBO_TAG_NAV, GUMBO_TAG_OL, GUMBO_TAG_PRE,
2499
- GUMBO_TAG_SECTION, GUMBO_TAG_SUMMARY, GUMBO_TAG_UL,
2500
- GUMBO_TAG_LAST)) {
2584
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(ADDRESS), TAG(ARTICLE),
2585
+ TAG(ASIDE), TAG(BLOCKQUOTE), TAG(BUTTON), TAG(CENTER), TAG(DETAILS),
2586
+ TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET), TAG(FIGCAPTION),
2587
+ TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP), TAG(LISTING),
2588
+ TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(PRE),
2589
+ TAG(SECTION), TAG(SUMMARY), TAG(UL) })) {
2501
2590
  GumboTag tag = token->v.end_tag;
2502
2591
  if (!has_an_element_in_scope(parser, tag)) {
2503
2592
  parser_add_parse_error(parser, token);
2504
2593
  ignore_token(parser);
2505
2594
  return false;
2506
2595
  }
2507
- implicitly_close_tags(parser, token, token->v.end_tag);
2596
+ implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token->v.end_tag);
2508
2597
  return true;
2509
2598
  } else if (tag_is(token, kEndTag, GUMBO_TAG_FORM)) {
2510
- bool result = true;
2511
- const GumboNode* node = state->_form_element;
2512
- assert(!node || node->type == GUMBO_NODE_ELEMENT);
2513
- state->_form_element = NULL;
2514
- if (!node || !has_node_in_scope(parser, node)) {
2515
- gumbo_debug("Closing an unopened form.\n");
2516
- parser_add_parse_error(parser, token);
2517
- ignore_token(parser);
2518
- return false;
2519
- }
2520
- // This differs from implicitly_close_tags because we remove *only* the
2521
- // <form> element; other nodes are left in scope.
2522
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2523
- if (get_current_node(parser) != node) {
2524
- parser_add_parse_error(parser, token);
2525
- result = false;
2526
- }
2599
+ if (has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2600
+ if (!has_an_element_in_scope(parser, GUMBO_TAG_FORM)) {
2601
+ parser_add_parse_error(parser, token);
2602
+ ignore_token(parser);
2603
+ return false;
2604
+ }
2605
+ bool success = true;
2606
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2607
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_FORM)) {
2608
+ parser_add_parse_error(parser, token);
2609
+ return false;
2610
+ }
2611
+ while(!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_FORM));
2612
+ return success;
2613
+ } else {
2614
+ bool result = true;
2615
+ const GumboNode* node = state->_form_element;
2616
+ assert(!node || node->type == GUMBO_NODE_ELEMENT);
2617
+ state->_form_element = NULL;
2618
+ if (!node || !has_node_in_scope(parser, node)) {
2619
+ gumbo_debug("Closing an unopened form.\n");
2620
+ parser_add_parse_error(parser, token);
2621
+ ignore_token(parser);
2622
+ return false;
2623
+ }
2624
+ // This differs from implicitly_close_tags because we remove *only* the
2625
+ // <form> element; other nodes are left in scope.
2626
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2627
+ if (get_current_node(parser) != node) {
2628
+ parser_add_parse_error(parser, token);
2629
+ result = false;
2630
+ }
2527
2631
 
2528
- GumboVector* open_elements = &state->_open_elements;
2529
- int index = open_elements->length - 1;
2530
- for (; index >= 0 && open_elements->data[index] != node; --index);
2531
- assert(index >= 0);
2532
- gumbo_vector_remove_at(parser, index, open_elements);
2533
- return result;
2632
+ GumboVector* open_elements = &state->_open_elements;
2633
+ int index = gumbo_vector_index_of(open_elements, node);
2634
+ assert(index >= 0);
2635
+ gumbo_vector_remove_at(parser, index, open_elements);
2636
+ return result;
2637
+ }
2534
2638
  } else if (tag_is(token, kEndTag, GUMBO_TAG_P)) {
2535
2639
  if (!has_an_element_in_button_scope(parser, GUMBO_TAG_P)) {
2536
2640
  parser_add_parse_error(parser, token);
2537
- reconstruct_active_formatting_elements(parser);
2641
+ // reconstruct_active_formatting_elements(parser);
2538
2642
  insert_element_of_tag_type(
2539
2643
  parser, GUMBO_TAG_P, GUMBO_INSERTION_CONVERTED_FROM_END_TAG);
2540
2644
  state->_reprocess_current_token = true;
2541
2645
  return false;
2542
2646
  }
2543
- return implicitly_close_tags(parser, token, GUMBO_TAG_P);
2647
+ return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_P);
2544
2648
  } else if (tag_is(token, kEndTag, GUMBO_TAG_LI)) {
2545
2649
  if (!has_an_element_in_list_scope(parser, GUMBO_TAG_LI)) {
2546
2650
  parser_add_parse_error(parser, token);
2547
2651
  ignore_token(parser);
2548
2652
  return false;
2549
2653
  }
2550
- return implicitly_close_tags(parser, token, GUMBO_TAG_LI);
2551
- } else if (tag_in(token, kEndTag, GUMBO_TAG_DD, GUMBO_TAG_DT,
2552
- GUMBO_TAG_LAST)) {
2654
+ return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, GUMBO_TAG_LI);
2655
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(DD), TAG(DT) })) {
2553
2656
  assert(token->type == GUMBO_TOKEN_END_TAG);
2554
2657
  GumboTag token_tag = token->v.end_tag;
2555
2658
  if (!has_an_element_in_scope(parser, token_tag)) {
@@ -2557,12 +2660,12 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2557
2660
  ignore_token(parser);
2558
2661
  return false;
2559
2662
  }
2560
- return implicitly_close_tags(parser, token, token_tag);
2561
- } else if (tag_in(token, kEndTag, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2562
- GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2563
- if (!has_an_element_in_scope_with_tagname(
2564
- parser, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
2565
- GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_LAST)) {
2663
+ return implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
2664
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) {
2665
+ TAG(H1), TAG(H2), TAG(H3), TAG(H4), TAG(H5), TAG(H6) })) {
2666
+ if (!has_an_element_in_scope_with_tagname(parser, 6, (GumboTag[]) {
2667
+ GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3,
2668
+ GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6})) {
2566
2669
  // No heading open; ignore the token entirely.
2567
2670
  parser_add_parse_error(parser, token);
2568
2671
  ignore_token(parser);
@@ -2570,7 +2673,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2570
2673
  } else {
2571
2674
  generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2572
2675
  const GumboNode* current_node = get_current_node(parser);
2573
- bool success = node_tag_is(current_node, token->v.end_tag);
2676
+ bool success = node_html_tag_is(current_node, token->v.end_tag);
2574
2677
  if (!success) {
2575
2678
  // There're children of the heading currently open; close them below and
2576
2679
  // record a parse error.
@@ -2580,9 +2683,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2580
2683
  }
2581
2684
  do {
2582
2685
  current_node = pop_current_node(parser);
2583
- } while (!node_tag_in(current_node, GUMBO_TAG_H1, GUMBO_TAG_H2,
2584
- GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5,
2585
- GUMBO_TAG_H6, GUMBO_TAG_LAST));
2686
+ } while (!node_tag_in_set(current_node, (gumbo_tagset) { TAG(H1), TAG(H2),
2687
+ TAG(H3), TAG(H4), TAG(H5), TAG(H6) } ));
2586
2688
  return success;
2587
2689
  }
2588
2690
  } else if (tag_is(token, kStartTag, GUMBO_TAG_A)) {
@@ -2608,11 +2710,10 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2608
2710
  reconstruct_active_formatting_elements(parser);
2609
2711
  add_formatting_element(parser, insert_element_from_token(parser, token));
2610
2712
  return success;
2611
- } else if (tag_in(token, kStartTag, GUMBO_TAG_B, GUMBO_TAG_BIG,
2612
- GUMBO_TAG_CODE, GUMBO_TAG_EM, GUMBO_TAG_FONT, GUMBO_TAG_I,
2613
- GUMBO_TAG_S, GUMBO_TAG_SMALL, GUMBO_TAG_STRIKE,
2614
- GUMBO_TAG_STRONG, GUMBO_TAG_TT, GUMBO_TAG_U,
2615
- GUMBO_TAG_LAST)) {
2713
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(B), TAG(BIG),
2714
+ TAG(CODE), TAG(EM), TAG(FONT), TAG(I),
2715
+ TAG(S), TAG(SMALL), TAG(STRIKE),
2716
+ TAG(STRONG), TAG(TT), TAG(U) })) {
2616
2717
  reconstruct_active_formatting_elements(parser);
2617
2718
  add_formatting_element(parser, insert_element_from_token(parser, token));
2618
2719
  return true;
@@ -2628,28 +2729,27 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2628
2729
  insert_element_from_token(parser, token);
2629
2730
  add_formatting_element(parser, get_current_node(parser));
2630
2731
  return result;
2631
- } else if (tag_in(token, kEndTag, GUMBO_TAG_A, GUMBO_TAG_B, GUMBO_TAG_BIG,
2632
- GUMBO_TAG_CODE, GUMBO_TAG_EM, GUMBO_TAG_FONT, GUMBO_TAG_I,
2633
- GUMBO_TAG_NOBR, GUMBO_TAG_S, GUMBO_TAG_SMALL,
2634
- GUMBO_TAG_STRIKE, GUMBO_TAG_STRONG, GUMBO_TAG_TT,
2635
- GUMBO_TAG_U, GUMBO_TAG_LAST)) {
2732
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(A), TAG(B), TAG(BIG),
2733
+ TAG(CODE), TAG(EM), TAG(FONT), TAG(I),
2734
+ TAG(NOBR), TAG(S), TAG(SMALL),
2735
+ TAG(STRIKE), TAG(STRONG), TAG(TT), TAG(U) })) {
2636
2736
  return adoption_agency_algorithm(parser, token, token->v.end_tag);
2637
- } else if (tag_in(token, kStartTag, GUMBO_TAG_APPLET, GUMBO_TAG_MARQUEE,
2638
- GUMBO_TAG_OBJECT, GUMBO_TAG_LAST)) {
2737
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(APPLET), TAG(MARQUEE),
2738
+ TAG(OBJECT) })) {
2639
2739
  reconstruct_active_formatting_elements(parser);
2640
2740
  insert_element_from_token(parser, token);
2641
2741
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
2642
2742
  set_frameset_not_ok(parser);
2643
2743
  return true;
2644
- } else if (tag_in(token, kEndTag, GUMBO_TAG_APPLET, GUMBO_TAG_MARQUEE,
2645
- GUMBO_TAG_OBJECT, GUMBO_TAG_LAST)) {
2744
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(APPLET), TAG(MARQUEE),
2745
+ TAG(OBJECT) })) {
2646
2746
  GumboTag token_tag = token->v.end_tag;
2647
2747
  if (!has_an_element_in_table_scope(parser, token_tag)) {
2648
2748
  parser_add_parse_error(parser, token);
2649
2749
  ignore_token(parser);
2650
2750
  return false;
2651
2751
  }
2652
- implicitly_close_tags(parser, token, token_tag);
2752
+ implicitly_close_tags(parser, token, GUMBO_NAMESPACE_HTML, token_tag);
2653
2753
  clear_active_formatting_elements(parser);
2654
2754
  return true;
2655
2755
  } else if (tag_is(token, kStartTag, GUMBO_TAG_TABLE)) {
@@ -2661,9 +2761,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2661
2761
  set_frameset_not_ok(parser);
2662
2762
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
2663
2763
  return true;
2664
- } else if (tag_in(token, kStartTag, GUMBO_TAG_AREA, GUMBO_TAG_BR,
2665
- GUMBO_TAG_EMBED, GUMBO_TAG_IMG, GUMBO_TAG_IMAGE,
2666
- GUMBO_TAG_KEYGEN, GUMBO_TAG_WBR, GUMBO_TAG_LAST)) {
2764
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(AREA), TAG(BR),
2765
+ TAG(EMBED), TAG(IMG), TAG(IMAGE), TAG(KEYGEN), TAG(WBR) })) {
2667
2766
  bool success = true;
2668
2767
  if (tag_is(token, kStartTag, GUMBO_TAG_IMAGE)) {
2669
2768
  success = false;
@@ -2693,8 +2792,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2693
2792
  pop_current_node(parser);
2694
2793
  acknowledge_self_closing_tag(parser);
2695
2794
  return true;
2696
- } else if (tag_in(token, kStartTag, GUMBO_TAG_PARAM, GUMBO_TAG_SOURCE,
2697
- GUMBO_TAG_TRACK, GUMBO_TAG_LAST)) {
2795
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(PARAM), TAG(SOURCE), TAG(TRACK) })) {
2698
2796
  insert_element_from_token(parser, token);
2699
2797
  pop_current_node(parser);
2700
2798
  acknowledge_self_closing_tag(parser);
@@ -2708,7 +2806,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2708
2806
  return result;
2709
2807
  } else if (tag_is(token, kStartTag, GUMBO_TAG_ISINDEX)) {
2710
2808
  parser_add_parse_error(parser, token);
2711
- if (parser->_parser_state->_form_element != NULL) {
2809
+ if (parser->_parser_state->_form_element != NULL &&
2810
+ !has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2712
2811
  ignore_token(parser);
2713
2812
  return false;
2714
2813
  }
@@ -2723,6 +2822,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2723
2822
 
2724
2823
  GumboNode* form = insert_element_of_tag_type(
2725
2824
  parser, GUMBO_TAG_FORM, GUMBO_INSERTION_FROM_ISINDEX);
2825
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2826
+ parser->_parser_state->_form_element = form;
2827
+ }
2726
2828
  if (action_attr) {
2727
2829
  gumbo_vector_add(parser, action_attr, &form->v.element.attributes);
2728
2830
  }
@@ -2786,6 +2888,9 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2786
2888
  parser, GUMBO_TAG_HR, GUMBO_INSERTION_FROM_ISINDEX);
2787
2889
  pop_current_node(parser); // <hr>
2788
2890
  pop_current_node(parser); // <form>
2891
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
2892
+ parser->_parser_state->_form_element = NULL;
2893
+ }
2789
2894
  return false;
2790
2895
  } else if (tag_is(token, kStartTag, GUMBO_TAG_TEXTAREA)) {
2791
2896
  run_generic_parsing_algorithm(parser, token, GUMBO_LEX_RCDATA);
@@ -2820,21 +2925,24 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2820
2925
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_SELECT);
2821
2926
  }
2822
2927
  return true;
2823
- } else if (tag_in(token, kStartTag, GUMBO_TAG_OPTION, GUMBO_TAG_OPTGROUP,
2824
- GUMBO_TAG_LAST)) {
2825
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2928
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(OPTION), TAG(OPTGROUP) })) {
2929
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
2826
2930
  pop_current_node(parser);
2827
2931
  }
2828
2932
  reconstruct_active_formatting_elements(parser);
2829
2933
  insert_element_from_token(parser, token);
2830
2934
  return true;
2831
- } else if (tag_in(token, kStartTag, GUMBO_TAG_RP, GUMBO_TAG_RT,
2832
- GUMBO_TAG_LAST)) {
2935
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) {
2936
+ TAG(RB), TAG(RP), TAG(RT), TAG(RTC) })) {
2833
2937
  bool success = true;
2938
+ GumboTag exception = tag_in(token, kStartTag, (gumbo_tagset) {
2939
+ TAG(RT), TAG(RP) }) ? GUMBO_TAG_RTC : GUMBO_TAG_LAST;
2834
2940
  if (has_an_element_in_scope(parser, GUMBO_TAG_RUBY)) {
2835
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
2941
+ generate_implied_end_tags(parser, exception);
2836
2942
  }
2837
- if (!node_tag_is(get_current_node(parser), GUMBO_TAG_RUBY)) {
2943
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_RUBY) &&
2944
+ !(exception == GUMBO_TAG_LAST ||
2945
+ node_html_tag_is(get_current_node(parser), GUMBO_TAG_RTC))) {
2838
2946
  parser_add_parse_error(parser, token);
2839
2947
  success = false;
2840
2948
  }
@@ -2867,11 +2975,10 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2867
2975
  acknowledge_self_closing_tag(parser);
2868
2976
  }
2869
2977
  return true;
2870
- } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
2871
- GUMBO_TAG_COLGROUP, GUMBO_TAG_FRAME, GUMBO_TAG_HEAD,
2872
- GUMBO_TAG_TBODY, GUMBO_TAG_TD, GUMBO_TAG_TFOOT,
2873
- GUMBO_TAG_TH, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
2874
- GUMBO_TAG_LAST)) {
2978
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COL),
2979
+ TAG(COLGROUP), TAG(FRAME), TAG(HEAD),
2980
+ TAG(TBODY), TAG(TD), TAG(TFOOT),
2981
+ TAG(TH), TAG(THEAD), TAG(TR) })) {
2875
2982
  parser_add_parse_error(parser, token);
2876
2983
  ignore_token(parser);
2877
2984
  return false;
@@ -2883,7 +2990,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2883
2990
  assert(token->type == GUMBO_TOKEN_END_TAG);
2884
2991
  GumboTag end_tag = token->v.end_tag;
2885
2992
  assert(state->_open_elements.length > 0);
2886
- assert(node_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
2993
+ assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML));
2887
2994
  // Walk up the stack of open elements until we find one that either:
2888
2995
  // a) Matches the tag name we saw
2889
2996
  // b) Is in the "special" category.
@@ -2892,8 +2999,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
2892
2999
  // implied end tags) and ignore the end tag token.
2893
3000
  for (int i = state->_open_elements.length; --i >= 0; ) {
2894
3001
  const GumboNode* node = state->_open_elements.data[i];
2895
- if (node->v.element.tag_namespace == GUMBO_NAMESPACE_HTML &&
2896
- node_tag_is(node, end_tag)) {
3002
+ if (node_html_tag_is(node, end_tag)) {
2897
3003
  generate_implied_end_tags(parser, end_tag);
2898
3004
  // TODO(jdtang): Do I need to add a parse error here? The condition in
2899
3005
  // the spec seems like it's the inverse of the loop condition above, and
@@ -2974,13 +3080,11 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
2974
3080
  parser->_parser_state->_reprocess_current_token = true;
2975
3081
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
2976
3082
  return true;
2977
- } else if (tag_in(token, kStartTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
2978
- GUMBO_TAG_THEAD, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_TR,
2979
- GUMBO_TAG_LAST)) {
3083
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(TBODY), TAG(TFOOT),
3084
+ TAG(THEAD), TAG(TD), TAG(TH), TAG(TR) })) {
2980
3085
  clear_stack_to_table_context(parser);
2981
3086
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
2982
- if (tag_in(token, kStartTag, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_TR,
2983
- GUMBO_TAG_LAST)) {
3087
+ if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(TD), TAG(TH), TAG(TR) })) {
2984
3088
  insert_element_of_tag_type(
2985
3089
  parser, GUMBO_TAG_TBODY, GUMBO_INSERTION_IMPLIED);
2986
3090
  state->_reprocess_current_token = true;
@@ -3002,16 +3106,15 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3002
3106
  return false;
3003
3107
  }
3004
3108
  return true;
3005
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3006
- GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
3007
- GUMBO_TAG_TBODY, GUMBO_TAG_TD, GUMBO_TAG_TFOOT,
3008
- GUMBO_TAG_TH, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
3009
- GUMBO_TAG_LAST)) {
3109
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(CAPTION),
3110
+ TAG(COL), TAG(COLGROUP), TAG(HTML),
3111
+ TAG(TBODY), TAG(TD), TAG(TFOOT),
3112
+ TAG(TH), TAG(THEAD), TAG(TR) })) {
3010
3113
  parser_add_parse_error(parser, token);
3011
3114
  ignore_token(parser);
3012
3115
  return false;
3013
- } else if (tag_in(token, kStartTag, GUMBO_TAG_STYLE, GUMBO_TAG_SCRIPT,
3014
- GUMBO_TAG_LAST)) {
3116
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(STYLE), TAG(SCRIPT), TAG(TEMPLATE) }) ||
3117
+ (tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE))) {
3015
3118
  return handle_in_head(parser, token);
3016
3119
  } else if (tag_is(token, kStartTag, GUMBO_TAG_INPUT) &&
3017
3120
  attribute_matches(&token->v.start_tag.attributes,
@@ -3022,7 +3125,7 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3022
3125
  return false;
3023
3126
  } else if (tag_is(token, kStartTag, GUMBO_TAG_FORM)) {
3024
3127
  parser_add_parse_error(parser, token);
3025
- if (state->_form_element) {
3128
+ if (state->_form_element || has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3026
3129
  ignore_token(parser);
3027
3130
  return false;
3028
3131
  }
@@ -3030,11 +3133,7 @@ static bool handle_in_table(GumboParser* parser, GumboToken* token) {
3030
3133
  pop_current_node(parser);
3031
3134
  return false;
3032
3135
  } else if (token->type == GUMBO_TOKEN_EOF) {
3033
- if (!node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3034
- parser_add_parse_error(parser, token);
3035
- return false;
3036
- }
3037
- return true;
3136
+ return handle_in_body(parser, token);
3038
3137
  } else {
3039
3138
  parser_add_parse_error(parser, token);
3040
3139
  state->_foster_parent_insertions = true;
@@ -3063,7 +3162,7 @@ static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3063
3162
  // of any one byte that is not whitespace means we flip the flag, so this
3064
3163
  // loop is still valid.
3065
3164
  for (int i = 0; i < buffer->length; ++i) {
3066
- if (!isspace(buffer->data[i]) || buffer->data[i] == '\v') {
3165
+ if (!isspace((unsigned char)buffer->data[i]) || buffer->data[i] == '\v') {
3067
3166
  state->_foster_parent_insertions = true;
3068
3167
  reconstruct_active_formatting_elements(parser);
3069
3168
  break;
@@ -3079,38 +3178,37 @@ static bool handle_in_table_text(GumboParser* parser, GumboToken* token) {
3079
3178
 
3080
3179
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-incaption
3081
3180
  static bool handle_in_caption(GumboParser* parser, GumboToken* token) {
3082
- if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3083
- GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
3084
- GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
3085
- GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
3086
- tag_in(token, kEndTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3087
- GUMBO_TAG_LAST)) {
3181
+ if (tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3088
3182
  if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3089
3183
  parser_add_parse_error(parser, token);
3090
3184
  ignore_token(parser);
3091
3185
  return false;
3186
+ } else {
3187
+ generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3188
+ bool result = true;
3189
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3190
+ parser_add_parse_error(parser, token);
3191
+ }
3192
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION));
3193
+ clear_active_formatting_elements(parser);
3194
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3195
+ return result;
3092
3196
  }
3093
- if (!tag_is(token, kEndTag, GUMBO_TAG_CAPTION)) {
3094
- parser_add_parse_error(parser, token);
3095
- parser->_parser_state->_reprocess_current_token = true;
3096
- }
3097
- generate_implied_end_tags(parser, GUMBO_TAG_LAST);
3098
- bool result = true;
3099
- if (!node_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3197
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COL),
3198
+ TAG(COLGROUP), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) }) ||
3199
+ (tag_is(token, kEndTag, GUMBO_TAG_TABLE))) {
3200
+ if (!has_an_element_in_table_scope(parser, GUMBO_TAG_CAPTION)) {
3100
3201
  parser_add_parse_error(parser, token);
3101
- while (!node_tag_is(get_current_node(parser), GUMBO_TAG_CAPTION)) {
3102
- pop_current_node(parser);
3103
- }
3104
- result = false;
3202
+ ignore_token(parser);
3203
+ return false;
3105
3204
  }
3106
- pop_current_node(parser); // The <caption> itself.
3205
+ while (!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_CAPTION));
3107
3206
  clear_active_formatting_elements(parser);
3108
3207
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3109
- return result;
3110
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_COL,
3111
- GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML, GUMBO_TAG_TBODY,
3112
- GUMBO_TAG_TD, GUMBO_TAG_TFOOT, GUMBO_TAG_TH,
3113
- GUMBO_TAG_THEAD, GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
3208
+ parser->_parser_state->_reprocess_current_token = true;
3209
+ return true;
3210
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(COL), TAG(COLGROUP),
3211
+ TAG(HTML), TAG(TBODY), TAG(TD), TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) } )) {
3114
3212
  parser_add_parse_error(parser, token);
3115
3213
  ignore_token(parser);
3116
3214
  return false;
@@ -3138,24 +3236,33 @@ static bool handle_in_column_group(GumboParser* parser, GumboToken* token) {
3138
3236
  pop_current_node(parser);
3139
3237
  acknowledge_self_closing_tag(parser);
3140
3238
  return true;
3239
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3240
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3241
+ parser_add_parse_error(parser, token);
3242
+ ignore_token(parser);
3243
+ return false;
3244
+ }
3245
+ pop_current_node(parser);
3246
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3247
+ return false;
3141
3248
  } else if (tag_is(token, kEndTag, GUMBO_TAG_COL)) {
3142
3249
  parser_add_parse_error(parser, token);
3143
3250
  ignore_token(parser);
3144
3251
  return false;
3145
- } else if (token->type == GUMBO_TOKEN_EOF &&
3146
- get_current_node(parser) == parser->_output->root) {
3147
- return true;
3252
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TEMPLATE) ||
3253
+ tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3254
+ return handle_in_head(parser, token);
3255
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3256
+ return handle_in_body(parser, token);
3148
3257
  } else {
3149
- if (get_current_node(parser) == parser->_output->root) {
3258
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP)) {
3150
3259
  parser_add_parse_error(parser, token);
3260
+ ignore_token(parser);
3151
3261
  return false;
3152
3262
  }
3153
- assert(node_tag_is(get_current_node(parser), GUMBO_TAG_COLGROUP));
3154
3263
  pop_current_node(parser);
3155
3264
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3156
- if (!tag_is(token, kEndTag, GUMBO_TAG_COLGROUP)) {
3157
- parser->_parser_state->_reprocess_current_token = true;
3158
- }
3265
+ parser->_parser_state->_reprocess_current_token = true;
3159
3266
  return true;
3160
3267
  }
3161
3268
  }
@@ -3167,16 +3274,14 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3167
3274
  insert_element_from_token(parser, token);
3168
3275
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3169
3276
  return true;
3170
- } else if (tag_in(token, kStartTag, GUMBO_TAG_TD, GUMBO_TAG_TH,
3171
- GUMBO_TAG_LAST)) {
3277
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(TD), TAG(TH) })) {
3172
3278
  parser_add_parse_error(parser, token);
3173
3279
  clear_stack_to_table_body_context(parser);
3174
3280
  insert_element_of_tag_type(parser, GUMBO_TAG_TR, GUMBO_INSERTION_IMPLIED);
3175
3281
  parser->_parser_state->_reprocess_current_token = true;
3176
3282
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3177
3283
  return false;
3178
- } else if (tag_in(token, kEndTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3179
- GUMBO_TAG_THEAD, GUMBO_TAG_LAST)) {
3284
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(TBODY), TAG(TFOOT), TAG(THEAD) })) {
3180
3285
  if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3181
3286
  parser_add_parse_error(parser, token);
3182
3287
  ignore_token(parser);
@@ -3186,9 +3291,8 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3186
3291
  pop_current_node(parser);
3187
3292
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3188
3293
  return true;
3189
- } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3190
- GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3191
- GUMBO_TAG_THEAD, GUMBO_TAG_LAST) ||
3294
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COL),
3295
+ TAG(COLGROUP), TAG(TBODY), TAG(TFOOT), TAG(THEAD) }) ||
3192
3296
  tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3193
3297
  if (!(has_an_element_in_table_scope(parser, GUMBO_TAG_TBODY) ||
3194
3298
  has_an_element_in_table_scope(parser, GUMBO_TAG_THEAD) ||
@@ -3202,9 +3306,8 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3202
3306
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3203
3307
  parser->_parser_state->_reprocess_current_token = true;
3204
3308
  return true;
3205
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3206
- GUMBO_TAG_COL, GUMBO_TAG_TR, GUMBO_TAG_COLGROUP,
3207
- GUMBO_TAG_HTML, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST))
3309
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(CAPTION),
3310
+ TAG(COL), TAG(TR), TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH) }))
3208
3311
  {
3209
3312
  parser_add_parse_error(parser, token);
3210
3313
  ignore_token(parser);
@@ -3216,51 +3319,54 @@ static bool handle_in_table_body(GumboParser* parser, GumboToken* token) {
3216
3319
 
3217
3320
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intr
3218
3321
  static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3219
- if (tag_in(token, kStartTag, GUMBO_TAG_TH, GUMBO_TAG_TD, GUMBO_TAG_LAST)) {
3322
+ if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(TH), TAG(TD) })) {
3220
3323
  clear_stack_to_table_row_context(parser);
3221
3324
  insert_element_from_token(parser, token);
3222
3325
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_CELL);
3223
3326
  add_formatting_element(parser, &kActiveFormattingScopeMarker);
3224
3327
  return true;
3225
- } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COLGROUP,
3226
- GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3227
- GUMBO_TAG_TR, GUMBO_TAG_LAST) ||
3228
- tag_in(token, kEndTag, GUMBO_TAG_TR, GUMBO_TAG_TABLE,
3229
- GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3230
- GUMBO_TAG_LAST)) {
3231
- // This case covers 4 clauses of the spec, each of which say "Otherwise, act
3232
- // as if an end tag with the tag name "tr" had been seen." The differences
3233
- // are in error handling and whether the current token is reprocessed.
3234
- GumboTag desired_tag =
3235
- tag_in(token, kEndTag, GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT,
3236
- GUMBO_TAG_THEAD, GUMBO_TAG_LAST)
3237
- ? token->v.end_tag : GUMBO_TAG_TR;
3238
- if (!has_an_element_in_table_scope(parser, desired_tag)) {
3239
- gumbo_debug("Bailing because there is no tag %s in table scope.\nOpen elements:",
3240
- gumbo_normalized_tagname(desired_tag));
3241
- for (int i = 0; i < parser->_parser_state->_open_elements.length; ++i) {
3242
- const GumboNode* node = parser->_parser_state->_open_elements.data[i];
3243
- gumbo_debug("%s\n", gumbo_normalized_tagname(node->v.element.tag));
3244
- }
3328
+ } else if (tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3329
+ if (!has_an_element_in_table_scope(parser,GUMBO_TAG_TR)) {
3245
3330
  parser_add_parse_error(parser, token);
3246
3331
  ignore_token(parser);
3247
3332
  return false;
3333
+ } else {
3334
+ clear_stack_to_table_row_context(parser);
3335
+ pop_current_node(parser);
3336
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3337
+ return true;
3248
3338
  }
3249
- clear_stack_to_table_row_context(parser);
3250
- GumboNode* last_element = pop_current_node(parser);
3251
- assert(node_tag_is(last_element, GUMBO_TAG_TR));
3252
- AVOID_UNUSED_VARIABLE_WARNING(last_element);
3253
- set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3254
- if (!tag_is(token, kEndTag, GUMBO_TAG_TR)) {
3339
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COL), TAG(COLGROUP),
3340
+ TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR) }) || tag_is(token, kEndTag, GUMBO_TAG_TABLE)) {
3341
+ if (!has_an_element_in_table_scope(parser,GUMBO_TAG_TR)) {
3342
+ parser_add_parse_error(parser, token);
3343
+ ignore_token(parser);
3344
+ return false;
3345
+ } else {
3346
+ clear_stack_to_table_row_context(parser);
3347
+ pop_current_node(parser);
3348
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3255
3349
  parser->_parser_state->_reprocess_current_token = true;
3350
+ return true;
3256
3351
  }
3257
- return true;
3258
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3259
- GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
3260
- GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3261
- parser_add_parse_error(parser, token);
3262
- ignore_token(parser);
3263
- return false;
3352
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(TBODY), TAG(TFOOT), TAG(THEAD) })) {
3353
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag) ||
3354
+ (!has_an_element_in_table_scope(parser, GUMBO_TAG_TR))) {
3355
+ parser_add_parse_error(parser, token);
3356
+ ignore_token(parser);
3357
+ return false;
3358
+ } else {
3359
+ clear_stack_to_table_row_context(parser);
3360
+ pop_current_node(parser);
3361
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3362
+ parser->_parser_state->_reprocess_current_token = true;
3363
+ return true;
3364
+ }
3365
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(CAPTION), TAG(COL),
3366
+ TAG(COLGROUP), TAG(HTML), TAG(TD), TAG(TH) })) {
3367
+ parser_add_parse_error(parser, token);
3368
+ ignore_token(parser);
3369
+ return false;
3264
3370
  } else {
3265
3371
  return handle_in_table(parser, token);
3266
3372
  }
@@ -3268,17 +3374,17 @@ static bool handle_in_row(GumboParser* parser, GumboToken* token) {
3268
3374
 
3269
3375
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-intd
3270
3376
  static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3271
- if (tag_in(token, kEndTag, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3377
+ if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(TD), TAG(TH) })) {
3272
3378
  GumboTag token_tag = token->v.end_tag;
3273
3379
  if (!has_an_element_in_table_scope(parser, token_tag)) {
3274
3380
  parser_add_parse_error(parser, token);
3381
+ ignore_token(parser);
3275
3382
  return false;
3276
3383
  }
3277
3384
  return close_table_cell(parser, token, token_tag);
3278
- } else if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_COL,
3279
- GUMBO_TAG_COLGROUP, GUMBO_TAG_TBODY, GUMBO_TAG_TD,
3280
- GUMBO_TAG_TFOOT, GUMBO_TAG_TH, GUMBO_TAG_THEAD,
3281
- GUMBO_TAG_TR, GUMBO_TAG_LAST)) {
3385
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COL),
3386
+ TAG(COLGROUP), TAG(TBODY), TAG(TD),
3387
+ TAG(TFOOT), TAG(TH), TAG(THEAD), TAG(TR) })) {
3282
3388
  gumbo_debug("Handling <td> in cell.\n");
3283
3389
  if (!has_an_element_in_table_scope(parser, GUMBO_TAG_TH) &&
3284
3390
  !has_an_element_in_table_scope(parser, GUMBO_TAG_TD)) {
@@ -3289,15 +3395,13 @@ static bool handle_in_cell(GumboParser* parser, GumboToken* token) {
3289
3395
  }
3290
3396
  parser->_parser_state->_reprocess_current_token = true;
3291
3397
  return close_current_cell(parser, token);
3292
- } else if (tag_in(token, kEndTag, GUMBO_TAG_BODY, GUMBO_TAG_CAPTION,
3293
- GUMBO_TAG_COL, GUMBO_TAG_COLGROUP, GUMBO_TAG_HTML,
3294
- GUMBO_TAG_LAST)) {
3398
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(BODY), TAG(CAPTION),
3399
+ TAG(COL), TAG(COLGROUP), TAG(HTML) })) {
3295
3400
  parser_add_parse_error(parser, token);
3296
3401
  ignore_token(parser);
3297
3402
  return false;
3298
- } else if (tag_in(token, kEndTag, GUMBO_TAG_TABLE, GUMBO_TAG_TBODY,
3299
- GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
3300
- GUMBO_TAG_LAST)) {
3403
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(TABLE), TAG(TBODY),
3404
+ TAG(TFOOT), TAG(THEAD), TAG(TR) })) {
3301
3405
  if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3302
3406
  parser_add_parse_error(parser, token);
3303
3407
  ignore_token(parser);
@@ -3330,28 +3434,28 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3330
3434
  } else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
3331
3435
  return handle_in_body(parser, token);
3332
3436
  } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTION)) {
3333
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3437
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3334
3438
  pop_current_node(parser);
3335
3439
  }
3336
3440
  insert_element_from_token(parser, token);
3337
3441
  return true;
3338
3442
  } else if (tag_is(token, kStartTag, GUMBO_TAG_OPTGROUP)) {
3339
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3443
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3340
3444
  pop_current_node(parser);
3341
3445
  }
3342
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3446
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3343
3447
  pop_current_node(parser);
3344
3448
  }
3345
3449
  insert_element_from_token(parser, token);
3346
3450
  return true;
3347
3451
  } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTGROUP)) {
3348
3452
  GumboVector* open_elements = &parser->_parser_state->_open_elements;
3349
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
3350
- node_tag_is(open_elements->data[open_elements->length - 2],
3453
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION) &&
3454
+ node_html_tag_is(open_elements->data[open_elements->length - 2],
3351
3455
  GUMBO_TAG_OPTGROUP)) {
3352
3456
  pop_current_node(parser);
3353
3457
  }
3354
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3458
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTGROUP)) {
3355
3459
  pop_current_node(parser);
3356
3460
  return true;
3357
3461
  } else {
@@ -3360,7 +3464,7 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3360
3464
  return false;
3361
3465
  }
3362
3466
  } else if (tag_is(token, kEndTag, GUMBO_TAG_OPTION)) {
3363
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3467
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_OPTION)) {
3364
3468
  pop_current_node(parser);
3365
3469
  return true;
3366
3470
  } else {
@@ -3381,8 +3485,7 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3381
3485
  ignore_token(parser);
3382
3486
  close_current_select(parser);
3383
3487
  return false;
3384
- } else if (tag_in(token, kStartTag, GUMBO_TAG_INPUT, GUMBO_TAG_KEYGEN,
3385
- GUMBO_TAG_TEXTAREA, GUMBO_TAG_LAST)) {
3488
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(INPUT), TAG(KEYGEN), TAG(TEXTAREA) })) {
3386
3489
  parser_add_parse_error(parser, token);
3387
3490
  if (!has_an_element_in_select_scope(parser, GUMBO_TAG_SELECT)) {
3388
3491
  ignore_token(parser);
@@ -3391,14 +3494,11 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3391
3494
  parser->_parser_state->_reprocess_current_token = true;
3392
3495
  }
3393
3496
  return false;
3394
- } else if (tag_is(token, kStartTag, GUMBO_TAG_SCRIPT)) {
3497
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(SCRIPT) , TAG(TEMPLATE) }) ||
3498
+ tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3395
3499
  return handle_in_head(parser, token);
3396
3500
  } else if (token->type == GUMBO_TOKEN_EOF) {
3397
- if (get_current_node(parser) != parser->_output->root) {
3398
- parser_add_parse_error(parser, token);
3399
- return false;
3400
- }
3401
- return true;
3501
+ return handle_in_body(parser, token);
3402
3502
  } else {
3403
3503
  parser_add_parse_error(parser, token);
3404
3504
  ignore_token(parser);
@@ -3408,25 +3508,25 @@ static bool handle_in_select(GumboParser* parser, GumboToken* token) {
3408
3508
 
3409
3509
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-inselectintable
3410
3510
  static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3411
- if (tag_in(token, kStartTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3412
- GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD, GUMBO_TAG_TR,
3413
- GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3511
+ if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(TABLE),
3512
+ TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH) })) {
3414
3513
  parser_add_parse_error(parser, token);
3415
3514
  close_current_select(parser);
3416
3515
  parser->_parser_state->_reprocess_current_token = true;
3417
3516
  return false;
3418
- } else if (tag_in(token, kEndTag, GUMBO_TAG_CAPTION, GUMBO_TAG_TABLE,
3419
- GUMBO_TAG_TBODY, GUMBO_TAG_TFOOT, GUMBO_TAG_THEAD,
3420
- GUMBO_TAG_TR, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_LAST)) {
3517
+ } else if (tag_in(token, kEndTag, (gumbo_tagset) { TAG(CAPTION), TAG(TABLE),
3518
+ TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR), TAG(TD), TAG(TH) })) {
3421
3519
  parser_add_parse_error(parser, token);
3422
- if (has_an_element_in_table_scope(parser, token->v.end_tag)) {
3520
+ if (!has_an_element_in_table_scope(parser, token->v.end_tag)) {
3521
+ ignore_token(parser);
3522
+ return false;
3523
+ } else {
3423
3524
  close_current_select(parser);
3424
- reset_insertion_mode_appropriately(parser);
3525
+ // close_current_select already does the reset_insertion_mode_appropriately
3526
+ // reset_insertion_mode_appropriately(parser);
3425
3527
  parser->_parser_state->_reprocess_current_token = true;
3426
- } else {
3427
- ignore_token(parser);
3528
+ return false;
3428
3529
  }
3429
- return false;
3430
3530
  } else {
3431
3531
  return handle_in_select(parser, token);
3432
3532
  }
@@ -3434,8 +3534,68 @@ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
3434
3534
 
3435
3535
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
3436
3536
  static bool handle_in_template(GumboParser* parser, GumboToken* token) {
3437
- // TODO(jdtang): Implement this.
3438
- return true;
3537
+ GumboParserState* state = parser->_parser_state;
3538
+ if (token->type == GUMBO_TOKEN_WHITESPACE ||
3539
+ token->type == GUMBO_TOKEN_CHARACTER ||
3540
+ token->type == GUMBO_TOKEN_COMMENT ||
3541
+ token->type == GUMBO_TOKEN_DOCTYPE) {
3542
+ return handle_in_body(parser, token);
3543
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(BASE), TAG(BASEFONT), TAG(BGSOUND),
3544
+ TAG(LINK), TAG(META), TAG(NOFRAMES), TAG(SCRIPT), TAG(STYLE),
3545
+ TAG(TEMPLATE), TAG(TITLE) }) ||
3546
+ tag_is(token, kEndTag, GUMBO_TAG_TEMPLATE)) {
3547
+ return handle_in_head(parser, token);
3548
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(CAPTION), TAG(COLGROUP),
3549
+ TAG(TBODY), TAG(TFOOT), TAG(THEAD) })) {
3550
+ pop_template_insertion_mode(parser);
3551
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3552
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE);
3553
+ state->_reprocess_current_token = true;
3554
+ return true;
3555
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_COL)) {
3556
+ pop_template_insertion_mode(parser);
3557
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3558
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_COLUMN_GROUP);
3559
+ state->_reprocess_current_token = true;
3560
+ return true;
3561
+ } else if (tag_is(token, kStartTag, GUMBO_TAG_TR)) {
3562
+ pop_template_insertion_mode(parser);
3563
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3564
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TABLE_BODY);
3565
+ state->_reprocess_current_token = true;
3566
+ return true;
3567
+ } else if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(TD), TAG(TH) })) {
3568
+ pop_template_insertion_mode(parser);
3569
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3570
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_ROW);
3571
+ state->_reprocess_current_token = true;
3572
+ return true;
3573
+ } else if (token->type == GUMBO_TOKEN_START_TAG) {
3574
+ pop_template_insertion_mode(parser);
3575
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3576
+ set_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_BODY);
3577
+ state->_reprocess_current_token = true;
3578
+ return true;
3579
+ } else if (token->type == GUMBO_TOKEN_END_TAG) {
3580
+ parser_add_parse_error(parser, token);
3581
+ ignore_token(parser);
3582
+ return false;
3583
+ } else if (token->type == GUMBO_TOKEN_EOF) {
3584
+ if (!has_open_element(parser, GUMBO_TAG_TEMPLATE)) {
3585
+ // Stop parsing.
3586
+ return true;
3587
+ }
3588
+ parser_add_parse_error(parser, token);
3589
+ while(!node_html_tag_is(pop_current_node(parser), GUMBO_TAG_TEMPLATE));
3590
+ clear_active_formatting_elements(parser);
3591
+ pop_template_insertion_mode(parser);
3592
+ reset_insertion_mode_appropriately(parser);
3593
+ state->_reprocess_current_token = true;
3594
+ return false;
3595
+ } else {
3596
+ assert(0);
3597
+ return false;
3598
+ }
3439
3599
  }
3440
3600
 
3441
3601
  // http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
@@ -3453,10 +3613,15 @@ static bool handle_after_body(GumboParser* parser, GumboToken* token) {
3453
3613
  ignore_token(parser);
3454
3614
  return false;
3455
3615
  } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3456
- // TODO(jdtang): Handle fragment parsing algorithm case.
3616
+ /* fragment case: ignore the closing HTML token */
3617
+ if (is_fragment_parser(parser)) {
3618
+ parser_add_parse_error(parser, token);
3619
+ ignore_token(parser);
3620
+ return false;
3621
+ }
3457
3622
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_BODY);
3458
3623
  GumboNode* html = parser->_parser_state->_open_elements.data[0];
3459
- assert(node_tag_is(html, GUMBO_TAG_HTML));
3624
+ assert(node_html_tag_is(html, GUMBO_TAG_HTML));
3460
3625
  record_end_of_element(
3461
3626
  parser->_parser_state->_current_token, &html->v.element);
3462
3627
  return true;
@@ -3488,15 +3653,14 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
3488
3653
  insert_element_from_token(parser, token);
3489
3654
  return true;
3490
3655
  } else if (tag_is(token, kEndTag, GUMBO_TAG_FRAMESET)) {
3491
- if (node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3656
+ if (node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3492
3657
  parser_add_parse_error(parser, token);
3493
3658
  ignore_token(parser);
3494
3659
  return false;
3495
3660
  }
3496
3661
  pop_current_node(parser);
3497
- // TODO(jdtang): Add a condition to ignore this for the fragment parsing
3498
- // algorithm.
3499
- if (!node_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
3662
+ if (!is_fragment_parser(parser) &&
3663
+ !node_html_tag_is(get_current_node(parser), GUMBO_TAG_FRAMESET)) {
3500
3664
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_FRAMESET);
3501
3665
  }
3502
3666
  return true;
@@ -3508,7 +3672,7 @@ static bool handle_in_frameset(GumboParser* parser, GumboToken* token) {
3508
3672
  } else if (tag_is(token, kStartTag, GUMBO_TAG_NOFRAMES)) {
3509
3673
  return handle_in_head(parser, token);
3510
3674
  } else if (token->type == GUMBO_TOKEN_EOF) {
3511
- if (!node_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3675
+ if (!node_html_tag_is(get_current_node(parser), GUMBO_TAG_HTML)) {
3512
3676
  parser_add_parse_error(parser, token);
3513
3677
  return false;
3514
3678
  }
@@ -3536,7 +3700,7 @@ static bool handle_after_frameset(GumboParser* parser, GumboToken* token) {
3536
3700
  return handle_in_body(parser, token);
3537
3701
  } else if (tag_is(token, kEndTag, GUMBO_TAG_HTML)) {
3538
3702
  GumboNode* html = parser->_parser_state->_open_elements.data[0];
3539
- assert(node_tag_is(html, GUMBO_TAG_HTML));
3703
+ assert(node_html_tag_is(html, GUMBO_TAG_HTML));
3540
3704
  record_end_of_element(
3541
3705
  parser->_parser_state->_current_token, &html->v.element);
3542
3706
  set_insertion_mode(parser, GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET);
@@ -3631,13 +3795,13 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
3631
3795
  switch (token->type) {
3632
3796
  case GUMBO_TOKEN_NULL:
3633
3797
  parser_add_parse_error(parser, token);
3634
- token->type = GUMBO_TOKEN_CHARACTER;
3635
3798
  token->v.character = kUtf8ReplacementChar;
3636
3799
  insert_text_token(parser, token);
3637
3800
  return false;
3638
3801
  case GUMBO_TOKEN_WHITESPACE:
3639
3802
  insert_text_token(parser, token);
3640
3803
  return true;
3804
+ case GUMBO_TOKEN_CDATA:
3641
3805
  case GUMBO_TOKEN_CHARACTER:
3642
3806
  insert_text_token(parser, token);
3643
3807
  set_frameset_not_ok(parser);
@@ -3654,35 +3818,48 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
3654
3818
  break;
3655
3819
  }
3656
3820
  // Order matters for these clauses.
3657
- if (tag_in(token, kStartTag, GUMBO_TAG_B, GUMBO_TAG_BIG,
3658
- GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_BODY, GUMBO_TAG_BR,
3659
- GUMBO_TAG_CENTER, GUMBO_TAG_CODE, GUMBO_TAG_DD, GUMBO_TAG_DIV,
3660
- GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EM, GUMBO_TAG_EMBED,
3661
- GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4,
3662
- GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HEAD, GUMBO_TAG_HR,
3663
- GUMBO_TAG_I, GUMBO_TAG_IMG, GUMBO_TAG_LI, GUMBO_TAG_LISTING,
3664
- GUMBO_TAG_MENU, GUMBO_TAG_META, GUMBO_TAG_NOBR, GUMBO_TAG_OL,
3665
- GUMBO_TAG_P, GUMBO_TAG_PRE, GUMBO_TAG_RUBY, GUMBO_TAG_S,
3666
- GUMBO_TAG_SMALL, GUMBO_TAG_SPAN, GUMBO_TAG_STRONG,
3667
- GUMBO_TAG_STRIKE, GUMBO_TAG_SUB, GUMBO_TAG_SUP,
3668
- GUMBO_TAG_TABLE, GUMBO_TAG_TT, GUMBO_TAG_U, GUMBO_TAG_UL,
3669
- GUMBO_TAG_VAR, GUMBO_TAG_LAST) ||
3821
+ if (tag_in(token, kStartTag, (gumbo_tagset) { TAG(B), TAG(BIG),
3822
+ TAG(BLOCKQUOTE), TAG(BODY), TAG(BR),
3823
+ TAG(CENTER), TAG(CODE), TAG(DD), TAG(DIV),
3824
+ TAG(DL), TAG(DT), TAG(EM), TAG(EMBED),
3825
+ TAG(H1), TAG(H2), TAG(H3), TAG(H4),
3826
+ TAG(H5), TAG(H6), TAG(HEAD), TAG(HR),
3827
+ TAG(I), TAG(IMG), TAG(LI), TAG(LISTING),
3828
+ TAG(MENU), TAG(META), TAG(NOBR), TAG(OL),
3829
+ TAG(P), TAG(PRE), TAG(RUBY), TAG(S),
3830
+ TAG(SMALL), TAG(SPAN), TAG(STRONG),
3831
+ TAG(STRIKE), TAG(SUB), TAG(SUP),
3832
+ TAG(TABLE), TAG(TT), TAG(U), TAG(UL), TAG(VAR) }) ||
3670
3833
  (tag_is(token, kStartTag, GUMBO_TAG_FONT) && (
3671
3834
  token_has_attribute(token, "color") ||
3672
3835
  token_has_attribute(token, "face") ||
3673
3836
  token_has_attribute(token, "size")))) {
3837
+
3838
+ /* Parse error */
3674
3839
  parser_add_parse_error(parser, token);
3675
- do {
3676
- pop_current_node(parser);
3677
- } while(!(is_mathml_integration_point(get_current_node(parser)) ||
3678
- is_html_integration_point(get_current_node(parser)) ||
3679
- get_current_node(parser)->v.element.tag_namespace ==
3680
- GUMBO_NAMESPACE_HTML));
3681
- parser->_parser_state->_reprocess_current_token = true;
3682
- return false;
3683
- } else if (token->type == GUMBO_TOKEN_START_TAG) {
3840
+
3841
+ /*
3842
+ * Fragment case: If the parser was originally created for the HTML
3843
+ * fragment parsing algorithm, then act as described in the "any other
3844
+ * start tag" entry below.
3845
+ */
3846
+ if (!is_fragment_parser(parser)) {
3847
+ do {
3848
+ pop_current_node(parser);
3849
+ } while(!(is_mathml_integration_point(get_current_node(parser)) ||
3850
+ is_html_integration_point(get_current_node(parser)) ||
3851
+ get_current_node(parser)->v.element.tag_namespace ==
3852
+ GUMBO_NAMESPACE_HTML));
3853
+ parser->_parser_state->_reprocess_current_token = true;
3854
+ return false;
3855
+ }
3856
+
3857
+ assert(token->type == GUMBO_TOKEN_START_TAG);
3858
+ }
3859
+
3860
+ if (token->type == GUMBO_TOKEN_START_TAG) {
3684
3861
  const GumboNamespaceEnum current_namespace =
3685
- get_current_node(parser)->v.element.tag_namespace;
3862
+ get_adjusted_current_node(parser)->v.element.tag_namespace;
3686
3863
  if (current_namespace == GUMBO_NAMESPACE_MATHML) {
3687
3864
  adjust_mathml_attributes(parser, token);
3688
3865
  }
@@ -3771,8 +3948,10 @@ static bool handle_token(GumboParser* parser, GumboToken* token) {
3771
3948
  parser->_parser_state->_closed_html_tag = true;
3772
3949
  }
3773
3950
 
3774
- const GumboNode* current_node = get_current_node(parser);
3775
- assert(!current_node || current_node->type == GUMBO_NODE_ELEMENT);
3951
+ const GumboNode* current_node = get_adjusted_current_node(parser);
3952
+ assert(!current_node ||
3953
+ current_node->type == GUMBO_NODE_ELEMENT ||
3954
+ current_node->type == GUMBO_NODE_TEMPLATE);
3776
3955
  if (current_node) {
3777
3956
  gumbo_debug("Current node: <%s>.\n",
3778
3957
  gumbo_normalized_tagname(current_node->v.element.tag));
@@ -3784,10 +3963,9 @@ static bool handle_token(GumboParser* parser, GumboToken* token) {
3784
3963
  token->type == GUMBO_TOKEN_WHITESPACE ||
3785
3964
  token->type == GUMBO_TOKEN_NULL ||
3786
3965
  (token->type == GUMBO_TOKEN_START_TAG &&
3787
- !tag_in(token, kStartTag, GUMBO_TAG_MGLYPH, GUMBO_TAG_MALIGNMARK,
3788
- GUMBO_TAG_LAST)))) ||
3966
+ !tag_in(token, kStartTag, (gumbo_tagset) { TAG(MGLYPH), TAG(MALIGNMARK) })))) ||
3789
3967
  (current_node->v.element.tag_namespace == GUMBO_NAMESPACE_MATHML &&
3790
- node_tag_is(current_node, GUMBO_TAG_ANNOTATION_XML) &&
3968
+ node_qualified_tag_is(current_node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML) &&
3791
3969
  tag_is(token, kStartTag, GUMBO_TAG_SVG)) ||
3792
3970
  (is_html_integration_point(current_node) && (
3793
3971
  token->type == GUMBO_TOKEN_START_TAG ||
@@ -3801,6 +3979,66 @@ static bool handle_token(GumboParser* parser, GumboToken* token) {
3801
3979
  }
3802
3980
  }
3803
3981
 
3982
+ static void fragment_parser_init(
3983
+ GumboParser *parser, GumboTag fragment_ctx,
3984
+ GumboNamespaceEnum fragment_namespace) {
3985
+ GumboNode *root;
3986
+ assert(fragment_ctx != GUMBO_TAG_LAST);
3987
+
3988
+ // 3
3989
+ parser->_parser_state->_fragment_ctx = create_element(parser, fragment_ctx);
3990
+ parser->_parser_state->_fragment_ctx->v.element.tag_namespace =
3991
+ fragment_namespace;
3992
+
3993
+ // 4
3994
+ if (fragment_namespace == GUMBO_NAMESPACE_HTML) {
3995
+ // Non-HTML namespaces always start in the DATA state.
3996
+ switch (fragment_ctx) {
3997
+ case GUMBO_TAG_TITLE:
3998
+ case GUMBO_TAG_TEXTAREA:
3999
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
4000
+ break;
4001
+
4002
+ case GUMBO_TAG_STYLE:
4003
+ case GUMBO_TAG_XMP:
4004
+ case GUMBO_TAG_IFRAME:
4005
+ case GUMBO_TAG_NOEMBED:
4006
+ case GUMBO_TAG_NOFRAMES:
4007
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT);
4008
+ break;
4009
+
4010
+ case GUMBO_TAG_SCRIPT:
4011
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT);
4012
+ break;
4013
+
4014
+ case GUMBO_TAG_NOSCRIPT:
4015
+ /* scripting is disabled in Gumbo, so leave the tokenizer
4016
+ * in the default data state */
4017
+ break;
4018
+
4019
+ case GUMBO_TAG_PLAINTEXT:
4020
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_PLAINTEXT);
4021
+ break;
4022
+
4023
+ default:
4024
+ /* default data state */
4025
+ break;
4026
+ }
4027
+ }
4028
+
4029
+ // 5. 6. 7.
4030
+ root = insert_element_of_tag_type(parser, GUMBO_TAG_HTML, GUMBO_INSERTION_IMPLIED);
4031
+ parser->_output->root = root;
4032
+
4033
+ // 8.
4034
+ if (fragment_ctx == GUMBO_TAG_TEMPLATE) {
4035
+ push_template_insertion_mode(parser, GUMBO_INSERTION_MODE_IN_TEMPLATE);
4036
+ }
4037
+
4038
+ // 10.
4039
+ reset_insertion_mode_appropriately(parser);
4040
+ }
4041
+
3804
4042
  GumboOutput* gumbo_parse(const char* buffer) {
3805
4043
  return gumbo_parse_with_options(
3806
4044
  &kGumboDefaultOptions, buffer, strlen(buffer));
@@ -3808,11 +4046,27 @@ GumboOutput* gumbo_parse(const char* buffer) {
3808
4046
 
3809
4047
  GumboOutput* gumbo_parse_with_options(
3810
4048
  const GumboOptions* options, const char* buffer, size_t length) {
4049
+ return gumbo_parse_fragment(
4050
+ options, buffer, length, GUMBO_TAG_LAST, GUMBO_NAMESPACE_HTML);
4051
+ }
4052
+
4053
+ GumboOutput* gumbo_parse_fragment(
4054
+ const GumboOptions* options, const char* buffer, size_t length,
4055
+ const GumboTag fragment_ctx, const GumboNamespaceEnum fragment_namespace) {
3811
4056
  GumboParser parser;
3812
4057
  parser._options = options;
4058
+ parser_state_init(&parser);
4059
+ // Must come after parser_state_init, since creating the document node must
4060
+ // reference parser_state->_current_node.
3813
4061
  output_init(&parser);
4062
+ // And this must come after output_init, because initializing the tokenizer
4063
+ // reads the first character and that may cause a UTF-8 decode error
4064
+ // (inserting into output->errors) if that's invalid.
3814
4065
  gumbo_tokenizer_state_init(&parser, buffer, length);
3815
- parser_state_init(&parser);
4066
+
4067
+ if (fragment_ctx != GUMBO_TAG_LAST) {
4068
+ fragment_parser_init(&parser, fragment_ctx, fragment_namespace);
4069
+ }
3816
4070
 
3817
4071
  GumboParserState* state = parser._parser_state;
3818
4072
  gumbo_debug("Parsing %.*s.\n", length, buffer);
@@ -3823,6 +4077,7 @@ GumboOutput* gumbo_parse_with_options(
3823
4077
 
3824
4078
  GumboToken token;
3825
4079
  bool has_error = false;
4080
+
3826
4081
  do {
3827
4082
  if (state->_reprocess_current_token) {
3828
4083
  state->_reprocess_current_token = false;
@@ -3899,20 +4154,16 @@ GumboOutput* gumbo_parse_with_options(
3899
4154
  return parser._output;
3900
4155
  }
3901
4156
 
3902
- void gumbo_destroy_node(GumboOptions* options, GumboNode* node) {
3903
- // Need a dummy GumboParser because the allocator comes along with the
3904
- // options object.
3905
- GumboParser parser;
3906
- parser._options = options;
3907
- destroy_node(&parser, node);
3908
- }
3909
-
3910
4157
  void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) {
3911
4158
  // Need a dummy GumboParser because the allocator comes along with the
3912
4159
  // options object.
3913
4160
  GumboParser parser;
4161
+ parser._parser_state = NULL;
3914
4162
  parser._options = options;
3915
- destroy_node(&parser, output->document);
4163
+ GumboNode* current = output->document;
4164
+ while (current) {
4165
+ current = destroy_node(&parser, current);
4166
+ }
3916
4167
  for (int i = 0; i < output->errors.length; ++i) {
3917
4168
  gumbo_error_destroy(&parser, output->errors.data[i]);
3918
4169
  }