nokogumbo 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,153 @@
1
+ // Generated via `gentags.py src/tag.in`.
2
+ // Do not edit; edit src/tag.in instead.
3
+ // clang-format off
4
+ GUMBO_TAG_HTML,
5
+ GUMBO_TAG_HEAD,
6
+ GUMBO_TAG_TITLE,
7
+ GUMBO_TAG_BASE,
8
+ GUMBO_TAG_LINK,
9
+ GUMBO_TAG_META,
10
+ GUMBO_TAG_STYLE,
11
+ GUMBO_TAG_SCRIPT,
12
+ GUMBO_TAG_NOSCRIPT,
13
+ GUMBO_TAG_TEMPLATE,
14
+ GUMBO_TAG_BODY,
15
+ GUMBO_TAG_ARTICLE,
16
+ GUMBO_TAG_SECTION,
17
+ GUMBO_TAG_NAV,
18
+ GUMBO_TAG_ASIDE,
19
+ GUMBO_TAG_H1,
20
+ GUMBO_TAG_H2,
21
+ GUMBO_TAG_H3,
22
+ GUMBO_TAG_H4,
23
+ GUMBO_TAG_H5,
24
+ GUMBO_TAG_H6,
25
+ GUMBO_TAG_HGROUP,
26
+ GUMBO_TAG_HEADER,
27
+ GUMBO_TAG_FOOTER,
28
+ GUMBO_TAG_ADDRESS,
29
+ GUMBO_TAG_P,
30
+ GUMBO_TAG_HR,
31
+ GUMBO_TAG_PRE,
32
+ GUMBO_TAG_BLOCKQUOTE,
33
+ GUMBO_TAG_OL,
34
+ GUMBO_TAG_UL,
35
+ GUMBO_TAG_LI,
36
+ GUMBO_TAG_DL,
37
+ GUMBO_TAG_DT,
38
+ GUMBO_TAG_DD,
39
+ GUMBO_TAG_FIGURE,
40
+ GUMBO_TAG_FIGCAPTION,
41
+ GUMBO_TAG_MAIN,
42
+ GUMBO_TAG_DIV,
43
+ GUMBO_TAG_A,
44
+ GUMBO_TAG_EM,
45
+ GUMBO_TAG_STRONG,
46
+ GUMBO_TAG_SMALL,
47
+ GUMBO_TAG_S,
48
+ GUMBO_TAG_CITE,
49
+ GUMBO_TAG_Q,
50
+ GUMBO_TAG_DFN,
51
+ GUMBO_TAG_ABBR,
52
+ GUMBO_TAG_DATA,
53
+ GUMBO_TAG_TIME,
54
+ GUMBO_TAG_CODE,
55
+ GUMBO_TAG_VAR,
56
+ GUMBO_TAG_SAMP,
57
+ GUMBO_TAG_KBD,
58
+ GUMBO_TAG_SUB,
59
+ GUMBO_TAG_SUP,
60
+ GUMBO_TAG_I,
61
+ GUMBO_TAG_B,
62
+ GUMBO_TAG_U,
63
+ GUMBO_TAG_MARK,
64
+ GUMBO_TAG_RUBY,
65
+ GUMBO_TAG_RT,
66
+ GUMBO_TAG_RP,
67
+ GUMBO_TAG_BDI,
68
+ GUMBO_TAG_BDO,
69
+ GUMBO_TAG_SPAN,
70
+ GUMBO_TAG_BR,
71
+ GUMBO_TAG_WBR,
72
+ GUMBO_TAG_INS,
73
+ GUMBO_TAG_DEL,
74
+ GUMBO_TAG_IMAGE,
75
+ GUMBO_TAG_IMG,
76
+ GUMBO_TAG_IFRAME,
77
+ GUMBO_TAG_EMBED,
78
+ GUMBO_TAG_OBJECT,
79
+ GUMBO_TAG_PARAM,
80
+ GUMBO_TAG_VIDEO,
81
+ GUMBO_TAG_AUDIO,
82
+ GUMBO_TAG_SOURCE,
83
+ GUMBO_TAG_TRACK,
84
+ GUMBO_TAG_CANVAS,
85
+ GUMBO_TAG_MAP,
86
+ GUMBO_TAG_AREA,
87
+ GUMBO_TAG_MATH,
88
+ GUMBO_TAG_MI,
89
+ GUMBO_TAG_MO,
90
+ GUMBO_TAG_MN,
91
+ GUMBO_TAG_MS,
92
+ GUMBO_TAG_MTEXT,
93
+ GUMBO_TAG_MGLYPH,
94
+ GUMBO_TAG_MALIGNMARK,
95
+ GUMBO_TAG_ANNOTATION_XML,
96
+ GUMBO_TAG_SVG,
97
+ GUMBO_TAG_FOREIGNOBJECT,
98
+ GUMBO_TAG_DESC,
99
+ GUMBO_TAG_TABLE,
100
+ GUMBO_TAG_CAPTION,
101
+ GUMBO_TAG_COLGROUP,
102
+ GUMBO_TAG_COL,
103
+ GUMBO_TAG_TBODY,
104
+ GUMBO_TAG_THEAD,
105
+ GUMBO_TAG_TFOOT,
106
+ GUMBO_TAG_TR,
107
+ GUMBO_TAG_TD,
108
+ GUMBO_TAG_TH,
109
+ GUMBO_TAG_FORM,
110
+ GUMBO_TAG_FIELDSET,
111
+ GUMBO_TAG_LEGEND,
112
+ GUMBO_TAG_LABEL,
113
+ GUMBO_TAG_INPUT,
114
+ GUMBO_TAG_BUTTON,
115
+ GUMBO_TAG_SELECT,
116
+ GUMBO_TAG_DATALIST,
117
+ GUMBO_TAG_OPTGROUP,
118
+ GUMBO_TAG_OPTION,
119
+ GUMBO_TAG_TEXTAREA,
120
+ GUMBO_TAG_KEYGEN,
121
+ GUMBO_TAG_OUTPUT,
122
+ GUMBO_TAG_PROGRESS,
123
+ GUMBO_TAG_METER,
124
+ GUMBO_TAG_DETAILS,
125
+ GUMBO_TAG_SUMMARY,
126
+ GUMBO_TAG_MENU,
127
+ GUMBO_TAG_MENUITEM,
128
+ GUMBO_TAG_APPLET,
129
+ GUMBO_TAG_ACRONYM,
130
+ GUMBO_TAG_BGSOUND,
131
+ GUMBO_TAG_DIR,
132
+ GUMBO_TAG_FRAME,
133
+ GUMBO_TAG_FRAMESET,
134
+ GUMBO_TAG_NOFRAMES,
135
+ GUMBO_TAG_ISINDEX,
136
+ GUMBO_TAG_LISTING,
137
+ GUMBO_TAG_XMP,
138
+ GUMBO_TAG_NEXTID,
139
+ GUMBO_TAG_NOEMBED,
140
+ GUMBO_TAG_PLAINTEXT,
141
+ GUMBO_TAG_RB,
142
+ GUMBO_TAG_STRIKE,
143
+ GUMBO_TAG_BASEFONT,
144
+ GUMBO_TAG_BIG,
145
+ GUMBO_TAG_BLINK,
146
+ GUMBO_TAG_CENTER,
147
+ GUMBO_TAG_FONT,
148
+ GUMBO_TAG_MARQUEE,
149
+ GUMBO_TAG_MULTICOL,
150
+ GUMBO_TAG_NOBR,
151
+ GUMBO_TAG_SPACER,
152
+ GUMBO_TAG_TT,
153
+ GUMBO_TAG_RTC,
@@ -0,0 +1,105 @@
1
+ static unsigned int tag_hash(
2
+ register const char *str, register unsigned int len) {
3
+ static unsigned short asso_values[] = {296, 296, 296, 296, 296, 296, 296, 296,
4
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
5
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
6
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 6, 4, 3, 1, 1, 0,
7
+ 1, 0, 0, 296, 296, 296, 296, 296, 296, 296, 22, 73, 151, 4, 13, 59, 65, 2,
8
+ 69, 0, 134, 9, 16, 52, 55, 28, 101, 0, 1, 6, 63, 126, 104, 93, 124, 296,
9
+ 296, 296, 296, 296, 296, 296, 22, 73, 151, 4, 13, 59, 65, 2, 69, 0, 134,
10
+ 9, 16, 52, 55, 28, 101, 0, 1, 6, 63, 126, 104, 93, 124, 296, 296, 296,
11
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
12
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
13
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
14
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
15
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
16
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
17
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
18
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
19
+ 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296};
20
+ register unsigned int hval = len;
21
+
22
+ switch (hval) {
23
+ default:
24
+ hval += asso_values[(unsigned char) str[1] + 3];
25
+ /*FALLTHROUGH*/
26
+ case 1:
27
+ hval += asso_values[(unsigned char) str[0]];
28
+ break;
29
+ }
30
+ return hval + asso_values[(unsigned char) str[len - 1]];
31
+ }
32
+
33
+ static const unsigned char kGumboTagMap[] = {GUMBO_TAG_LAST, GUMBO_TAG_LAST,
34
+ GUMBO_TAG_LAST, GUMBO_TAG_S, GUMBO_TAG_H6, GUMBO_TAG_H5, GUMBO_TAG_H4,
35
+ GUMBO_TAG_H3, GUMBO_TAG_SPACER, GUMBO_TAG_H2, GUMBO_TAG_HEADER,
36
+ GUMBO_TAG_H1, GUMBO_TAG_HEAD, GUMBO_TAG_LAST, GUMBO_TAG_DETAILS,
37
+ GUMBO_TAG_SELECT, GUMBO_TAG_DIR, GUMBO_TAG_LAST, GUMBO_TAG_DEL,
38
+ GUMBO_TAG_LAST, GUMBO_TAG_SOURCE, GUMBO_TAG_LEGEND, GUMBO_TAG_DATALIST,
39
+ GUMBO_TAG_METER, GUMBO_TAG_MGLYPH, GUMBO_TAG_LAST, GUMBO_TAG_MATH,
40
+ GUMBO_TAG_LABEL, GUMBO_TAG_TABLE, GUMBO_TAG_TEMPLATE, GUMBO_TAG_LAST,
41
+ GUMBO_TAG_RP, GUMBO_TAG_TIME, GUMBO_TAG_TITLE, GUMBO_TAG_DATA,
42
+ GUMBO_TAG_APPLET, GUMBO_TAG_HGROUP, GUMBO_TAG_SAMP, GUMBO_TAG_TEXTAREA,
43
+ GUMBO_TAG_ABBR, GUMBO_TAG_MARQUEE, GUMBO_TAG_LAST, GUMBO_TAG_MENUITEM,
44
+ GUMBO_TAG_SMALL, GUMBO_TAG_META, GUMBO_TAG_A, GUMBO_TAG_LAST,
45
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_EMBED,
46
+ GUMBO_TAG_MAP, GUMBO_TAG_LAST, GUMBO_TAG_PARAM, GUMBO_TAG_LAST,
47
+ GUMBO_TAG_LAST, GUMBO_TAG_NOBR, GUMBO_TAG_P, GUMBO_TAG_SPAN, GUMBO_TAG_EM,
48
+ GUMBO_TAG_LAST, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SECTION, GUMBO_TAG_NOEMBED,
49
+ GUMBO_TAG_NEXTID, GUMBO_TAG_FOOTER, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_HR,
50
+ GUMBO_TAG_LAST, GUMBO_TAG_FONT, GUMBO_TAG_DL, GUMBO_TAG_TR,
51
+ GUMBO_TAG_SCRIPT, GUMBO_TAG_MO, GUMBO_TAG_LAST, GUMBO_TAG_DD,
52
+ GUMBO_TAG_MAIN, GUMBO_TAG_TD, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_FORM,
53
+ GUMBO_TAG_OBJECT, GUMBO_TAG_LAST, GUMBO_TAG_FIELDSET, GUMBO_TAG_LAST,
54
+ GUMBO_TAG_BGSOUND, GUMBO_TAG_MENU, GUMBO_TAG_TFOOT, GUMBO_TAG_FIGURE,
55
+ GUMBO_TAG_RB, GUMBO_TAG_LI, GUMBO_TAG_LISTING, GUMBO_TAG_BASEFONT,
56
+ GUMBO_TAG_OPTGROUP, GUMBO_TAG_LAST, GUMBO_TAG_BASE, GUMBO_TAG_ADDRESS,
57
+ GUMBO_TAG_MI, GUMBO_TAG_LAST, GUMBO_TAG_PLAINTEXT, GUMBO_TAG_LAST,
58
+ GUMBO_TAG_PROGRESS, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
59
+ GUMBO_TAG_ACRONYM, GUMBO_TAG_ARTICLE, GUMBO_TAG_LAST, GUMBO_TAG_PRE,
60
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_AREA,
61
+ GUMBO_TAG_RT, GUMBO_TAG_LAST, GUMBO_TAG_OPTION, GUMBO_TAG_IMAGE,
62
+ GUMBO_TAG_DT, GUMBO_TAG_LAST, GUMBO_TAG_TT, GUMBO_TAG_HTML, GUMBO_TAG_WBR,
63
+ GUMBO_TAG_OL, GUMBO_TAG_LAST, GUMBO_TAG_STYLE, GUMBO_TAG_STRIKE,
64
+ GUMBO_TAG_SUP, GUMBO_TAG_MULTICOL, GUMBO_TAG_U, GUMBO_TAG_DFN, GUMBO_TAG_UL,
65
+ GUMBO_TAG_FIGCAPTION, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST, GUMBO_TAG_VAR,
66
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_FRAMESET, GUMBO_TAG_LAST,
67
+ GUMBO_TAG_BR, GUMBO_TAG_I, GUMBO_TAG_FRAME, GUMBO_TAG_LAST, GUMBO_TAG_DIV,
68
+ GUMBO_TAG_LAST, GUMBO_TAG_TH, GUMBO_TAG_MS, GUMBO_TAG_ANNOTATION_XML,
69
+ GUMBO_TAG_B, GUMBO_TAG_TBODY, GUMBO_TAG_THEAD, GUMBO_TAG_BIG,
70
+ GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_XMP, GUMBO_TAG_LAST, GUMBO_TAG_KBD,
71
+ GUMBO_TAG_LAST, GUMBO_TAG_LINK, GUMBO_TAG_IFRAME, GUMBO_TAG_MARK,
72
+ GUMBO_TAG_CENTER, GUMBO_TAG_OUTPUT, GUMBO_TAG_DESC, GUMBO_TAG_CANVAS,
73
+ GUMBO_TAG_COL, GUMBO_TAG_MALIGNMARK, GUMBO_TAG_IMG, GUMBO_TAG_ASIDE,
74
+ GUMBO_TAG_LAST, GUMBO_TAG_CODE, GUMBO_TAG_LAST, GUMBO_TAG_SUB, GUMBO_TAG_MN,
75
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_INS, GUMBO_TAG_AUDIO,
76
+ GUMBO_TAG_STRONG, GUMBO_TAG_CITE, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
77
+ GUMBO_TAG_LAST, GUMBO_TAG_INPUT, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
78
+ GUMBO_TAG_LAST, GUMBO_TAG_NAV, GUMBO_TAG_LAST, GUMBO_TAG_COLGROUP,
79
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
80
+ GUMBO_TAG_LAST, GUMBO_TAG_SVG, GUMBO_TAG_KEYGEN, GUMBO_TAG_VIDEO,
81
+ GUMBO_TAG_BDO, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
82
+ GUMBO_TAG_LAST, GUMBO_TAG_BODY, GUMBO_TAG_LAST, GUMBO_TAG_Q, GUMBO_TAG_LAST,
83
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_TRACK,
84
+ GUMBO_TAG_LAST, GUMBO_TAG_BDI, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
85
+ GUMBO_TAG_LAST, GUMBO_TAG_CAPTION, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
86
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
87
+ GUMBO_TAG_RUBY, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_BUTTON,
88
+ GUMBO_TAG_SUMMARY, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
89
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
90
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
91
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
92
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
93
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
94
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
95
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
96
+ GUMBO_TAG_LAST, GUMBO_TAG_RTC, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
97
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
98
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_BLINK, GUMBO_TAG_LAST,
99
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
100
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
101
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
102
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
103
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
104
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
105
+ GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_ISINDEX};
@@ -0,0 +1,4 @@
1
+ // Generated via `gentags.py src/tag.in`.
2
+ // Do not edit; edit src/tag.in instead.
3
+ // clang-format off
4
+ 4, 4, 5, 4, 4, 4, 5, 6, 8, 8, 4, 7, 7, 3, 5, 2, 2, 2, 2, 2, 2, 6, 6, 6, 7, 1, 2, 3, 10, 2, 2, 2, 2, 2, 2, 6, 10, 4, 3, 1, 2, 6, 5, 1, 4, 1, 3, 4, 4, 4, 4, 3, 4, 3, 3, 3, 1, 1, 1, 4, 4, 2, 2, 3, 3, 4, 2, 3, 3, 3, 5, 3, 6, 5, 6, 5, 5, 5, 6, 5, 6, 3, 4, 4, 2, 2, 2, 2, 5, 6, 10, 14, 3, 13, 4, 5, 7, 8, 3, 5, 5, 5, 2, 2, 2, 4, 8, 6, 5, 5, 6, 6, 8, 8, 6, 8, 6, 6, 8, 5, 7, 7, 4, 8, 6, 7, 7, 3, 5, 8, 8, 7, 7, 3, 6, 7, 9, 2, 6, 8, 3, 5, 6, 4, 7, 8, 4, 6, 2, 3,
@@ -0,0 +1,153 @@
1
+ // Generated via `gentags.py src/tag.in`.
2
+ // Do not edit; edit src/tag.in instead.
3
+ // clang-format off
4
+ "html",
5
+ "head",
6
+ "title",
7
+ "base",
8
+ "link",
9
+ "meta",
10
+ "style",
11
+ "script",
12
+ "noscript",
13
+ "template",
14
+ "body",
15
+ "article",
16
+ "section",
17
+ "nav",
18
+ "aside",
19
+ "h1",
20
+ "h2",
21
+ "h3",
22
+ "h4",
23
+ "h5",
24
+ "h6",
25
+ "hgroup",
26
+ "header",
27
+ "footer",
28
+ "address",
29
+ "p",
30
+ "hr",
31
+ "pre",
32
+ "blockquote",
33
+ "ol",
34
+ "ul",
35
+ "li",
36
+ "dl",
37
+ "dt",
38
+ "dd",
39
+ "figure",
40
+ "figcaption",
41
+ "main",
42
+ "div",
43
+ "a",
44
+ "em",
45
+ "strong",
46
+ "small",
47
+ "s",
48
+ "cite",
49
+ "q",
50
+ "dfn",
51
+ "abbr",
52
+ "data",
53
+ "time",
54
+ "code",
55
+ "var",
56
+ "samp",
57
+ "kbd",
58
+ "sub",
59
+ "sup",
60
+ "i",
61
+ "b",
62
+ "u",
63
+ "mark",
64
+ "ruby",
65
+ "rt",
66
+ "rp",
67
+ "bdi",
68
+ "bdo",
69
+ "span",
70
+ "br",
71
+ "wbr",
72
+ "ins",
73
+ "del",
74
+ "image",
75
+ "img",
76
+ "iframe",
77
+ "embed",
78
+ "object",
79
+ "param",
80
+ "video",
81
+ "audio",
82
+ "source",
83
+ "track",
84
+ "canvas",
85
+ "map",
86
+ "area",
87
+ "math",
88
+ "mi",
89
+ "mo",
90
+ "mn",
91
+ "ms",
92
+ "mtext",
93
+ "mglyph",
94
+ "malignmark",
95
+ "annotation-xml",
96
+ "svg",
97
+ "foreignobject",
98
+ "desc",
99
+ "table",
100
+ "caption",
101
+ "colgroup",
102
+ "col",
103
+ "tbody",
104
+ "thead",
105
+ "tfoot",
106
+ "tr",
107
+ "td",
108
+ "th",
109
+ "form",
110
+ "fieldset",
111
+ "legend",
112
+ "label",
113
+ "input",
114
+ "button",
115
+ "select",
116
+ "datalist",
117
+ "optgroup",
118
+ "option",
119
+ "textarea",
120
+ "keygen",
121
+ "output",
122
+ "progress",
123
+ "meter",
124
+ "details",
125
+ "summary",
126
+ "menu",
127
+ "menuitem",
128
+ "applet",
129
+ "acronym",
130
+ "bgsound",
131
+ "dir",
132
+ "frame",
133
+ "frameset",
134
+ "noframes",
135
+ "isindex",
136
+ "listing",
137
+ "xmp",
138
+ "nextid",
139
+ "noembed",
140
+ "plaintext",
141
+ "rb",
142
+ "strike",
143
+ "basefont",
144
+ "big",
145
+ "blink",
146
+ "center",
147
+ "font",
148
+ "marquee",
149
+ "multicol",
150
+ "nobr",
151
+ "spacer",
152
+ "tt",
153
+ "rtc",
@@ -29,6 +29,7 @@ typedef enum {
29
29
  GUMBO_TOKEN_COMMENT,
30
30
  GUMBO_TOKEN_WHITESPACE,
31
31
  GUMBO_TOKEN_CHARACTER,
32
+ GUMBO_TOKEN_CDATA,
32
33
  GUMBO_TOKEN_NULL,
33
34
  GUMBO_TOKEN_EOF
34
35
  } GumboTokenType;
@@ -42,7 +42,6 @@
42
42
  // prevents parse error position from being messed up by possible mark/resets in
43
43
  // temporary buffer manipulation.
44
44
 
45
-
46
45
  #include "tokenizer.h"
47
46
 
48
47
  #include <assert.h>
@@ -64,13 +63,13 @@
64
63
 
65
64
  // Compared against _script_data_buffer to determine if we're in double-escaped
66
65
  // script mode.
67
- const GumboStringPiece kScriptTag = { "script", 6 };
66
+ const GumboStringPiece kScriptTag = {"script", 6};
68
67
 
69
68
  // An enum for the return value of each individual state.
70
69
  typedef enum {
71
- RETURN_ERROR, // Return false (error) from the tokenizer.
72
- RETURN_SUCCESS, // Return true (success) from the tokenizer.
73
- NEXT_CHAR // Proceed to the next character and continue lexing.
70
+ RETURN_ERROR, // Return false (error) from the tokenizer.
71
+ RETURN_SUCCESS, // Return true (success) from the tokenizer.
72
+ NEXT_CHAR // Proceed to the next character and continue lexing.
74
73
  } StateResult;
75
74
 
76
75
  // This is a struct containing state necessary to build up a tag token,
@@ -136,6 +135,10 @@ typedef struct GumboInternalTokenizerState {
136
135
  // markup declaration state.
137
136
  bool _is_current_node_foreign;
138
137
 
138
+ // A flag indicating whether the tokenizer is in a CDATA section. If so, then
139
+ // text tokens emitted will be GUMBO_TOKEN_CDATA.
140
+ bool _is_in_cdata;
141
+
139
142
  // Certain states (notably character references) may emit two character tokens
140
143
  // at once, but the contract for lex() fills in only one token at a time. The
141
144
  // extra character is buffered here, and then this is checked on entry to
@@ -196,7 +199,8 @@ typedef struct GumboInternalTokenizerState {
196
199
  } GumboTokenizerState;
197
200
 
198
201
  // Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
199
- static void tokenizer_add_parse_error(GumboParser* parser, GumboErrorType type) {
202
+ static void tokenizer_add_parse_error(
203
+ GumboParser* parser, GumboErrorType type) {
200
204
  GumboError* error = gumbo_add_error(parser);
201
205
  if (!error) {
202
206
  return;
@@ -315,7 +319,11 @@ static int ensure_lowercase(int c) {
315
319
  return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
316
320
  }
317
321
 
318
- static GumboTokenType get_char_token_type(int c) {
322
+ static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
323
+ if (is_in_cdata && c > 0) {
324
+ return GUMBO_TOKEN_CDATA;
325
+ }
326
+
319
327
  switch (c) {
320
328
  case '\t':
321
329
  case '\n':
@@ -348,12 +356,10 @@ static void clear_temporary_buffer(GumboParser* parser) {
348
356
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
349
357
  assert(!tokenizer->_temporary_buffer_emit);
350
358
  utf8iterator_mark(&tokenizer->_input);
351
- gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
352
- gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
359
+ gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
353
360
  // The temporary buffer and script data buffer are the same object in the
354
361
  // spec, so the script data buffer should be cleared as well.
355
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
356
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
362
+ gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
357
363
  }
358
364
 
359
365
  // Appends a codepoint to the temporary buffer.
@@ -366,15 +372,14 @@ static void append_char_to_temporary_buffer(
366
372
  // Checks to see if the temporary buffer equals a certain string.
367
373
  // Make sure this remains side-effect free; it's used in assertions.
368
374
  #ifndef NDEBUG
369
- static bool temporary_buffer_equals(
370
- GumboParser* parser, const char* text) {
375
+ static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
371
376
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
372
377
  // TODO(jdtang): See if the extra strlen is a performance problem, and replace
373
378
  // it with an explicit sizeof(literal) if necessary. I don't think it will
374
379
  // be, as this is only used in a couple of rare states.
375
380
  int text_len = strlen(text);
376
381
  return text_len == buffer->length &&
377
- memcmp(buffer->data, text, text_len) == 0;
382
+ memcmp(buffer->data, text, text_len) == 0;
378
383
  }
379
384
  #endif
380
385
 
@@ -475,7 +480,7 @@ static void finish_doctype_system_id(GumboParser* parser) {
475
480
 
476
481
  // Writes a single specified character to the output token.
477
482
  static void emit_char(GumboParser* parser, int c, GumboToken* output) {
478
- output->type = get_char_token_type(c);
483
+ output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
479
484
  output->v.character = c;
480
485
  finish_token(parser, output);
481
486
  }
@@ -531,8 +536,8 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
531
536
  output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
532
537
  tag_state->_last_start_tag = tag_state->_tag;
533
538
  mark_tag_state_as_empty(tag_state);
534
- gumbo_debug("Emitted start tag %s.\n",
535
- gumbo_normalized_tagname(tag_state->_tag));
539
+ gumbo_debug(
540
+ "Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
536
541
  } else {
537
542
  output->type = GUMBO_TOKEN_END_TAG;
538
543
  output->v.end_tag = tag_state->_tag;
@@ -540,17 +545,18 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
540
545
  // token, but it's still initialized as normal, so it must be manually
541
546
  // deallocated. There may also be attributes to destroy, in certain broken
542
547
  // cases like </div</th> (the "th" is an attribute there).
543
- for (int i = 0; i < tag_state->_attributes.length; ++i) {
548
+ for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
544
549
  gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
545
550
  }
546
551
  gumbo_parser_deallocate(parser, tag_state->_attributes.data);
547
552
  mark_tag_state_as_empty(tag_state);
548
- gumbo_debug("Emitted end tag %s.\n",
549
- gumbo_normalized_tagname(tag_state->_tag));
553
+ gumbo_debug(
554
+ "Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
550
555
  }
551
556
  gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
552
557
  finish_token(parser, output);
553
- gumbo_debug("Original text = %.*s.\n", output->original_text.length, output->original_text.data);
558
+ gumbo_debug("Original text = %.*s.\n", output->original_text.length,
559
+ output->original_text.data);
554
560
  assert(output->original_text.length >= 2);
555
561
  assert(output->original_text.data[0] == '<');
556
562
  assert(output->original_text.data[output->original_text.length - 1] == '>');
@@ -563,7 +569,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
563
569
  // avoid a memory leak.
564
570
  static void abandon_current_tag(GumboParser* parser) {
565
571
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
566
- for (int i = 0; i < tag_state->_attributes.length; ++i) {
572
+ for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
567
573
  gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
568
574
  }
569
575
  gumbo_parser_deallocate(parser, tag_state->_attributes.data);
@@ -575,9 +581,8 @@ static void abandon_current_tag(GumboParser* parser) {
575
581
  // Wraps the consume_char_ref function to handle its output and make the
576
582
  // appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
577
583
  // error occurred, RETURN_SUCCESS otherwise.
578
- static StateResult emit_char_ref(
579
- GumboParser* parser, int additional_allowed_char,
580
- bool is_in_attribute, GumboToken* output) {
584
+ static StateResult emit_char_ref(GumboParser* parser,
585
+ int additional_allowed_char, bool is_in_attribute, GumboToken* output) {
581
586
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
582
587
  OneOrTwoCodepoints char_ref;
583
588
  bool status = consume_char_ref(
@@ -641,8 +646,7 @@ static bool maybe_emit_from_temporary_buffer(
641
646
  // _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
642
647
  // the first character in it. It returns true if a character was emitted, false
643
648
  // otherwise.
644
- static bool emit_temporary_buffer(
645
- GumboParser* parser, GumboToken* output) {
649
+ static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
646
650
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
647
651
  assert(tokenizer->_temporary_buffer.data);
648
652
  utf8iterator_reset(&tokenizer->_input);
@@ -655,8 +659,8 @@ static bool emit_temporary_buffer(
655
659
  // start point; the only time you would *not* want to pass true for this
656
660
  // parameter is if you want the original_text to include character (like an
657
661
  // opening quote) that doesn't appear in the value.
658
- static void append_char_to_tag_buffer(GumboParser* parser, int codepoint,
659
- bool reinitilize_position_on_first) {
662
+ static void append_char_to_tag_buffer(
663
+ GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
660
664
  GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
661
665
  if (buffer->length == 0 && reinitilize_position_on_first) {
662
666
  reset_tag_buffer_start_point(parser);
@@ -689,7 +693,11 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
689
693
  gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
690
694
 
691
695
  assert(tag_state->_attributes.data == NULL);
692
- gumbo_vector_init(parser, 4, &tag_state->_attributes);
696
+ // Initial size chosen by statistical analysis of a corpus of 60k webpages.
697
+ // 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
698
+ // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
699
+ // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
700
+ gumbo_vector_init(parser, 1, &tag_state->_attributes);
693
701
  tag_state->_drop_next_attr_value = false;
694
702
  tag_state->_is_start_tag = is_start_tag;
695
703
  tag_state->_is_self_closing = false;
@@ -709,16 +717,15 @@ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
709
717
  // * The start_pos GumboSourcePosition with the start position of the tag
710
718
  // buffer.
711
719
  // * The end_pos GumboSourcePosition with the current source position.
712
- static void copy_over_original_tag_text(
713
- GumboParser* parser, GumboStringPiece* original_text,
714
- GumboSourcePosition* start_pos, GumboSourcePosition* end_pos) {
720
+ static void copy_over_original_tag_text(GumboParser* parser,
721
+ GumboStringPiece* original_text, GumboSourcePosition* start_pos,
722
+ GumboSourcePosition* end_pos) {
715
723
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
716
724
  GumboTagState* tag_state = &tokenizer->_tag_state;
717
725
 
718
726
  original_text->data = tag_state->_original_text;
719
- original_text->length =
720
- utf8iterator_get_char_pointer(&tokenizer->_input) -
721
- tag_state->_original_text;
727
+ original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
728
+ tag_state->_original_text;
722
729
  if (original_text->data[original_text->length - 1] == '\r') {
723
730
  // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
724
731
  // appended to the end of original text even when it's really the first part
@@ -743,16 +750,14 @@ static void finish_tag_name(GumboParser* parser) {
743
750
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
744
751
  GumboTagState* tag_state = &tokenizer->_tag_state;
745
752
 
746
- const char* temp;
747
- copy_over_tag_buffer(parser, &temp);
748
- tag_state->_tag = gumbo_tag_enum(temp);
753
+ tag_state->_tag =
754
+ gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
749
755
  reinitialize_tag_buffer(parser);
750
- gumbo_parser_deallocate(parser, (void*) temp);
751
756
  }
752
757
 
753
758
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
754
759
  static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
755
- int original_index, int new_index) {
760
+ int original_index, int new_index) {
756
761
  GumboError* error = gumbo_add_error(parser);
757
762
  if (!error) {
758
763
  return;
@@ -782,14 +787,13 @@ static bool finish_attribute_name(GumboParser* parser) {
782
787
  assert(tag_state->_attributes.capacity);
783
788
 
784
789
  GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
785
- for (int i = 0; i < attributes->length; ++i) {
790
+ for (unsigned int i = 0; i < attributes->length; ++i) {
786
791
  GumboAttribute* attr = attributes->data[i];
787
792
  if (strlen(attr->name) == tag_state->_buffer.length &&
788
793
  memcmp(attr->name, tag_state->_buffer.data,
789
- tag_state->_buffer.length) == 0) {
794
+ tag_state->_buffer.length) == 0) {
790
795
  // Identical attribute; bail.
791
- add_duplicate_attr_error(
792
- parser, attr->name, i, attributes->length);
796
+ add_duplicate_attr_error(parser, attr->name, i, attributes->length);
793
797
  tag_state->_drop_next_attr_value = true;
794
798
  return false;
795
799
  }
@@ -798,11 +802,11 @@ static bool finish_attribute_name(GumboParser* parser) {
798
802
  GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
799
803
  attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
800
804
  copy_over_tag_buffer(parser, &attr->name);
801
- copy_over_original_tag_text(parser, &attr->original_name,
802
- &attr->name_start, &attr->name_end);
805
+ copy_over_original_tag_text(
806
+ parser, &attr->original_name, &attr->name_start, &attr->name_end);
803
807
  attr->value = gumbo_copy_stringz(parser, "");
804
- copy_over_original_tag_text(parser, &attr->original_value,
805
- &attr->name_start, &attr->name_end);
808
+ copy_over_original_tag_text(
809
+ parser, &attr->original_value, &attr->name_start, &attr->name_end);
806
810
  gumbo_vector_add(parser, attr, attributes);
807
811
  reinitialize_tag_buffer(parser);
808
812
  return true;
@@ -824,8 +828,8 @@ static void finish_attribute_value(GumboParser* parser) {
824
828
  tag_state->_attributes.data[tag_state->_attributes.length - 1];
825
829
  gumbo_parser_deallocate(parser, (void*) attr->value);
826
830
  copy_over_tag_buffer(parser, &attr->value);
827
- copy_over_original_tag_text(parser, &attr->original_value,
828
- &attr->value_start, &attr->value_end);
831
+ copy_over_original_tag_text(
832
+ parser, &attr->original_value, &attr->value_start, &attr->value_end);
829
833
  reinitialize_tag_buffer(parser);
830
834
  }
831
835
 
@@ -833,13 +837,9 @@ static void finish_attribute_value(GumboParser* parser) {
833
837
  static bool is_appropriate_end_tag(GumboParser* parser) {
834
838
  GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
835
839
  assert(!tag_state->_is_start_tag);
836
- // Null terminate the current string buffer, so it can be passed to
837
- // gumbo_tag_enum, but don't increment the length in case we need to dump the
838
- // buffer as character tokens.
839
- gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
840
- --tag_state->_buffer.length;
841
840
  return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
842
- tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
841
+ tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
842
+ tag_state->_buffer.length);
843
843
  }
844
844
 
845
845
  void gumbo_tokenizer_state_init(
@@ -850,6 +850,7 @@ void gumbo_tokenizer_state_init(
850
850
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
851
851
  tokenizer->_reconsume_current_input = false;
852
852
  tokenizer->_is_current_node_foreign = false;
853
+ tokenizer->_is_in_cdata = false;
853
854
  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
854
855
 
855
856
  tokenizer->_buffered_emit_char = kGumboNoChar;
@@ -883,15 +884,14 @@ void gumbo_tokenizer_set_is_current_node_foreign(
883
884
  GumboParser* parser, bool is_foreign) {
884
885
  if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
885
886
  gumbo_debug("Toggling is_current_node_foreign to %s.\n",
886
- is_foreign ? "true" : "false");
887
+ is_foreign ? "true" : "false");
887
888
  }
888
889
  parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
889
890
  }
890
891
 
891
892
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
892
- static StateResult handle_data_state(
893
- GumboParser* parser, GumboTokenizerState* tokenizer,
894
- int c, GumboToken* output) {
893
+ static StateResult handle_data_state(GumboParser* parser,
894
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
895
895
  switch (c) {
896
896
  case '&':
897
897
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
@@ -915,17 +915,15 @@ static StateResult handle_data_state(
915
915
  }
916
916
 
917
917
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
918
- static StateResult handle_char_ref_in_data_state(
919
- GumboParser* parser, GumboTokenizerState* tokenizer,
920
- int c, GumboToken* output) {
918
+ static StateResult handle_char_ref_in_data_state(GumboParser* parser,
919
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
921
920
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
922
921
  return emit_char_ref(parser, ' ', false, output);
923
922
  }
924
923
 
925
924
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
926
- static StateResult handle_rcdata_state(
927
- GumboParser* parser, GumboTokenizerState* tokenizer,
928
- int c, GumboToken* output) {
925
+ static StateResult handle_rcdata_state(GumboParser* parser,
926
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
929
927
  switch (c) {
930
928
  case '&':
931
929
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
@@ -946,17 +944,15 @@ static StateResult handle_rcdata_state(
946
944
  }
947
945
 
948
946
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
949
- static StateResult handle_char_ref_in_rcdata_state(
950
- GumboParser* parser, GumboTokenizerState* tokenizer,
951
- int c, GumboToken* output) {
947
+ static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
948
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
952
949
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
953
950
  return emit_char_ref(parser, ' ', false, output);
954
951
  }
955
952
 
956
953
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
957
- static StateResult handle_rawtext_state(
958
- GumboParser* parser, GumboTokenizerState* tokenizer,
959
- int c, GumboToken* output) {
954
+ static StateResult handle_rawtext_state(GumboParser* parser,
955
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
960
956
  switch (c) {
961
957
  case '<':
962
958
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
@@ -973,9 +969,8 @@ static StateResult handle_rawtext_state(
973
969
  }
974
970
 
975
971
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
976
- static StateResult handle_script_state(
977
- GumboParser* parser, GumboTokenizerState* tokenizer,
978
- int c, GumboToken* output) {
972
+ static StateResult handle_script_state(GumboParser* parser,
973
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
979
974
  switch (c) {
980
975
  case '<':
981
976
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
@@ -992,9 +987,8 @@ static StateResult handle_script_state(
992
987
  }
993
988
 
994
989
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
995
- static StateResult handle_plaintext_state(
996
- GumboParser* parser, GumboTokenizerState* tokenizer,
997
- int c, GumboToken* output) {
990
+ static StateResult handle_plaintext_state(GumboParser* parser,
991
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
998
992
  switch (c) {
999
993
  case '\0':
1000
994
  return emit_replacement_char(parser, output);
@@ -1006,9 +1000,8 @@ static StateResult handle_plaintext_state(
1006
1000
  }
1007
1001
 
1008
1002
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
1009
- static StateResult handle_tag_open_state(
1010
- GumboParser* parser, GumboTokenizerState* tokenizer,
1011
- int c, GumboToken* output) {
1003
+ static StateResult handle_tag_open_state(GumboParser* parser,
1004
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1012
1005
  assert(temporary_buffer_equals(parser, "<"));
1013
1006
  switch (c) {
1014
1007
  case '!':
@@ -1040,9 +1033,8 @@ static StateResult handle_tag_open_state(
1040
1033
  }
1041
1034
 
1042
1035
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
1043
- static StateResult handle_end_tag_open_state(
1044
- GumboParser* parser, GumboTokenizerState* tokenizer,
1045
- int c, GumboToken* output) {
1036
+ static StateResult handle_end_tag_open_state(GumboParser* parser,
1037
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1046
1038
  assert(temporary_buffer_equals(parser, "</"));
1047
1039
  switch (c) {
1048
1040
  case '>':
@@ -1068,9 +1060,8 @@ static StateResult handle_end_tag_open_state(
1068
1060
  }
1069
1061
 
1070
1062
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
1071
- static StateResult handle_tag_name_state(
1072
- GumboParser* parser, GumboTokenizerState* tokenizer,
1073
- int c, GumboToken* output) {
1063
+ static StateResult handle_tag_name_state(GumboParser* parser,
1064
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1074
1065
  switch (c) {
1075
1066
  case '\t':
1076
1067
  case '\n':
@@ -1103,9 +1094,8 @@ static StateResult handle_tag_name_state(
1103
1094
  }
1104
1095
 
1105
1096
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
1106
- static StateResult handle_rcdata_lt_state(
1107
- GumboParser* parser, GumboTokenizerState* tokenizer,
1108
- int c, GumboToken* output) {
1097
+ static StateResult handle_rcdata_lt_state(GumboParser* parser,
1098
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1109
1099
  assert(temporary_buffer_equals(parser, "<"));
1110
1100
  if (c == '/') {
1111
1101
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
@@ -1119,9 +1109,8 @@ static StateResult handle_rcdata_lt_state(
1119
1109
  }
1120
1110
 
1121
1111
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
1122
- static StateResult handle_rcdata_end_tag_open_state(
1123
- GumboParser* parser, GumboTokenizerState* tokenizer,
1124
- int c, GumboToken* output) {
1112
+ static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
1113
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1125
1114
  assert(temporary_buffer_equals(parser, "</"));
1126
1115
  if (is_alpha(c)) {
1127
1116
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
@@ -1136,9 +1125,8 @@ static StateResult handle_rcdata_end_tag_open_state(
1136
1125
  }
1137
1126
 
1138
1127
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
1139
- static StateResult handle_rcdata_end_tag_name_state(
1140
- GumboParser* parser, GumboTokenizerState* tokenizer,
1141
- int c, GumboToken* output) {
1128
+ static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
1129
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1142
1130
  assert(tokenizer->_temporary_buffer.length >= 2);
1143
1131
  if (is_alpha(c)) {
1144
1132
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
@@ -1169,9 +1157,8 @@ static StateResult handle_rcdata_end_tag_name_state(
1169
1157
  }
1170
1158
 
1171
1159
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
1172
- static StateResult handle_rawtext_lt_state(
1173
- GumboParser* parser, GumboTokenizerState* tokenizer,
1174
- int c, GumboToken* output) {
1160
+ static StateResult handle_rawtext_lt_state(GumboParser* parser,
1161
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1175
1162
  assert(temporary_buffer_equals(parser, "<"));
1176
1163
  if (c == '/') {
1177
1164
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
@@ -1185,9 +1172,8 @@ static StateResult handle_rawtext_lt_state(
1185
1172
  }
1186
1173
 
1187
1174
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
1188
- static StateResult handle_rawtext_end_tag_open_state(
1189
- GumboParser* parser, GumboTokenizerState* tokenizer,
1190
- int c, GumboToken* output) {
1175
+ static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
1176
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1191
1177
  assert(temporary_buffer_equals(parser, "</"));
1192
1178
  if (is_alpha(c)) {
1193
1179
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
@@ -1201,12 +1187,11 @@ static StateResult handle_rawtext_end_tag_open_state(
1201
1187
  }
1202
1188
 
1203
1189
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
1204
- static StateResult handle_rawtext_end_tag_name_state(
1205
- GumboParser* parser, GumboTokenizerState* tokenizer,
1206
- int c, GumboToken* output) {
1190
+ static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
1191
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1207
1192
  assert(tokenizer->_temporary_buffer.length >= 2);
1208
1193
  gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
1209
- tokenizer->_tag_state._buffer.data);
1194
+ tokenizer->_tag_state._buffer.data);
1210
1195
  if (is_alpha(c)) {
1211
1196
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1212
1197
  append_char_to_temporary_buffer(parser, c);
@@ -1237,9 +1222,8 @@ static StateResult handle_rawtext_end_tag_name_state(
1237
1222
  }
1238
1223
 
1239
1224
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
1240
- static StateResult handle_script_lt_state(
1241
- GumboParser* parser, GumboTokenizerState* tokenizer,
1242
- int c, GumboToken* output) {
1225
+ static StateResult handle_script_lt_state(GumboParser* parser,
1226
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1243
1227
  assert(temporary_buffer_equals(parser, "<"));
1244
1228
  if (c == '/') {
1245
1229
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
@@ -1257,9 +1241,8 @@ static StateResult handle_script_lt_state(
1257
1241
  }
1258
1242
 
1259
1243
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
1260
- static StateResult handle_script_end_tag_open_state(
1261
- GumboParser* parser, GumboTokenizerState* tokenizer,
1262
- int c, GumboToken* output) {
1244
+ static StateResult handle_script_end_tag_open_state(GumboParser* parser,
1245
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1263
1246
  assert(temporary_buffer_equals(parser, "</"));
1264
1247
  if (is_alpha(c)) {
1265
1248
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
@@ -1273,9 +1256,8 @@ static StateResult handle_script_end_tag_open_state(
1273
1256
  }
1274
1257
 
1275
1258
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
1276
- static StateResult handle_script_end_tag_name_state(
1277
- GumboParser* parser, GumboTokenizerState* tokenizer,
1278
- int c, GumboToken* output) {
1259
+ static StateResult handle_script_end_tag_name_state(GumboParser* parser,
1260
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1279
1261
  assert(tokenizer->_temporary_buffer.length >= 2);
1280
1262
  if (is_alpha(c)) {
1281
1263
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
@@ -1306,9 +1288,8 @@ static StateResult handle_script_end_tag_name_state(
1306
1288
  }
1307
1289
 
1308
1290
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
1309
- static StateResult handle_script_escaped_start_state(
1310
- GumboParser* parser, GumboTokenizerState* tokenizer,
1311
- int c, GumboToken* output) {
1291
+ static StateResult handle_script_escaped_start_state(GumboParser* parser,
1292
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1312
1293
  if (c == '-') {
1313
1294
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
1314
1295
  return emit_current_char(parser, output);
@@ -1320,9 +1301,8 @@ static StateResult handle_script_escaped_start_state(
1320
1301
  }
1321
1302
 
1322
1303
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
1323
- static StateResult handle_script_escaped_start_dash_state(
1324
- GumboParser* parser, GumboTokenizerState* tokenizer,
1325
- int c, GumboToken* output) {
1304
+ static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
1305
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1326
1306
  if (c == '-') {
1327
1307
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
1328
1308
  return emit_current_char(parser, output);
@@ -1334,9 +1314,8 @@ static StateResult handle_script_escaped_start_dash_state(
1334
1314
  }
1335
1315
 
1336
1316
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
1337
- static StateResult handle_script_escaped_state(
1338
- GumboParser* parser, GumboTokenizerState* tokenizer,
1339
- int c, GumboToken* output) {
1317
+ static StateResult handle_script_escaped_state(GumboParser* parser,
1318
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1340
1319
  switch (c) {
1341
1320
  case '-':
1342
1321
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
@@ -1357,9 +1336,8 @@ static StateResult handle_script_escaped_state(
1357
1336
  }
1358
1337
 
1359
1338
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
1360
- static StateResult handle_script_escaped_dash_state(
1361
- GumboParser* parser, GumboTokenizerState* tokenizer,
1362
- int c, GumboToken* output) {
1339
+ static StateResult handle_script_escaped_dash_state(GumboParser* parser,
1340
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1363
1341
  switch (c) {
1364
1342
  case '-':
1365
1343
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
@@ -1383,9 +1361,8 @@ static StateResult handle_script_escaped_dash_state(
1383
1361
  }
1384
1362
 
1385
1363
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
1386
- static StateResult handle_script_escaped_dash_dash_state(
1387
- GumboParser* parser, GumboTokenizerState* tokenizer,
1388
- int c, GumboToken* output) {
1364
+ static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
1365
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1389
1366
  switch (c) {
1390
1367
  case '-':
1391
1368
  return emit_current_char(parser, output);
@@ -1411,9 +1388,8 @@ static StateResult handle_script_escaped_dash_dash_state(
1411
1388
  }
1412
1389
 
1413
1390
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
1414
- static StateResult handle_script_escaped_lt_state(
1415
- GumboParser* parser, GumboTokenizerState* tokenizer,
1416
- int c, GumboToken* output) {
1391
+ static StateResult handle_script_escaped_lt_state(GumboParser* parser,
1392
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1417
1393
  assert(temporary_buffer_equals(parser, "<"));
1418
1394
  assert(!tokenizer->_script_data_buffer.length);
1419
1395
  if (c == '/') {
@@ -1433,9 +1409,8 @@ static StateResult handle_script_escaped_lt_state(
1433
1409
  }
1434
1410
 
1435
1411
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
1436
- static StateResult handle_script_escaped_end_tag_open_state(
1437
- GumboParser* parser, GumboTokenizerState* tokenizer,
1438
- int c, GumboToken* output) {
1412
+ static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
1413
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1439
1414
  assert(temporary_buffer_equals(parser, "</"));
1440
1415
  if (is_alpha(c)) {
1441
1416
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
@@ -1449,9 +1424,8 @@ static StateResult handle_script_escaped_end_tag_open_state(
1449
1424
  }
1450
1425
 
1451
1426
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
1452
- static StateResult handle_script_escaped_end_tag_name_state(
1453
- GumboParser* parser, GumboTokenizerState* tokenizer,
1454
- int c, GumboToken* output) {
1427
+ static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
1428
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1455
1429
  assert(tokenizer->_temporary_buffer.length >= 2);
1456
1430
  if (is_alpha(c)) {
1457
1431
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
@@ -1482,9 +1456,8 @@ static StateResult handle_script_escaped_end_tag_name_state(
1482
1456
  }
1483
1457
 
1484
1458
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
1485
- static StateResult handle_script_double_escaped_start_state(
1486
- GumboParser* parser, GumboTokenizerState* tokenizer,
1487
- int c, GumboToken* output) {
1459
+ static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
1460
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1488
1461
  switch (c) {
1489
1462
  case '\t':
1490
1463
  case '\n':
@@ -1492,9 +1465,11 @@ static StateResult handle_script_double_escaped_start_state(
1492
1465
  case ' ':
1493
1466
  case '/':
1494
1467
  case '>':
1495
- gumbo_tokenizer_set_state(parser, gumbo_string_equals(
1496
- &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1497
- ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED : GUMBO_LEX_SCRIPT_ESCAPED);
1468
+ gumbo_tokenizer_set_state(
1469
+ parser, gumbo_string_equals(&kScriptTag,
1470
+ (GumboStringPiece*) &tokenizer->_script_data_buffer)
1471
+ ? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
1472
+ : GUMBO_LEX_SCRIPT_ESCAPED);
1498
1473
  return emit_current_char(parser, output);
1499
1474
  default:
1500
1475
  if (is_alpha(c)) {
@@ -1510,9 +1485,8 @@ static StateResult handle_script_double_escaped_start_state(
1510
1485
  }
1511
1486
 
1512
1487
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
1513
- static StateResult handle_script_double_escaped_state(
1514
- GumboParser* parser, GumboTokenizerState* tokenizer,
1515
- int c, GumboToken* output) {
1488
+ static StateResult handle_script_double_escaped_state(GumboParser* parser,
1489
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1516
1490
  switch (c) {
1517
1491
  case '-':
1518
1492
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
@@ -1532,9 +1506,8 @@ static StateResult handle_script_double_escaped_state(
1532
1506
  }
1533
1507
 
1534
1508
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
1535
- static StateResult handle_script_double_escaped_dash_state(
1536
- GumboParser* parser, GumboTokenizerState* tokenizer,
1537
- int c, GumboToken* output) {
1509
+ static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
1510
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1538
1511
  switch (c) {
1539
1512
  case '-':
1540
1513
  gumbo_tokenizer_set_state(
@@ -1558,8 +1531,8 @@ static StateResult handle_script_double_escaped_dash_state(
1558
1531
 
1559
1532
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
1560
1533
  static StateResult handle_script_double_escaped_dash_dash_state(
1561
- GumboParser* parser, GumboTokenizerState* tokenizer,
1562
- int c, GumboToken* output) {
1534
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
1535
+ GumboToken* output) {
1563
1536
  switch (c) {
1564
1537
  case '-':
1565
1538
  return emit_current_char(parser, output);
@@ -1583,26 +1556,22 @@ static StateResult handle_script_double_escaped_dash_dash_state(
1583
1556
  }
1584
1557
 
1585
1558
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
1586
- static StateResult handle_script_double_escaped_lt_state(
1587
- GumboParser* parser, GumboTokenizerState* tokenizer,
1588
- int c, GumboToken* output) {
1559
+ static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
1560
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1589
1561
  if (c == '/') {
1590
1562
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
1591
- gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
1592
- gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
1563
+ gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
1593
1564
  return emit_current_char(parser, output);
1594
1565
  } else {
1595
1566
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1596
1567
  tokenizer->_reconsume_current_input = true;
1597
1568
  return NEXT_CHAR;
1598
1569
  }
1599
-
1600
1570
  }
1601
1571
 
1602
1572
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
1603
- static StateResult handle_script_double_escaped_end_state(
1604
- GumboParser* parser, GumboTokenizerState* tokenizer,
1605
- int c, GumboToken* output) {
1573
+ static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
1574
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1606
1575
  switch (c) {
1607
1576
  case '\t':
1608
1577
  case '\n':
@@ -1610,9 +1579,11 @@ static StateResult handle_script_double_escaped_end_state(
1610
1579
  case ' ':
1611
1580
  case '/':
1612
1581
  case '>':
1613
- gumbo_tokenizer_set_state(parser, gumbo_string_equals(
1614
- &kScriptTag, (GumboStringPiece*) &tokenizer->_script_data_buffer)
1615
- ? GUMBO_LEX_SCRIPT_ESCAPED : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1582
+ gumbo_tokenizer_set_state(
1583
+ parser, gumbo_string_equals(&kScriptTag,
1584
+ (GumboStringPiece*) &tokenizer->_script_data_buffer)
1585
+ ? GUMBO_LEX_SCRIPT_ESCAPED
1586
+ : GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
1616
1587
  return emit_current_char(parser, output);
1617
1588
  default:
1618
1589
  if (is_alpha(c)) {
@@ -1628,9 +1599,8 @@ static StateResult handle_script_double_escaped_end_state(
1628
1599
  }
1629
1600
 
1630
1601
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
1631
- static StateResult handle_before_attr_name_state(
1632
- GumboParser* parser, GumboTokenizerState* tokenizer,
1633
- int c, GumboToken* output) {
1602
+ static StateResult handle_before_attr_name_state(GumboParser* parser,
1603
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1634
1604
  switch (c) {
1635
1605
  case '\t':
1636
1606
  case '\n':
@@ -1658,7 +1628,7 @@ static StateResult handle_before_attr_name_state(
1658
1628
  case '<':
1659
1629
  case '=':
1660
1630
  tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1661
- // Fall through.
1631
+ // Fall through.
1662
1632
  default:
1663
1633
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1664
1634
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
@@ -1667,9 +1637,8 @@ static StateResult handle_before_attr_name_state(
1667
1637
  }
1668
1638
 
1669
1639
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
1670
- static StateResult handle_attr_name_state(
1671
- GumboParser* parser, GumboTokenizerState* tokenizer,
1672
- int c, GumboToken* output) {
1640
+ static StateResult handle_attr_name_state(GumboParser* parser,
1641
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1673
1642
  switch (c) {
1674
1643
  case '\t':
1675
1644
  case '\n':
@@ -1703,7 +1672,7 @@ static StateResult handle_attr_name_state(
1703
1672
  case '\'':
1704
1673
  case '<':
1705
1674
  tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1706
- // Fall through.
1675
+ // Fall through.
1707
1676
  default:
1708
1677
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
1709
1678
  return NEXT_CHAR;
@@ -1711,9 +1680,8 @@ static StateResult handle_attr_name_state(
1711
1680
  }
1712
1681
 
1713
1682
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
1714
- static StateResult handle_after_attr_name_state(
1715
- GumboParser* parser, GumboTokenizerState* tokenizer,
1716
- int c, GumboToken* output) {
1683
+ static StateResult handle_after_attr_name_state(GumboParser* parser,
1684
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1717
1685
  switch (c) {
1718
1686
  case '\t':
1719
1687
  case '\n':
@@ -1743,7 +1711,7 @@ static StateResult handle_after_attr_name_state(
1743
1711
  case '\'':
1744
1712
  case '<':
1745
1713
  tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
1746
- // Fall through.
1714
+ // Fall through.
1747
1715
  default:
1748
1716
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
1749
1717
  append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
@@ -1752,9 +1720,8 @@ static StateResult handle_after_attr_name_state(
1752
1720
  }
1753
1721
 
1754
1722
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
1755
- static StateResult handle_before_attr_value_state(
1756
- GumboParser* parser, GumboTokenizerState* tokenizer,
1757
- int c, GumboToken* output) {
1723
+ static StateResult handle_before_attr_value_state(GumboParser* parser,
1724
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1758
1725
  switch (c) {
1759
1726
  case '\t':
1760
1727
  case '\n':
@@ -1793,7 +1760,7 @@ static StateResult handle_before_attr_value_state(
1793
1760
  case '=':
1794
1761
  case '`':
1795
1762
  tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1796
- // Fall through.
1763
+ // Fall through.
1797
1764
  default:
1798
1765
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
1799
1766
  append_char_to_tag_buffer(parser, c, true);
@@ -1802,9 +1769,8 @@ static StateResult handle_before_attr_value_state(
1802
1769
  }
1803
1770
 
1804
1771
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
1805
- static StateResult handle_attr_value_double_quoted_state(
1806
- GumboParser* parser, GumboTokenizerState* tokenizer,
1807
- int c, GumboToken* output) {
1772
+ static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
1773
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1808
1774
  switch (c) {
1809
1775
  case '"':
1810
1776
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
@@ -1831,9 +1797,8 @@ static StateResult handle_attr_value_double_quoted_state(
1831
1797
  }
1832
1798
 
1833
1799
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
1834
- static StateResult handle_attr_value_single_quoted_state(
1835
- GumboParser* parser, GumboTokenizerState* tokenizer,
1836
- int c, GumboToken* output) {
1800
+ static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
1801
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1837
1802
  switch (c) {
1838
1803
  case '\'':
1839
1804
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
@@ -1860,9 +1825,8 @@ static StateResult handle_attr_value_single_quoted_state(
1860
1825
  }
1861
1826
 
1862
1827
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
1863
- static StateResult handle_attr_value_unquoted_state(
1864
- GumboParser* parser, GumboTokenizerState* tokenizer,
1865
- int c, GumboToken* output) {
1828
+ static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
1829
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1866
1830
  switch (c) {
1867
1831
  case '\t':
1868
1832
  case '\n':
@@ -1896,7 +1860,7 @@ static StateResult handle_attr_value_unquoted_state(
1896
1860
  case '\'':
1897
1861
  case '`':
1898
1862
  tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
1899
- // Fall through.
1863
+ // Fall through.
1900
1864
  default:
1901
1865
  append_char_to_tag_buffer(parser, c, true);
1902
1866
  return NEXT_CHAR;
@@ -1904,9 +1868,8 @@ static StateResult handle_attr_value_unquoted_state(
1904
1868
  }
1905
1869
 
1906
1870
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
1907
- static StateResult handle_char_ref_in_attr_value_state(
1908
- GumboParser* parser, GumboTokenizerState* tokenizer,
1909
- int c, GumboToken* output) {
1871
+ static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
1872
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1910
1873
  OneOrTwoCodepoints char_ref;
1911
1874
  int allowed_char;
1912
1875
  bool is_unquoted = false;
@@ -1947,9 +1910,8 @@ static StateResult handle_char_ref_in_attr_value_state(
1947
1910
  }
1948
1911
 
1949
1912
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
1950
- static StateResult handle_after_attr_value_quoted_state(
1951
- GumboParser* parser, GumboTokenizerState* tokenizer,
1952
- int c, GumboToken* output) {
1913
+ static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
1914
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1953
1915
  finish_attribute_value(parser);
1954
1916
  switch (c) {
1955
1917
  case '\t':
@@ -1979,9 +1941,8 @@ static StateResult handle_after_attr_value_quoted_state(
1979
1941
  }
1980
1942
 
1981
1943
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
1982
- static StateResult handle_self_closing_start_tag_state(
1983
- GumboParser* parser, GumboTokenizerState* tokenizer,
1984
- int c, GumboToken* output) {
1944
+ static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
1945
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
1985
1946
  switch (c) {
1986
1947
  case '>':
1987
1948
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
@@ -2001,9 +1962,8 @@ static StateResult handle_self_closing_start_tag_state(
2001
1962
  }
2002
1963
 
2003
1964
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
2004
- static StateResult handle_bogus_comment_state(
2005
- GumboParser* parser, GumboTokenizerState* tokenizer,
2006
- int c, GumboToken* output) {
1965
+ static StateResult handle_bogus_comment_state(GumboParser* parser,
1966
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2007
1967
  while (c != '>' && c != -1) {
2008
1968
  if (c == '\0') {
2009
1969
  c = 0xFFFD;
@@ -2017,15 +1977,14 @@ static StateResult handle_bogus_comment_state(
2017
1977
  }
2018
1978
 
2019
1979
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
2020
- static StateResult handle_markup_declaration_state(
2021
- GumboParser* parser, GumboTokenizerState* tokenizer,
2022
- int c, GumboToken* output) {
1980
+ static StateResult handle_markup_declaration_state(GumboParser* parser,
1981
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2023
1982
  if (utf8iterator_maybe_consume_match(
2024
- &tokenizer->_input, "--", sizeof("--") - 1, true)) {
1983
+ &tokenizer->_input, "--", sizeof("--") - 1, true)) {
2025
1984
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
2026
1985
  tokenizer->_reconsume_current_input = true;
2027
1986
  } else if (utf8iterator_maybe_consume_match(
2028
- &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
1987
+ &tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
2029
1988
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
2030
1989
  tokenizer->_reconsume_current_input = true;
2031
1990
  // If we get here, we know we'll eventually emit a doctype token, so now is
@@ -2039,8 +1998,9 @@ static StateResult handle_markup_declaration_state(
2039
1998
  gumbo_copy_stringz(parser, "");
2040
1999
  } else if (tokenizer->_is_current_node_foreign &&
2041
2000
  utf8iterator_maybe_consume_match(
2042
- &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2001
+ &tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
2043
2002
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
2003
+ tokenizer->_is_in_cdata = true;
2044
2004
  tokenizer->_reconsume_current_input = true;
2045
2005
  } else {
2046
2006
  tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
@@ -2052,9 +2012,8 @@ static StateResult handle_markup_declaration_state(
2052
2012
  }
2053
2013
 
2054
2014
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
2055
- static StateResult handle_comment_start_state(
2056
- GumboParser* parser, GumboTokenizerState* tokenizer,
2057
- int c, GumboToken* output) {
2015
+ static StateResult handle_comment_start_state(GumboParser* parser,
2016
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2058
2017
  switch (c) {
2059
2018
  case '-':
2060
2019
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
@@ -2082,9 +2041,8 @@ static StateResult handle_comment_start_state(
2082
2041
  }
2083
2042
 
2084
2043
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
2085
- static StateResult handle_comment_start_dash_state(
2086
- GumboParser* parser, GumboTokenizerState* tokenizer,
2087
- int c, GumboToken* output) {
2044
+ static StateResult handle_comment_start_dash_state(GumboParser* parser,
2045
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2088
2046
  switch (c) {
2089
2047
  case '-':
2090
2048
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
@@ -2114,9 +2072,8 @@ static StateResult handle_comment_start_dash_state(
2114
2072
  }
2115
2073
 
2116
2074
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
2117
- static StateResult handle_comment_state(
2118
- GumboParser* parser, GumboTokenizerState* tokenizer,
2119
- int c, GumboToken* output) {
2075
+ static StateResult handle_comment_state(GumboParser* parser,
2076
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2120
2077
  switch (c) {
2121
2078
  case '-':
2122
2079
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
@@ -2137,9 +2094,8 @@ static StateResult handle_comment_state(
2137
2094
  }
2138
2095
 
2139
2096
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
2140
- static StateResult handle_comment_end_dash_state(
2141
- GumboParser* parser, GumboTokenizerState* tokenizer,
2142
- int c, GumboToken* output) {
2097
+ static StateResult handle_comment_end_dash_state(GumboParser* parser,
2098
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2143
2099
  switch (c) {
2144
2100
  case '-':
2145
2101
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
@@ -2164,9 +2120,8 @@ static StateResult handle_comment_end_dash_state(
2164
2120
  }
2165
2121
 
2166
2122
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
2167
- static StateResult handle_comment_end_state(
2168
- GumboParser* parser, GumboTokenizerState* tokenizer,
2169
- int c, GumboToken* output) {
2123
+ static StateResult handle_comment_end_state(GumboParser* parser,
2124
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2170
2125
  switch (c) {
2171
2126
  case '>':
2172
2127
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
@@ -2179,11 +2134,13 @@ static StateResult handle_comment_end_state(
2179
2134
  append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
2180
2135
  return NEXT_CHAR;
2181
2136
  case '!':
2182
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2137
+ tokenizer_add_parse_error(
2138
+ parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
2183
2139
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
2184
2140
  return NEXT_CHAR;
2185
2141
  case '-':
2186
- tokenizer_add_parse_error(parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2142
+ tokenizer_add_parse_error(
2143
+ parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
2187
2144
  append_char_to_temporary_buffer(parser, '-');
2188
2145
  return NEXT_CHAR;
2189
2146
  case -1:
@@ -2202,9 +2159,8 @@ static StateResult handle_comment_end_state(
2202
2159
  }
2203
2160
 
2204
2161
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
2205
- static StateResult handle_comment_end_bang_state(
2206
- GumboParser* parser, GumboTokenizerState* tokenizer,
2207
- int c, GumboToken* output) {
2162
+ static StateResult handle_comment_end_bang_state(GumboParser* parser,
2163
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2208
2164
  switch (c) {
2209
2165
  case '-':
2210
2166
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
@@ -2239,9 +2195,8 @@ static StateResult handle_comment_end_bang_state(
2239
2195
  }
2240
2196
 
2241
2197
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
2242
- static StateResult handle_doctype_state(
2243
- GumboParser* parser, GumboTokenizerState* tokenizer,
2244
- int c, GumboToken* output) {
2198
+ static StateResult handle_doctype_state(GumboParser* parser,
2199
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2245
2200
  assert(!tokenizer->_temporary_buffer.length);
2246
2201
  switch (c) {
2247
2202
  case '\t':
@@ -2266,9 +2221,8 @@ static StateResult handle_doctype_state(
2266
2221
  }
2267
2222
 
2268
2223
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
2269
- static StateResult handle_before_doctype_name_state(
2270
- GumboParser* parser, GumboTokenizerState* tokenizer,
2271
- int c, GumboToken* output) {
2224
+ static StateResult handle_before_doctype_name_state(GumboParser* parser,
2225
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2272
2226
  switch (c) {
2273
2227
  case '\t':
2274
2228
  case '\n':
@@ -2302,9 +2256,8 @@ static StateResult handle_before_doctype_name_state(
2302
2256
  }
2303
2257
 
2304
2258
  // http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
2305
- static StateResult handle_doctype_name_state(
2306
- GumboParser* parser, GumboTokenizerState* tokenizer,
2307
- int c, GumboToken* output) {
2259
+ static StateResult handle_doctype_name_state(GumboParser* parser,
2260
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2308
2261
  switch (c) {
2309
2262
  case '\t':
2310
2263
  case '\n':
@@ -2312,14 +2265,12 @@ static StateResult handle_doctype_name_state(
2312
2265
  case ' ':
2313
2266
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
2314
2267
  gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2315
- finish_temporary_buffer(
2316
- parser, &tokenizer->_doc_type_state.name);
2268
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2317
2269
  return NEXT_CHAR;
2318
2270
  case '>':
2319
2271
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2320
2272
  gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2321
- finish_temporary_buffer(
2322
- parser, &tokenizer->_doc_type_state.name);
2273
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2323
2274
  emit_doctype(parser, output);
2324
2275
  return RETURN_SUCCESS;
2325
2276
  case '\0':
@@ -2331,8 +2282,7 @@ static StateResult handle_doctype_name_state(
2331
2282
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2332
2283
  tokenizer->_doc_type_state.force_quirks = true;
2333
2284
  gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
2334
- finish_temporary_buffer(
2335
- parser, &tokenizer->_doc_type_state.name);
2285
+ finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
2336
2286
  emit_doctype(parser, output);
2337
2287
  return RETURN_ERROR;
2338
2288
  default:
@@ -2344,9 +2294,8 @@ static StateResult handle_doctype_name_state(
2344
2294
  }
2345
2295
 
2346
2296
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
2347
- static StateResult handle_after_doctype_name_state(
2348
- GumboParser* parser, GumboTokenizerState* tokenizer,
2349
- int c, GumboToken* output) {
2297
+ static StateResult handle_after_doctype_name_state(GumboParser* parser,
2298
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2350
2299
  switch (c) {
2351
2300
  case '\t':
2352
2301
  case '\n':
@@ -2365,17 +2314,18 @@ static StateResult handle_after_doctype_name_state(
2365
2314
  return RETURN_ERROR;
2366
2315
  default:
2367
2316
  if (utf8iterator_maybe_consume_match(
2368
- &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2317
+ &tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
2369
2318
  gumbo_tokenizer_set_state(
2370
2319
  parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
2371
2320
  tokenizer->_reconsume_current_input = true;
2372
- } else if (utf8iterator_maybe_consume_match(
2373
- &tokenizer->_input, "SYSTEM", sizeof("SYSTEM") - 1, false)) {
2321
+ } else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
2322
+ sizeof("SYSTEM") - 1, false)) {
2374
2323
  gumbo_tokenizer_set_state(
2375
2324
  parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
2376
2325
  tokenizer->_reconsume_current_input = true;
2377
2326
  } else {
2378
- tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2327
+ tokenizer_add_parse_error(
2328
+ parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
2379
2329
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
2380
2330
  tokenizer->_doc_type_state.force_quirks = true;
2381
2331
  }
@@ -2385,15 +2335,14 @@ static StateResult handle_after_doctype_name_state(
2385
2335
 
2386
2336
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
2387
2337
  static StateResult handle_after_doctype_public_keyword_state(
2388
- GumboParser* parser, GumboTokenizerState* tokenizer,
2389
- int c, GumboToken* output) {
2338
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2339
+ GumboToken* output) {
2390
2340
  switch (c) {
2391
2341
  case '\t':
2392
2342
  case '\n':
2393
2343
  case '\f':
2394
2344
  case ' ':
2395
- gumbo_tokenizer_set_state(
2396
- parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2345
+ gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
2397
2346
  return NEXT_CHAR;
2398
2347
  case '"':
2399
2348
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
@@ -2429,9 +2378,8 @@ static StateResult handle_after_doctype_public_keyword_state(
2429
2378
  }
2430
2379
 
2431
2380
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
2432
- static StateResult handle_before_doctype_public_id_state(
2433
- GumboParser* parser, GumboTokenizerState* tokenizer,
2434
- int c, GumboToken* output) {
2381
+ static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
2382
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2435
2383
  switch (c) {
2436
2384
  case '\t':
2437
2385
  case '\n':
@@ -2471,8 +2419,8 @@ static StateResult handle_before_doctype_public_id_state(
2471
2419
 
2472
2420
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
2473
2421
  static StateResult handle_doctype_public_id_double_quoted_state(
2474
- GumboParser* parser, GumboTokenizerState* tokenizer,
2475
- int c, GumboToken* output) {
2422
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2423
+ GumboToken* output) {
2476
2424
  switch (c) {
2477
2425
  case '"':
2478
2426
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
@@ -2504,8 +2452,8 @@ static StateResult handle_doctype_public_id_double_quoted_state(
2504
2452
 
2505
2453
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
2506
2454
  static StateResult handle_doctype_public_id_single_quoted_state(
2507
- GumboParser* parser, GumboTokenizerState* tokenizer,
2508
- int c, GumboToken* output) {
2455
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2456
+ GumboToken* output) {
2509
2457
  switch (c) {
2510
2458
  case '\'':
2511
2459
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
@@ -2536,9 +2484,8 @@ static StateResult handle_doctype_public_id_single_quoted_state(
2536
2484
  }
2537
2485
 
2538
2486
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
2539
- static StateResult handle_after_doctype_public_id_state(
2540
- GumboParser* parser, GumboTokenizerState* tokenizer,
2541
- int c, GumboToken* output) {
2487
+ static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
2488
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2542
2489
  switch (c) {
2543
2490
  case '\t':
2544
2491
  case '\n':
@@ -2568,7 +2515,8 @@ static StateResult handle_after_doctype_public_id_state(
2568
2515
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2569
2516
  tokenizer->_reconsume_current_input = true;
2570
2517
  tokenizer->_doc_type_state.force_quirks = true;
2571
- return NEXT_CHAR;
2518
+ emit_doctype(parser, output);
2519
+ return RETURN_ERROR;
2572
2520
  default:
2573
2521
  tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
2574
2522
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
@@ -2579,8 +2527,8 @@ static StateResult handle_after_doctype_public_id_state(
2579
2527
 
2580
2528
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
2581
2529
  static StateResult handle_between_doctype_public_system_id_state(
2582
- GumboParser* parser, GumboTokenizerState* tokenizer,
2583
- int c, GumboToken* output) {
2530
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2531
+ GumboToken* output) {
2584
2532
  switch (c) {
2585
2533
  case '\t':
2586
2534
  case '\n':
@@ -2618,8 +2566,8 @@ static StateResult handle_between_doctype_public_system_id_state(
2618
2566
 
2619
2567
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
2620
2568
  static StateResult handle_after_doctype_system_keyword_state(
2621
- GumboParser* parser, GumboTokenizerState* tokenizer,
2622
- int c, GumboToken* output) {
2569
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2570
+ GumboToken* output) {
2623
2571
  switch (c) {
2624
2572
  case '\t':
2625
2573
  case '\n':
@@ -2660,9 +2608,8 @@ static StateResult handle_after_doctype_system_keyword_state(
2660
2608
  }
2661
2609
 
2662
2610
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
2663
- static StateResult handle_before_doctype_system_id_state(
2664
- GumboParser* parser, GumboTokenizerState* tokenizer,
2665
- int c, GumboToken* output) {
2611
+ static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
2612
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2666
2613
  switch (c) {
2667
2614
  case '\t':
2668
2615
  case '\n':
@@ -2701,8 +2648,8 @@ static StateResult handle_before_doctype_system_id_state(
2701
2648
 
2702
2649
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
2703
2650
  static StateResult handle_doctype_system_id_double_quoted_state(
2704
- GumboParser* parser, GumboTokenizerState* tokenizer,
2705
- int c, GumboToken* output) {
2651
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2652
+ GumboToken* output) {
2706
2653
  switch (c) {
2707
2654
  case '"':
2708
2655
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
@@ -2734,8 +2681,8 @@ static StateResult handle_doctype_system_id_double_quoted_state(
2734
2681
 
2735
2682
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
2736
2683
  static StateResult handle_doctype_system_id_single_quoted_state(
2737
- GumboParser* parser, GumboTokenizerState* tokenizer,
2738
- int c, GumboToken* output) {
2684
+ GumboParser* parser, GumboTokenizerState* tokenizer, int c,
2685
+ GumboToken* output) {
2739
2686
  switch (c) {
2740
2687
  case '\'':
2741
2688
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
@@ -2766,9 +2713,8 @@ static StateResult handle_doctype_system_id_single_quoted_state(
2766
2713
  }
2767
2714
 
2768
2715
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
2769
- static StateResult handle_after_doctype_system_id_state(
2770
- GumboParser* parser, GumboTokenizerState* tokenizer,
2771
- int c, GumboToken* output) {
2716
+ static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
2717
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2772
2718
  switch (c) {
2773
2719
  case '\t':
2774
2720
  case '\n':
@@ -2793,9 +2739,8 @@ static StateResult handle_after_doctype_system_id_state(
2793
2739
  }
2794
2740
 
2795
2741
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
2796
- static StateResult handle_bogus_doctype_state(
2797
- GumboParser* parser, GumboTokenizerState* tokenizer,
2798
- int c, GumboToken* output) {
2742
+ static StateResult handle_bogus_doctype_state(GumboParser* parser,
2743
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2799
2744
  if (c == '>' || c == -1) {
2800
2745
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2801
2746
  emit_doctype(parser, output);
@@ -2805,14 +2750,14 @@ static StateResult handle_bogus_doctype_state(
2805
2750
  }
2806
2751
 
2807
2752
  // http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
2808
- static StateResult handle_cdata_state(
2809
- GumboParser* parser, GumboTokenizerState* tokenizer,
2810
- int c, GumboToken* output) {
2753
+ static StateResult handle_cdata_state(GumboParser* parser,
2754
+ GumboTokenizerState* tokenizer, int c, GumboToken* output) {
2811
2755
  if (c == -1 || utf8iterator_maybe_consume_match(
2812
- &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2756
+ &tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
2813
2757
  tokenizer->_reconsume_current_input = true;
2814
2758
  reset_token_start_point(tokenizer);
2815
2759
  gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
2760
+ tokenizer->_is_in_cdata = false;
2816
2761
  return NEXT_CHAR;
2817
2762
  } else {
2818
2763
  return emit_current_char(parser, output);
@@ -2822,76 +2767,47 @@ static StateResult handle_cdata_state(
2822
2767
  typedef StateResult (*GumboLexerStateFunction)(
2823
2768
  GumboParser*, GumboTokenizerState*, int, GumboToken*);
2824
2769
 
2825
- static GumboLexerStateFunction dispatch_table[] = {
2826
- handle_data_state,
2827
- handle_char_ref_in_data_state,
2828
- handle_rcdata_state,
2829
- handle_char_ref_in_rcdata_state,
2830
- handle_rawtext_state,
2831
- handle_script_state,
2832
- handle_plaintext_state,
2833
- handle_tag_open_state,
2834
- handle_end_tag_open_state,
2835
- handle_tag_name_state,
2836
- handle_rcdata_lt_state,
2837
- handle_rcdata_end_tag_open_state,
2838
- handle_rcdata_end_tag_name_state,
2839
- handle_rawtext_lt_state,
2840
- handle_rawtext_end_tag_open_state,
2841
- handle_rawtext_end_tag_name_state,
2842
- handle_script_lt_state,
2843
- handle_script_end_tag_open_state,
2844
- handle_script_end_tag_name_state,
2845
- handle_script_escaped_start_state,
2846
- handle_script_escaped_start_dash_state,
2847
- handle_script_escaped_state,
2848
- handle_script_escaped_dash_state,
2849
- handle_script_escaped_dash_dash_state,
2850
- handle_script_escaped_lt_state,
2851
- handle_script_escaped_end_tag_open_state,
2852
- handle_script_escaped_end_tag_name_state,
2853
- handle_script_double_escaped_start_state,
2854
- handle_script_double_escaped_state,
2855
- handle_script_double_escaped_dash_state,
2856
- handle_script_double_escaped_dash_dash_state,
2857
- handle_script_double_escaped_lt_state,
2858
- handle_script_double_escaped_end_state,
2859
- handle_before_attr_name_state,
2860
- handle_attr_name_state,
2861
- handle_after_attr_name_state,
2862
- handle_before_attr_value_state,
2863
- handle_attr_value_double_quoted_state,
2864
- handle_attr_value_single_quoted_state,
2865
- handle_attr_value_unquoted_state,
2866
- handle_char_ref_in_attr_value_state,
2867
- handle_after_attr_value_quoted_state,
2868
- handle_self_closing_start_tag_state,
2869
- handle_bogus_comment_state,
2870
- handle_markup_declaration_state,
2871
- handle_comment_start_state,
2872
- handle_comment_start_dash_state,
2873
- handle_comment_state,
2874
- handle_comment_end_dash_state,
2875
- handle_comment_end_state,
2876
- handle_comment_end_bang_state,
2877
- handle_doctype_state,
2878
- handle_before_doctype_name_state,
2879
- handle_doctype_name_state,
2880
- handle_after_doctype_name_state,
2881
- handle_after_doctype_public_keyword_state,
2882
- handle_before_doctype_public_id_state,
2883
- handle_doctype_public_id_double_quoted_state,
2884
- handle_doctype_public_id_single_quoted_state,
2885
- handle_after_doctype_public_id_state,
2886
- handle_between_doctype_public_system_id_state,
2887
- handle_after_doctype_system_keyword_state,
2888
- handle_before_doctype_system_id_state,
2889
- handle_doctype_system_id_double_quoted_state,
2890
- handle_doctype_system_id_single_quoted_state,
2891
- handle_after_doctype_system_id_state,
2892
- handle_bogus_doctype_state,
2893
- handle_cdata_state
2894
- };
2770
+ static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
2771
+ handle_char_ref_in_data_state, handle_rcdata_state,
2772
+ handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
2773
+ handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
2774
+ handle_tag_name_state, handle_rcdata_lt_state,
2775
+ handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
2776
+ handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
2777
+ handle_rawtext_end_tag_name_state, handle_script_lt_state,
2778
+ handle_script_end_tag_open_state, handle_script_end_tag_name_state,
2779
+ handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
2780
+ handle_script_escaped_state, handle_script_escaped_dash_state,
2781
+ handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
2782
+ handle_script_escaped_end_tag_open_state,
2783
+ handle_script_escaped_end_tag_name_state,
2784
+ handle_script_double_escaped_start_state,
2785
+ handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
2786
+ handle_script_double_escaped_dash_dash_state,
2787
+ handle_script_double_escaped_lt_state,
2788
+ handle_script_double_escaped_end_state, handle_before_attr_name_state,
2789
+ handle_attr_name_state, handle_after_attr_name_state,
2790
+ handle_before_attr_value_state, handle_attr_value_double_quoted_state,
2791
+ handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
2792
+ handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
2793
+ handle_self_closing_start_tag_state, handle_bogus_comment_state,
2794
+ handle_markup_declaration_state, handle_comment_start_state,
2795
+ handle_comment_start_dash_state, handle_comment_state,
2796
+ handle_comment_end_dash_state, handle_comment_end_state,
2797
+ handle_comment_end_bang_state, handle_doctype_state,
2798
+ handle_before_doctype_name_state, handle_doctype_name_state,
2799
+ handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
2800
+ handle_before_doctype_public_id_state,
2801
+ handle_doctype_public_id_double_quoted_state,
2802
+ handle_doctype_public_id_single_quoted_state,
2803
+ handle_after_doctype_public_id_state,
2804
+ handle_between_doctype_public_system_id_state,
2805
+ handle_after_doctype_system_keyword_state,
2806
+ handle_before_doctype_system_id_state,
2807
+ handle_doctype_system_id_double_quoted_state,
2808
+ handle_doctype_system_id_single_quoted_state,
2809
+ handle_after_doctype_system_id_state, handle_bogus_doctype_state,
2810
+ handle_cdata_state};
2895
2811
 
2896
2812
  bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2897
2813
  // Because of the spec requirements that...
@@ -2929,7 +2845,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2929
2845
  assert(!tokenizer->_temporary_buffer_emit);
2930
2846
  assert(tokenizer->_buffered_emit_char == kGumboNoChar);
2931
2847
  int c = utf8iterator_current(&tokenizer->_input);
2932
- gumbo_debug("Lexing character '%c' in state %d.\n", c, tokenizer->_state);
2848
+ gumbo_debug(
2849
+ "Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
2933
2850
  StateResult result =
2934
2851
  dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
2935
2852
  // We need to clear reconsume_current_input before returning to prevent
@@ -2939,7 +2856,7 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
2939
2856
 
2940
2857
  if (result == RETURN_SUCCESS) {
2941
2858
  return true;
2942
- } else if(result == RETURN_ERROR) {
2859
+ } else if (result == RETURN_ERROR) {
2943
2860
  return false;
2944
2861
  }
2945
2862
 
@@ -2961,7 +2878,7 @@ void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
2961
2878
  parser, (void*) token->v.doc_type.system_identifier);
2962
2879
  return;
2963
2880
  case GUMBO_TOKEN_START_TAG:
2964
- for (int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2881
+ for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
2965
2882
  GumboAttribute* attr = token->v.start_tag.attributes.data[i];
2966
2883
  if (attr) {
2967
2884
  // May have been nulled out if this token was merged with another.