nokogumbo 1.4.2 → 1.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/gumbo-parser/src/attribute.c +1 -1
- data/gumbo-parser/src/char_ref.c +37 -67
- data/gumbo-parser/src/char_ref.h +3 -4
- data/gumbo-parser/src/char_ref.rl +6 -1
- data/gumbo-parser/src/error.c +50 -51
- data/gumbo-parser/src/error.h +7 -9
- data/gumbo-parser/src/gumbo.h +45 -181
- data/gumbo-parser/src/parser.c +1397 -989
- data/gumbo-parser/src/string_buffer.c +14 -10
- data/gumbo-parser/src/string_buffer.h +9 -6
- data/gumbo-parser/src/string_piece.c +5 -6
- data/gumbo-parser/src/string_piece.h +2 -3
- data/gumbo-parser/src/tag.c +36 -166
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +153 -0
- data/gumbo-parser/src/tag_gperf.h +105 -0
- data/gumbo-parser/src/tag_sizes.h +4 -0
- data/gumbo-parser/src/tag_strings.h +153 -0
- data/gumbo-parser/src/tokenizer.c +264 -360
- data/gumbo-parser/src/tokenizer.h +2 -2
- data/gumbo-parser/src/utf8.c +44 -44
- data/gumbo-parser/src/utf8.h +1 -2
- data/gumbo-parser/src/util.c +1 -1
- data/gumbo-parser/src/util.h +0 -2
- data/gumbo-parser/src/vector.c +17 -17
- data/gumbo-parser/src/vector.h +6 -8
- metadata +8 -3
@@ -0,0 +1,153 @@
|
|
1
|
+
// Generated via `gentags.py src/tag.in`.
|
2
|
+
// Do not edit; edit src/tag.in instead.
|
3
|
+
// clang-format off
|
4
|
+
GUMBO_TAG_HTML,
|
5
|
+
GUMBO_TAG_HEAD,
|
6
|
+
GUMBO_TAG_TITLE,
|
7
|
+
GUMBO_TAG_BASE,
|
8
|
+
GUMBO_TAG_LINK,
|
9
|
+
GUMBO_TAG_META,
|
10
|
+
GUMBO_TAG_STYLE,
|
11
|
+
GUMBO_TAG_SCRIPT,
|
12
|
+
GUMBO_TAG_NOSCRIPT,
|
13
|
+
GUMBO_TAG_TEMPLATE,
|
14
|
+
GUMBO_TAG_BODY,
|
15
|
+
GUMBO_TAG_ARTICLE,
|
16
|
+
GUMBO_TAG_SECTION,
|
17
|
+
GUMBO_TAG_NAV,
|
18
|
+
GUMBO_TAG_ASIDE,
|
19
|
+
GUMBO_TAG_H1,
|
20
|
+
GUMBO_TAG_H2,
|
21
|
+
GUMBO_TAG_H3,
|
22
|
+
GUMBO_TAG_H4,
|
23
|
+
GUMBO_TAG_H5,
|
24
|
+
GUMBO_TAG_H6,
|
25
|
+
GUMBO_TAG_HGROUP,
|
26
|
+
GUMBO_TAG_HEADER,
|
27
|
+
GUMBO_TAG_FOOTER,
|
28
|
+
GUMBO_TAG_ADDRESS,
|
29
|
+
GUMBO_TAG_P,
|
30
|
+
GUMBO_TAG_HR,
|
31
|
+
GUMBO_TAG_PRE,
|
32
|
+
GUMBO_TAG_BLOCKQUOTE,
|
33
|
+
GUMBO_TAG_OL,
|
34
|
+
GUMBO_TAG_UL,
|
35
|
+
GUMBO_TAG_LI,
|
36
|
+
GUMBO_TAG_DL,
|
37
|
+
GUMBO_TAG_DT,
|
38
|
+
GUMBO_TAG_DD,
|
39
|
+
GUMBO_TAG_FIGURE,
|
40
|
+
GUMBO_TAG_FIGCAPTION,
|
41
|
+
GUMBO_TAG_MAIN,
|
42
|
+
GUMBO_TAG_DIV,
|
43
|
+
GUMBO_TAG_A,
|
44
|
+
GUMBO_TAG_EM,
|
45
|
+
GUMBO_TAG_STRONG,
|
46
|
+
GUMBO_TAG_SMALL,
|
47
|
+
GUMBO_TAG_S,
|
48
|
+
GUMBO_TAG_CITE,
|
49
|
+
GUMBO_TAG_Q,
|
50
|
+
GUMBO_TAG_DFN,
|
51
|
+
GUMBO_TAG_ABBR,
|
52
|
+
GUMBO_TAG_DATA,
|
53
|
+
GUMBO_TAG_TIME,
|
54
|
+
GUMBO_TAG_CODE,
|
55
|
+
GUMBO_TAG_VAR,
|
56
|
+
GUMBO_TAG_SAMP,
|
57
|
+
GUMBO_TAG_KBD,
|
58
|
+
GUMBO_TAG_SUB,
|
59
|
+
GUMBO_TAG_SUP,
|
60
|
+
GUMBO_TAG_I,
|
61
|
+
GUMBO_TAG_B,
|
62
|
+
GUMBO_TAG_U,
|
63
|
+
GUMBO_TAG_MARK,
|
64
|
+
GUMBO_TAG_RUBY,
|
65
|
+
GUMBO_TAG_RT,
|
66
|
+
GUMBO_TAG_RP,
|
67
|
+
GUMBO_TAG_BDI,
|
68
|
+
GUMBO_TAG_BDO,
|
69
|
+
GUMBO_TAG_SPAN,
|
70
|
+
GUMBO_TAG_BR,
|
71
|
+
GUMBO_TAG_WBR,
|
72
|
+
GUMBO_TAG_INS,
|
73
|
+
GUMBO_TAG_DEL,
|
74
|
+
GUMBO_TAG_IMAGE,
|
75
|
+
GUMBO_TAG_IMG,
|
76
|
+
GUMBO_TAG_IFRAME,
|
77
|
+
GUMBO_TAG_EMBED,
|
78
|
+
GUMBO_TAG_OBJECT,
|
79
|
+
GUMBO_TAG_PARAM,
|
80
|
+
GUMBO_TAG_VIDEO,
|
81
|
+
GUMBO_TAG_AUDIO,
|
82
|
+
GUMBO_TAG_SOURCE,
|
83
|
+
GUMBO_TAG_TRACK,
|
84
|
+
GUMBO_TAG_CANVAS,
|
85
|
+
GUMBO_TAG_MAP,
|
86
|
+
GUMBO_TAG_AREA,
|
87
|
+
GUMBO_TAG_MATH,
|
88
|
+
GUMBO_TAG_MI,
|
89
|
+
GUMBO_TAG_MO,
|
90
|
+
GUMBO_TAG_MN,
|
91
|
+
GUMBO_TAG_MS,
|
92
|
+
GUMBO_TAG_MTEXT,
|
93
|
+
GUMBO_TAG_MGLYPH,
|
94
|
+
GUMBO_TAG_MALIGNMARK,
|
95
|
+
GUMBO_TAG_ANNOTATION_XML,
|
96
|
+
GUMBO_TAG_SVG,
|
97
|
+
GUMBO_TAG_FOREIGNOBJECT,
|
98
|
+
GUMBO_TAG_DESC,
|
99
|
+
GUMBO_TAG_TABLE,
|
100
|
+
GUMBO_TAG_CAPTION,
|
101
|
+
GUMBO_TAG_COLGROUP,
|
102
|
+
GUMBO_TAG_COL,
|
103
|
+
GUMBO_TAG_TBODY,
|
104
|
+
GUMBO_TAG_THEAD,
|
105
|
+
GUMBO_TAG_TFOOT,
|
106
|
+
GUMBO_TAG_TR,
|
107
|
+
GUMBO_TAG_TD,
|
108
|
+
GUMBO_TAG_TH,
|
109
|
+
GUMBO_TAG_FORM,
|
110
|
+
GUMBO_TAG_FIELDSET,
|
111
|
+
GUMBO_TAG_LEGEND,
|
112
|
+
GUMBO_TAG_LABEL,
|
113
|
+
GUMBO_TAG_INPUT,
|
114
|
+
GUMBO_TAG_BUTTON,
|
115
|
+
GUMBO_TAG_SELECT,
|
116
|
+
GUMBO_TAG_DATALIST,
|
117
|
+
GUMBO_TAG_OPTGROUP,
|
118
|
+
GUMBO_TAG_OPTION,
|
119
|
+
GUMBO_TAG_TEXTAREA,
|
120
|
+
GUMBO_TAG_KEYGEN,
|
121
|
+
GUMBO_TAG_OUTPUT,
|
122
|
+
GUMBO_TAG_PROGRESS,
|
123
|
+
GUMBO_TAG_METER,
|
124
|
+
GUMBO_TAG_DETAILS,
|
125
|
+
GUMBO_TAG_SUMMARY,
|
126
|
+
GUMBO_TAG_MENU,
|
127
|
+
GUMBO_TAG_MENUITEM,
|
128
|
+
GUMBO_TAG_APPLET,
|
129
|
+
GUMBO_TAG_ACRONYM,
|
130
|
+
GUMBO_TAG_BGSOUND,
|
131
|
+
GUMBO_TAG_DIR,
|
132
|
+
GUMBO_TAG_FRAME,
|
133
|
+
GUMBO_TAG_FRAMESET,
|
134
|
+
GUMBO_TAG_NOFRAMES,
|
135
|
+
GUMBO_TAG_ISINDEX,
|
136
|
+
GUMBO_TAG_LISTING,
|
137
|
+
GUMBO_TAG_XMP,
|
138
|
+
GUMBO_TAG_NEXTID,
|
139
|
+
GUMBO_TAG_NOEMBED,
|
140
|
+
GUMBO_TAG_PLAINTEXT,
|
141
|
+
GUMBO_TAG_RB,
|
142
|
+
GUMBO_TAG_STRIKE,
|
143
|
+
GUMBO_TAG_BASEFONT,
|
144
|
+
GUMBO_TAG_BIG,
|
145
|
+
GUMBO_TAG_BLINK,
|
146
|
+
GUMBO_TAG_CENTER,
|
147
|
+
GUMBO_TAG_FONT,
|
148
|
+
GUMBO_TAG_MARQUEE,
|
149
|
+
GUMBO_TAG_MULTICOL,
|
150
|
+
GUMBO_TAG_NOBR,
|
151
|
+
GUMBO_TAG_SPACER,
|
152
|
+
GUMBO_TAG_TT,
|
153
|
+
GUMBO_TAG_RTC,
|
@@ -0,0 +1,105 @@
|
|
1
|
+
static unsigned int tag_hash(
|
2
|
+
register const char *str, register unsigned int len) {
|
3
|
+
static unsigned short asso_values[] = {296, 296, 296, 296, 296, 296, 296, 296,
|
4
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
5
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
6
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 6, 4, 3, 1, 1, 0,
|
7
|
+
1, 0, 0, 296, 296, 296, 296, 296, 296, 296, 22, 73, 151, 4, 13, 59, 65, 2,
|
8
|
+
69, 0, 134, 9, 16, 52, 55, 28, 101, 0, 1, 6, 63, 126, 104, 93, 124, 296,
|
9
|
+
296, 296, 296, 296, 296, 296, 22, 73, 151, 4, 13, 59, 65, 2, 69, 0, 134,
|
10
|
+
9, 16, 52, 55, 28, 101, 0, 1, 6, 63, 126, 104, 93, 124, 296, 296, 296,
|
11
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
12
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
13
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
14
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
15
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
16
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
17
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
18
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
19
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296};
|
20
|
+
register unsigned int hval = len;
|
21
|
+
|
22
|
+
switch (hval) {
|
23
|
+
default:
|
24
|
+
hval += asso_values[(unsigned char) str[1] + 3];
|
25
|
+
/*FALLTHROUGH*/
|
26
|
+
case 1:
|
27
|
+
hval += asso_values[(unsigned char) str[0]];
|
28
|
+
break;
|
29
|
+
}
|
30
|
+
return hval + asso_values[(unsigned char) str[len - 1]];
|
31
|
+
}
|
32
|
+
|
33
|
+
static const unsigned char kGumboTagMap[] = {GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
34
|
+
GUMBO_TAG_LAST, GUMBO_TAG_S, GUMBO_TAG_H6, GUMBO_TAG_H5, GUMBO_TAG_H4,
|
35
|
+
GUMBO_TAG_H3, GUMBO_TAG_SPACER, GUMBO_TAG_H2, GUMBO_TAG_HEADER,
|
36
|
+
GUMBO_TAG_H1, GUMBO_TAG_HEAD, GUMBO_TAG_LAST, GUMBO_TAG_DETAILS,
|
37
|
+
GUMBO_TAG_SELECT, GUMBO_TAG_DIR, GUMBO_TAG_LAST, GUMBO_TAG_DEL,
|
38
|
+
GUMBO_TAG_LAST, GUMBO_TAG_SOURCE, GUMBO_TAG_LEGEND, GUMBO_TAG_DATALIST,
|
39
|
+
GUMBO_TAG_METER, GUMBO_TAG_MGLYPH, GUMBO_TAG_LAST, GUMBO_TAG_MATH,
|
40
|
+
GUMBO_TAG_LABEL, GUMBO_TAG_TABLE, GUMBO_TAG_TEMPLATE, GUMBO_TAG_LAST,
|
41
|
+
GUMBO_TAG_RP, GUMBO_TAG_TIME, GUMBO_TAG_TITLE, GUMBO_TAG_DATA,
|
42
|
+
GUMBO_TAG_APPLET, GUMBO_TAG_HGROUP, GUMBO_TAG_SAMP, GUMBO_TAG_TEXTAREA,
|
43
|
+
GUMBO_TAG_ABBR, GUMBO_TAG_MARQUEE, GUMBO_TAG_LAST, GUMBO_TAG_MENUITEM,
|
44
|
+
GUMBO_TAG_SMALL, GUMBO_TAG_META, GUMBO_TAG_A, GUMBO_TAG_LAST,
|
45
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_EMBED,
|
46
|
+
GUMBO_TAG_MAP, GUMBO_TAG_LAST, GUMBO_TAG_PARAM, GUMBO_TAG_LAST,
|
47
|
+
GUMBO_TAG_LAST, GUMBO_TAG_NOBR, GUMBO_TAG_P, GUMBO_TAG_SPAN, GUMBO_TAG_EM,
|
48
|
+
GUMBO_TAG_LAST, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SECTION, GUMBO_TAG_NOEMBED,
|
49
|
+
GUMBO_TAG_NEXTID, GUMBO_TAG_FOOTER, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_HR,
|
50
|
+
GUMBO_TAG_LAST, GUMBO_TAG_FONT, GUMBO_TAG_DL, GUMBO_TAG_TR,
|
51
|
+
GUMBO_TAG_SCRIPT, GUMBO_TAG_MO, GUMBO_TAG_LAST, GUMBO_TAG_DD,
|
52
|
+
GUMBO_TAG_MAIN, GUMBO_TAG_TD, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_FORM,
|
53
|
+
GUMBO_TAG_OBJECT, GUMBO_TAG_LAST, GUMBO_TAG_FIELDSET, GUMBO_TAG_LAST,
|
54
|
+
GUMBO_TAG_BGSOUND, GUMBO_TAG_MENU, GUMBO_TAG_TFOOT, GUMBO_TAG_FIGURE,
|
55
|
+
GUMBO_TAG_RB, GUMBO_TAG_LI, GUMBO_TAG_LISTING, GUMBO_TAG_BASEFONT,
|
56
|
+
GUMBO_TAG_OPTGROUP, GUMBO_TAG_LAST, GUMBO_TAG_BASE, GUMBO_TAG_ADDRESS,
|
57
|
+
GUMBO_TAG_MI, GUMBO_TAG_LAST, GUMBO_TAG_PLAINTEXT, GUMBO_TAG_LAST,
|
58
|
+
GUMBO_TAG_PROGRESS, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
59
|
+
GUMBO_TAG_ACRONYM, GUMBO_TAG_ARTICLE, GUMBO_TAG_LAST, GUMBO_TAG_PRE,
|
60
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_AREA,
|
61
|
+
GUMBO_TAG_RT, GUMBO_TAG_LAST, GUMBO_TAG_OPTION, GUMBO_TAG_IMAGE,
|
62
|
+
GUMBO_TAG_DT, GUMBO_TAG_LAST, GUMBO_TAG_TT, GUMBO_TAG_HTML, GUMBO_TAG_WBR,
|
63
|
+
GUMBO_TAG_OL, GUMBO_TAG_LAST, GUMBO_TAG_STYLE, GUMBO_TAG_STRIKE,
|
64
|
+
GUMBO_TAG_SUP, GUMBO_TAG_MULTICOL, GUMBO_TAG_U, GUMBO_TAG_DFN, GUMBO_TAG_UL,
|
65
|
+
GUMBO_TAG_FIGCAPTION, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST, GUMBO_TAG_VAR,
|
66
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_FRAMESET, GUMBO_TAG_LAST,
|
67
|
+
GUMBO_TAG_BR, GUMBO_TAG_I, GUMBO_TAG_FRAME, GUMBO_TAG_LAST, GUMBO_TAG_DIV,
|
68
|
+
GUMBO_TAG_LAST, GUMBO_TAG_TH, GUMBO_TAG_MS, GUMBO_TAG_ANNOTATION_XML,
|
69
|
+
GUMBO_TAG_B, GUMBO_TAG_TBODY, GUMBO_TAG_THEAD, GUMBO_TAG_BIG,
|
70
|
+
GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_XMP, GUMBO_TAG_LAST, GUMBO_TAG_KBD,
|
71
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LINK, GUMBO_TAG_IFRAME, GUMBO_TAG_MARK,
|
72
|
+
GUMBO_TAG_CENTER, GUMBO_TAG_OUTPUT, GUMBO_TAG_DESC, GUMBO_TAG_CANVAS,
|
73
|
+
GUMBO_TAG_COL, GUMBO_TAG_MALIGNMARK, GUMBO_TAG_IMG, GUMBO_TAG_ASIDE,
|
74
|
+
GUMBO_TAG_LAST, GUMBO_TAG_CODE, GUMBO_TAG_LAST, GUMBO_TAG_SUB, GUMBO_TAG_MN,
|
75
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_INS, GUMBO_TAG_AUDIO,
|
76
|
+
GUMBO_TAG_STRONG, GUMBO_TAG_CITE, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
77
|
+
GUMBO_TAG_LAST, GUMBO_TAG_INPUT, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
78
|
+
GUMBO_TAG_LAST, GUMBO_TAG_NAV, GUMBO_TAG_LAST, GUMBO_TAG_COLGROUP,
|
79
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
80
|
+
GUMBO_TAG_LAST, GUMBO_TAG_SVG, GUMBO_TAG_KEYGEN, GUMBO_TAG_VIDEO,
|
81
|
+
GUMBO_TAG_BDO, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
82
|
+
GUMBO_TAG_LAST, GUMBO_TAG_BODY, GUMBO_TAG_LAST, GUMBO_TAG_Q, GUMBO_TAG_LAST,
|
83
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_TRACK,
|
84
|
+
GUMBO_TAG_LAST, GUMBO_TAG_BDI, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
85
|
+
GUMBO_TAG_LAST, GUMBO_TAG_CAPTION, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
86
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
87
|
+
GUMBO_TAG_RUBY, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_BUTTON,
|
88
|
+
GUMBO_TAG_SUMMARY, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
89
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
90
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
91
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
92
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
93
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
94
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
95
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
96
|
+
GUMBO_TAG_LAST, GUMBO_TAG_RTC, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
97
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
98
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_BLINK, GUMBO_TAG_LAST,
|
99
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
100
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
101
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
102
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
103
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
104
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
105
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_ISINDEX};
|
@@ -0,0 +1,4 @@
|
|
1
|
+
// Generated via `gentags.py src/tag.in`.
|
2
|
+
// Do not edit; edit src/tag.in instead.
|
3
|
+
// clang-format off
|
4
|
+
4, 4, 5, 4, 4, 4, 5, 6, 8, 8, 4, 7, 7, 3, 5, 2, 2, 2, 2, 2, 2, 6, 6, 6, 7, 1, 2, 3, 10, 2, 2, 2, 2, 2, 2, 6, 10, 4, 3, 1, 2, 6, 5, 1, 4, 1, 3, 4, 4, 4, 4, 3, 4, 3, 3, 3, 1, 1, 1, 4, 4, 2, 2, 3, 3, 4, 2, 3, 3, 3, 5, 3, 6, 5, 6, 5, 5, 5, 6, 5, 6, 3, 4, 4, 2, 2, 2, 2, 5, 6, 10, 14, 3, 13, 4, 5, 7, 8, 3, 5, 5, 5, 2, 2, 2, 4, 8, 6, 5, 5, 6, 6, 8, 8, 6, 8, 6, 6, 8, 5, 7, 7, 4, 8, 6, 7, 7, 3, 5, 8, 8, 7, 7, 3, 6, 7, 9, 2, 6, 8, 3, 5, 6, 4, 7, 8, 4, 6, 2, 3,
|
@@ -0,0 +1,153 @@
|
|
1
|
+
// Generated via `gentags.py src/tag.in`.
|
2
|
+
// Do not edit; edit src/tag.in instead.
|
3
|
+
// clang-format off
|
4
|
+
"html",
|
5
|
+
"head",
|
6
|
+
"title",
|
7
|
+
"base",
|
8
|
+
"link",
|
9
|
+
"meta",
|
10
|
+
"style",
|
11
|
+
"script",
|
12
|
+
"noscript",
|
13
|
+
"template",
|
14
|
+
"body",
|
15
|
+
"article",
|
16
|
+
"section",
|
17
|
+
"nav",
|
18
|
+
"aside",
|
19
|
+
"h1",
|
20
|
+
"h2",
|
21
|
+
"h3",
|
22
|
+
"h4",
|
23
|
+
"h5",
|
24
|
+
"h6",
|
25
|
+
"hgroup",
|
26
|
+
"header",
|
27
|
+
"footer",
|
28
|
+
"address",
|
29
|
+
"p",
|
30
|
+
"hr",
|
31
|
+
"pre",
|
32
|
+
"blockquote",
|
33
|
+
"ol",
|
34
|
+
"ul",
|
35
|
+
"li",
|
36
|
+
"dl",
|
37
|
+
"dt",
|
38
|
+
"dd",
|
39
|
+
"figure",
|
40
|
+
"figcaption",
|
41
|
+
"main",
|
42
|
+
"div",
|
43
|
+
"a",
|
44
|
+
"em",
|
45
|
+
"strong",
|
46
|
+
"small",
|
47
|
+
"s",
|
48
|
+
"cite",
|
49
|
+
"q",
|
50
|
+
"dfn",
|
51
|
+
"abbr",
|
52
|
+
"data",
|
53
|
+
"time",
|
54
|
+
"code",
|
55
|
+
"var",
|
56
|
+
"samp",
|
57
|
+
"kbd",
|
58
|
+
"sub",
|
59
|
+
"sup",
|
60
|
+
"i",
|
61
|
+
"b",
|
62
|
+
"u",
|
63
|
+
"mark",
|
64
|
+
"ruby",
|
65
|
+
"rt",
|
66
|
+
"rp",
|
67
|
+
"bdi",
|
68
|
+
"bdo",
|
69
|
+
"span",
|
70
|
+
"br",
|
71
|
+
"wbr",
|
72
|
+
"ins",
|
73
|
+
"del",
|
74
|
+
"image",
|
75
|
+
"img",
|
76
|
+
"iframe",
|
77
|
+
"embed",
|
78
|
+
"object",
|
79
|
+
"param",
|
80
|
+
"video",
|
81
|
+
"audio",
|
82
|
+
"source",
|
83
|
+
"track",
|
84
|
+
"canvas",
|
85
|
+
"map",
|
86
|
+
"area",
|
87
|
+
"math",
|
88
|
+
"mi",
|
89
|
+
"mo",
|
90
|
+
"mn",
|
91
|
+
"ms",
|
92
|
+
"mtext",
|
93
|
+
"mglyph",
|
94
|
+
"malignmark",
|
95
|
+
"annotation-xml",
|
96
|
+
"svg",
|
97
|
+
"foreignobject",
|
98
|
+
"desc",
|
99
|
+
"table",
|
100
|
+
"caption",
|
101
|
+
"colgroup",
|
102
|
+
"col",
|
103
|
+
"tbody",
|
104
|
+
"thead",
|
105
|
+
"tfoot",
|
106
|
+
"tr",
|
107
|
+
"td",
|
108
|
+
"th",
|
109
|
+
"form",
|
110
|
+
"fieldset",
|
111
|
+
"legend",
|
112
|
+
"label",
|
113
|
+
"input",
|
114
|
+
"button",
|
115
|
+
"select",
|
116
|
+
"datalist",
|
117
|
+
"optgroup",
|
118
|
+
"option",
|
119
|
+
"textarea",
|
120
|
+
"keygen",
|
121
|
+
"output",
|
122
|
+
"progress",
|
123
|
+
"meter",
|
124
|
+
"details",
|
125
|
+
"summary",
|
126
|
+
"menu",
|
127
|
+
"menuitem",
|
128
|
+
"applet",
|
129
|
+
"acronym",
|
130
|
+
"bgsound",
|
131
|
+
"dir",
|
132
|
+
"frame",
|
133
|
+
"frameset",
|
134
|
+
"noframes",
|
135
|
+
"isindex",
|
136
|
+
"listing",
|
137
|
+
"xmp",
|
138
|
+
"nextid",
|
139
|
+
"noembed",
|
140
|
+
"plaintext",
|
141
|
+
"rb",
|
142
|
+
"strike",
|
143
|
+
"basefont",
|
144
|
+
"big",
|
145
|
+
"blink",
|
146
|
+
"center",
|
147
|
+
"font",
|
148
|
+
"marquee",
|
149
|
+
"multicol",
|
150
|
+
"nobr",
|
151
|
+
"spacer",
|
152
|
+
"tt",
|
153
|
+
"rtc",
|
@@ -42,7 +42,6 @@
|
|
42
42
|
// prevents parse error position from being messed up by possible mark/resets in
|
43
43
|
// temporary buffer manipulation.
|
44
44
|
|
45
|
-
|
46
45
|
#include "tokenizer.h"
|
47
46
|
|
48
47
|
#include <assert.h>
|
@@ -64,13 +63,13 @@
|
|
64
63
|
|
65
64
|
// Compared against _script_data_buffer to determine if we're in double-escaped
|
66
65
|
// script mode.
|
67
|
-
const GumboStringPiece kScriptTag = {
|
66
|
+
const GumboStringPiece kScriptTag = {"script", 6};
|
68
67
|
|
69
68
|
// An enum for the return value of each individual state.
|
70
69
|
typedef enum {
|
71
|
-
RETURN_ERROR,
|
72
|
-
RETURN_SUCCESS,
|
73
|
-
NEXT_CHAR
|
70
|
+
RETURN_ERROR, // Return false (error) from the tokenizer.
|
71
|
+
RETURN_SUCCESS, // Return true (success) from the tokenizer.
|
72
|
+
NEXT_CHAR // Proceed to the next character and continue lexing.
|
74
73
|
} StateResult;
|
75
74
|
|
76
75
|
// This is a struct containing state necessary to build up a tag token,
|
@@ -200,7 +199,8 @@ typedef struct GumboInternalTokenizerState {
|
|
200
199
|
} GumboTokenizerState;
|
201
200
|
|
202
201
|
// Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
|
203
|
-
static void tokenizer_add_parse_error(
|
202
|
+
static void tokenizer_add_parse_error(
|
203
|
+
GumboParser* parser, GumboErrorType type) {
|
204
204
|
GumboError* error = gumbo_add_error(parser);
|
205
205
|
if (!error) {
|
206
206
|
return;
|
@@ -356,12 +356,10 @@ static void clear_temporary_buffer(GumboParser* parser) {
|
|
356
356
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
357
357
|
assert(!tokenizer->_temporary_buffer_emit);
|
358
358
|
utf8iterator_mark(&tokenizer->_input);
|
359
|
-
|
360
|
-
gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
|
359
|
+
gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
|
361
360
|
// The temporary buffer and script data buffer are the same object in the
|
362
361
|
// spec, so the script data buffer should be cleared as well.
|
363
|
-
|
364
|
-
gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
|
362
|
+
gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
|
365
363
|
}
|
366
364
|
|
367
365
|
// Appends a codepoint to the temporary buffer.
|
@@ -374,15 +372,14 @@ static void append_char_to_temporary_buffer(
|
|
374
372
|
// Checks to see if the temporary buffer equals a certain string.
|
375
373
|
// Make sure this remains side-effect free; it's used in assertions.
|
376
374
|
#ifndef NDEBUG
|
377
|
-
static bool temporary_buffer_equals(
|
378
|
-
GumboParser* parser, const char* text) {
|
375
|
+
static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
|
379
376
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
|
380
377
|
// TODO(jdtang): See if the extra strlen is a performance problem, and replace
|
381
378
|
// it with an explicit sizeof(literal) if necessary. I don't think it will
|
382
379
|
// be, as this is only used in a couple of rare states.
|
383
380
|
int text_len = strlen(text);
|
384
381
|
return text_len == buffer->length &&
|
385
|
-
|
382
|
+
memcmp(buffer->data, text, text_len) == 0;
|
386
383
|
}
|
387
384
|
#endif
|
388
385
|
|
@@ -539,8 +536,8 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
539
536
|
output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
|
540
537
|
tag_state->_last_start_tag = tag_state->_tag;
|
541
538
|
mark_tag_state_as_empty(tag_state);
|
542
|
-
gumbo_debug(
|
543
|
-
|
539
|
+
gumbo_debug(
|
540
|
+
"Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
|
544
541
|
} else {
|
545
542
|
output->type = GUMBO_TOKEN_END_TAG;
|
546
543
|
output->v.end_tag = tag_state->_tag;
|
@@ -548,17 +545,18 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
548
545
|
// token, but it's still initialized as normal, so it must be manually
|
549
546
|
// deallocated. There may also be attributes to destroy, in certain broken
|
550
547
|
// cases like </div</th> (the "th" is an attribute there).
|
551
|
-
for (int i = 0; i < tag_state->_attributes.length; ++i) {
|
548
|
+
for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
|
552
549
|
gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
|
553
550
|
}
|
554
551
|
gumbo_parser_deallocate(parser, tag_state->_attributes.data);
|
555
552
|
mark_tag_state_as_empty(tag_state);
|
556
|
-
gumbo_debug(
|
557
|
-
|
553
|
+
gumbo_debug(
|
554
|
+
"Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
|
558
555
|
}
|
559
556
|
gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
|
560
557
|
finish_token(parser, output);
|
561
|
-
gumbo_debug("Original text = %.*s.\n", output->original_text.length,
|
558
|
+
gumbo_debug("Original text = %.*s.\n", output->original_text.length,
|
559
|
+
output->original_text.data);
|
562
560
|
assert(output->original_text.length >= 2);
|
563
561
|
assert(output->original_text.data[0] == '<');
|
564
562
|
assert(output->original_text.data[output->original_text.length - 1] == '>');
|
@@ -571,7 +569,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
571
569
|
// avoid a memory leak.
|
572
570
|
static void abandon_current_tag(GumboParser* parser) {
|
573
571
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
574
|
-
for (int i = 0; i < tag_state->_attributes.length; ++i) {
|
572
|
+
for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
|
575
573
|
gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
|
576
574
|
}
|
577
575
|
gumbo_parser_deallocate(parser, tag_state->_attributes.data);
|
@@ -583,9 +581,8 @@ static void abandon_current_tag(GumboParser* parser) {
|
|
583
581
|
// Wraps the consume_char_ref function to handle its output and make the
|
584
582
|
// appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
|
585
583
|
// error occurred, RETURN_SUCCESS otherwise.
|
586
|
-
static StateResult emit_char_ref(
|
587
|
-
|
588
|
-
bool is_in_attribute, GumboToken* output) {
|
584
|
+
static StateResult emit_char_ref(GumboParser* parser,
|
585
|
+
int additional_allowed_char, bool is_in_attribute, GumboToken* output) {
|
589
586
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
590
587
|
OneOrTwoCodepoints char_ref;
|
591
588
|
bool status = consume_char_ref(
|
@@ -649,8 +646,7 @@ static bool maybe_emit_from_temporary_buffer(
|
|
649
646
|
// _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
|
650
647
|
// the first character in it. It returns true if a character was emitted, false
|
651
648
|
// otherwise.
|
652
|
-
static bool emit_temporary_buffer(
|
653
|
-
GumboParser* parser, GumboToken* output) {
|
649
|
+
static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
|
654
650
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
655
651
|
assert(tokenizer->_temporary_buffer.data);
|
656
652
|
utf8iterator_reset(&tokenizer->_input);
|
@@ -663,8 +659,8 @@ static bool emit_temporary_buffer(
|
|
663
659
|
// start point; the only time you would *not* want to pass true for this
|
664
660
|
// parameter is if you want the original_text to include character (like an
|
665
661
|
// opening quote) that doesn't appear in the value.
|
666
|
-
static void append_char_to_tag_buffer(
|
667
|
-
|
662
|
+
static void append_char_to_tag_buffer(
|
663
|
+
GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
|
668
664
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
669
665
|
if (buffer->length == 0 && reinitilize_position_on_first) {
|
670
666
|
reset_tag_buffer_start_point(parser);
|
@@ -697,7 +693,11 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
|
|
697
693
|
gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
|
698
694
|
|
699
695
|
assert(tag_state->_attributes.data == NULL);
|
700
|
-
|
696
|
+
// Initial size chosen by statistical analysis of a corpus of 60k webpages.
|
697
|
+
// 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
|
698
|
+
// numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
|
699
|
+
// for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
|
700
|
+
gumbo_vector_init(parser, 1, &tag_state->_attributes);
|
701
701
|
tag_state->_drop_next_attr_value = false;
|
702
702
|
tag_state->_is_start_tag = is_start_tag;
|
703
703
|
tag_state->_is_self_closing = false;
|
@@ -717,16 +717,15 @@ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
|
|
717
717
|
// * The start_pos GumboSourcePosition with the start position of the tag
|
718
718
|
// buffer.
|
719
719
|
// * The end_pos GumboSourcePosition with the current source position.
|
720
|
-
static void copy_over_original_tag_text(
|
721
|
-
|
722
|
-
GumboSourcePosition*
|
720
|
+
static void copy_over_original_tag_text(GumboParser* parser,
|
721
|
+
GumboStringPiece* original_text, GumboSourcePosition* start_pos,
|
722
|
+
GumboSourcePosition* end_pos) {
|
723
723
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
724
724
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
725
725
|
|
726
726
|
original_text->data = tag_state->_original_text;
|
727
|
-
original_text->length =
|
728
|
-
|
729
|
-
tag_state->_original_text;
|
727
|
+
original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
|
728
|
+
tag_state->_original_text;
|
730
729
|
if (original_text->data[original_text->length - 1] == '\r') {
|
731
730
|
// Since \r is skipped by the UTF-8 iterator, it can sometimes end up
|
732
731
|
// appended to the end of original text even when it's really the first part
|
@@ -751,16 +750,14 @@ static void finish_tag_name(GumboParser* parser) {
|
|
751
750
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
752
751
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
753
752
|
|
754
|
-
|
755
|
-
|
756
|
-
tag_state->_tag = gumbo_tag_enum(temp);
|
753
|
+
tag_state->_tag =
|
754
|
+
gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
|
757
755
|
reinitialize_tag_buffer(parser);
|
758
|
-
gumbo_parser_deallocate(parser, (void*) temp);
|
759
756
|
}
|
760
757
|
|
761
758
|
// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
|
762
759
|
static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
|
763
|
-
|
760
|
+
int original_index, int new_index) {
|
764
761
|
GumboError* error = gumbo_add_error(parser);
|
765
762
|
if (!error) {
|
766
763
|
return;
|
@@ -790,14 +787,13 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
790
787
|
assert(tag_state->_attributes.capacity);
|
791
788
|
|
792
789
|
GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
|
793
|
-
for (int i = 0; i < attributes->length; ++i) {
|
790
|
+
for (unsigned int i = 0; i < attributes->length; ++i) {
|
794
791
|
GumboAttribute* attr = attributes->data[i];
|
795
792
|
if (strlen(attr->name) == tag_state->_buffer.length &&
|
796
793
|
memcmp(attr->name, tag_state->_buffer.data,
|
797
|
-
|
794
|
+
tag_state->_buffer.length) == 0) {
|
798
795
|
// Identical attribute; bail.
|
799
|
-
add_duplicate_attr_error(
|
800
|
-
parser, attr->name, i, attributes->length);
|
796
|
+
add_duplicate_attr_error(parser, attr->name, i, attributes->length);
|
801
797
|
tag_state->_drop_next_attr_value = true;
|
802
798
|
return false;
|
803
799
|
}
|
@@ -806,11 +802,11 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
806
802
|
GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
|
807
803
|
attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
|
808
804
|
copy_over_tag_buffer(parser, &attr->name);
|
809
|
-
copy_over_original_tag_text(
|
810
|
-
|
805
|
+
copy_over_original_tag_text(
|
806
|
+
parser, &attr->original_name, &attr->name_start, &attr->name_end);
|
811
807
|
attr->value = gumbo_copy_stringz(parser, "");
|
812
|
-
copy_over_original_tag_text(
|
813
|
-
|
808
|
+
copy_over_original_tag_text(
|
809
|
+
parser, &attr->original_value, &attr->name_start, &attr->name_end);
|
814
810
|
gumbo_vector_add(parser, attr, attributes);
|
815
811
|
reinitialize_tag_buffer(parser);
|
816
812
|
return true;
|
@@ -832,8 +828,8 @@ static void finish_attribute_value(GumboParser* parser) {
|
|
832
828
|
tag_state->_attributes.data[tag_state->_attributes.length - 1];
|
833
829
|
gumbo_parser_deallocate(parser, (void*) attr->value);
|
834
830
|
copy_over_tag_buffer(parser, &attr->value);
|
835
|
-
copy_over_original_tag_text(
|
836
|
-
|
831
|
+
copy_over_original_tag_text(
|
832
|
+
parser, &attr->original_value, &attr->value_start, &attr->value_end);
|
837
833
|
reinitialize_tag_buffer(parser);
|
838
834
|
}
|
839
835
|
|
@@ -841,13 +837,9 @@ static void finish_attribute_value(GumboParser* parser) {
|
|
841
837
|
static bool is_appropriate_end_tag(GumboParser* parser) {
|
842
838
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
843
839
|
assert(!tag_state->_is_start_tag);
|
844
|
-
// Null terminate the current string buffer, so it can be passed to
|
845
|
-
// gumbo_tag_enum, but don't increment the length in case we need to dump the
|
846
|
-
// buffer as character tokens.
|
847
|
-
gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
|
848
|
-
--tag_state->_buffer.length;
|
849
840
|
return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
|
850
|
-
|
841
|
+
tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
|
842
|
+
tag_state->_buffer.length);
|
851
843
|
}
|
852
844
|
|
853
845
|
void gumbo_tokenizer_state_init(
|
@@ -892,15 +884,14 @@ void gumbo_tokenizer_set_is_current_node_foreign(
|
|
892
884
|
GumboParser* parser, bool is_foreign) {
|
893
885
|
if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
|
894
886
|
gumbo_debug("Toggling is_current_node_foreign to %s.\n",
|
895
|
-
|
887
|
+
is_foreign ? "true" : "false");
|
896
888
|
}
|
897
889
|
parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
|
898
890
|
}
|
899
891
|
|
900
892
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
|
901
|
-
static StateResult handle_data_state(
|
902
|
-
|
903
|
-
int c, GumboToken* output) {
|
893
|
+
static StateResult handle_data_state(GumboParser* parser,
|
894
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
904
895
|
switch (c) {
|
905
896
|
case '&':
|
906
897
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
|
@@ -924,17 +915,15 @@ static StateResult handle_data_state(
|
|
924
915
|
}
|
925
916
|
|
926
917
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
|
927
|
-
static StateResult handle_char_ref_in_data_state(
|
928
|
-
|
929
|
-
int c, GumboToken* output) {
|
918
|
+
static StateResult handle_char_ref_in_data_state(GumboParser* parser,
|
919
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
930
920
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
931
921
|
return emit_char_ref(parser, ' ', false, output);
|
932
922
|
}
|
933
923
|
|
934
924
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
|
935
|
-
static StateResult handle_rcdata_state(
|
936
|
-
|
937
|
-
int c, GumboToken* output) {
|
925
|
+
static StateResult handle_rcdata_state(GumboParser* parser,
|
926
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
938
927
|
switch (c) {
|
939
928
|
case '&':
|
940
929
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
|
@@ -955,17 +944,15 @@ static StateResult handle_rcdata_state(
|
|
955
944
|
}
|
956
945
|
|
957
946
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
|
958
|
-
static StateResult handle_char_ref_in_rcdata_state(
|
959
|
-
|
960
|
-
int c, GumboToken* output) {
|
947
|
+
static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
|
948
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
961
949
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
|
962
950
|
return emit_char_ref(parser, ' ', false, output);
|
963
951
|
}
|
964
952
|
|
965
953
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
|
966
|
-
static StateResult handle_rawtext_state(
|
967
|
-
|
968
|
-
int c, GumboToken* output) {
|
954
|
+
static StateResult handle_rawtext_state(GumboParser* parser,
|
955
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
969
956
|
switch (c) {
|
970
957
|
case '<':
|
971
958
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
|
@@ -982,9 +969,8 @@ static StateResult handle_rawtext_state(
|
|
982
969
|
}
|
983
970
|
|
984
971
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
|
985
|
-
static StateResult handle_script_state(
|
986
|
-
|
987
|
-
int c, GumboToken* output) {
|
972
|
+
static StateResult handle_script_state(GumboParser* parser,
|
973
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
988
974
|
switch (c) {
|
989
975
|
case '<':
|
990
976
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
|
@@ -1001,9 +987,8 @@ static StateResult handle_script_state(
|
|
1001
987
|
}
|
1002
988
|
|
1003
989
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
|
1004
|
-
static StateResult handle_plaintext_state(
|
1005
|
-
|
1006
|
-
int c, GumboToken* output) {
|
990
|
+
static StateResult handle_plaintext_state(GumboParser* parser,
|
991
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1007
992
|
switch (c) {
|
1008
993
|
case '\0':
|
1009
994
|
return emit_replacement_char(parser, output);
|
@@ -1015,9 +1000,8 @@ static StateResult handle_plaintext_state(
|
|
1015
1000
|
}
|
1016
1001
|
|
1017
1002
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
|
1018
|
-
static StateResult handle_tag_open_state(
|
1019
|
-
|
1020
|
-
int c, GumboToken* output) {
|
1003
|
+
static StateResult handle_tag_open_state(GumboParser* parser,
|
1004
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1021
1005
|
assert(temporary_buffer_equals(parser, "<"));
|
1022
1006
|
switch (c) {
|
1023
1007
|
case '!':
|
@@ -1049,9 +1033,8 @@ static StateResult handle_tag_open_state(
|
|
1049
1033
|
}
|
1050
1034
|
|
1051
1035
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
|
1052
|
-
static StateResult handle_end_tag_open_state(
|
1053
|
-
|
1054
|
-
int c, GumboToken* output) {
|
1036
|
+
static StateResult handle_end_tag_open_state(GumboParser* parser,
|
1037
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1055
1038
|
assert(temporary_buffer_equals(parser, "</"));
|
1056
1039
|
switch (c) {
|
1057
1040
|
case '>':
|
@@ -1077,9 +1060,8 @@ static StateResult handle_end_tag_open_state(
|
|
1077
1060
|
}
|
1078
1061
|
|
1079
1062
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
|
1080
|
-
static StateResult handle_tag_name_state(
|
1081
|
-
|
1082
|
-
int c, GumboToken* output) {
|
1063
|
+
static StateResult handle_tag_name_state(GumboParser* parser,
|
1064
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1083
1065
|
switch (c) {
|
1084
1066
|
case '\t':
|
1085
1067
|
case '\n':
|
@@ -1112,9 +1094,8 @@ static StateResult handle_tag_name_state(
|
|
1112
1094
|
}
|
1113
1095
|
|
1114
1096
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
|
1115
|
-
static StateResult handle_rcdata_lt_state(
|
1116
|
-
|
1117
|
-
int c, GumboToken* output) {
|
1097
|
+
static StateResult handle_rcdata_lt_state(GumboParser* parser,
|
1098
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1118
1099
|
assert(temporary_buffer_equals(parser, "<"));
|
1119
1100
|
if (c == '/') {
|
1120
1101
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
|
@@ -1128,9 +1109,8 @@ static StateResult handle_rcdata_lt_state(
|
|
1128
1109
|
}
|
1129
1110
|
|
1130
1111
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
|
1131
|
-
static StateResult handle_rcdata_end_tag_open_state(
|
1132
|
-
|
1133
|
-
int c, GumboToken* output) {
|
1112
|
+
static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
|
1113
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1134
1114
|
assert(temporary_buffer_equals(parser, "</"));
|
1135
1115
|
if (is_alpha(c)) {
|
1136
1116
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
|
@@ -1145,9 +1125,8 @@ static StateResult handle_rcdata_end_tag_open_state(
|
|
1145
1125
|
}
|
1146
1126
|
|
1147
1127
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
|
1148
|
-
static StateResult handle_rcdata_end_tag_name_state(
|
1149
|
-
|
1150
|
-
int c, GumboToken* output) {
|
1128
|
+
static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
|
1129
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1151
1130
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1152
1131
|
if (is_alpha(c)) {
|
1153
1132
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1178,9 +1157,8 @@ static StateResult handle_rcdata_end_tag_name_state(
|
|
1178
1157
|
}
|
1179
1158
|
|
1180
1159
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
|
1181
|
-
static StateResult handle_rawtext_lt_state(
|
1182
|
-
|
1183
|
-
int c, GumboToken* output) {
|
1160
|
+
static StateResult handle_rawtext_lt_state(GumboParser* parser,
|
1161
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1184
1162
|
assert(temporary_buffer_equals(parser, "<"));
|
1185
1163
|
if (c == '/') {
|
1186
1164
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
|
@@ -1194,9 +1172,8 @@ static StateResult handle_rawtext_lt_state(
|
|
1194
1172
|
}
|
1195
1173
|
|
1196
1174
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
|
1197
|
-
static StateResult handle_rawtext_end_tag_open_state(
|
1198
|
-
|
1199
|
-
int c, GumboToken* output) {
|
1175
|
+
static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
|
1176
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1200
1177
|
assert(temporary_buffer_equals(parser, "</"));
|
1201
1178
|
if (is_alpha(c)) {
|
1202
1179
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
|
@@ -1210,12 +1187,11 @@ static StateResult handle_rawtext_end_tag_open_state(
|
|
1210
1187
|
}
|
1211
1188
|
|
1212
1189
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
|
1213
|
-
static StateResult handle_rawtext_end_tag_name_state(
|
1214
|
-
|
1215
|
-
int c, GumboToken* output) {
|
1190
|
+
static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
|
1191
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1216
1192
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1217
1193
|
gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
|
1218
|
-
|
1194
|
+
tokenizer->_tag_state._buffer.data);
|
1219
1195
|
if (is_alpha(c)) {
|
1220
1196
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1221
1197
|
append_char_to_temporary_buffer(parser, c);
|
@@ -1246,9 +1222,8 @@ static StateResult handle_rawtext_end_tag_name_state(
|
|
1246
1222
|
}
|
1247
1223
|
|
1248
1224
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
|
1249
|
-
static StateResult handle_script_lt_state(
|
1250
|
-
|
1251
|
-
int c, GumboToken* output) {
|
1225
|
+
static StateResult handle_script_lt_state(GumboParser* parser,
|
1226
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1252
1227
|
assert(temporary_buffer_equals(parser, "<"));
|
1253
1228
|
if (c == '/') {
|
1254
1229
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
|
@@ -1266,9 +1241,8 @@ static StateResult handle_script_lt_state(
|
|
1266
1241
|
}
|
1267
1242
|
|
1268
1243
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
|
1269
|
-
static StateResult handle_script_end_tag_open_state(
|
1270
|
-
|
1271
|
-
int c, GumboToken* output) {
|
1244
|
+
static StateResult handle_script_end_tag_open_state(GumboParser* parser,
|
1245
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1272
1246
|
assert(temporary_buffer_equals(parser, "</"));
|
1273
1247
|
if (is_alpha(c)) {
|
1274
1248
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
|
@@ -1282,9 +1256,8 @@ static StateResult handle_script_end_tag_open_state(
|
|
1282
1256
|
}
|
1283
1257
|
|
1284
1258
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
|
1285
|
-
static StateResult handle_script_end_tag_name_state(
|
1286
|
-
|
1287
|
-
int c, GumboToken* output) {
|
1259
|
+
static StateResult handle_script_end_tag_name_state(GumboParser* parser,
|
1260
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1288
1261
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1289
1262
|
if (is_alpha(c)) {
|
1290
1263
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1315,9 +1288,8 @@ static StateResult handle_script_end_tag_name_state(
|
|
1315
1288
|
}
|
1316
1289
|
|
1317
1290
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
|
1318
|
-
static StateResult handle_script_escaped_start_state(
|
1319
|
-
|
1320
|
-
int c, GumboToken* output) {
|
1291
|
+
static StateResult handle_script_escaped_start_state(GumboParser* parser,
|
1292
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1321
1293
|
if (c == '-') {
|
1322
1294
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
|
1323
1295
|
return emit_current_char(parser, output);
|
@@ -1329,9 +1301,8 @@ static StateResult handle_script_escaped_start_state(
|
|
1329
1301
|
}
|
1330
1302
|
|
1331
1303
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
|
1332
|
-
static StateResult handle_script_escaped_start_dash_state(
|
1333
|
-
|
1334
|
-
int c, GumboToken* output) {
|
1304
|
+
static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
|
1305
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1335
1306
|
if (c == '-') {
|
1336
1307
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
|
1337
1308
|
return emit_current_char(parser, output);
|
@@ -1343,9 +1314,8 @@ static StateResult handle_script_escaped_start_dash_state(
|
|
1343
1314
|
}
|
1344
1315
|
|
1345
1316
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
|
1346
|
-
static StateResult handle_script_escaped_state(
|
1347
|
-
|
1348
|
-
int c, GumboToken* output) {
|
1317
|
+
static StateResult handle_script_escaped_state(GumboParser* parser,
|
1318
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1349
1319
|
switch (c) {
|
1350
1320
|
case '-':
|
1351
1321
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
|
@@ -1366,9 +1336,8 @@ static StateResult handle_script_escaped_state(
|
|
1366
1336
|
}
|
1367
1337
|
|
1368
1338
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
|
1369
|
-
static StateResult handle_script_escaped_dash_state(
|
1370
|
-
|
1371
|
-
int c, GumboToken* output) {
|
1339
|
+
static StateResult handle_script_escaped_dash_state(GumboParser* parser,
|
1340
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1372
1341
|
switch (c) {
|
1373
1342
|
case '-':
|
1374
1343
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
|
@@ -1392,9 +1361,8 @@ static StateResult handle_script_escaped_dash_state(
|
|
1392
1361
|
}
|
1393
1362
|
|
1394
1363
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
|
1395
|
-
static StateResult handle_script_escaped_dash_dash_state(
|
1396
|
-
|
1397
|
-
int c, GumboToken* output) {
|
1364
|
+
static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
|
1365
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1398
1366
|
switch (c) {
|
1399
1367
|
case '-':
|
1400
1368
|
return emit_current_char(parser, output);
|
@@ -1420,9 +1388,8 @@ static StateResult handle_script_escaped_dash_dash_state(
|
|
1420
1388
|
}
|
1421
1389
|
|
1422
1390
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
|
1423
|
-
static StateResult handle_script_escaped_lt_state(
|
1424
|
-
|
1425
|
-
int c, GumboToken* output) {
|
1391
|
+
static StateResult handle_script_escaped_lt_state(GumboParser* parser,
|
1392
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1426
1393
|
assert(temporary_buffer_equals(parser, "<"));
|
1427
1394
|
assert(!tokenizer->_script_data_buffer.length);
|
1428
1395
|
if (c == '/') {
|
@@ -1442,9 +1409,8 @@ static StateResult handle_script_escaped_lt_state(
|
|
1442
1409
|
}
|
1443
1410
|
|
1444
1411
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
|
1445
|
-
static StateResult handle_script_escaped_end_tag_open_state(
|
1446
|
-
|
1447
|
-
int c, GumboToken* output) {
|
1412
|
+
static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
|
1413
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1448
1414
|
assert(temporary_buffer_equals(parser, "</"));
|
1449
1415
|
if (is_alpha(c)) {
|
1450
1416
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
|
@@ -1458,9 +1424,8 @@ static StateResult handle_script_escaped_end_tag_open_state(
|
|
1458
1424
|
}
|
1459
1425
|
|
1460
1426
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
|
1461
|
-
static StateResult handle_script_escaped_end_tag_name_state(
|
1462
|
-
|
1463
|
-
int c, GumboToken* output) {
|
1427
|
+
static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
|
1428
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1464
1429
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1465
1430
|
if (is_alpha(c)) {
|
1466
1431
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1491,9 +1456,8 @@ static StateResult handle_script_escaped_end_tag_name_state(
|
|
1491
1456
|
}
|
1492
1457
|
|
1493
1458
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
|
1494
|
-
static StateResult handle_script_double_escaped_start_state(
|
1495
|
-
|
1496
|
-
int c, GumboToken* output) {
|
1459
|
+
static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
|
1460
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1497
1461
|
switch (c) {
|
1498
1462
|
case '\t':
|
1499
1463
|
case '\n':
|
@@ -1501,9 +1465,11 @@ static StateResult handle_script_double_escaped_start_state(
|
|
1501
1465
|
case ' ':
|
1502
1466
|
case '/':
|
1503
1467
|
case '>':
|
1504
|
-
gumbo_tokenizer_set_state(
|
1505
|
-
|
1506
|
-
|
1468
|
+
gumbo_tokenizer_set_state(
|
1469
|
+
parser, gumbo_string_equals(&kScriptTag,
|
1470
|
+
(GumboStringPiece*) &tokenizer->_script_data_buffer)
|
1471
|
+
? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
|
1472
|
+
: GUMBO_LEX_SCRIPT_ESCAPED);
|
1507
1473
|
return emit_current_char(parser, output);
|
1508
1474
|
default:
|
1509
1475
|
if (is_alpha(c)) {
|
@@ -1519,9 +1485,8 @@ static StateResult handle_script_double_escaped_start_state(
|
|
1519
1485
|
}
|
1520
1486
|
|
1521
1487
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
|
1522
|
-
static StateResult handle_script_double_escaped_state(
|
1523
|
-
|
1524
|
-
int c, GumboToken* output) {
|
1488
|
+
static StateResult handle_script_double_escaped_state(GumboParser* parser,
|
1489
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1525
1490
|
switch (c) {
|
1526
1491
|
case '-':
|
1527
1492
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
|
@@ -1541,9 +1506,8 @@ static StateResult handle_script_double_escaped_state(
|
|
1541
1506
|
}
|
1542
1507
|
|
1543
1508
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
|
1544
|
-
static StateResult handle_script_double_escaped_dash_state(
|
1545
|
-
|
1546
|
-
int c, GumboToken* output) {
|
1509
|
+
static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
|
1510
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1547
1511
|
switch (c) {
|
1548
1512
|
case '-':
|
1549
1513
|
gumbo_tokenizer_set_state(
|
@@ -1567,8 +1531,8 @@ static StateResult handle_script_double_escaped_dash_state(
|
|
1567
1531
|
|
1568
1532
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
|
1569
1533
|
static StateResult handle_script_double_escaped_dash_dash_state(
|
1570
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
1571
|
-
|
1534
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
1535
|
+
GumboToken* output) {
|
1572
1536
|
switch (c) {
|
1573
1537
|
case '-':
|
1574
1538
|
return emit_current_char(parser, output);
|
@@ -1592,26 +1556,22 @@ static StateResult handle_script_double_escaped_dash_dash_state(
|
|
1592
1556
|
}
|
1593
1557
|
|
1594
1558
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
|
1595
|
-
static StateResult handle_script_double_escaped_lt_state(
|
1596
|
-
|
1597
|
-
int c, GumboToken* output) {
|
1559
|
+
static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
|
1560
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1598
1561
|
if (c == '/') {
|
1599
1562
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
|
1600
|
-
|
1601
|
-
gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
|
1563
|
+
gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
|
1602
1564
|
return emit_current_char(parser, output);
|
1603
1565
|
} else {
|
1604
1566
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
|
1605
1567
|
tokenizer->_reconsume_current_input = true;
|
1606
1568
|
return NEXT_CHAR;
|
1607
1569
|
}
|
1608
|
-
|
1609
1570
|
}
|
1610
1571
|
|
1611
1572
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
|
1612
|
-
static StateResult handle_script_double_escaped_end_state(
|
1613
|
-
|
1614
|
-
int c, GumboToken* output) {
|
1573
|
+
static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
|
1574
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1615
1575
|
switch (c) {
|
1616
1576
|
case '\t':
|
1617
1577
|
case '\n':
|
@@ -1619,9 +1579,11 @@ static StateResult handle_script_double_escaped_end_state(
|
|
1619
1579
|
case ' ':
|
1620
1580
|
case '/':
|
1621
1581
|
case '>':
|
1622
|
-
gumbo_tokenizer_set_state(
|
1623
|
-
|
1624
|
-
|
1582
|
+
gumbo_tokenizer_set_state(
|
1583
|
+
parser, gumbo_string_equals(&kScriptTag,
|
1584
|
+
(GumboStringPiece*) &tokenizer->_script_data_buffer)
|
1585
|
+
? GUMBO_LEX_SCRIPT_ESCAPED
|
1586
|
+
: GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
|
1625
1587
|
return emit_current_char(parser, output);
|
1626
1588
|
default:
|
1627
1589
|
if (is_alpha(c)) {
|
@@ -1637,9 +1599,8 @@ static StateResult handle_script_double_escaped_end_state(
|
|
1637
1599
|
}
|
1638
1600
|
|
1639
1601
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
|
1640
|
-
static StateResult handle_before_attr_name_state(
|
1641
|
-
|
1642
|
-
int c, GumboToken* output) {
|
1602
|
+
static StateResult handle_before_attr_name_state(GumboParser* parser,
|
1603
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1643
1604
|
switch (c) {
|
1644
1605
|
case '\t':
|
1645
1606
|
case '\n':
|
@@ -1667,7 +1628,7 @@ static StateResult handle_before_attr_name_state(
|
|
1667
1628
|
case '<':
|
1668
1629
|
case '=':
|
1669
1630
|
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
|
1670
|
-
|
1631
|
+
// Fall through.
|
1671
1632
|
default:
|
1672
1633
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
|
1673
1634
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1676,9 +1637,8 @@ static StateResult handle_before_attr_name_state(
|
|
1676
1637
|
}
|
1677
1638
|
|
1678
1639
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
|
1679
|
-
static StateResult handle_attr_name_state(
|
1680
|
-
|
1681
|
-
int c, GumboToken* output) {
|
1640
|
+
static StateResult handle_attr_name_state(GumboParser* parser,
|
1641
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1682
1642
|
switch (c) {
|
1683
1643
|
case '\t':
|
1684
1644
|
case '\n':
|
@@ -1712,7 +1672,7 @@ static StateResult handle_attr_name_state(
|
|
1712
1672
|
case '\'':
|
1713
1673
|
case '<':
|
1714
1674
|
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
|
1715
|
-
|
1675
|
+
// Fall through.
|
1716
1676
|
default:
|
1717
1677
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1718
1678
|
return NEXT_CHAR;
|
@@ -1720,9 +1680,8 @@ static StateResult handle_attr_name_state(
|
|
1720
1680
|
}
|
1721
1681
|
|
1722
1682
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
|
1723
|
-
static StateResult handle_after_attr_name_state(
|
1724
|
-
|
1725
|
-
int c, GumboToken* output) {
|
1683
|
+
static StateResult handle_after_attr_name_state(GumboParser* parser,
|
1684
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1726
1685
|
switch (c) {
|
1727
1686
|
case '\t':
|
1728
1687
|
case '\n':
|
@@ -1752,7 +1711,7 @@ static StateResult handle_after_attr_name_state(
|
|
1752
1711
|
case '\'':
|
1753
1712
|
case '<':
|
1754
1713
|
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
|
1755
|
-
|
1714
|
+
// Fall through.
|
1756
1715
|
default:
|
1757
1716
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
|
1758
1717
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1761,9 +1720,8 @@ static StateResult handle_after_attr_name_state(
|
|
1761
1720
|
}
|
1762
1721
|
|
1763
1722
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
|
1764
|
-
static StateResult handle_before_attr_value_state(
|
1765
|
-
|
1766
|
-
int c, GumboToken* output) {
|
1723
|
+
static StateResult handle_before_attr_value_state(GumboParser* parser,
|
1724
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1767
1725
|
switch (c) {
|
1768
1726
|
case '\t':
|
1769
1727
|
case '\n':
|
@@ -1802,7 +1760,7 @@ static StateResult handle_before_attr_value_state(
|
|
1802
1760
|
case '=':
|
1803
1761
|
case '`':
|
1804
1762
|
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
|
1805
|
-
|
1763
|
+
// Fall through.
|
1806
1764
|
default:
|
1807
1765
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
|
1808
1766
|
append_char_to_tag_buffer(parser, c, true);
|
@@ -1811,9 +1769,8 @@ static StateResult handle_before_attr_value_state(
|
|
1811
1769
|
}
|
1812
1770
|
|
1813
1771
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
|
1814
|
-
static StateResult handle_attr_value_double_quoted_state(
|
1815
|
-
|
1816
|
-
int c, GumboToken* output) {
|
1772
|
+
static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
|
1773
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1817
1774
|
switch (c) {
|
1818
1775
|
case '"':
|
1819
1776
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
|
@@ -1840,9 +1797,8 @@ static StateResult handle_attr_value_double_quoted_state(
|
|
1840
1797
|
}
|
1841
1798
|
|
1842
1799
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
|
1843
|
-
static StateResult handle_attr_value_single_quoted_state(
|
1844
|
-
|
1845
|
-
int c, GumboToken* output) {
|
1800
|
+
static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
|
1801
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1846
1802
|
switch (c) {
|
1847
1803
|
case '\'':
|
1848
1804
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
|
@@ -1869,9 +1825,8 @@ static StateResult handle_attr_value_single_quoted_state(
|
|
1869
1825
|
}
|
1870
1826
|
|
1871
1827
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
|
1872
|
-
static StateResult handle_attr_value_unquoted_state(
|
1873
|
-
|
1874
|
-
int c, GumboToken* output) {
|
1828
|
+
static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
|
1829
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1875
1830
|
switch (c) {
|
1876
1831
|
case '\t':
|
1877
1832
|
case '\n':
|
@@ -1905,7 +1860,7 @@ static StateResult handle_attr_value_unquoted_state(
|
|
1905
1860
|
case '\'':
|
1906
1861
|
case '`':
|
1907
1862
|
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
|
1908
|
-
|
1863
|
+
// Fall through.
|
1909
1864
|
default:
|
1910
1865
|
append_char_to_tag_buffer(parser, c, true);
|
1911
1866
|
return NEXT_CHAR;
|
@@ -1913,9 +1868,8 @@ static StateResult handle_attr_value_unquoted_state(
|
|
1913
1868
|
}
|
1914
1869
|
|
1915
1870
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
|
1916
|
-
static StateResult handle_char_ref_in_attr_value_state(
|
1917
|
-
|
1918
|
-
int c, GumboToken* output) {
|
1871
|
+
static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
|
1872
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1919
1873
|
OneOrTwoCodepoints char_ref;
|
1920
1874
|
int allowed_char;
|
1921
1875
|
bool is_unquoted = false;
|
@@ -1956,9 +1910,8 @@ static StateResult handle_char_ref_in_attr_value_state(
|
|
1956
1910
|
}
|
1957
1911
|
|
1958
1912
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
|
1959
|
-
static StateResult handle_after_attr_value_quoted_state(
|
1960
|
-
|
1961
|
-
int c, GumboToken* output) {
|
1913
|
+
static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
|
1914
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1962
1915
|
finish_attribute_value(parser);
|
1963
1916
|
switch (c) {
|
1964
1917
|
case '\t':
|
@@ -1988,9 +1941,8 @@ static StateResult handle_after_attr_value_quoted_state(
|
|
1988
1941
|
}
|
1989
1942
|
|
1990
1943
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
|
1991
|
-
static StateResult handle_self_closing_start_tag_state(
|
1992
|
-
|
1993
|
-
int c, GumboToken* output) {
|
1944
|
+
static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
|
1945
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1994
1946
|
switch (c) {
|
1995
1947
|
case '>':
|
1996
1948
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
@@ -2010,9 +1962,8 @@ static StateResult handle_self_closing_start_tag_state(
|
|
2010
1962
|
}
|
2011
1963
|
|
2012
1964
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
|
2013
|
-
static StateResult handle_bogus_comment_state(
|
2014
|
-
|
2015
|
-
int c, GumboToken* output) {
|
1965
|
+
static StateResult handle_bogus_comment_state(GumboParser* parser,
|
1966
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2016
1967
|
while (c != '>' && c != -1) {
|
2017
1968
|
if (c == '\0') {
|
2018
1969
|
c = 0xFFFD;
|
@@ -2026,15 +1977,14 @@ static StateResult handle_bogus_comment_state(
|
|
2026
1977
|
}
|
2027
1978
|
|
2028
1979
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
|
2029
|
-
static StateResult handle_markup_declaration_state(
|
2030
|
-
|
2031
|
-
int c, GumboToken* output) {
|
1980
|
+
static StateResult handle_markup_declaration_state(GumboParser* parser,
|
1981
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2032
1982
|
if (utf8iterator_maybe_consume_match(
|
2033
|
-
|
1983
|
+
&tokenizer->_input, "--", sizeof("--") - 1, true)) {
|
2034
1984
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
|
2035
1985
|
tokenizer->_reconsume_current_input = true;
|
2036
1986
|
} else if (utf8iterator_maybe_consume_match(
|
2037
|
-
|
1987
|
+
&tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
|
2038
1988
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
|
2039
1989
|
tokenizer->_reconsume_current_input = true;
|
2040
1990
|
// If we get here, we know we'll eventually emit a doctype token, so now is
|
@@ -2048,7 +1998,7 @@ static StateResult handle_markup_declaration_state(
|
|
2048
1998
|
gumbo_copy_stringz(parser, "");
|
2049
1999
|
} else if (tokenizer->_is_current_node_foreign &&
|
2050
2000
|
utf8iterator_maybe_consume_match(
|
2051
|
-
|
2001
|
+
&tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
|
2052
2002
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
|
2053
2003
|
tokenizer->_is_in_cdata = true;
|
2054
2004
|
tokenizer->_reconsume_current_input = true;
|
@@ -2062,9 +2012,8 @@ static StateResult handle_markup_declaration_state(
|
|
2062
2012
|
}
|
2063
2013
|
|
2064
2014
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
|
2065
|
-
static StateResult handle_comment_start_state(
|
2066
|
-
|
2067
|
-
int c, GumboToken* output) {
|
2015
|
+
static StateResult handle_comment_start_state(GumboParser* parser,
|
2016
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2068
2017
|
switch (c) {
|
2069
2018
|
case '-':
|
2070
2019
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
|
@@ -2092,9 +2041,8 @@ static StateResult handle_comment_start_state(
|
|
2092
2041
|
}
|
2093
2042
|
|
2094
2043
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
|
2095
|
-
static StateResult handle_comment_start_dash_state(
|
2096
|
-
|
2097
|
-
int c, GumboToken* output) {
|
2044
|
+
static StateResult handle_comment_start_dash_state(GumboParser* parser,
|
2045
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2098
2046
|
switch (c) {
|
2099
2047
|
case '-':
|
2100
2048
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
|
@@ -2124,9 +2072,8 @@ static StateResult handle_comment_start_dash_state(
|
|
2124
2072
|
}
|
2125
2073
|
|
2126
2074
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
|
2127
|
-
static StateResult handle_comment_state(
|
2128
|
-
|
2129
|
-
int c, GumboToken* output) {
|
2075
|
+
static StateResult handle_comment_state(GumboParser* parser,
|
2076
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2130
2077
|
switch (c) {
|
2131
2078
|
case '-':
|
2132
2079
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
|
@@ -2147,9 +2094,8 @@ static StateResult handle_comment_state(
|
|
2147
2094
|
}
|
2148
2095
|
|
2149
2096
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
|
2150
|
-
static StateResult handle_comment_end_dash_state(
|
2151
|
-
|
2152
|
-
int c, GumboToken* output) {
|
2097
|
+
static StateResult handle_comment_end_dash_state(GumboParser* parser,
|
2098
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2153
2099
|
switch (c) {
|
2154
2100
|
case '-':
|
2155
2101
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
|
@@ -2174,9 +2120,8 @@ static StateResult handle_comment_end_dash_state(
|
|
2174
2120
|
}
|
2175
2121
|
|
2176
2122
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
|
2177
|
-
static StateResult handle_comment_end_state(
|
2178
|
-
|
2179
|
-
int c, GumboToken* output) {
|
2123
|
+
static StateResult handle_comment_end_state(GumboParser* parser,
|
2124
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2180
2125
|
switch (c) {
|
2181
2126
|
case '>':
|
2182
2127
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
@@ -2189,11 +2134,13 @@ static StateResult handle_comment_end_state(
|
|
2189
2134
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2190
2135
|
return NEXT_CHAR;
|
2191
2136
|
case '!':
|
2192
|
-
tokenizer_add_parse_error(
|
2137
|
+
tokenizer_add_parse_error(
|
2138
|
+
parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
|
2193
2139
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
|
2194
2140
|
return NEXT_CHAR;
|
2195
2141
|
case '-':
|
2196
|
-
tokenizer_add_parse_error(
|
2142
|
+
tokenizer_add_parse_error(
|
2143
|
+
parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
|
2197
2144
|
append_char_to_temporary_buffer(parser, '-');
|
2198
2145
|
return NEXT_CHAR;
|
2199
2146
|
case -1:
|
@@ -2212,9 +2159,8 @@ static StateResult handle_comment_end_state(
|
|
2212
2159
|
}
|
2213
2160
|
|
2214
2161
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
|
2215
|
-
static StateResult handle_comment_end_bang_state(
|
2216
|
-
|
2217
|
-
int c, GumboToken* output) {
|
2162
|
+
static StateResult handle_comment_end_bang_state(GumboParser* parser,
|
2163
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2218
2164
|
switch (c) {
|
2219
2165
|
case '-':
|
2220
2166
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
|
@@ -2249,9 +2195,8 @@ static StateResult handle_comment_end_bang_state(
|
|
2249
2195
|
}
|
2250
2196
|
|
2251
2197
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
|
2252
|
-
static StateResult handle_doctype_state(
|
2253
|
-
|
2254
|
-
int c, GumboToken* output) {
|
2198
|
+
static StateResult handle_doctype_state(GumboParser* parser,
|
2199
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2255
2200
|
assert(!tokenizer->_temporary_buffer.length);
|
2256
2201
|
switch (c) {
|
2257
2202
|
case '\t':
|
@@ -2276,9 +2221,8 @@ static StateResult handle_doctype_state(
|
|
2276
2221
|
}
|
2277
2222
|
|
2278
2223
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
|
2279
|
-
static StateResult handle_before_doctype_name_state(
|
2280
|
-
|
2281
|
-
int c, GumboToken* output) {
|
2224
|
+
static StateResult handle_before_doctype_name_state(GumboParser* parser,
|
2225
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2282
2226
|
switch (c) {
|
2283
2227
|
case '\t':
|
2284
2228
|
case '\n':
|
@@ -2312,9 +2256,8 @@ static StateResult handle_before_doctype_name_state(
|
|
2312
2256
|
}
|
2313
2257
|
|
2314
2258
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
|
2315
|
-
static StateResult handle_doctype_name_state(
|
2316
|
-
|
2317
|
-
int c, GumboToken* output) {
|
2259
|
+
static StateResult handle_doctype_name_state(GumboParser* parser,
|
2260
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2318
2261
|
switch (c) {
|
2319
2262
|
case '\t':
|
2320
2263
|
case '\n':
|
@@ -2322,14 +2265,12 @@ static StateResult handle_doctype_name_state(
|
|
2322
2265
|
case ' ':
|
2323
2266
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
|
2324
2267
|
gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
|
2325
|
-
finish_temporary_buffer(
|
2326
|
-
parser, &tokenizer->_doc_type_state.name);
|
2268
|
+
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2327
2269
|
return NEXT_CHAR;
|
2328
2270
|
case '>':
|
2329
2271
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2330
2272
|
gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
|
2331
|
-
finish_temporary_buffer(
|
2332
|
-
parser, &tokenizer->_doc_type_state.name);
|
2273
|
+
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2333
2274
|
emit_doctype(parser, output);
|
2334
2275
|
return RETURN_SUCCESS;
|
2335
2276
|
case '\0':
|
@@ -2341,8 +2282,7 @@ static StateResult handle_doctype_name_state(
|
|
2341
2282
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2342
2283
|
tokenizer->_doc_type_state.force_quirks = true;
|
2343
2284
|
gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
|
2344
|
-
finish_temporary_buffer(
|
2345
|
-
parser, &tokenizer->_doc_type_state.name);
|
2285
|
+
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2346
2286
|
emit_doctype(parser, output);
|
2347
2287
|
return RETURN_ERROR;
|
2348
2288
|
default:
|
@@ -2354,9 +2294,8 @@ static StateResult handle_doctype_name_state(
|
|
2354
2294
|
}
|
2355
2295
|
|
2356
2296
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
|
2357
|
-
static StateResult handle_after_doctype_name_state(
|
2358
|
-
|
2359
|
-
int c, GumboToken* output) {
|
2297
|
+
static StateResult handle_after_doctype_name_state(GumboParser* parser,
|
2298
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2360
2299
|
switch (c) {
|
2361
2300
|
case '\t':
|
2362
2301
|
case '\n':
|
@@ -2375,17 +2314,18 @@ static StateResult handle_after_doctype_name_state(
|
|
2375
2314
|
return RETURN_ERROR;
|
2376
2315
|
default:
|
2377
2316
|
if (utf8iterator_maybe_consume_match(
|
2378
|
-
|
2317
|
+
&tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
|
2379
2318
|
gumbo_tokenizer_set_state(
|
2380
2319
|
parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
|
2381
2320
|
tokenizer->_reconsume_current_input = true;
|
2382
|
-
} else if (utf8iterator_maybe_consume_match(
|
2383
|
-
|
2321
|
+
} else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
|
2322
|
+
sizeof("SYSTEM") - 1, false)) {
|
2384
2323
|
gumbo_tokenizer_set_state(
|
2385
2324
|
parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
|
2386
2325
|
tokenizer->_reconsume_current_input = true;
|
2387
2326
|
} else {
|
2388
|
-
tokenizer_add_parse_error(
|
2327
|
+
tokenizer_add_parse_error(
|
2328
|
+
parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
|
2389
2329
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2390
2330
|
tokenizer->_doc_type_state.force_quirks = true;
|
2391
2331
|
}
|
@@ -2395,15 +2335,14 @@ static StateResult handle_after_doctype_name_state(
|
|
2395
2335
|
|
2396
2336
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
|
2397
2337
|
static StateResult handle_after_doctype_public_keyword_state(
|
2398
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2399
|
-
|
2338
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2339
|
+
GumboToken* output) {
|
2400
2340
|
switch (c) {
|
2401
2341
|
case '\t':
|
2402
2342
|
case '\n':
|
2403
2343
|
case '\f':
|
2404
2344
|
case ' ':
|
2405
|
-
gumbo_tokenizer_set_state(
|
2406
|
-
parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
|
2345
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
|
2407
2346
|
return NEXT_CHAR;
|
2408
2347
|
case '"':
|
2409
2348
|
tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
|
@@ -2439,9 +2378,8 @@ static StateResult handle_after_doctype_public_keyword_state(
|
|
2439
2378
|
}
|
2440
2379
|
|
2441
2380
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
|
2442
|
-
static StateResult handle_before_doctype_public_id_state(
|
2443
|
-
|
2444
|
-
int c, GumboToken* output) {
|
2381
|
+
static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
|
2382
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2445
2383
|
switch (c) {
|
2446
2384
|
case '\t':
|
2447
2385
|
case '\n':
|
@@ -2481,8 +2419,8 @@ static StateResult handle_before_doctype_public_id_state(
|
|
2481
2419
|
|
2482
2420
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
|
2483
2421
|
static StateResult handle_doctype_public_id_double_quoted_state(
|
2484
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2485
|
-
|
2422
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2423
|
+
GumboToken* output) {
|
2486
2424
|
switch (c) {
|
2487
2425
|
case '"':
|
2488
2426
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
|
@@ -2514,8 +2452,8 @@ static StateResult handle_doctype_public_id_double_quoted_state(
|
|
2514
2452
|
|
2515
2453
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
|
2516
2454
|
static StateResult handle_doctype_public_id_single_quoted_state(
|
2517
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2518
|
-
|
2455
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2456
|
+
GumboToken* output) {
|
2519
2457
|
switch (c) {
|
2520
2458
|
case '\'':
|
2521
2459
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
|
@@ -2546,9 +2484,8 @@ static StateResult handle_doctype_public_id_single_quoted_state(
|
|
2546
2484
|
}
|
2547
2485
|
|
2548
2486
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
|
2549
|
-
static StateResult handle_after_doctype_public_id_state(
|
2550
|
-
|
2551
|
-
int c, GumboToken* output) {
|
2487
|
+
static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
|
2488
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2552
2489
|
switch (c) {
|
2553
2490
|
case '\t':
|
2554
2491
|
case '\n':
|
@@ -2590,8 +2527,8 @@ static StateResult handle_after_doctype_public_id_state(
|
|
2590
2527
|
|
2591
2528
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
|
2592
2529
|
static StateResult handle_between_doctype_public_system_id_state(
|
2593
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2594
|
-
|
2530
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2531
|
+
GumboToken* output) {
|
2595
2532
|
switch (c) {
|
2596
2533
|
case '\t':
|
2597
2534
|
case '\n':
|
@@ -2629,8 +2566,8 @@ static StateResult handle_between_doctype_public_system_id_state(
|
|
2629
2566
|
|
2630
2567
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
|
2631
2568
|
static StateResult handle_after_doctype_system_keyword_state(
|
2632
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2633
|
-
|
2569
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2570
|
+
GumboToken* output) {
|
2634
2571
|
switch (c) {
|
2635
2572
|
case '\t':
|
2636
2573
|
case '\n':
|
@@ -2671,9 +2608,8 @@ static StateResult handle_after_doctype_system_keyword_state(
|
|
2671
2608
|
}
|
2672
2609
|
|
2673
2610
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
|
2674
|
-
static StateResult handle_before_doctype_system_id_state(
|
2675
|
-
|
2676
|
-
int c, GumboToken* output) {
|
2611
|
+
static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
|
2612
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2677
2613
|
switch (c) {
|
2678
2614
|
case '\t':
|
2679
2615
|
case '\n':
|
@@ -2712,8 +2648,8 @@ static StateResult handle_before_doctype_system_id_state(
|
|
2712
2648
|
|
2713
2649
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
|
2714
2650
|
static StateResult handle_doctype_system_id_double_quoted_state(
|
2715
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2716
|
-
|
2651
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2652
|
+
GumboToken* output) {
|
2717
2653
|
switch (c) {
|
2718
2654
|
case '"':
|
2719
2655
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
|
@@ -2745,8 +2681,8 @@ static StateResult handle_doctype_system_id_double_quoted_state(
|
|
2745
2681
|
|
2746
2682
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
|
2747
2683
|
static StateResult handle_doctype_system_id_single_quoted_state(
|
2748
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2749
|
-
|
2684
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2685
|
+
GumboToken* output) {
|
2750
2686
|
switch (c) {
|
2751
2687
|
case '\'':
|
2752
2688
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
|
@@ -2777,9 +2713,8 @@ static StateResult handle_doctype_system_id_single_quoted_state(
|
|
2777
2713
|
}
|
2778
2714
|
|
2779
2715
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
|
2780
|
-
static StateResult handle_after_doctype_system_id_state(
|
2781
|
-
|
2782
|
-
int c, GumboToken* output) {
|
2716
|
+
static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
|
2717
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2783
2718
|
switch (c) {
|
2784
2719
|
case '\t':
|
2785
2720
|
case '\n':
|
@@ -2804,9 +2739,8 @@ static StateResult handle_after_doctype_system_id_state(
|
|
2804
2739
|
}
|
2805
2740
|
|
2806
2741
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
|
2807
|
-
static StateResult handle_bogus_doctype_state(
|
2808
|
-
|
2809
|
-
int c, GumboToken* output) {
|
2742
|
+
static StateResult handle_bogus_doctype_state(GumboParser* parser,
|
2743
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2810
2744
|
if (c == '>' || c == -1) {
|
2811
2745
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2812
2746
|
emit_doctype(parser, output);
|
@@ -2816,15 +2750,14 @@ static StateResult handle_bogus_doctype_state(
|
|
2816
2750
|
}
|
2817
2751
|
|
2818
2752
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
|
2819
|
-
static StateResult handle_cdata_state(
|
2820
|
-
|
2821
|
-
int c, GumboToken* output) {
|
2753
|
+
static StateResult handle_cdata_state(GumboParser* parser,
|
2754
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2822
2755
|
if (c == -1 || utf8iterator_maybe_consume_match(
|
2823
|
-
|
2756
|
+
&tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
|
2824
2757
|
tokenizer->_reconsume_current_input = true;
|
2825
2758
|
reset_token_start_point(tokenizer);
|
2826
2759
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2827
|
-
tokenizer->_is_in_cdata =
|
2760
|
+
tokenizer->_is_in_cdata = false;
|
2828
2761
|
return NEXT_CHAR;
|
2829
2762
|
} else {
|
2830
2763
|
return emit_current_char(parser, output);
|
@@ -2834,76 +2767,47 @@ static StateResult handle_cdata_state(
|
|
2834
2767
|
typedef StateResult (*GumboLexerStateFunction)(
|
2835
2768
|
GumboParser*, GumboTokenizerState*, int, GumboToken*);
|
2836
2769
|
|
2837
|
-
static GumboLexerStateFunction dispatch_table[] = {
|
2838
|
-
|
2839
|
-
|
2840
|
-
|
2841
|
-
|
2842
|
-
|
2843
|
-
|
2844
|
-
|
2845
|
-
|
2846
|
-
|
2847
|
-
|
2848
|
-
|
2849
|
-
|
2850
|
-
|
2851
|
-
|
2852
|
-
|
2853
|
-
|
2854
|
-
|
2855
|
-
|
2856
|
-
|
2857
|
-
|
2858
|
-
|
2859
|
-
|
2860
|
-
|
2861
|
-
|
2862
|
-
|
2863
|
-
|
2864
|
-
|
2865
|
-
|
2866
|
-
|
2867
|
-
|
2868
|
-
|
2869
|
-
|
2870
|
-
|
2871
|
-
|
2872
|
-
|
2873
|
-
|
2874
|
-
|
2875
|
-
|
2876
|
-
|
2877
|
-
|
2878
|
-
handle_char_ref_in_attr_value_state,
|
2879
|
-
handle_after_attr_value_quoted_state,
|
2880
|
-
handle_self_closing_start_tag_state,
|
2881
|
-
handle_bogus_comment_state,
|
2882
|
-
handle_markup_declaration_state,
|
2883
|
-
handle_comment_start_state,
|
2884
|
-
handle_comment_start_dash_state,
|
2885
|
-
handle_comment_state,
|
2886
|
-
handle_comment_end_dash_state,
|
2887
|
-
handle_comment_end_state,
|
2888
|
-
handle_comment_end_bang_state,
|
2889
|
-
handle_doctype_state,
|
2890
|
-
handle_before_doctype_name_state,
|
2891
|
-
handle_doctype_name_state,
|
2892
|
-
handle_after_doctype_name_state,
|
2893
|
-
handle_after_doctype_public_keyword_state,
|
2894
|
-
handle_before_doctype_public_id_state,
|
2895
|
-
handle_doctype_public_id_double_quoted_state,
|
2896
|
-
handle_doctype_public_id_single_quoted_state,
|
2897
|
-
handle_after_doctype_public_id_state,
|
2898
|
-
handle_between_doctype_public_system_id_state,
|
2899
|
-
handle_after_doctype_system_keyword_state,
|
2900
|
-
handle_before_doctype_system_id_state,
|
2901
|
-
handle_doctype_system_id_double_quoted_state,
|
2902
|
-
handle_doctype_system_id_single_quoted_state,
|
2903
|
-
handle_after_doctype_system_id_state,
|
2904
|
-
handle_bogus_doctype_state,
|
2905
|
-
handle_cdata_state
|
2906
|
-
};
|
2770
|
+
static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
|
2771
|
+
handle_char_ref_in_data_state, handle_rcdata_state,
|
2772
|
+
handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
|
2773
|
+
handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
|
2774
|
+
handle_tag_name_state, handle_rcdata_lt_state,
|
2775
|
+
handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
|
2776
|
+
handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
|
2777
|
+
handle_rawtext_end_tag_name_state, handle_script_lt_state,
|
2778
|
+
handle_script_end_tag_open_state, handle_script_end_tag_name_state,
|
2779
|
+
handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
|
2780
|
+
handle_script_escaped_state, handle_script_escaped_dash_state,
|
2781
|
+
handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
|
2782
|
+
handle_script_escaped_end_tag_open_state,
|
2783
|
+
handle_script_escaped_end_tag_name_state,
|
2784
|
+
handle_script_double_escaped_start_state,
|
2785
|
+
handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
|
2786
|
+
handle_script_double_escaped_dash_dash_state,
|
2787
|
+
handle_script_double_escaped_lt_state,
|
2788
|
+
handle_script_double_escaped_end_state, handle_before_attr_name_state,
|
2789
|
+
handle_attr_name_state, handle_after_attr_name_state,
|
2790
|
+
handle_before_attr_value_state, handle_attr_value_double_quoted_state,
|
2791
|
+
handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
|
2792
|
+
handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
|
2793
|
+
handle_self_closing_start_tag_state, handle_bogus_comment_state,
|
2794
|
+
handle_markup_declaration_state, handle_comment_start_state,
|
2795
|
+
handle_comment_start_dash_state, handle_comment_state,
|
2796
|
+
handle_comment_end_dash_state, handle_comment_end_state,
|
2797
|
+
handle_comment_end_bang_state, handle_doctype_state,
|
2798
|
+
handle_before_doctype_name_state, handle_doctype_name_state,
|
2799
|
+
handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
|
2800
|
+
handle_before_doctype_public_id_state,
|
2801
|
+
handle_doctype_public_id_double_quoted_state,
|
2802
|
+
handle_doctype_public_id_single_quoted_state,
|
2803
|
+
handle_after_doctype_public_id_state,
|
2804
|
+
handle_between_doctype_public_system_id_state,
|
2805
|
+
handle_after_doctype_system_keyword_state,
|
2806
|
+
handle_before_doctype_system_id_state,
|
2807
|
+
handle_doctype_system_id_double_quoted_state,
|
2808
|
+
handle_doctype_system_id_single_quoted_state,
|
2809
|
+
handle_after_doctype_system_id_state, handle_bogus_doctype_state,
|
2810
|
+
handle_cdata_state};
|
2907
2811
|
|
2908
2812
|
bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
2909
2813
|
// Because of the spec requirements that...
|
@@ -2941,8 +2845,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
2941
2845
|
assert(!tokenizer->_temporary_buffer_emit);
|
2942
2846
|
assert(tokenizer->_buffered_emit_char == kGumboNoChar);
|
2943
2847
|
int c = utf8iterator_current(&tokenizer->_input);
|
2944
|
-
gumbo_debug(
|
2945
|
-
c, c, tokenizer->_state);
|
2848
|
+
gumbo_debug(
|
2849
|
+
"Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
|
2946
2850
|
StateResult result =
|
2947
2851
|
dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
|
2948
2852
|
// We need to clear reconsume_current_input before returning to prevent
|
@@ -2952,7 +2856,7 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
2952
2856
|
|
2953
2857
|
if (result == RETURN_SUCCESS) {
|
2954
2858
|
return true;
|
2955
|
-
} else if(result == RETURN_ERROR) {
|
2859
|
+
} else if (result == RETURN_ERROR) {
|
2956
2860
|
return false;
|
2957
2861
|
}
|
2958
2862
|
|
@@ -2974,7 +2878,7 @@ void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
|
|
2974
2878
|
parser, (void*) token->v.doc_type.system_identifier);
|
2975
2879
|
return;
|
2976
2880
|
case GUMBO_TOKEN_START_TAG:
|
2977
|
-
for (int i = 0; i < token->v.start_tag.attributes.length; ++i) {
|
2881
|
+
for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
|
2978
2882
|
GumboAttribute* attr = token->v.start_tag.attributes.data[i];
|
2979
2883
|
if (attr) {
|
2980
2884
|
// May have been nulled out if this token was merged with another.
|