nokogumbo 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +8 -2
- data/ext/nokogumboc/extconf.rb +18 -6
- data/ext/nokogumboc/nokogumbo.c +102 -42
- data/gumbo-parser/src/attribute.c +1 -1
- data/gumbo-parser/src/char_ref.c +37 -67
- data/gumbo-parser/src/char_ref.h +3 -4
- data/gumbo-parser/src/char_ref.rl +6 -1
- data/gumbo-parser/src/error.c +51 -51
- data/gumbo-parser/src/error.h +7 -9
- data/gumbo-parser/src/gumbo.h +45 -181
- data/gumbo-parser/src/parser.c +1439 -1172
- data/gumbo-parser/src/string_buffer.c +14 -10
- data/gumbo-parser/src/string_buffer.h +9 -6
- data/gumbo-parser/src/string_piece.c +5 -6
- data/gumbo-parser/src/string_piece.h +2 -3
- data/gumbo-parser/src/tag.c +36 -166
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +153 -0
- data/gumbo-parser/src/tag_gperf.h +105 -0
- data/gumbo-parser/src/tag_sizes.h +4 -0
- data/gumbo-parser/src/tag_strings.h +153 -0
- data/gumbo-parser/src/token_type.h +1 -0
- data/gumbo-parser/src/tokenizer.c +278 -361
- data/gumbo-parser/src/tokenizer.h +2 -2
- data/gumbo-parser/src/utf8.c +53 -52
- data/gumbo-parser/src/utf8.h +1 -2
- data/gumbo-parser/src/util.c +1 -1
- data/gumbo-parser/src/util.h +0 -2
- data/gumbo-parser/src/vector.c +17 -17
- data/gumbo-parser/src/vector.h +6 -8
- data/gumbo-parser/visualc/include/strings.h +2 -1
- data/lib/nokogumbo.rb +8 -8
- data/test-nokogumbo.rb +190 -0
- metadata +19 -17
@@ -0,0 +1,153 @@
|
|
1
|
+
// Generated via `gentags.py src/tag.in`.
|
2
|
+
// Do not edit; edit src/tag.in instead.
|
3
|
+
// clang-format off
|
4
|
+
GUMBO_TAG_HTML,
|
5
|
+
GUMBO_TAG_HEAD,
|
6
|
+
GUMBO_TAG_TITLE,
|
7
|
+
GUMBO_TAG_BASE,
|
8
|
+
GUMBO_TAG_LINK,
|
9
|
+
GUMBO_TAG_META,
|
10
|
+
GUMBO_TAG_STYLE,
|
11
|
+
GUMBO_TAG_SCRIPT,
|
12
|
+
GUMBO_TAG_NOSCRIPT,
|
13
|
+
GUMBO_TAG_TEMPLATE,
|
14
|
+
GUMBO_TAG_BODY,
|
15
|
+
GUMBO_TAG_ARTICLE,
|
16
|
+
GUMBO_TAG_SECTION,
|
17
|
+
GUMBO_TAG_NAV,
|
18
|
+
GUMBO_TAG_ASIDE,
|
19
|
+
GUMBO_TAG_H1,
|
20
|
+
GUMBO_TAG_H2,
|
21
|
+
GUMBO_TAG_H3,
|
22
|
+
GUMBO_TAG_H4,
|
23
|
+
GUMBO_TAG_H5,
|
24
|
+
GUMBO_TAG_H6,
|
25
|
+
GUMBO_TAG_HGROUP,
|
26
|
+
GUMBO_TAG_HEADER,
|
27
|
+
GUMBO_TAG_FOOTER,
|
28
|
+
GUMBO_TAG_ADDRESS,
|
29
|
+
GUMBO_TAG_P,
|
30
|
+
GUMBO_TAG_HR,
|
31
|
+
GUMBO_TAG_PRE,
|
32
|
+
GUMBO_TAG_BLOCKQUOTE,
|
33
|
+
GUMBO_TAG_OL,
|
34
|
+
GUMBO_TAG_UL,
|
35
|
+
GUMBO_TAG_LI,
|
36
|
+
GUMBO_TAG_DL,
|
37
|
+
GUMBO_TAG_DT,
|
38
|
+
GUMBO_TAG_DD,
|
39
|
+
GUMBO_TAG_FIGURE,
|
40
|
+
GUMBO_TAG_FIGCAPTION,
|
41
|
+
GUMBO_TAG_MAIN,
|
42
|
+
GUMBO_TAG_DIV,
|
43
|
+
GUMBO_TAG_A,
|
44
|
+
GUMBO_TAG_EM,
|
45
|
+
GUMBO_TAG_STRONG,
|
46
|
+
GUMBO_TAG_SMALL,
|
47
|
+
GUMBO_TAG_S,
|
48
|
+
GUMBO_TAG_CITE,
|
49
|
+
GUMBO_TAG_Q,
|
50
|
+
GUMBO_TAG_DFN,
|
51
|
+
GUMBO_TAG_ABBR,
|
52
|
+
GUMBO_TAG_DATA,
|
53
|
+
GUMBO_TAG_TIME,
|
54
|
+
GUMBO_TAG_CODE,
|
55
|
+
GUMBO_TAG_VAR,
|
56
|
+
GUMBO_TAG_SAMP,
|
57
|
+
GUMBO_TAG_KBD,
|
58
|
+
GUMBO_TAG_SUB,
|
59
|
+
GUMBO_TAG_SUP,
|
60
|
+
GUMBO_TAG_I,
|
61
|
+
GUMBO_TAG_B,
|
62
|
+
GUMBO_TAG_U,
|
63
|
+
GUMBO_TAG_MARK,
|
64
|
+
GUMBO_TAG_RUBY,
|
65
|
+
GUMBO_TAG_RT,
|
66
|
+
GUMBO_TAG_RP,
|
67
|
+
GUMBO_TAG_BDI,
|
68
|
+
GUMBO_TAG_BDO,
|
69
|
+
GUMBO_TAG_SPAN,
|
70
|
+
GUMBO_TAG_BR,
|
71
|
+
GUMBO_TAG_WBR,
|
72
|
+
GUMBO_TAG_INS,
|
73
|
+
GUMBO_TAG_DEL,
|
74
|
+
GUMBO_TAG_IMAGE,
|
75
|
+
GUMBO_TAG_IMG,
|
76
|
+
GUMBO_TAG_IFRAME,
|
77
|
+
GUMBO_TAG_EMBED,
|
78
|
+
GUMBO_TAG_OBJECT,
|
79
|
+
GUMBO_TAG_PARAM,
|
80
|
+
GUMBO_TAG_VIDEO,
|
81
|
+
GUMBO_TAG_AUDIO,
|
82
|
+
GUMBO_TAG_SOURCE,
|
83
|
+
GUMBO_TAG_TRACK,
|
84
|
+
GUMBO_TAG_CANVAS,
|
85
|
+
GUMBO_TAG_MAP,
|
86
|
+
GUMBO_TAG_AREA,
|
87
|
+
GUMBO_TAG_MATH,
|
88
|
+
GUMBO_TAG_MI,
|
89
|
+
GUMBO_TAG_MO,
|
90
|
+
GUMBO_TAG_MN,
|
91
|
+
GUMBO_TAG_MS,
|
92
|
+
GUMBO_TAG_MTEXT,
|
93
|
+
GUMBO_TAG_MGLYPH,
|
94
|
+
GUMBO_TAG_MALIGNMARK,
|
95
|
+
GUMBO_TAG_ANNOTATION_XML,
|
96
|
+
GUMBO_TAG_SVG,
|
97
|
+
GUMBO_TAG_FOREIGNOBJECT,
|
98
|
+
GUMBO_TAG_DESC,
|
99
|
+
GUMBO_TAG_TABLE,
|
100
|
+
GUMBO_TAG_CAPTION,
|
101
|
+
GUMBO_TAG_COLGROUP,
|
102
|
+
GUMBO_TAG_COL,
|
103
|
+
GUMBO_TAG_TBODY,
|
104
|
+
GUMBO_TAG_THEAD,
|
105
|
+
GUMBO_TAG_TFOOT,
|
106
|
+
GUMBO_TAG_TR,
|
107
|
+
GUMBO_TAG_TD,
|
108
|
+
GUMBO_TAG_TH,
|
109
|
+
GUMBO_TAG_FORM,
|
110
|
+
GUMBO_TAG_FIELDSET,
|
111
|
+
GUMBO_TAG_LEGEND,
|
112
|
+
GUMBO_TAG_LABEL,
|
113
|
+
GUMBO_TAG_INPUT,
|
114
|
+
GUMBO_TAG_BUTTON,
|
115
|
+
GUMBO_TAG_SELECT,
|
116
|
+
GUMBO_TAG_DATALIST,
|
117
|
+
GUMBO_TAG_OPTGROUP,
|
118
|
+
GUMBO_TAG_OPTION,
|
119
|
+
GUMBO_TAG_TEXTAREA,
|
120
|
+
GUMBO_TAG_KEYGEN,
|
121
|
+
GUMBO_TAG_OUTPUT,
|
122
|
+
GUMBO_TAG_PROGRESS,
|
123
|
+
GUMBO_TAG_METER,
|
124
|
+
GUMBO_TAG_DETAILS,
|
125
|
+
GUMBO_TAG_SUMMARY,
|
126
|
+
GUMBO_TAG_MENU,
|
127
|
+
GUMBO_TAG_MENUITEM,
|
128
|
+
GUMBO_TAG_APPLET,
|
129
|
+
GUMBO_TAG_ACRONYM,
|
130
|
+
GUMBO_TAG_BGSOUND,
|
131
|
+
GUMBO_TAG_DIR,
|
132
|
+
GUMBO_TAG_FRAME,
|
133
|
+
GUMBO_TAG_FRAMESET,
|
134
|
+
GUMBO_TAG_NOFRAMES,
|
135
|
+
GUMBO_TAG_ISINDEX,
|
136
|
+
GUMBO_TAG_LISTING,
|
137
|
+
GUMBO_TAG_XMP,
|
138
|
+
GUMBO_TAG_NEXTID,
|
139
|
+
GUMBO_TAG_NOEMBED,
|
140
|
+
GUMBO_TAG_PLAINTEXT,
|
141
|
+
GUMBO_TAG_RB,
|
142
|
+
GUMBO_TAG_STRIKE,
|
143
|
+
GUMBO_TAG_BASEFONT,
|
144
|
+
GUMBO_TAG_BIG,
|
145
|
+
GUMBO_TAG_BLINK,
|
146
|
+
GUMBO_TAG_CENTER,
|
147
|
+
GUMBO_TAG_FONT,
|
148
|
+
GUMBO_TAG_MARQUEE,
|
149
|
+
GUMBO_TAG_MULTICOL,
|
150
|
+
GUMBO_TAG_NOBR,
|
151
|
+
GUMBO_TAG_SPACER,
|
152
|
+
GUMBO_TAG_TT,
|
153
|
+
GUMBO_TAG_RTC,
|
@@ -0,0 +1,105 @@
|
|
1
|
+
static unsigned int tag_hash(
|
2
|
+
register const char *str, register unsigned int len) {
|
3
|
+
static unsigned short asso_values[] = {296, 296, 296, 296, 296, 296, 296, 296,
|
4
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
5
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
6
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 6, 4, 3, 1, 1, 0,
|
7
|
+
1, 0, 0, 296, 296, 296, 296, 296, 296, 296, 22, 73, 151, 4, 13, 59, 65, 2,
|
8
|
+
69, 0, 134, 9, 16, 52, 55, 28, 101, 0, 1, 6, 63, 126, 104, 93, 124, 296,
|
9
|
+
296, 296, 296, 296, 296, 296, 22, 73, 151, 4, 13, 59, 65, 2, 69, 0, 134,
|
10
|
+
9, 16, 52, 55, 28, 101, 0, 1, 6, 63, 126, 104, 93, 124, 296, 296, 296,
|
11
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
12
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
13
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
14
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
15
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
16
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
17
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
18
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296,
|
19
|
+
296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296, 296};
|
20
|
+
register unsigned int hval = len;
|
21
|
+
|
22
|
+
switch (hval) {
|
23
|
+
default:
|
24
|
+
hval += asso_values[(unsigned char) str[1] + 3];
|
25
|
+
/*FALLTHROUGH*/
|
26
|
+
case 1:
|
27
|
+
hval += asso_values[(unsigned char) str[0]];
|
28
|
+
break;
|
29
|
+
}
|
30
|
+
return hval + asso_values[(unsigned char) str[len - 1]];
|
31
|
+
}
|
32
|
+
|
33
|
+
static const unsigned char kGumboTagMap[] = {GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
34
|
+
GUMBO_TAG_LAST, GUMBO_TAG_S, GUMBO_TAG_H6, GUMBO_TAG_H5, GUMBO_TAG_H4,
|
35
|
+
GUMBO_TAG_H3, GUMBO_TAG_SPACER, GUMBO_TAG_H2, GUMBO_TAG_HEADER,
|
36
|
+
GUMBO_TAG_H1, GUMBO_TAG_HEAD, GUMBO_TAG_LAST, GUMBO_TAG_DETAILS,
|
37
|
+
GUMBO_TAG_SELECT, GUMBO_TAG_DIR, GUMBO_TAG_LAST, GUMBO_TAG_DEL,
|
38
|
+
GUMBO_TAG_LAST, GUMBO_TAG_SOURCE, GUMBO_TAG_LEGEND, GUMBO_TAG_DATALIST,
|
39
|
+
GUMBO_TAG_METER, GUMBO_TAG_MGLYPH, GUMBO_TAG_LAST, GUMBO_TAG_MATH,
|
40
|
+
GUMBO_TAG_LABEL, GUMBO_TAG_TABLE, GUMBO_TAG_TEMPLATE, GUMBO_TAG_LAST,
|
41
|
+
GUMBO_TAG_RP, GUMBO_TAG_TIME, GUMBO_TAG_TITLE, GUMBO_TAG_DATA,
|
42
|
+
GUMBO_TAG_APPLET, GUMBO_TAG_HGROUP, GUMBO_TAG_SAMP, GUMBO_TAG_TEXTAREA,
|
43
|
+
GUMBO_TAG_ABBR, GUMBO_TAG_MARQUEE, GUMBO_TAG_LAST, GUMBO_TAG_MENUITEM,
|
44
|
+
GUMBO_TAG_SMALL, GUMBO_TAG_META, GUMBO_TAG_A, GUMBO_TAG_LAST,
|
45
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_EMBED,
|
46
|
+
GUMBO_TAG_MAP, GUMBO_TAG_LAST, GUMBO_TAG_PARAM, GUMBO_TAG_LAST,
|
47
|
+
GUMBO_TAG_LAST, GUMBO_TAG_NOBR, GUMBO_TAG_P, GUMBO_TAG_SPAN, GUMBO_TAG_EM,
|
48
|
+
GUMBO_TAG_LAST, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SECTION, GUMBO_TAG_NOEMBED,
|
49
|
+
GUMBO_TAG_NEXTID, GUMBO_TAG_FOOTER, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_HR,
|
50
|
+
GUMBO_TAG_LAST, GUMBO_TAG_FONT, GUMBO_TAG_DL, GUMBO_TAG_TR,
|
51
|
+
GUMBO_TAG_SCRIPT, GUMBO_TAG_MO, GUMBO_TAG_LAST, GUMBO_TAG_DD,
|
52
|
+
GUMBO_TAG_MAIN, GUMBO_TAG_TD, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_FORM,
|
53
|
+
GUMBO_TAG_OBJECT, GUMBO_TAG_LAST, GUMBO_TAG_FIELDSET, GUMBO_TAG_LAST,
|
54
|
+
GUMBO_TAG_BGSOUND, GUMBO_TAG_MENU, GUMBO_TAG_TFOOT, GUMBO_TAG_FIGURE,
|
55
|
+
GUMBO_TAG_RB, GUMBO_TAG_LI, GUMBO_TAG_LISTING, GUMBO_TAG_BASEFONT,
|
56
|
+
GUMBO_TAG_OPTGROUP, GUMBO_TAG_LAST, GUMBO_TAG_BASE, GUMBO_TAG_ADDRESS,
|
57
|
+
GUMBO_TAG_MI, GUMBO_TAG_LAST, GUMBO_TAG_PLAINTEXT, GUMBO_TAG_LAST,
|
58
|
+
GUMBO_TAG_PROGRESS, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
59
|
+
GUMBO_TAG_ACRONYM, GUMBO_TAG_ARTICLE, GUMBO_TAG_LAST, GUMBO_TAG_PRE,
|
60
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_AREA,
|
61
|
+
GUMBO_TAG_RT, GUMBO_TAG_LAST, GUMBO_TAG_OPTION, GUMBO_TAG_IMAGE,
|
62
|
+
GUMBO_TAG_DT, GUMBO_TAG_LAST, GUMBO_TAG_TT, GUMBO_TAG_HTML, GUMBO_TAG_WBR,
|
63
|
+
GUMBO_TAG_OL, GUMBO_TAG_LAST, GUMBO_TAG_STYLE, GUMBO_TAG_STRIKE,
|
64
|
+
GUMBO_TAG_SUP, GUMBO_TAG_MULTICOL, GUMBO_TAG_U, GUMBO_TAG_DFN, GUMBO_TAG_UL,
|
65
|
+
GUMBO_TAG_FIGCAPTION, GUMBO_TAG_MTEXT, GUMBO_TAG_LAST, GUMBO_TAG_VAR,
|
66
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_FRAMESET, GUMBO_TAG_LAST,
|
67
|
+
GUMBO_TAG_BR, GUMBO_TAG_I, GUMBO_TAG_FRAME, GUMBO_TAG_LAST, GUMBO_TAG_DIV,
|
68
|
+
GUMBO_TAG_LAST, GUMBO_TAG_TH, GUMBO_TAG_MS, GUMBO_TAG_ANNOTATION_XML,
|
69
|
+
GUMBO_TAG_B, GUMBO_TAG_TBODY, GUMBO_TAG_THEAD, GUMBO_TAG_BIG,
|
70
|
+
GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_XMP, GUMBO_TAG_LAST, GUMBO_TAG_KBD,
|
71
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LINK, GUMBO_TAG_IFRAME, GUMBO_TAG_MARK,
|
72
|
+
GUMBO_TAG_CENTER, GUMBO_TAG_OUTPUT, GUMBO_TAG_DESC, GUMBO_TAG_CANVAS,
|
73
|
+
GUMBO_TAG_COL, GUMBO_TAG_MALIGNMARK, GUMBO_TAG_IMG, GUMBO_TAG_ASIDE,
|
74
|
+
GUMBO_TAG_LAST, GUMBO_TAG_CODE, GUMBO_TAG_LAST, GUMBO_TAG_SUB, GUMBO_TAG_MN,
|
75
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_INS, GUMBO_TAG_AUDIO,
|
76
|
+
GUMBO_TAG_STRONG, GUMBO_TAG_CITE, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
77
|
+
GUMBO_TAG_LAST, GUMBO_TAG_INPUT, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
78
|
+
GUMBO_TAG_LAST, GUMBO_TAG_NAV, GUMBO_TAG_LAST, GUMBO_TAG_COLGROUP,
|
79
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
80
|
+
GUMBO_TAG_LAST, GUMBO_TAG_SVG, GUMBO_TAG_KEYGEN, GUMBO_TAG_VIDEO,
|
81
|
+
GUMBO_TAG_BDO, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
82
|
+
GUMBO_TAG_LAST, GUMBO_TAG_BODY, GUMBO_TAG_LAST, GUMBO_TAG_Q, GUMBO_TAG_LAST,
|
83
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_TRACK,
|
84
|
+
GUMBO_TAG_LAST, GUMBO_TAG_BDI, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
85
|
+
GUMBO_TAG_LAST, GUMBO_TAG_CAPTION, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
86
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
87
|
+
GUMBO_TAG_RUBY, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_BUTTON,
|
88
|
+
GUMBO_TAG_SUMMARY, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
89
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
90
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
91
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
92
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
93
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
94
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
95
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
96
|
+
GUMBO_TAG_LAST, GUMBO_TAG_RTC, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
97
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
98
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_BLINK, GUMBO_TAG_LAST,
|
99
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
100
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
101
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
102
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
103
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
104
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_LAST,
|
105
|
+
GUMBO_TAG_LAST, GUMBO_TAG_LAST, GUMBO_TAG_ISINDEX};
|
@@ -0,0 +1,4 @@
|
|
1
|
+
// Generated via `gentags.py src/tag.in`.
|
2
|
+
// Do not edit; edit src/tag.in instead.
|
3
|
+
// clang-format off
|
4
|
+
4, 4, 5, 4, 4, 4, 5, 6, 8, 8, 4, 7, 7, 3, 5, 2, 2, 2, 2, 2, 2, 6, 6, 6, 7, 1, 2, 3, 10, 2, 2, 2, 2, 2, 2, 6, 10, 4, 3, 1, 2, 6, 5, 1, 4, 1, 3, 4, 4, 4, 4, 3, 4, 3, 3, 3, 1, 1, 1, 4, 4, 2, 2, 3, 3, 4, 2, 3, 3, 3, 5, 3, 6, 5, 6, 5, 5, 5, 6, 5, 6, 3, 4, 4, 2, 2, 2, 2, 5, 6, 10, 14, 3, 13, 4, 5, 7, 8, 3, 5, 5, 5, 2, 2, 2, 4, 8, 6, 5, 5, 6, 6, 8, 8, 6, 8, 6, 6, 8, 5, 7, 7, 4, 8, 6, 7, 7, 3, 5, 8, 8, 7, 7, 3, 6, 7, 9, 2, 6, 8, 3, 5, 6, 4, 7, 8, 4, 6, 2, 3,
|
@@ -0,0 +1,153 @@
|
|
1
|
+
// Generated via `gentags.py src/tag.in`.
|
2
|
+
// Do not edit; edit src/tag.in instead.
|
3
|
+
// clang-format off
|
4
|
+
"html",
|
5
|
+
"head",
|
6
|
+
"title",
|
7
|
+
"base",
|
8
|
+
"link",
|
9
|
+
"meta",
|
10
|
+
"style",
|
11
|
+
"script",
|
12
|
+
"noscript",
|
13
|
+
"template",
|
14
|
+
"body",
|
15
|
+
"article",
|
16
|
+
"section",
|
17
|
+
"nav",
|
18
|
+
"aside",
|
19
|
+
"h1",
|
20
|
+
"h2",
|
21
|
+
"h3",
|
22
|
+
"h4",
|
23
|
+
"h5",
|
24
|
+
"h6",
|
25
|
+
"hgroup",
|
26
|
+
"header",
|
27
|
+
"footer",
|
28
|
+
"address",
|
29
|
+
"p",
|
30
|
+
"hr",
|
31
|
+
"pre",
|
32
|
+
"blockquote",
|
33
|
+
"ol",
|
34
|
+
"ul",
|
35
|
+
"li",
|
36
|
+
"dl",
|
37
|
+
"dt",
|
38
|
+
"dd",
|
39
|
+
"figure",
|
40
|
+
"figcaption",
|
41
|
+
"main",
|
42
|
+
"div",
|
43
|
+
"a",
|
44
|
+
"em",
|
45
|
+
"strong",
|
46
|
+
"small",
|
47
|
+
"s",
|
48
|
+
"cite",
|
49
|
+
"q",
|
50
|
+
"dfn",
|
51
|
+
"abbr",
|
52
|
+
"data",
|
53
|
+
"time",
|
54
|
+
"code",
|
55
|
+
"var",
|
56
|
+
"samp",
|
57
|
+
"kbd",
|
58
|
+
"sub",
|
59
|
+
"sup",
|
60
|
+
"i",
|
61
|
+
"b",
|
62
|
+
"u",
|
63
|
+
"mark",
|
64
|
+
"ruby",
|
65
|
+
"rt",
|
66
|
+
"rp",
|
67
|
+
"bdi",
|
68
|
+
"bdo",
|
69
|
+
"span",
|
70
|
+
"br",
|
71
|
+
"wbr",
|
72
|
+
"ins",
|
73
|
+
"del",
|
74
|
+
"image",
|
75
|
+
"img",
|
76
|
+
"iframe",
|
77
|
+
"embed",
|
78
|
+
"object",
|
79
|
+
"param",
|
80
|
+
"video",
|
81
|
+
"audio",
|
82
|
+
"source",
|
83
|
+
"track",
|
84
|
+
"canvas",
|
85
|
+
"map",
|
86
|
+
"area",
|
87
|
+
"math",
|
88
|
+
"mi",
|
89
|
+
"mo",
|
90
|
+
"mn",
|
91
|
+
"ms",
|
92
|
+
"mtext",
|
93
|
+
"mglyph",
|
94
|
+
"malignmark",
|
95
|
+
"annotation-xml",
|
96
|
+
"svg",
|
97
|
+
"foreignobject",
|
98
|
+
"desc",
|
99
|
+
"table",
|
100
|
+
"caption",
|
101
|
+
"colgroup",
|
102
|
+
"col",
|
103
|
+
"tbody",
|
104
|
+
"thead",
|
105
|
+
"tfoot",
|
106
|
+
"tr",
|
107
|
+
"td",
|
108
|
+
"th",
|
109
|
+
"form",
|
110
|
+
"fieldset",
|
111
|
+
"legend",
|
112
|
+
"label",
|
113
|
+
"input",
|
114
|
+
"button",
|
115
|
+
"select",
|
116
|
+
"datalist",
|
117
|
+
"optgroup",
|
118
|
+
"option",
|
119
|
+
"textarea",
|
120
|
+
"keygen",
|
121
|
+
"output",
|
122
|
+
"progress",
|
123
|
+
"meter",
|
124
|
+
"details",
|
125
|
+
"summary",
|
126
|
+
"menu",
|
127
|
+
"menuitem",
|
128
|
+
"applet",
|
129
|
+
"acronym",
|
130
|
+
"bgsound",
|
131
|
+
"dir",
|
132
|
+
"frame",
|
133
|
+
"frameset",
|
134
|
+
"noframes",
|
135
|
+
"isindex",
|
136
|
+
"listing",
|
137
|
+
"xmp",
|
138
|
+
"nextid",
|
139
|
+
"noembed",
|
140
|
+
"plaintext",
|
141
|
+
"rb",
|
142
|
+
"strike",
|
143
|
+
"basefont",
|
144
|
+
"big",
|
145
|
+
"blink",
|
146
|
+
"center",
|
147
|
+
"font",
|
148
|
+
"marquee",
|
149
|
+
"multicol",
|
150
|
+
"nobr",
|
151
|
+
"spacer",
|
152
|
+
"tt",
|
153
|
+
"rtc",
|
@@ -42,7 +42,6 @@
|
|
42
42
|
// prevents parse error position from being messed up by possible mark/resets in
|
43
43
|
// temporary buffer manipulation.
|
44
44
|
|
45
|
-
|
46
45
|
#include "tokenizer.h"
|
47
46
|
|
48
47
|
#include <assert.h>
|
@@ -64,13 +63,13 @@
|
|
64
63
|
|
65
64
|
// Compared against _script_data_buffer to determine if we're in double-escaped
|
66
65
|
// script mode.
|
67
|
-
const GumboStringPiece kScriptTag = {
|
66
|
+
const GumboStringPiece kScriptTag = {"script", 6};
|
68
67
|
|
69
68
|
// An enum for the return value of each individual state.
|
70
69
|
typedef enum {
|
71
|
-
RETURN_ERROR,
|
72
|
-
RETURN_SUCCESS,
|
73
|
-
NEXT_CHAR
|
70
|
+
RETURN_ERROR, // Return false (error) from the tokenizer.
|
71
|
+
RETURN_SUCCESS, // Return true (success) from the tokenizer.
|
72
|
+
NEXT_CHAR // Proceed to the next character and continue lexing.
|
74
73
|
} StateResult;
|
75
74
|
|
76
75
|
// This is a struct containing state necessary to build up a tag token,
|
@@ -136,6 +135,10 @@ typedef struct GumboInternalTokenizerState {
|
|
136
135
|
// markup declaration state.
|
137
136
|
bool _is_current_node_foreign;
|
138
137
|
|
138
|
+
// A flag indicating whether the tokenizer is in a CDATA section. If so, then
|
139
|
+
// text tokens emitted will be GUMBO_TOKEN_CDATA.
|
140
|
+
bool _is_in_cdata;
|
141
|
+
|
139
142
|
// Certain states (notably character references) may emit two character tokens
|
140
143
|
// at once, but the contract for lex() fills in only one token at a time. The
|
141
144
|
// extra character is buffered here, and then this is checked on entry to
|
@@ -196,7 +199,8 @@ typedef struct GumboInternalTokenizerState {
|
|
196
199
|
} GumboTokenizerState;
|
197
200
|
|
198
201
|
// Adds an ERR_UNEXPECTED_CODE_POINT parse error to the parser's error struct.
|
199
|
-
static void tokenizer_add_parse_error(
|
202
|
+
static void tokenizer_add_parse_error(
|
203
|
+
GumboParser* parser, GumboErrorType type) {
|
200
204
|
GumboError* error = gumbo_add_error(parser);
|
201
205
|
if (!error) {
|
202
206
|
return;
|
@@ -315,7 +319,11 @@ static int ensure_lowercase(int c) {
|
|
315
319
|
return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
|
316
320
|
}
|
317
321
|
|
318
|
-
static GumboTokenType get_char_token_type(int c) {
|
322
|
+
static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
|
323
|
+
if (is_in_cdata && c > 0) {
|
324
|
+
return GUMBO_TOKEN_CDATA;
|
325
|
+
}
|
326
|
+
|
319
327
|
switch (c) {
|
320
328
|
case '\t':
|
321
329
|
case '\n':
|
@@ -348,12 +356,10 @@ static void clear_temporary_buffer(GumboParser* parser) {
|
|
348
356
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
349
357
|
assert(!tokenizer->_temporary_buffer_emit);
|
350
358
|
utf8iterator_mark(&tokenizer->_input);
|
351
|
-
|
352
|
-
gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
|
359
|
+
gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer);
|
353
360
|
// The temporary buffer and script data buffer are the same object in the
|
354
361
|
// spec, so the script data buffer should be cleared as well.
|
355
|
-
|
356
|
-
gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
|
362
|
+
gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
|
357
363
|
}
|
358
364
|
|
359
365
|
// Appends a codepoint to the temporary buffer.
|
@@ -366,15 +372,14 @@ static void append_char_to_temporary_buffer(
|
|
366
372
|
// Checks to see if the temporary buffer equals a certain string.
|
367
373
|
// Make sure this remains side-effect free; it's used in assertions.
|
368
374
|
#ifndef NDEBUG
|
369
|
-
static bool temporary_buffer_equals(
|
370
|
-
GumboParser* parser, const char* text) {
|
375
|
+
static bool temporary_buffer_equals(GumboParser* parser, const char* text) {
|
371
376
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_temporary_buffer;
|
372
377
|
// TODO(jdtang): See if the extra strlen is a performance problem, and replace
|
373
378
|
// it with an explicit sizeof(literal) if necessary. I don't think it will
|
374
379
|
// be, as this is only used in a couple of rare states.
|
375
380
|
int text_len = strlen(text);
|
376
381
|
return text_len == buffer->length &&
|
377
|
-
|
382
|
+
memcmp(buffer->data, text, text_len) == 0;
|
378
383
|
}
|
379
384
|
#endif
|
380
385
|
|
@@ -475,7 +480,7 @@ static void finish_doctype_system_id(GumboParser* parser) {
|
|
475
480
|
|
476
481
|
// Writes a single specified character to the output token.
|
477
482
|
static void emit_char(GumboParser* parser, int c, GumboToken* output) {
|
478
|
-
output->type = get_char_token_type(c);
|
483
|
+
output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
|
479
484
|
output->v.character = c;
|
480
485
|
finish_token(parser, output);
|
481
486
|
}
|
@@ -531,8 +536,8 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
531
536
|
output->v.start_tag.is_self_closing = tag_state->_is_self_closing;
|
532
537
|
tag_state->_last_start_tag = tag_state->_tag;
|
533
538
|
mark_tag_state_as_empty(tag_state);
|
534
|
-
gumbo_debug(
|
535
|
-
|
539
|
+
gumbo_debug(
|
540
|
+
"Emitted start tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
|
536
541
|
} else {
|
537
542
|
output->type = GUMBO_TOKEN_END_TAG;
|
538
543
|
output->v.end_tag = tag_state->_tag;
|
@@ -540,17 +545,18 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
540
545
|
// token, but it's still initialized as normal, so it must be manually
|
541
546
|
// deallocated. There may also be attributes to destroy, in certain broken
|
542
547
|
// cases like </div</th> (the "th" is an attribute there).
|
543
|
-
for (int i = 0; i < tag_state->_attributes.length; ++i) {
|
548
|
+
for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
|
544
549
|
gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
|
545
550
|
}
|
546
551
|
gumbo_parser_deallocate(parser, tag_state->_attributes.data);
|
547
552
|
mark_tag_state_as_empty(tag_state);
|
548
|
-
gumbo_debug(
|
549
|
-
|
553
|
+
gumbo_debug(
|
554
|
+
"Emitted end tag %s.\n", gumbo_normalized_tagname(tag_state->_tag));
|
550
555
|
}
|
551
556
|
gumbo_string_buffer_destroy(parser, &tag_state->_buffer);
|
552
557
|
finish_token(parser, output);
|
553
|
-
gumbo_debug("Original text = %.*s.\n", output->original_text.length,
|
558
|
+
gumbo_debug("Original text = %.*s.\n", output->original_text.length,
|
559
|
+
output->original_text.data);
|
554
560
|
assert(output->original_text.length >= 2);
|
555
561
|
assert(output->original_text.data[0] == '<');
|
556
562
|
assert(output->original_text.data[output->original_text.length - 1] == '>');
|
@@ -563,7 +569,7 @@ static StateResult emit_current_tag(GumboParser* parser, GumboToken* output) {
|
|
563
569
|
// avoid a memory leak.
|
564
570
|
static void abandon_current_tag(GumboParser* parser) {
|
565
571
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
566
|
-
for (int i = 0; i < tag_state->_attributes.length; ++i) {
|
572
|
+
for (unsigned int i = 0; i < tag_state->_attributes.length; ++i) {
|
567
573
|
gumbo_destroy_attribute(parser, tag_state->_attributes.data[i]);
|
568
574
|
}
|
569
575
|
gumbo_parser_deallocate(parser, tag_state->_attributes.data);
|
@@ -575,9 +581,8 @@ static void abandon_current_tag(GumboParser* parser) {
|
|
575
581
|
// Wraps the consume_char_ref function to handle its output and make the
|
576
582
|
// appropriate TokenizerState modifications. Returns RETURN_ERROR if a parse
|
577
583
|
// error occurred, RETURN_SUCCESS otherwise.
|
578
|
-
static StateResult emit_char_ref(
|
579
|
-
|
580
|
-
bool is_in_attribute, GumboToken* output) {
|
584
|
+
static StateResult emit_char_ref(GumboParser* parser,
|
585
|
+
int additional_allowed_char, bool is_in_attribute, GumboToken* output) {
|
581
586
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
582
587
|
OneOrTwoCodepoints char_ref;
|
583
588
|
bool status = consume_char_ref(
|
@@ -641,8 +646,7 @@ static bool maybe_emit_from_temporary_buffer(
|
|
641
646
|
// _temporary_buffer_emit, and then (if the temporary buffer is non-empty) emits
|
642
647
|
// the first character in it. It returns true if a character was emitted, false
|
643
648
|
// otherwise.
|
644
|
-
static bool emit_temporary_buffer(
|
645
|
-
GumboParser* parser, GumboToken* output) {
|
649
|
+
static bool emit_temporary_buffer(GumboParser* parser, GumboToken* output) {
|
646
650
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
647
651
|
assert(tokenizer->_temporary_buffer.data);
|
648
652
|
utf8iterator_reset(&tokenizer->_input);
|
@@ -655,8 +659,8 @@ static bool emit_temporary_buffer(
|
|
655
659
|
// start point; the only time you would *not* want to pass true for this
|
656
660
|
// parameter is if you want the original_text to include character (like an
|
657
661
|
// opening quote) that doesn't appear in the value.
|
658
|
-
static void append_char_to_tag_buffer(
|
659
|
-
|
662
|
+
static void append_char_to_tag_buffer(
|
663
|
+
GumboParser* parser, int codepoint, bool reinitilize_position_on_first) {
|
660
664
|
GumboStringBuffer* buffer = &parser->_tokenizer_state->_tag_state._buffer;
|
661
665
|
if (buffer->length == 0 && reinitilize_position_on_first) {
|
662
666
|
reset_tag_buffer_start_point(parser);
|
@@ -689,7 +693,11 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
|
|
689
693
|
gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
|
690
694
|
|
691
695
|
assert(tag_state->_attributes.data == NULL);
|
692
|
-
|
696
|
+
// Initial size chosen by statistical analysis of a corpus of 60k webpages.
|
697
|
+
// 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
|
698
|
+
// numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
|
699
|
+
// for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
|
700
|
+
gumbo_vector_init(parser, 1, &tag_state->_attributes);
|
693
701
|
tag_state->_drop_next_attr_value = false;
|
694
702
|
tag_state->_is_start_tag = is_start_tag;
|
695
703
|
tag_state->_is_self_closing = false;
|
@@ -709,16 +717,15 @@ static void copy_over_tag_buffer(GumboParser* parser, const char** output) {
|
|
709
717
|
// * The start_pos GumboSourcePosition with the start position of the tag
|
710
718
|
// buffer.
|
711
719
|
// * The end_pos GumboSourcePosition with the current source position.
|
712
|
-
static void copy_over_original_tag_text(
|
713
|
-
|
714
|
-
GumboSourcePosition*
|
720
|
+
static void copy_over_original_tag_text(GumboParser* parser,
|
721
|
+
GumboStringPiece* original_text, GumboSourcePosition* start_pos,
|
722
|
+
GumboSourcePosition* end_pos) {
|
715
723
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
716
724
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
717
725
|
|
718
726
|
original_text->data = tag_state->_original_text;
|
719
|
-
original_text->length =
|
720
|
-
|
721
|
-
tag_state->_original_text;
|
727
|
+
original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
|
728
|
+
tag_state->_original_text;
|
722
729
|
if (original_text->data[original_text->length - 1] == '\r') {
|
723
730
|
// Since \r is skipped by the UTF-8 iterator, it can sometimes end up
|
724
731
|
// appended to the end of original text even when it's really the first part
|
@@ -743,16 +750,14 @@ static void finish_tag_name(GumboParser* parser) {
|
|
743
750
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
744
751
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
745
752
|
|
746
|
-
|
747
|
-
|
748
|
-
tag_state->_tag = gumbo_tag_enum(temp);
|
753
|
+
tag_state->_tag =
|
754
|
+
gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
|
749
755
|
reinitialize_tag_buffer(parser);
|
750
|
-
gumbo_parser_deallocate(parser, (void*) temp);
|
751
756
|
}
|
752
757
|
|
753
758
|
// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
|
754
759
|
static void add_duplicate_attr_error(GumboParser* parser, const char* attr_name,
|
755
|
-
|
760
|
+
int original_index, int new_index) {
|
756
761
|
GumboError* error = gumbo_add_error(parser);
|
757
762
|
if (!error) {
|
758
763
|
return;
|
@@ -782,14 +787,13 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
782
787
|
assert(tag_state->_attributes.capacity);
|
783
788
|
|
784
789
|
GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
|
785
|
-
for (int i = 0; i < attributes->length; ++i) {
|
790
|
+
for (unsigned int i = 0; i < attributes->length; ++i) {
|
786
791
|
GumboAttribute* attr = attributes->data[i];
|
787
792
|
if (strlen(attr->name) == tag_state->_buffer.length &&
|
788
793
|
memcmp(attr->name, tag_state->_buffer.data,
|
789
|
-
|
794
|
+
tag_state->_buffer.length) == 0) {
|
790
795
|
// Identical attribute; bail.
|
791
|
-
add_duplicate_attr_error(
|
792
|
-
parser, attr->name, i, attributes->length);
|
796
|
+
add_duplicate_attr_error(parser, attr->name, i, attributes->length);
|
793
797
|
tag_state->_drop_next_attr_value = true;
|
794
798
|
return false;
|
795
799
|
}
|
@@ -798,11 +802,11 @@ static bool finish_attribute_name(GumboParser* parser) {
|
|
798
802
|
GumboAttribute* attr = gumbo_parser_allocate(parser, sizeof(GumboAttribute));
|
799
803
|
attr->attr_namespace = GUMBO_ATTR_NAMESPACE_NONE;
|
800
804
|
copy_over_tag_buffer(parser, &attr->name);
|
801
|
-
copy_over_original_tag_text(
|
802
|
-
|
805
|
+
copy_over_original_tag_text(
|
806
|
+
parser, &attr->original_name, &attr->name_start, &attr->name_end);
|
803
807
|
attr->value = gumbo_copy_stringz(parser, "");
|
804
|
-
copy_over_original_tag_text(
|
805
|
-
|
808
|
+
copy_over_original_tag_text(
|
809
|
+
parser, &attr->original_value, &attr->name_start, &attr->name_end);
|
806
810
|
gumbo_vector_add(parser, attr, attributes);
|
807
811
|
reinitialize_tag_buffer(parser);
|
808
812
|
return true;
|
@@ -824,8 +828,8 @@ static void finish_attribute_value(GumboParser* parser) {
|
|
824
828
|
tag_state->_attributes.data[tag_state->_attributes.length - 1];
|
825
829
|
gumbo_parser_deallocate(parser, (void*) attr->value);
|
826
830
|
copy_over_tag_buffer(parser, &attr->value);
|
827
|
-
copy_over_original_tag_text(
|
828
|
-
|
831
|
+
copy_over_original_tag_text(
|
832
|
+
parser, &attr->original_value, &attr->value_start, &attr->value_end);
|
829
833
|
reinitialize_tag_buffer(parser);
|
830
834
|
}
|
831
835
|
|
@@ -833,13 +837,9 @@ static void finish_attribute_value(GumboParser* parser) {
|
|
833
837
|
static bool is_appropriate_end_tag(GumboParser* parser) {
|
834
838
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
835
839
|
assert(!tag_state->_is_start_tag);
|
836
|
-
// Null terminate the current string buffer, so it can be passed to
|
837
|
-
// gumbo_tag_enum, but don't increment the length in case we need to dump the
|
838
|
-
// buffer as character tokens.
|
839
|
-
gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
|
840
|
-
--tag_state->_buffer.length;
|
841
840
|
return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
|
842
|
-
|
841
|
+
tag_state->_last_start_tag == gumbo_tagn_enum(tag_state->_buffer.data,
|
842
|
+
tag_state->_buffer.length);
|
843
843
|
}
|
844
844
|
|
845
845
|
void gumbo_tokenizer_state_init(
|
@@ -850,6 +850,7 @@ void gumbo_tokenizer_state_init(
|
|
850
850
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
851
851
|
tokenizer->_reconsume_current_input = false;
|
852
852
|
tokenizer->_is_current_node_foreign = false;
|
853
|
+
tokenizer->_is_in_cdata = false;
|
853
854
|
tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
|
854
855
|
|
855
856
|
tokenizer->_buffered_emit_char = kGumboNoChar;
|
@@ -883,15 +884,14 @@ void gumbo_tokenizer_set_is_current_node_foreign(
|
|
883
884
|
GumboParser* parser, bool is_foreign) {
|
884
885
|
if (is_foreign != parser->_tokenizer_state->_is_current_node_foreign) {
|
885
886
|
gumbo_debug("Toggling is_current_node_foreign to %s.\n",
|
886
|
-
|
887
|
+
is_foreign ? "true" : "false");
|
887
888
|
}
|
888
889
|
parser->_tokenizer_state->_is_current_node_foreign = is_foreign;
|
889
890
|
}
|
890
891
|
|
891
892
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#data-state
|
892
|
-
static StateResult handle_data_state(
|
893
|
-
|
894
|
-
int c, GumboToken* output) {
|
893
|
+
static StateResult handle_data_state(GumboParser* parser,
|
894
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
895
895
|
switch (c) {
|
896
896
|
case '&':
|
897
897
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_DATA);
|
@@ -915,17 +915,15 @@ static StateResult handle_data_state(
|
|
915
915
|
}
|
916
916
|
|
917
917
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-data-state
|
918
|
-
static StateResult handle_char_ref_in_data_state(
|
919
|
-
|
920
|
-
int c, GumboToken* output) {
|
918
|
+
static StateResult handle_char_ref_in_data_state(GumboParser* parser,
|
919
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
921
920
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
922
921
|
return emit_char_ref(parser, ' ', false, output);
|
923
922
|
}
|
924
923
|
|
925
924
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rcdata-state
|
926
|
-
static StateResult handle_rcdata_state(
|
927
|
-
|
928
|
-
int c, GumboToken* output) {
|
925
|
+
static StateResult handle_rcdata_state(GumboParser* parser,
|
926
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
929
927
|
switch (c) {
|
930
928
|
case '&':
|
931
929
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CHAR_REF_IN_RCDATA);
|
@@ -946,17 +944,15 @@ static StateResult handle_rcdata_state(
|
|
946
944
|
}
|
947
945
|
|
948
946
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-rcdata-state
|
949
|
-
static StateResult handle_char_ref_in_rcdata_state(
|
950
|
-
|
951
|
-
int c, GumboToken* output) {
|
947
|
+
static StateResult handle_char_ref_in_rcdata_state(GumboParser* parser,
|
948
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
952
949
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA);
|
953
950
|
return emit_char_ref(parser, ' ', false, output);
|
954
951
|
}
|
955
952
|
|
956
953
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-state
|
957
|
-
static StateResult handle_rawtext_state(
|
958
|
-
|
959
|
-
int c, GumboToken* output) {
|
954
|
+
static StateResult handle_rawtext_state(GumboParser* parser,
|
955
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
960
956
|
switch (c) {
|
961
957
|
case '<':
|
962
958
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_LT);
|
@@ -973,9 +969,8 @@ static StateResult handle_rawtext_state(
|
|
973
969
|
}
|
974
970
|
|
975
971
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-state
|
976
|
-
static StateResult handle_script_state(
|
977
|
-
|
978
|
-
int c, GumboToken* output) {
|
972
|
+
static StateResult handle_script_state(GumboParser* parser,
|
973
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
979
974
|
switch (c) {
|
980
975
|
case '<':
|
981
976
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_LT);
|
@@ -992,9 +987,8 @@ static StateResult handle_script_state(
|
|
992
987
|
}
|
993
988
|
|
994
989
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#plaintext-state
|
995
|
-
static StateResult handle_plaintext_state(
|
996
|
-
|
997
|
-
int c, GumboToken* output) {
|
990
|
+
static StateResult handle_plaintext_state(GumboParser* parser,
|
991
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
998
992
|
switch (c) {
|
999
993
|
case '\0':
|
1000
994
|
return emit_replacement_char(parser, output);
|
@@ -1006,9 +1000,8 @@ static StateResult handle_plaintext_state(
|
|
1006
1000
|
}
|
1007
1001
|
|
1008
1002
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-open-state
|
1009
|
-
static StateResult handle_tag_open_state(
|
1010
|
-
|
1011
|
-
int c, GumboToken* output) {
|
1003
|
+
static StateResult handle_tag_open_state(GumboParser* parser,
|
1004
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1012
1005
|
assert(temporary_buffer_equals(parser, "<"));
|
1013
1006
|
switch (c) {
|
1014
1007
|
case '!':
|
@@ -1040,9 +1033,8 @@ static StateResult handle_tag_open_state(
|
|
1040
1033
|
}
|
1041
1034
|
|
1042
1035
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#end-tag-open-state
|
1043
|
-
static StateResult handle_end_tag_open_state(
|
1044
|
-
|
1045
|
-
int c, GumboToken* output) {
|
1036
|
+
static StateResult handle_end_tag_open_state(GumboParser* parser,
|
1037
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1046
1038
|
assert(temporary_buffer_equals(parser, "</"));
|
1047
1039
|
switch (c) {
|
1048
1040
|
case '>':
|
@@ -1068,9 +1060,8 @@ static StateResult handle_end_tag_open_state(
|
|
1068
1060
|
}
|
1069
1061
|
|
1070
1062
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#tag-name-state
|
1071
|
-
static StateResult handle_tag_name_state(
|
1072
|
-
|
1073
|
-
int c, GumboToken* output) {
|
1063
|
+
static StateResult handle_tag_name_state(GumboParser* parser,
|
1064
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1074
1065
|
switch (c) {
|
1075
1066
|
case '\t':
|
1076
1067
|
case '\n':
|
@@ -1103,9 +1094,8 @@ static StateResult handle_tag_name_state(
|
|
1103
1094
|
}
|
1104
1095
|
|
1105
1096
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-less-than-sign-state
|
1106
|
-
static StateResult handle_rcdata_lt_state(
|
1107
|
-
|
1108
|
-
int c, GumboToken* output) {
|
1097
|
+
static StateResult handle_rcdata_lt_state(GumboParser* parser,
|
1098
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1109
1099
|
assert(temporary_buffer_equals(parser, "<"));
|
1110
1100
|
if (c == '/') {
|
1111
1101
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_OPEN);
|
@@ -1119,9 +1109,8 @@ static StateResult handle_rcdata_lt_state(
|
|
1119
1109
|
}
|
1120
1110
|
|
1121
1111
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-open-state
|
1122
|
-
static StateResult handle_rcdata_end_tag_open_state(
|
1123
|
-
|
1124
|
-
int c, GumboToken* output) {
|
1112
|
+
static StateResult handle_rcdata_end_tag_open_state(GumboParser* parser,
|
1113
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1125
1114
|
assert(temporary_buffer_equals(parser, "</"));
|
1126
1115
|
if (is_alpha(c)) {
|
1127
1116
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RCDATA_END_TAG_NAME);
|
@@ -1136,9 +1125,8 @@ static StateResult handle_rcdata_end_tag_open_state(
|
|
1136
1125
|
}
|
1137
1126
|
|
1138
1127
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#rcdata-end-tag-name-state
|
1139
|
-
static StateResult handle_rcdata_end_tag_name_state(
|
1140
|
-
|
1141
|
-
int c, GumboToken* output) {
|
1128
|
+
static StateResult handle_rcdata_end_tag_name_state(GumboParser* parser,
|
1129
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1142
1130
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1143
1131
|
if (is_alpha(c)) {
|
1144
1132
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1169,9 +1157,8 @@ static StateResult handle_rcdata_end_tag_name_state(
|
|
1169
1157
|
}
|
1170
1158
|
|
1171
1159
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-less-than-sign-state
|
1172
|
-
static StateResult handle_rawtext_lt_state(
|
1173
|
-
|
1174
|
-
int c, GumboToken* output) {
|
1160
|
+
static StateResult handle_rawtext_lt_state(GumboParser* parser,
|
1161
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1175
1162
|
assert(temporary_buffer_equals(parser, "<"));
|
1176
1163
|
if (c == '/') {
|
1177
1164
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_OPEN);
|
@@ -1185,9 +1172,8 @@ static StateResult handle_rawtext_lt_state(
|
|
1185
1172
|
}
|
1186
1173
|
|
1187
1174
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-open-state
|
1188
|
-
static StateResult handle_rawtext_end_tag_open_state(
|
1189
|
-
|
1190
|
-
int c, GumboToken* output) {
|
1175
|
+
static StateResult handle_rawtext_end_tag_open_state(GumboParser* parser,
|
1176
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1191
1177
|
assert(temporary_buffer_equals(parser, "</"));
|
1192
1178
|
if (is_alpha(c)) {
|
1193
1179
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_RAWTEXT_END_TAG_NAME);
|
@@ -1201,12 +1187,11 @@ static StateResult handle_rawtext_end_tag_open_state(
|
|
1201
1187
|
}
|
1202
1188
|
|
1203
1189
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#rawtext-end-tag-name-state
|
1204
|
-
static StateResult handle_rawtext_end_tag_name_state(
|
1205
|
-
|
1206
|
-
int c, GumboToken* output) {
|
1190
|
+
static StateResult handle_rawtext_end_tag_name_state(GumboParser* parser,
|
1191
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1207
1192
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1208
1193
|
gumbo_debug("Last end tag: %*s\n", (int) tokenizer->_tag_state._buffer.length,
|
1209
|
-
|
1194
|
+
tokenizer->_tag_state._buffer.data);
|
1210
1195
|
if (is_alpha(c)) {
|
1211
1196
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1212
1197
|
append_char_to_temporary_buffer(parser, c);
|
@@ -1237,9 +1222,8 @@ static StateResult handle_rawtext_end_tag_name_state(
|
|
1237
1222
|
}
|
1238
1223
|
|
1239
1224
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-less-than-sign-state
|
1240
|
-
static StateResult handle_script_lt_state(
|
1241
|
-
|
1242
|
-
int c, GumboToken* output) {
|
1225
|
+
static StateResult handle_script_lt_state(GumboParser* parser,
|
1226
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1243
1227
|
assert(temporary_buffer_equals(parser, "<"));
|
1244
1228
|
if (c == '/') {
|
1245
1229
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_OPEN);
|
@@ -1257,9 +1241,8 @@ static StateResult handle_script_lt_state(
|
|
1257
1241
|
}
|
1258
1242
|
|
1259
1243
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-open-state
|
1260
|
-
static StateResult handle_script_end_tag_open_state(
|
1261
|
-
|
1262
|
-
int c, GumboToken* output) {
|
1244
|
+
static StateResult handle_script_end_tag_open_state(GumboParser* parser,
|
1245
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1263
1246
|
assert(temporary_buffer_equals(parser, "</"));
|
1264
1247
|
if (is_alpha(c)) {
|
1265
1248
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_END_TAG_NAME);
|
@@ -1273,9 +1256,8 @@ static StateResult handle_script_end_tag_open_state(
|
|
1273
1256
|
}
|
1274
1257
|
|
1275
1258
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-end-tag-name-state
|
1276
|
-
static StateResult handle_script_end_tag_name_state(
|
1277
|
-
|
1278
|
-
int c, GumboToken* output) {
|
1259
|
+
static StateResult handle_script_end_tag_name_state(GumboParser* parser,
|
1260
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1279
1261
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1280
1262
|
if (is_alpha(c)) {
|
1281
1263
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1306,9 +1288,8 @@ static StateResult handle_script_end_tag_name_state(
|
|
1306
1288
|
}
|
1307
1289
|
|
1308
1290
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-state
|
1309
|
-
static StateResult handle_script_escaped_start_state(
|
1310
|
-
|
1311
|
-
int c, GumboToken* output) {
|
1291
|
+
static StateResult handle_script_escaped_start_state(GumboParser* parser,
|
1292
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1312
1293
|
if (c == '-') {
|
1313
1294
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_START_DASH);
|
1314
1295
|
return emit_current_char(parser, output);
|
@@ -1320,9 +1301,8 @@ static StateResult handle_script_escaped_start_state(
|
|
1320
1301
|
}
|
1321
1302
|
|
1322
1303
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escape-start-dash-state
|
1323
|
-
static StateResult handle_script_escaped_start_dash_state(
|
1324
|
-
|
1325
|
-
int c, GumboToken* output) {
|
1304
|
+
static StateResult handle_script_escaped_start_dash_state(GumboParser* parser,
|
1305
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1326
1306
|
if (c == '-') {
|
1327
1307
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
|
1328
1308
|
return emit_current_char(parser, output);
|
@@ -1334,9 +1314,8 @@ static StateResult handle_script_escaped_start_dash_state(
|
|
1334
1314
|
}
|
1335
1315
|
|
1336
1316
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-state
|
1337
|
-
static StateResult handle_script_escaped_state(
|
1338
|
-
|
1339
|
-
int c, GumboToken* output) {
|
1317
|
+
static StateResult handle_script_escaped_state(GumboParser* parser,
|
1318
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1340
1319
|
switch (c) {
|
1341
1320
|
case '-':
|
1342
1321
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH);
|
@@ -1357,9 +1336,8 @@ static StateResult handle_script_escaped_state(
|
|
1357
1336
|
}
|
1358
1337
|
|
1359
1338
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-state
|
1360
|
-
static StateResult handle_script_escaped_dash_state(
|
1361
|
-
|
1362
|
-
int c, GumboToken* output) {
|
1339
|
+
static StateResult handle_script_escaped_dash_state(GumboParser* parser,
|
1340
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1363
1341
|
switch (c) {
|
1364
1342
|
case '-':
|
1365
1343
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_DASH_DASH);
|
@@ -1383,9 +1361,8 @@ static StateResult handle_script_escaped_dash_state(
|
|
1383
1361
|
}
|
1384
1362
|
|
1385
1363
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-dash-dash-state
|
1386
|
-
static StateResult handle_script_escaped_dash_dash_state(
|
1387
|
-
|
1388
|
-
int c, GumboToken* output) {
|
1364
|
+
static StateResult handle_script_escaped_dash_dash_state(GumboParser* parser,
|
1365
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1389
1366
|
switch (c) {
|
1390
1367
|
case '-':
|
1391
1368
|
return emit_current_char(parser, output);
|
@@ -1411,9 +1388,8 @@ static StateResult handle_script_escaped_dash_dash_state(
|
|
1411
1388
|
}
|
1412
1389
|
|
1413
1390
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-less-than-sign-state
|
1414
|
-
static StateResult handle_script_escaped_lt_state(
|
1415
|
-
|
1416
|
-
int c, GumboToken* output) {
|
1391
|
+
static StateResult handle_script_escaped_lt_state(GumboParser* parser,
|
1392
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1417
1393
|
assert(temporary_buffer_equals(parser, "<"));
|
1418
1394
|
assert(!tokenizer->_script_data_buffer.length);
|
1419
1395
|
if (c == '/') {
|
@@ -1433,9 +1409,8 @@ static StateResult handle_script_escaped_lt_state(
|
|
1433
1409
|
}
|
1434
1410
|
|
1435
1411
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-open-state
|
1436
|
-
static StateResult handle_script_escaped_end_tag_open_state(
|
1437
|
-
|
1438
|
-
int c, GumboToken* output) {
|
1412
|
+
static StateResult handle_script_escaped_end_tag_open_state(GumboParser* parser,
|
1413
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1439
1414
|
assert(temporary_buffer_equals(parser, "</"));
|
1440
1415
|
if (is_alpha(c)) {
|
1441
1416
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_ESCAPED_END_TAG_NAME);
|
@@ -1449,9 +1424,8 @@ static StateResult handle_script_escaped_end_tag_open_state(
|
|
1449
1424
|
}
|
1450
1425
|
|
1451
1426
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-escaped-end-tag-name-state
|
1452
|
-
static StateResult handle_script_escaped_end_tag_name_state(
|
1453
|
-
|
1454
|
-
int c, GumboToken* output) {
|
1427
|
+
static StateResult handle_script_escaped_end_tag_name_state(GumboParser* parser,
|
1428
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1455
1429
|
assert(tokenizer->_temporary_buffer.length >= 2);
|
1456
1430
|
if (is_alpha(c)) {
|
1457
1431
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1482,9 +1456,8 @@ static StateResult handle_script_escaped_end_tag_name_state(
|
|
1482
1456
|
}
|
1483
1457
|
|
1484
1458
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-start-state
|
1485
|
-
static StateResult handle_script_double_escaped_start_state(
|
1486
|
-
|
1487
|
-
int c, GumboToken* output) {
|
1459
|
+
static StateResult handle_script_double_escaped_start_state(GumboParser* parser,
|
1460
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1488
1461
|
switch (c) {
|
1489
1462
|
case '\t':
|
1490
1463
|
case '\n':
|
@@ -1492,9 +1465,11 @@ static StateResult handle_script_double_escaped_start_state(
|
|
1492
1465
|
case ' ':
|
1493
1466
|
case '/':
|
1494
1467
|
case '>':
|
1495
|
-
gumbo_tokenizer_set_state(
|
1496
|
-
|
1497
|
-
|
1468
|
+
gumbo_tokenizer_set_state(
|
1469
|
+
parser, gumbo_string_equals(&kScriptTag,
|
1470
|
+
(GumboStringPiece*) &tokenizer->_script_data_buffer)
|
1471
|
+
? GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED
|
1472
|
+
: GUMBO_LEX_SCRIPT_ESCAPED);
|
1498
1473
|
return emit_current_char(parser, output);
|
1499
1474
|
default:
|
1500
1475
|
if (is_alpha(c)) {
|
@@ -1510,9 +1485,8 @@ static StateResult handle_script_double_escaped_start_state(
|
|
1510
1485
|
}
|
1511
1486
|
|
1512
1487
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-state
|
1513
|
-
static StateResult handle_script_double_escaped_state(
|
1514
|
-
|
1515
|
-
int c, GumboToken* output) {
|
1488
|
+
static StateResult handle_script_double_escaped_state(GumboParser* parser,
|
1489
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1516
1490
|
switch (c) {
|
1517
1491
|
case '-':
|
1518
1492
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_DASH);
|
@@ -1532,9 +1506,8 @@ static StateResult handle_script_double_escaped_state(
|
|
1532
1506
|
}
|
1533
1507
|
|
1534
1508
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-state
|
1535
|
-
static StateResult handle_script_double_escaped_dash_state(
|
1536
|
-
|
1537
|
-
int c, GumboToken* output) {
|
1509
|
+
static StateResult handle_script_double_escaped_dash_state(GumboParser* parser,
|
1510
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1538
1511
|
switch (c) {
|
1539
1512
|
case '-':
|
1540
1513
|
gumbo_tokenizer_set_state(
|
@@ -1558,8 +1531,8 @@ static StateResult handle_script_double_escaped_dash_state(
|
|
1558
1531
|
|
1559
1532
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-dash-dash-state
|
1560
1533
|
static StateResult handle_script_double_escaped_dash_dash_state(
|
1561
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
1562
|
-
|
1534
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
1535
|
+
GumboToken* output) {
|
1563
1536
|
switch (c) {
|
1564
1537
|
case '-':
|
1565
1538
|
return emit_current_char(parser, output);
|
@@ -1583,26 +1556,22 @@ static StateResult handle_script_double_escaped_dash_dash_state(
|
|
1583
1556
|
}
|
1584
1557
|
|
1585
1558
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escaped-less-than-sign-state
|
1586
|
-
static StateResult handle_script_double_escaped_lt_state(
|
1587
|
-
|
1588
|
-
int c, GumboToken* output) {
|
1559
|
+
static StateResult handle_script_double_escaped_lt_state(GumboParser* parser,
|
1560
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1589
1561
|
if (c == '/') {
|
1590
1562
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
|
1591
|
-
|
1592
|
-
gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
|
1563
|
+
gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer);
|
1593
1564
|
return emit_current_char(parser, output);
|
1594
1565
|
} else {
|
1595
1566
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
|
1596
1567
|
tokenizer->_reconsume_current_input = true;
|
1597
1568
|
return NEXT_CHAR;
|
1598
1569
|
}
|
1599
|
-
|
1600
1570
|
}
|
1601
1571
|
|
1602
1572
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#script-data-double-escape-end-state
|
1603
|
-
static StateResult handle_script_double_escaped_end_state(
|
1604
|
-
|
1605
|
-
int c, GumboToken* output) {
|
1573
|
+
static StateResult handle_script_double_escaped_end_state(GumboParser* parser,
|
1574
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1606
1575
|
switch (c) {
|
1607
1576
|
case '\t':
|
1608
1577
|
case '\n':
|
@@ -1610,9 +1579,11 @@ static StateResult handle_script_double_escaped_end_state(
|
|
1610
1579
|
case ' ':
|
1611
1580
|
case '/':
|
1612
1581
|
case '>':
|
1613
|
-
gumbo_tokenizer_set_state(
|
1614
|
-
|
1615
|
-
|
1582
|
+
gumbo_tokenizer_set_state(
|
1583
|
+
parser, gumbo_string_equals(&kScriptTag,
|
1584
|
+
(GumboStringPiece*) &tokenizer->_script_data_buffer)
|
1585
|
+
? GUMBO_LEX_SCRIPT_ESCAPED
|
1586
|
+
: GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
|
1616
1587
|
return emit_current_char(parser, output);
|
1617
1588
|
default:
|
1618
1589
|
if (is_alpha(c)) {
|
@@ -1628,9 +1599,8 @@ static StateResult handle_script_double_escaped_end_state(
|
|
1628
1599
|
}
|
1629
1600
|
|
1630
1601
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-name-state
|
1631
|
-
static StateResult handle_before_attr_name_state(
|
1632
|
-
|
1633
|
-
int c, GumboToken* output) {
|
1602
|
+
static StateResult handle_before_attr_name_state(GumboParser* parser,
|
1603
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1634
1604
|
switch (c) {
|
1635
1605
|
case '\t':
|
1636
1606
|
case '\n':
|
@@ -1658,7 +1628,7 @@ static StateResult handle_before_attr_name_state(
|
|
1658
1628
|
case '<':
|
1659
1629
|
case '=':
|
1660
1630
|
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
|
1661
|
-
|
1631
|
+
// Fall through.
|
1662
1632
|
default:
|
1663
1633
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
|
1664
1634
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1667,9 +1637,8 @@ static StateResult handle_before_attr_name_state(
|
|
1667
1637
|
}
|
1668
1638
|
|
1669
1639
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-name-state
|
1670
|
-
static StateResult handle_attr_name_state(
|
1671
|
-
|
1672
|
-
int c, GumboToken* output) {
|
1640
|
+
static StateResult handle_attr_name_state(GumboParser* parser,
|
1641
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1673
1642
|
switch (c) {
|
1674
1643
|
case '\t':
|
1675
1644
|
case '\n':
|
@@ -1703,7 +1672,7 @@ static StateResult handle_attr_name_state(
|
|
1703
1672
|
case '\'':
|
1704
1673
|
case '<':
|
1705
1674
|
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
|
1706
|
-
|
1675
|
+
// Fall through.
|
1707
1676
|
default:
|
1708
1677
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
1709
1678
|
return NEXT_CHAR;
|
@@ -1711,9 +1680,8 @@ static StateResult handle_attr_name_state(
|
|
1711
1680
|
}
|
1712
1681
|
|
1713
1682
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-name-state
|
1714
|
-
static StateResult handle_after_attr_name_state(
|
1715
|
-
|
1716
|
-
int c, GumboToken* output) {
|
1683
|
+
static StateResult handle_after_attr_name_state(GumboParser* parser,
|
1684
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1717
1685
|
switch (c) {
|
1718
1686
|
case '\t':
|
1719
1687
|
case '\n':
|
@@ -1743,7 +1711,7 @@ static StateResult handle_after_attr_name_state(
|
|
1743
1711
|
case '\'':
|
1744
1712
|
case '<':
|
1745
1713
|
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_NAME_INVALID);
|
1746
|
-
|
1714
|
+
// Fall through.
|
1747
1715
|
default:
|
1748
1716
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_NAME);
|
1749
1717
|
append_char_to_tag_buffer(parser, ensure_lowercase(c), true);
|
@@ -1752,9 +1720,8 @@ static StateResult handle_after_attr_name_state(
|
|
1752
1720
|
}
|
1753
1721
|
|
1754
1722
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-attribute-value-state
|
1755
|
-
static StateResult handle_before_attr_value_state(
|
1756
|
-
|
1757
|
-
int c, GumboToken* output) {
|
1723
|
+
static StateResult handle_before_attr_value_state(GumboParser* parser,
|
1724
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1758
1725
|
switch (c) {
|
1759
1726
|
case '\t':
|
1760
1727
|
case '\n':
|
@@ -1793,7 +1760,7 @@ static StateResult handle_before_attr_value_state(
|
|
1793
1760
|
case '=':
|
1794
1761
|
case '`':
|
1795
1762
|
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
|
1796
|
-
|
1763
|
+
// Fall through.
|
1797
1764
|
default:
|
1798
1765
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_ATTR_VALUE_UNQUOTED);
|
1799
1766
|
append_char_to_tag_buffer(parser, c, true);
|
@@ -1802,9 +1769,8 @@ static StateResult handle_before_attr_value_state(
|
|
1802
1769
|
}
|
1803
1770
|
|
1804
1771
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-double-quoted-state
|
1805
|
-
static StateResult handle_attr_value_double_quoted_state(
|
1806
|
-
|
1807
|
-
int c, GumboToken* output) {
|
1772
|
+
static StateResult handle_attr_value_double_quoted_state(GumboParser* parser,
|
1773
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1808
1774
|
switch (c) {
|
1809
1775
|
case '"':
|
1810
1776
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
|
@@ -1831,9 +1797,8 @@ static StateResult handle_attr_value_double_quoted_state(
|
|
1831
1797
|
}
|
1832
1798
|
|
1833
1799
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-single-quoted-state
|
1834
|
-
static StateResult handle_attr_value_single_quoted_state(
|
1835
|
-
|
1836
|
-
int c, GumboToken* output) {
|
1800
|
+
static StateResult handle_attr_value_single_quoted_state(GumboParser* parser,
|
1801
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1837
1802
|
switch (c) {
|
1838
1803
|
case '\'':
|
1839
1804
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_ATTR_VALUE_QUOTED);
|
@@ -1860,9 +1825,8 @@ static StateResult handle_attr_value_single_quoted_state(
|
|
1860
1825
|
}
|
1861
1826
|
|
1862
1827
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#attribute-value-unquoted-state
|
1863
|
-
static StateResult handle_attr_value_unquoted_state(
|
1864
|
-
|
1865
|
-
int c, GumboToken* output) {
|
1828
|
+
static StateResult handle_attr_value_unquoted_state(GumboParser* parser,
|
1829
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1866
1830
|
switch (c) {
|
1867
1831
|
case '\t':
|
1868
1832
|
case '\n':
|
@@ -1896,7 +1860,7 @@ static StateResult handle_attr_value_unquoted_state(
|
|
1896
1860
|
case '\'':
|
1897
1861
|
case '`':
|
1898
1862
|
tokenizer_add_parse_error(parser, GUMBO_ERR_ATTR_UNQUOTED_EQUALS);
|
1899
|
-
|
1863
|
+
// Fall through.
|
1900
1864
|
default:
|
1901
1865
|
append_char_to_tag_buffer(parser, c, true);
|
1902
1866
|
return NEXT_CHAR;
|
@@ -1904,9 +1868,8 @@ static StateResult handle_attr_value_unquoted_state(
|
|
1904
1868
|
}
|
1905
1869
|
|
1906
1870
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#character-reference-in-attribute-value-state
|
1907
|
-
static StateResult handle_char_ref_in_attr_value_state(
|
1908
|
-
|
1909
|
-
int c, GumboToken* output) {
|
1871
|
+
static StateResult handle_char_ref_in_attr_value_state(GumboParser* parser,
|
1872
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1910
1873
|
OneOrTwoCodepoints char_ref;
|
1911
1874
|
int allowed_char;
|
1912
1875
|
bool is_unquoted = false;
|
@@ -1947,9 +1910,8 @@ static StateResult handle_char_ref_in_attr_value_state(
|
|
1947
1910
|
}
|
1948
1911
|
|
1949
1912
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#after-attribute-value-quoted-state
|
1950
|
-
static StateResult handle_after_attr_value_quoted_state(
|
1951
|
-
|
1952
|
-
int c, GumboToken* output) {
|
1913
|
+
static StateResult handle_after_attr_value_quoted_state(GumboParser* parser,
|
1914
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1953
1915
|
finish_attribute_value(parser);
|
1954
1916
|
switch (c) {
|
1955
1917
|
case '\t':
|
@@ -1979,9 +1941,8 @@ static StateResult handle_after_attr_value_quoted_state(
|
|
1979
1941
|
}
|
1980
1942
|
|
1981
1943
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#self-closing-start-tag-state
|
1982
|
-
static StateResult handle_self_closing_start_tag_state(
|
1983
|
-
|
1984
|
-
int c, GumboToken* output) {
|
1944
|
+
static StateResult handle_self_closing_start_tag_state(GumboParser* parser,
|
1945
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
1985
1946
|
switch (c) {
|
1986
1947
|
case '>':
|
1987
1948
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
@@ -2001,9 +1962,8 @@ static StateResult handle_self_closing_start_tag_state(
|
|
2001
1962
|
}
|
2002
1963
|
|
2003
1964
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-comment-state
|
2004
|
-
static StateResult handle_bogus_comment_state(
|
2005
|
-
|
2006
|
-
int c, GumboToken* output) {
|
1965
|
+
static StateResult handle_bogus_comment_state(GumboParser* parser,
|
1966
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2007
1967
|
while (c != '>' && c != -1) {
|
2008
1968
|
if (c == '\0') {
|
2009
1969
|
c = 0xFFFD;
|
@@ -2017,15 +1977,14 @@ static StateResult handle_bogus_comment_state(
|
|
2017
1977
|
}
|
2018
1978
|
|
2019
1979
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#markup-declaration-open-state
|
2020
|
-
static StateResult handle_markup_declaration_state(
|
2021
|
-
|
2022
|
-
int c, GumboToken* output) {
|
1980
|
+
static StateResult handle_markup_declaration_state(GumboParser* parser,
|
1981
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2023
1982
|
if (utf8iterator_maybe_consume_match(
|
2024
|
-
|
1983
|
+
&tokenizer->_input, "--", sizeof("--") - 1, true)) {
|
2025
1984
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START);
|
2026
1985
|
tokenizer->_reconsume_current_input = true;
|
2027
1986
|
} else if (utf8iterator_maybe_consume_match(
|
2028
|
-
|
1987
|
+
&tokenizer->_input, "DOCTYPE", sizeof("DOCTYPE") - 1, false)) {
|
2029
1988
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DOCTYPE);
|
2030
1989
|
tokenizer->_reconsume_current_input = true;
|
2031
1990
|
// If we get here, we know we'll eventually emit a doctype token, so now is
|
@@ -2039,8 +1998,9 @@ static StateResult handle_markup_declaration_state(
|
|
2039
1998
|
gumbo_copy_stringz(parser, "");
|
2040
1999
|
} else if (tokenizer->_is_current_node_foreign &&
|
2041
2000
|
utf8iterator_maybe_consume_match(
|
2042
|
-
|
2001
|
+
&tokenizer->_input, "[CDATA[", sizeof("[CDATA[") - 1, true)) {
|
2043
2002
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_CDATA);
|
2003
|
+
tokenizer->_is_in_cdata = true;
|
2044
2004
|
tokenizer->_reconsume_current_input = true;
|
2045
2005
|
} else {
|
2046
2006
|
tokenizer_add_parse_error(parser, GUMBO_ERR_DASHES_OR_DOCTYPE);
|
@@ -2052,9 +2012,8 @@ static StateResult handle_markup_declaration_state(
|
|
2052
2012
|
}
|
2053
2013
|
|
2054
2014
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-state
|
2055
|
-
static StateResult handle_comment_start_state(
|
2056
|
-
|
2057
|
-
int c, GumboToken* output) {
|
2015
|
+
static StateResult handle_comment_start_state(GumboParser* parser,
|
2016
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2058
2017
|
switch (c) {
|
2059
2018
|
case '-':
|
2060
2019
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_START_DASH);
|
@@ -2082,9 +2041,8 @@ static StateResult handle_comment_start_state(
|
|
2082
2041
|
}
|
2083
2042
|
|
2084
2043
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-start-dash-state
|
2085
|
-
static StateResult handle_comment_start_dash_state(
|
2086
|
-
|
2087
|
-
int c, GumboToken* output) {
|
2044
|
+
static StateResult handle_comment_start_dash_state(GumboParser* parser,
|
2045
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2088
2046
|
switch (c) {
|
2089
2047
|
case '-':
|
2090
2048
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
|
@@ -2114,9 +2072,8 @@ static StateResult handle_comment_start_dash_state(
|
|
2114
2072
|
}
|
2115
2073
|
|
2116
2074
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-state
|
2117
|
-
static StateResult handle_comment_state(
|
2118
|
-
|
2119
|
-
int c, GumboToken* output) {
|
2075
|
+
static StateResult handle_comment_state(GumboParser* parser,
|
2076
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2120
2077
|
switch (c) {
|
2121
2078
|
case '-':
|
2122
2079
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
|
@@ -2137,9 +2094,8 @@ static StateResult handle_comment_state(
|
|
2137
2094
|
}
|
2138
2095
|
|
2139
2096
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-dash-state
|
2140
|
-
static StateResult handle_comment_end_dash_state(
|
2141
|
-
|
2142
|
-
int c, GumboToken* output) {
|
2097
|
+
static StateResult handle_comment_end_dash_state(GumboParser* parser,
|
2098
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2143
2099
|
switch (c) {
|
2144
2100
|
case '-':
|
2145
2101
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END);
|
@@ -2164,9 +2120,8 @@ static StateResult handle_comment_end_dash_state(
|
|
2164
2120
|
}
|
2165
2121
|
|
2166
2122
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-state
|
2167
|
-
static StateResult handle_comment_end_state(
|
2168
|
-
|
2169
|
-
int c, GumboToken* output) {
|
2123
|
+
static StateResult handle_comment_end_state(GumboParser* parser,
|
2124
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2170
2125
|
switch (c) {
|
2171
2126
|
case '>':
|
2172
2127
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
@@ -2179,11 +2134,13 @@ static StateResult handle_comment_end_state(
|
|
2179
2134
|
append_char_to_temporary_buffer(parser, kUtf8ReplacementChar);
|
2180
2135
|
return NEXT_CHAR;
|
2181
2136
|
case '!':
|
2182
|
-
tokenizer_add_parse_error(
|
2137
|
+
tokenizer_add_parse_error(
|
2138
|
+
parser, GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH);
|
2183
2139
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_BANG);
|
2184
2140
|
return NEXT_CHAR;
|
2185
2141
|
case '-':
|
2186
|
-
tokenizer_add_parse_error(
|
2142
|
+
tokenizer_add_parse_error(
|
2143
|
+
parser, GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH);
|
2187
2144
|
append_char_to_temporary_buffer(parser, '-');
|
2188
2145
|
return NEXT_CHAR;
|
2189
2146
|
case -1:
|
@@ -2202,9 +2159,8 @@ static StateResult handle_comment_end_state(
|
|
2202
2159
|
}
|
2203
2160
|
|
2204
2161
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#comment-end-bang-state
|
2205
|
-
static StateResult handle_comment_end_bang_state(
|
2206
|
-
|
2207
|
-
int c, GumboToken* output) {
|
2162
|
+
static StateResult handle_comment_end_bang_state(GumboParser* parser,
|
2163
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2208
2164
|
switch (c) {
|
2209
2165
|
case '-':
|
2210
2166
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_COMMENT_END_DASH);
|
@@ -2239,9 +2195,8 @@ static StateResult handle_comment_end_bang_state(
|
|
2239
2195
|
}
|
2240
2196
|
|
2241
2197
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-state
|
2242
|
-
static StateResult handle_doctype_state(
|
2243
|
-
|
2244
|
-
int c, GumboToken* output) {
|
2198
|
+
static StateResult handle_doctype_state(GumboParser* parser,
|
2199
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2245
2200
|
assert(!tokenizer->_temporary_buffer.length);
|
2246
2201
|
switch (c) {
|
2247
2202
|
case '\t':
|
@@ -2266,9 +2221,8 @@ static StateResult handle_doctype_state(
|
|
2266
2221
|
}
|
2267
2222
|
|
2268
2223
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#before-doctype-name-state
|
2269
|
-
static StateResult handle_before_doctype_name_state(
|
2270
|
-
|
2271
|
-
int c, GumboToken* output) {
|
2224
|
+
static StateResult handle_before_doctype_name_state(GumboParser* parser,
|
2225
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2272
2226
|
switch (c) {
|
2273
2227
|
case '\t':
|
2274
2228
|
case '\n':
|
@@ -2302,9 +2256,8 @@ static StateResult handle_before_doctype_name_state(
|
|
2302
2256
|
}
|
2303
2257
|
|
2304
2258
|
// http://www.whatwg.org/specs/web-apps/current-work/complete5/tokenization.html#doctype-name-state
|
2305
|
-
static StateResult handle_doctype_name_state(
|
2306
|
-
|
2307
|
-
int c, GumboToken* output) {
|
2259
|
+
static StateResult handle_doctype_name_state(GumboParser* parser,
|
2260
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2308
2261
|
switch (c) {
|
2309
2262
|
case '\t':
|
2310
2263
|
case '\n':
|
@@ -2312,14 +2265,12 @@ static StateResult handle_doctype_name_state(
|
|
2312
2265
|
case ' ':
|
2313
2266
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_NAME);
|
2314
2267
|
gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
|
2315
|
-
finish_temporary_buffer(
|
2316
|
-
parser, &tokenizer->_doc_type_state.name);
|
2268
|
+
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2317
2269
|
return NEXT_CHAR;
|
2318
2270
|
case '>':
|
2319
2271
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2320
2272
|
gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
|
2321
|
-
finish_temporary_buffer(
|
2322
|
-
parser, &tokenizer->_doc_type_state.name);
|
2273
|
+
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2323
2274
|
emit_doctype(parser, output);
|
2324
2275
|
return RETURN_SUCCESS;
|
2325
2276
|
case '\0':
|
@@ -2331,8 +2282,7 @@ static StateResult handle_doctype_name_state(
|
|
2331
2282
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2332
2283
|
tokenizer->_doc_type_state.force_quirks = true;
|
2333
2284
|
gumbo_parser_deallocate(parser, (void*) tokenizer->_doc_type_state.name);
|
2334
|
-
finish_temporary_buffer(
|
2335
|
-
parser, &tokenizer->_doc_type_state.name);
|
2285
|
+
finish_temporary_buffer(parser, &tokenizer->_doc_type_state.name);
|
2336
2286
|
emit_doctype(parser, output);
|
2337
2287
|
return RETURN_ERROR;
|
2338
2288
|
default:
|
@@ -2344,9 +2294,8 @@ static StateResult handle_doctype_name_state(
|
|
2344
2294
|
}
|
2345
2295
|
|
2346
2296
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-name-state
|
2347
|
-
static StateResult handle_after_doctype_name_state(
|
2348
|
-
|
2349
|
-
int c, GumboToken* output) {
|
2297
|
+
static StateResult handle_after_doctype_name_state(GumboParser* parser,
|
2298
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2350
2299
|
switch (c) {
|
2351
2300
|
case '\t':
|
2352
2301
|
case '\n':
|
@@ -2365,17 +2314,18 @@ static StateResult handle_after_doctype_name_state(
|
|
2365
2314
|
return RETURN_ERROR;
|
2366
2315
|
default:
|
2367
2316
|
if (utf8iterator_maybe_consume_match(
|
2368
|
-
|
2317
|
+
&tokenizer->_input, "PUBLIC", sizeof("PUBLIC") - 1, false)) {
|
2369
2318
|
gumbo_tokenizer_set_state(
|
2370
2319
|
parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_KEYWORD);
|
2371
2320
|
tokenizer->_reconsume_current_input = true;
|
2372
|
-
} else if (utf8iterator_maybe_consume_match(
|
2373
|
-
|
2321
|
+
} else if (utf8iterator_maybe_consume_match(&tokenizer->_input, "SYSTEM",
|
2322
|
+
sizeof("SYSTEM") - 1, false)) {
|
2374
2323
|
gumbo_tokenizer_set_state(
|
2375
2324
|
parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_KEYWORD);
|
2376
2325
|
tokenizer->_reconsume_current_input = true;
|
2377
2326
|
} else {
|
2378
|
-
tokenizer_add_parse_error(
|
2327
|
+
tokenizer_add_parse_error(
|
2328
|
+
parser, GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET);
|
2379
2329
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
2380
2330
|
tokenizer->_doc_type_state.force_quirks = true;
|
2381
2331
|
}
|
@@ -2385,15 +2335,14 @@ static StateResult handle_after_doctype_name_state(
|
|
2385
2335
|
|
2386
2336
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-keyword-state
|
2387
2337
|
static StateResult handle_after_doctype_public_keyword_state(
|
2388
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2389
|
-
|
2338
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2339
|
+
GumboToken* output) {
|
2390
2340
|
switch (c) {
|
2391
2341
|
case '\t':
|
2392
2342
|
case '\n':
|
2393
2343
|
case '\f':
|
2394
2344
|
case ' ':
|
2395
|
-
gumbo_tokenizer_set_state(
|
2396
|
-
parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
|
2345
|
+
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BEFORE_DOCTYPE_PUBLIC_ID);
|
2397
2346
|
return NEXT_CHAR;
|
2398
2347
|
case '"':
|
2399
2348
|
tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
|
@@ -2429,9 +2378,8 @@ static StateResult handle_after_doctype_public_keyword_state(
|
|
2429
2378
|
}
|
2430
2379
|
|
2431
2380
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-public-identifier-state
|
2432
|
-
static StateResult handle_before_doctype_public_id_state(
|
2433
|
-
|
2434
|
-
int c, GumboToken* output) {
|
2381
|
+
static StateResult handle_before_doctype_public_id_state(GumboParser* parser,
|
2382
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2435
2383
|
switch (c) {
|
2436
2384
|
case '\t':
|
2437
2385
|
case '\n':
|
@@ -2471,8 +2419,8 @@ static StateResult handle_before_doctype_public_id_state(
|
|
2471
2419
|
|
2472
2420
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(double-quoted)-state
|
2473
2421
|
static StateResult handle_doctype_public_id_double_quoted_state(
|
2474
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2475
|
-
|
2422
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2423
|
+
GumboToken* output) {
|
2476
2424
|
switch (c) {
|
2477
2425
|
case '"':
|
2478
2426
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
|
@@ -2504,8 +2452,8 @@ static StateResult handle_doctype_public_id_double_quoted_state(
|
|
2504
2452
|
|
2505
2453
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-public-identifier-(single-quoted)-state
|
2506
2454
|
static StateResult handle_doctype_public_id_single_quoted_state(
|
2507
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2508
|
-
|
2455
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2456
|
+
GumboToken* output) {
|
2509
2457
|
switch (c) {
|
2510
2458
|
case '\'':
|
2511
2459
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_PUBLIC_ID);
|
@@ -2536,9 +2484,8 @@ static StateResult handle_doctype_public_id_single_quoted_state(
|
|
2536
2484
|
}
|
2537
2485
|
|
2538
2486
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-public-identifier-state
|
2539
|
-
static StateResult handle_after_doctype_public_id_state(
|
2540
|
-
|
2541
|
-
int c, GumboToken* output) {
|
2487
|
+
static StateResult handle_after_doctype_public_id_state(GumboParser* parser,
|
2488
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2542
2489
|
switch (c) {
|
2543
2490
|
case '\t':
|
2544
2491
|
case '\n':
|
@@ -2568,7 +2515,8 @@ static StateResult handle_after_doctype_public_id_state(
|
|
2568
2515
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2569
2516
|
tokenizer->_reconsume_current_input = true;
|
2570
2517
|
tokenizer->_doc_type_state.force_quirks = true;
|
2571
|
-
|
2518
|
+
emit_doctype(parser, output);
|
2519
|
+
return RETURN_ERROR;
|
2572
2520
|
default:
|
2573
2521
|
tokenizer_add_parse_error(parser, GUMBO_ERR_DOCTYPE_INVALID);
|
2574
2522
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_BOGUS_DOCTYPE);
|
@@ -2579,8 +2527,8 @@ static StateResult handle_after_doctype_public_id_state(
|
|
2579
2527
|
|
2580
2528
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#between-doctype-public-and-system-identifiers-state
|
2581
2529
|
static StateResult handle_between_doctype_public_system_id_state(
|
2582
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2583
|
-
|
2530
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2531
|
+
GumboToken* output) {
|
2584
2532
|
switch (c) {
|
2585
2533
|
case '\t':
|
2586
2534
|
case '\n':
|
@@ -2618,8 +2566,8 @@ static StateResult handle_between_doctype_public_system_id_state(
|
|
2618
2566
|
|
2619
2567
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-keyword-state
|
2620
2568
|
static StateResult handle_after_doctype_system_keyword_state(
|
2621
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2622
|
-
|
2569
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2570
|
+
GumboToken* output) {
|
2623
2571
|
switch (c) {
|
2624
2572
|
case '\t':
|
2625
2573
|
case '\n':
|
@@ -2660,9 +2608,8 @@ static StateResult handle_after_doctype_system_keyword_state(
|
|
2660
2608
|
}
|
2661
2609
|
|
2662
2610
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#before-doctype-system-identifier-state
|
2663
|
-
static StateResult handle_before_doctype_system_id_state(
|
2664
|
-
|
2665
|
-
int c, GumboToken* output) {
|
2611
|
+
static StateResult handle_before_doctype_system_id_state(GumboParser* parser,
|
2612
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2666
2613
|
switch (c) {
|
2667
2614
|
case '\t':
|
2668
2615
|
case '\n':
|
@@ -2701,8 +2648,8 @@ static StateResult handle_before_doctype_system_id_state(
|
|
2701
2648
|
|
2702
2649
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(double-quoted)-state
|
2703
2650
|
static StateResult handle_doctype_system_id_double_quoted_state(
|
2704
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2705
|
-
|
2651
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2652
|
+
GumboToken* output) {
|
2706
2653
|
switch (c) {
|
2707
2654
|
case '"':
|
2708
2655
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
|
@@ -2734,8 +2681,8 @@ static StateResult handle_doctype_system_id_double_quoted_state(
|
|
2734
2681
|
|
2735
2682
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#doctype-system-identifier-(single-quoted)-state
|
2736
2683
|
static StateResult handle_doctype_system_id_single_quoted_state(
|
2737
|
-
GumboParser* parser, GumboTokenizerState* tokenizer,
|
2738
|
-
|
2684
|
+
GumboParser* parser, GumboTokenizerState* tokenizer, int c,
|
2685
|
+
GumboToken* output) {
|
2739
2686
|
switch (c) {
|
2740
2687
|
case '\'':
|
2741
2688
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_AFTER_DOCTYPE_SYSTEM_ID);
|
@@ -2766,9 +2713,8 @@ static StateResult handle_doctype_system_id_single_quoted_state(
|
|
2766
2713
|
}
|
2767
2714
|
|
2768
2715
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#after-doctype-system-identifier-state
|
2769
|
-
static StateResult handle_after_doctype_system_id_state(
|
2770
|
-
|
2771
|
-
int c, GumboToken* output) {
|
2716
|
+
static StateResult handle_after_doctype_system_id_state(GumboParser* parser,
|
2717
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2772
2718
|
switch (c) {
|
2773
2719
|
case '\t':
|
2774
2720
|
case '\n':
|
@@ -2793,9 +2739,8 @@ static StateResult handle_after_doctype_system_id_state(
|
|
2793
2739
|
}
|
2794
2740
|
|
2795
2741
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#bogus-doctype-state
|
2796
|
-
static StateResult handle_bogus_doctype_state(
|
2797
|
-
|
2798
|
-
int c, GumboToken* output) {
|
2742
|
+
static StateResult handle_bogus_doctype_state(GumboParser* parser,
|
2743
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2799
2744
|
if (c == '>' || c == -1) {
|
2800
2745
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2801
2746
|
emit_doctype(parser, output);
|
@@ -2805,14 +2750,14 @@ static StateResult handle_bogus_doctype_state(
|
|
2805
2750
|
}
|
2806
2751
|
|
2807
2752
|
// http://www.whatwg.org/specs/web-apps/current-work/complete.html#cdata-section-state
|
2808
|
-
static StateResult handle_cdata_state(
|
2809
|
-
|
2810
|
-
int c, GumboToken* output) {
|
2753
|
+
static StateResult handle_cdata_state(GumboParser* parser,
|
2754
|
+
GumboTokenizerState* tokenizer, int c, GumboToken* output) {
|
2811
2755
|
if (c == -1 || utf8iterator_maybe_consume_match(
|
2812
|
-
|
2756
|
+
&tokenizer->_input, "]]>", sizeof("]]>") - 1, true)) {
|
2813
2757
|
tokenizer->_reconsume_current_input = true;
|
2814
2758
|
reset_token_start_point(tokenizer);
|
2815
2759
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2760
|
+
tokenizer->_is_in_cdata = false;
|
2816
2761
|
return NEXT_CHAR;
|
2817
2762
|
} else {
|
2818
2763
|
return emit_current_char(parser, output);
|
@@ -2822,76 +2767,47 @@ static StateResult handle_cdata_state(
|
|
2822
2767
|
typedef StateResult (*GumboLexerStateFunction)(
|
2823
2768
|
GumboParser*, GumboTokenizerState*, int, GumboToken*);
|
2824
2769
|
|
2825
|
-
static GumboLexerStateFunction dispatch_table[] = {
|
2826
|
-
|
2827
|
-
|
2828
|
-
|
2829
|
-
|
2830
|
-
|
2831
|
-
|
2832
|
-
|
2833
|
-
|
2834
|
-
|
2835
|
-
|
2836
|
-
|
2837
|
-
|
2838
|
-
|
2839
|
-
|
2840
|
-
|
2841
|
-
|
2842
|
-
|
2843
|
-
|
2844
|
-
|
2845
|
-
|
2846
|
-
|
2847
|
-
|
2848
|
-
|
2849
|
-
|
2850
|
-
|
2851
|
-
|
2852
|
-
|
2853
|
-
|
2854
|
-
|
2855
|
-
|
2856
|
-
|
2857
|
-
|
2858
|
-
|
2859
|
-
|
2860
|
-
|
2861
|
-
|
2862
|
-
|
2863
|
-
|
2864
|
-
|
2865
|
-
|
2866
|
-
handle_char_ref_in_attr_value_state,
|
2867
|
-
handle_after_attr_value_quoted_state,
|
2868
|
-
handle_self_closing_start_tag_state,
|
2869
|
-
handle_bogus_comment_state,
|
2870
|
-
handle_markup_declaration_state,
|
2871
|
-
handle_comment_start_state,
|
2872
|
-
handle_comment_start_dash_state,
|
2873
|
-
handle_comment_state,
|
2874
|
-
handle_comment_end_dash_state,
|
2875
|
-
handle_comment_end_state,
|
2876
|
-
handle_comment_end_bang_state,
|
2877
|
-
handle_doctype_state,
|
2878
|
-
handle_before_doctype_name_state,
|
2879
|
-
handle_doctype_name_state,
|
2880
|
-
handle_after_doctype_name_state,
|
2881
|
-
handle_after_doctype_public_keyword_state,
|
2882
|
-
handle_before_doctype_public_id_state,
|
2883
|
-
handle_doctype_public_id_double_quoted_state,
|
2884
|
-
handle_doctype_public_id_single_quoted_state,
|
2885
|
-
handle_after_doctype_public_id_state,
|
2886
|
-
handle_between_doctype_public_system_id_state,
|
2887
|
-
handle_after_doctype_system_keyword_state,
|
2888
|
-
handle_before_doctype_system_id_state,
|
2889
|
-
handle_doctype_system_id_double_quoted_state,
|
2890
|
-
handle_doctype_system_id_single_quoted_state,
|
2891
|
-
handle_after_doctype_system_id_state,
|
2892
|
-
handle_bogus_doctype_state,
|
2893
|
-
handle_cdata_state
|
2894
|
-
};
|
2770
|
+
static GumboLexerStateFunction dispatch_table[] = {handle_data_state,
|
2771
|
+
handle_char_ref_in_data_state, handle_rcdata_state,
|
2772
|
+
handle_char_ref_in_rcdata_state, handle_rawtext_state, handle_script_state,
|
2773
|
+
handle_plaintext_state, handle_tag_open_state, handle_end_tag_open_state,
|
2774
|
+
handle_tag_name_state, handle_rcdata_lt_state,
|
2775
|
+
handle_rcdata_end_tag_open_state, handle_rcdata_end_tag_name_state,
|
2776
|
+
handle_rawtext_lt_state, handle_rawtext_end_tag_open_state,
|
2777
|
+
handle_rawtext_end_tag_name_state, handle_script_lt_state,
|
2778
|
+
handle_script_end_tag_open_state, handle_script_end_tag_name_state,
|
2779
|
+
handle_script_escaped_start_state, handle_script_escaped_start_dash_state,
|
2780
|
+
handle_script_escaped_state, handle_script_escaped_dash_state,
|
2781
|
+
handle_script_escaped_dash_dash_state, handle_script_escaped_lt_state,
|
2782
|
+
handle_script_escaped_end_tag_open_state,
|
2783
|
+
handle_script_escaped_end_tag_name_state,
|
2784
|
+
handle_script_double_escaped_start_state,
|
2785
|
+
handle_script_double_escaped_state, handle_script_double_escaped_dash_state,
|
2786
|
+
handle_script_double_escaped_dash_dash_state,
|
2787
|
+
handle_script_double_escaped_lt_state,
|
2788
|
+
handle_script_double_escaped_end_state, handle_before_attr_name_state,
|
2789
|
+
handle_attr_name_state, handle_after_attr_name_state,
|
2790
|
+
handle_before_attr_value_state, handle_attr_value_double_quoted_state,
|
2791
|
+
handle_attr_value_single_quoted_state, handle_attr_value_unquoted_state,
|
2792
|
+
handle_char_ref_in_attr_value_state, handle_after_attr_value_quoted_state,
|
2793
|
+
handle_self_closing_start_tag_state, handle_bogus_comment_state,
|
2794
|
+
handle_markup_declaration_state, handle_comment_start_state,
|
2795
|
+
handle_comment_start_dash_state, handle_comment_state,
|
2796
|
+
handle_comment_end_dash_state, handle_comment_end_state,
|
2797
|
+
handle_comment_end_bang_state, handle_doctype_state,
|
2798
|
+
handle_before_doctype_name_state, handle_doctype_name_state,
|
2799
|
+
handle_after_doctype_name_state, handle_after_doctype_public_keyword_state,
|
2800
|
+
handle_before_doctype_public_id_state,
|
2801
|
+
handle_doctype_public_id_double_quoted_state,
|
2802
|
+
handle_doctype_public_id_single_quoted_state,
|
2803
|
+
handle_after_doctype_public_id_state,
|
2804
|
+
handle_between_doctype_public_system_id_state,
|
2805
|
+
handle_after_doctype_system_keyword_state,
|
2806
|
+
handle_before_doctype_system_id_state,
|
2807
|
+
handle_doctype_system_id_double_quoted_state,
|
2808
|
+
handle_doctype_system_id_single_quoted_state,
|
2809
|
+
handle_after_doctype_system_id_state, handle_bogus_doctype_state,
|
2810
|
+
handle_cdata_state};
|
2895
2811
|
|
2896
2812
|
bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
2897
2813
|
// Because of the spec requirements that...
|
@@ -2929,7 +2845,8 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
2929
2845
|
assert(!tokenizer->_temporary_buffer_emit);
|
2930
2846
|
assert(tokenizer->_buffered_emit_char == kGumboNoChar);
|
2931
2847
|
int c = utf8iterator_current(&tokenizer->_input);
|
2932
|
-
gumbo_debug(
|
2848
|
+
gumbo_debug(
|
2849
|
+
"Lexing character '%c' (%d) in state %d.\n", c, c, tokenizer->_state);
|
2933
2850
|
StateResult result =
|
2934
2851
|
dispatch_table[tokenizer->_state](parser, tokenizer, c, output);
|
2935
2852
|
// We need to clear reconsume_current_input before returning to prevent
|
@@ -2939,7 +2856,7 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
|
|
2939
2856
|
|
2940
2857
|
if (result == RETURN_SUCCESS) {
|
2941
2858
|
return true;
|
2942
|
-
} else if(result == RETURN_ERROR) {
|
2859
|
+
} else if (result == RETURN_ERROR) {
|
2943
2860
|
return false;
|
2944
2861
|
}
|
2945
2862
|
|
@@ -2961,7 +2878,7 @@ void gumbo_token_destroy(GumboParser* parser, GumboToken* token) {
|
|
2961
2878
|
parser, (void*) token->v.doc_type.system_identifier);
|
2962
2879
|
return;
|
2963
2880
|
case GUMBO_TOKEN_START_TAG:
|
2964
|
-
for (int i = 0; i < token->v.start_tag.attributes.length; ++i) {
|
2881
|
+
for (unsigned int i = 0; i < token->v.start_tag.attributes.length; ++i) {
|
2965
2882
|
GumboAttribute* attr = token->v.start_tag.attributes.data[i];
|
2966
2883
|
if (attr) {
|
2967
2884
|
// May have been nulled out if this token was merged with another.
|