@tkeron/html-parser 1.1.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +14 -4
- package/README.md +6 -6
- package/bun.lock +6 -8
- package/check-versions.ts +147 -0
- package/index.ts +4 -8
- package/package.json +5 -6
- package/src/dom-simulator/append-child.ts +130 -0
- package/src/dom-simulator/append.ts +18 -0
- package/src/dom-simulator/attributes.ts +23 -0
- package/src/dom-simulator/clone-node.ts +51 -0
- package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
- package/src/dom-simulator/create-cdata.ts +18 -0
- package/src/dom-simulator/create-comment.ts +23 -0
- package/src/dom-simulator/create-doctype.ts +24 -0
- package/src/dom-simulator/create-document.ts +81 -0
- package/src/dom-simulator/create-element.ts +195 -0
- package/src/dom-simulator/create-processing-instruction.ts +19 -0
- package/src/dom-simulator/create-temp-parent.ts +9 -0
- package/src/dom-simulator/create-text-node.ts +23 -0
- package/src/dom-simulator/escape-text-content.ts +6 -0
- package/src/dom-simulator/find-special-elements.ts +14 -0
- package/src/dom-simulator/get-text-content.ts +18 -0
- package/src/dom-simulator/index.ts +36 -0
- package/src/dom-simulator/inner-outer-html.ts +182 -0
- package/src/dom-simulator/insert-after.ts +20 -0
- package/src/dom-simulator/insert-before.ts +108 -0
- package/src/dom-simulator/matches.ts +26 -0
- package/src/dom-simulator/node-types.ts +26 -0
- package/src/dom-simulator/prepend.ts +24 -0
- package/src/dom-simulator/remove-child.ts +68 -0
- package/src/dom-simulator/remove.ts +7 -0
- package/src/dom-simulator/replace-child.ts +152 -0
- package/src/dom-simulator/set-text-content.ts +33 -0
- package/src/dom-simulator/update-element-content.ts +56 -0
- package/src/dom-simulator.ts +12 -1126
- package/src/encoding/constants.ts +8 -0
- package/src/encoding/detect-encoding.ts +21 -0
- package/src/encoding/index.ts +1 -0
- package/src/encoding/normalize-encoding.ts +6 -0
- package/src/html-entities.ts +2127 -0
- package/src/index.ts +5 -5
- package/src/parser/adoption-agency-helpers.ts +145 -0
- package/src/parser/constants.ts +137 -0
- package/src/parser/dom-to-ast.ts +79 -0
- package/src/parser/index.ts +9 -0
- package/src/parser/parse.ts +772 -0
- package/src/parser/types.ts +56 -0
- package/src/selectors/find-elements-descendant.ts +47 -0
- package/src/selectors/index.ts +2 -0
- package/src/selectors/matches-selector.ts +12 -0
- package/src/selectors/matches-token.ts +27 -0
- package/src/selectors/parse-selector.ts +48 -0
- package/src/selectors/query-selector-all.ts +43 -0
- package/src/selectors/query-selector.ts +6 -0
- package/src/selectors/types.ts +10 -0
- package/src/serializer/attributes.ts +74 -0
- package/src/serializer/escape.ts +13 -0
- package/src/serializer/index.ts +1 -0
- package/src/serializer/serialize-tokens.ts +511 -0
- package/src/tokenizer/calculate-position.ts +10 -0
- package/src/tokenizer/constants.ts +11 -0
- package/src/tokenizer/decode-entities.ts +64 -0
- package/src/tokenizer/index.ts +2 -0
- package/src/tokenizer/parse-attributes.ts +74 -0
- package/src/tokenizer/tokenize.ts +165 -0
- package/src/tokenizer/types.ts +25 -0
- package/tests/adoption-agency-helpers.test.ts +304 -0
- package/tests/advanced.test.ts +242 -221
- package/tests/cloneNode.test.ts +19 -66
- package/tests/custom-elements-head.test.ts +54 -55
- package/tests/dom-extended.test.ts +77 -64
- package/tests/dom-manipulation.test.ts +51 -24
- package/tests/dom.test.ts +15 -13
- package/tests/encoding/detect-encoding.test.ts +33 -0
- package/tests/google-dom.test.ts +2 -2
- package/tests/helpers/tokenizer-adapter.test.ts +29 -43
- package/tests/helpers/tokenizer-adapter.ts +36 -33
- package/tests/helpers/tree-adapter.test.ts +20 -20
- package/tests/helpers/tree-adapter.ts +34 -24
- package/tests/html-entities-text.test.ts +6 -2
- package/tests/innerhtml-void-elements.test.ts +52 -36
- package/tests/outerHTML-replacement.test.ts +37 -65
- package/tests/parser/dom-to-ast.test.ts +109 -0
- package/tests/parser/parse.test.ts +139 -0
- package/tests/parser.test.ts +281 -217
- package/tests/selectors/query-selector-all.test.ts +39 -0
- package/tests/selectors/query-selector.test.ts +42 -0
- package/tests/serializer/attributes.test.ts +132 -0
- package/tests/serializer/escape.test.ts +51 -0
- package/tests/serializer/serialize-tokens.test.ts +80 -0
- package/tests/serializer-core.test.ts +6 -6
- package/tests/serializer-injectmeta.test.ts +6 -6
- package/tests/serializer-optionaltags.test.ts +9 -6
- package/tests/serializer-options.test.ts +6 -6
- package/tests/serializer-whitespace.test.ts +6 -6
- package/tests/tokenizer/calculate-position.test.ts +34 -0
- package/tests/tokenizer/decode-entities.test.ts +31 -0
- package/tests/tokenizer/parse-attributes.test.ts +44 -0
- package/tests/tokenizer/tokenize.test.ts +757 -0
- package/tests/tokenizer-namedEntities.test.ts +10 -7
- package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
- package/tests/tokenizer.test.ts +268 -256
- package/tests/tree-construction-adoption01.test.ts +25 -16
- package/tests/tree-construction-adoption02.test.ts +30 -19
- package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
- package/tests/tree-construction-entities02.test.ts +18 -16
- package/tests/tree-construction-html5test-com.test.ts +16 -10
- package/tests/tree-construction-math.test.ts +11 -9
- package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
- package/tests/tree-construction-noscript01.test.ts +11 -9
- package/tests/tree-construction-ruby.test.ts +6 -4
- package/tests/tree-construction-scriptdata01.test.ts +6 -4
- package/tests/tree-construction-svg.test.ts +6 -4
- package/tests/tree-construction-template.test.ts +6 -4
- package/tests/tree-construction-tests10.test.ts +6 -4
- package/tests/tree-construction-tests11.test.ts +6 -4
- package/tests/tree-construction-tests20.test.ts +7 -4
- package/tests/tree-construction-tests21.test.ts +7 -4
- package/tests/tree-construction-tests23.test.ts +7 -4
- package/tests/tree-construction-tests24.test.ts +7 -4
- package/tests/tree-construction-tests5.test.ts +6 -5
- package/tests/tree-construction-tests6.test.ts +6 -5
- package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
- package/tests/void-elements.test.ts +85 -40
- package/tsconfig.json +1 -1
- package/src/css-selector.ts +0 -185
- package/src/encoding.ts +0 -39
- package/src/parser.ts +0 -682
- package/src/serializer.ts +0 -450
- package/src/tokenizer.ts +0 -325
- package/tests/selectors.test.ts +0 -128
|
@@ -0,0 +1,511 @@
|
|
|
1
|
+
import { serializeAttributes } from "./attributes.js";
|
|
2
|
+
import { escapeText } from "./escape.js";
|
|
3
|
+
|
|
4
|
+
export const serializeTokens = (
|
|
5
|
+
tokens: any[],
|
|
6
|
+
options?: {
|
|
7
|
+
inject_meta_charset?: boolean;
|
|
8
|
+
encoding?: string;
|
|
9
|
+
quote_char?: string;
|
|
10
|
+
quote_attr_values?: boolean;
|
|
11
|
+
minimize_boolean_attributes?: boolean;
|
|
12
|
+
escape_lt_in_attrs?: boolean;
|
|
13
|
+
use_trailing_solidus?: boolean;
|
|
14
|
+
escape_rcdata?: boolean;
|
|
15
|
+
strip_whitespace?: boolean;
|
|
16
|
+
},
|
|
17
|
+
): string => {
|
|
18
|
+
const encoding = options?.encoding || "utf-8";
|
|
19
|
+
let result = "";
|
|
20
|
+
let inScript = false;
|
|
21
|
+
let inPre = false;
|
|
22
|
+
let inTextarea = false;
|
|
23
|
+
let inStyle = false;
|
|
24
|
+
let serializingHead = true;
|
|
25
|
+
|
|
26
|
+
let processedTokens = tokens;
|
|
27
|
+
if (options?.inject_meta_charset) {
|
|
28
|
+
let hasCharset = false;
|
|
29
|
+
let modifiedTokens: any[] = [];
|
|
30
|
+
let inHead = false;
|
|
31
|
+
|
|
32
|
+
for (const token of tokens) {
|
|
33
|
+
const type = token[0];
|
|
34
|
+
if (type === "StartTag" && token[2] === "head") {
|
|
35
|
+
inHead = true;
|
|
36
|
+
} else if (type === "EndTag" && token[2] === "head") {
|
|
37
|
+
inHead = false;
|
|
38
|
+
} else if (inHead && type === "EmptyTag" && token[1] === "meta") {
|
|
39
|
+
const attrs = token[2];
|
|
40
|
+
if (attrs.some((attr: any) => attr.name === "charset")) {
|
|
41
|
+
hasCharset = true;
|
|
42
|
+
}
|
|
43
|
+
const hasHttpEquiv = attrs.some(
|
|
44
|
+
(attr: any) =>
|
|
45
|
+
attr.name === "http-equiv" && attr.value === "content-type",
|
|
46
|
+
);
|
|
47
|
+
if (hasHttpEquiv) {
|
|
48
|
+
const contentAttr = attrs.find(
|
|
49
|
+
(attr: any) => attr.name === "content",
|
|
50
|
+
);
|
|
51
|
+
if (contentAttr && contentAttr.value.includes("charset=")) {
|
|
52
|
+
hasCharset = true;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
inHead = false;
|
|
59
|
+
for (const token of tokens) {
|
|
60
|
+
const type = token[0];
|
|
61
|
+
if (type === "StartTag" && token[2] === "head") {
|
|
62
|
+
inHead = true;
|
|
63
|
+
modifiedTokens.push(token);
|
|
64
|
+
if (!hasCharset && options?.encoding) {
|
|
65
|
+
modifiedTokens.push([
|
|
66
|
+
"EmptyTag",
|
|
67
|
+
"meta",
|
|
68
|
+
[{ name: "charset", value: encoding }],
|
|
69
|
+
]);
|
|
70
|
+
}
|
|
71
|
+
} else if (type === "EndTag" && token[2] === "head") {
|
|
72
|
+
inHead = false;
|
|
73
|
+
modifiedTokens.push(token);
|
|
74
|
+
} else if (inHead && type === "EmptyTag" && token[1] === "meta") {
|
|
75
|
+
let newAttrs = token[2].slice();
|
|
76
|
+
let isHttpEquiv = false;
|
|
77
|
+
for (let i = 0; i < newAttrs.length; i++) {
|
|
78
|
+
const attr = newAttrs[i];
|
|
79
|
+
if (attr.name === "charset" && options?.encoding) {
|
|
80
|
+
newAttrs[i] = { name: "charset", value: encoding };
|
|
81
|
+
} else if (
|
|
82
|
+
attr.name === "http-equiv" &&
|
|
83
|
+
attr.value === "content-type"
|
|
84
|
+
) {
|
|
85
|
+
isHttpEquiv = true;
|
|
86
|
+
} else if (
|
|
87
|
+
attr.name === "content" &&
|
|
88
|
+
isHttpEquiv &&
|
|
89
|
+
options?.encoding
|
|
90
|
+
) {
|
|
91
|
+
newAttrs[i] = {
|
|
92
|
+
name: "content",
|
|
93
|
+
value: attr.value.replace(/charset=[^;]*/, "charset=" + encoding),
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
modifiedTokens.push([type, token[1], newAttrs]);
|
|
98
|
+
} else {
|
|
99
|
+
modifiedTokens.push(token);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
processedTokens = modifiedTokens;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
let omitHtml = false;
|
|
106
|
+
let omitHead = false;
|
|
107
|
+
let omitBody = false;
|
|
108
|
+
let omitColgroup = false;
|
|
109
|
+
let htmlStartIndex = -1;
|
|
110
|
+
let headStartIndex = -1;
|
|
111
|
+
let bodyStartIndex = -1;
|
|
112
|
+
let colgroupStartIndex = -1;
|
|
113
|
+
let tbodyCount = 0;
|
|
114
|
+
let colgroupCount = 0;
|
|
115
|
+
for (let i = 0; i < processedTokens.length; i++) {
|
|
116
|
+
const token = processedTokens[i];
|
|
117
|
+
const type = token[0];
|
|
118
|
+
if (type === "StartTag") {
|
|
119
|
+
const name = token[2];
|
|
120
|
+
if (name === "html") {
|
|
121
|
+
htmlStartIndex = i;
|
|
122
|
+
}
|
|
123
|
+
if (name === "head") {
|
|
124
|
+
headStartIndex = i;
|
|
125
|
+
}
|
|
126
|
+
if (name === "body") {
|
|
127
|
+
bodyStartIndex = i;
|
|
128
|
+
}
|
|
129
|
+
if (name === "colgroup") {
|
|
130
|
+
colgroupStartIndex = i;
|
|
131
|
+
colgroupCount++;
|
|
132
|
+
}
|
|
133
|
+
if (name === "tbody") {
|
|
134
|
+
tbodyCount++;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
if (htmlStartIndex >= 0) {
|
|
139
|
+
const htmlToken = processedTokens[htmlStartIndex];
|
|
140
|
+
const attrs = htmlToken[3];
|
|
141
|
+
const hasAttributes = Array.isArray(attrs)
|
|
142
|
+
? attrs.length > 0
|
|
143
|
+
: attrs
|
|
144
|
+
? Object.keys(attrs).length > 0
|
|
145
|
+
: false;
|
|
146
|
+
if (hasAttributes) {
|
|
147
|
+
omitHtml = false;
|
|
148
|
+
} else {
|
|
149
|
+
let firstToken = null;
|
|
150
|
+
for (let j = htmlStartIndex + 1; j < processedTokens.length; j++) {
|
|
151
|
+
const t = processedTokens[j];
|
|
152
|
+
if (t[0] !== "Characters" || t[1].trim() !== "") {
|
|
153
|
+
firstToken = t;
|
|
154
|
+
break;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
if (!firstToken) {
|
|
158
|
+
omitHtml = true;
|
|
159
|
+
} else if (firstToken[0] === "Comment") {
|
|
160
|
+
omitHtml = false;
|
|
161
|
+
} else if (firstToken[0] === "Characters") {
|
|
162
|
+
if (/^\s/.test(firstToken[1])) {
|
|
163
|
+
omitHtml = false;
|
|
164
|
+
} else {
|
|
165
|
+
omitHtml = true;
|
|
166
|
+
}
|
|
167
|
+
} else {
|
|
168
|
+
omitHtml = true;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
if (headStartIndex >= 0) {
|
|
173
|
+
let firstToken = null;
|
|
174
|
+
for (let j = headStartIndex + 1; j < processedTokens.length; j++) {
|
|
175
|
+
const t = processedTokens[j];
|
|
176
|
+
if (t[0] !== "Characters" || t[1].trim() !== "") {
|
|
177
|
+
firstToken = t;
|
|
178
|
+
break;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
omitHead = false;
|
|
182
|
+
if (firstToken) {
|
|
183
|
+
if (firstToken[0] === "StartTag") {
|
|
184
|
+
omitHead = true;
|
|
185
|
+
} else if (firstToken[0] === "EndTag" && firstToken[2] === "head") {
|
|
186
|
+
omitHead = true;
|
|
187
|
+
} else if (firstToken[0] === "EmptyTag") {
|
|
188
|
+
omitHead = true;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
if (bodyStartIndex >= 0) {
|
|
193
|
+
let firstToken = null;
|
|
194
|
+
for (let j = bodyStartIndex + 1; j < processedTokens.length; j++) {
|
|
195
|
+
const t = processedTokens[j];
|
|
196
|
+
if (t[0] !== "Characters" || t[1].trim() !== "") {
|
|
197
|
+
firstToken = t;
|
|
198
|
+
break;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
omitBody = false;
|
|
202
|
+
if (firstToken) {
|
|
203
|
+
if (firstToken[0] === "StartTag") {
|
|
204
|
+
omitBody = true;
|
|
205
|
+
} else if (firstToken[0] === "EndTag") {
|
|
206
|
+
omitBody = true;
|
|
207
|
+
} else if (firstToken[0] === "Characters" && !/^\s/.test(firstToken[1])) {
|
|
208
|
+
omitBody = true;
|
|
209
|
+
}
|
|
210
|
+
} else {
|
|
211
|
+
omitBody = true;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
if (colgroupStartIndex >= 0) {
|
|
215
|
+
const colgroupToken = processedTokens[colgroupStartIndex];
|
|
216
|
+
const attrs = colgroupToken[3];
|
|
217
|
+
const hasAttributes = Array.isArray(attrs)
|
|
218
|
+
? attrs.length > 0
|
|
219
|
+
: attrs
|
|
220
|
+
? Object.keys(attrs).length > 0
|
|
221
|
+
: false;
|
|
222
|
+
let firstToken = null;
|
|
223
|
+
for (let j = colgroupStartIndex + 1; j < processedTokens.length; j++) {
|
|
224
|
+
const t = processedTokens[j];
|
|
225
|
+
if (t[0] !== "Characters" || t[1].trim() !== "") {
|
|
226
|
+
firstToken = t;
|
|
227
|
+
break;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
omitColgroup =
|
|
231
|
+
!hasAttributes &&
|
|
232
|
+
firstToken &&
|
|
233
|
+
(firstToken[0] === "StartTag" || firstToken[0] === "EmptyTag") &&
|
|
234
|
+
(firstToken[0] === "StartTag" ? firstToken[2] : firstToken[1]) === "col";
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
for (let i = 0; i < processedTokens.length; i++) {
|
|
238
|
+
const token = processedTokens[i];
|
|
239
|
+
const nextToken = processedTokens[i + 1];
|
|
240
|
+
const type = token[0];
|
|
241
|
+
switch (type) {
|
|
242
|
+
case "StartTag":
|
|
243
|
+
const [, , name, attrs] = token;
|
|
244
|
+
|
|
245
|
+
let omitThisTbody = false;
|
|
246
|
+
if (name === "tbody") {
|
|
247
|
+
const hasAttributes = Array.isArray(attrs)
|
|
248
|
+
? attrs.length > 0
|
|
249
|
+
: attrs
|
|
250
|
+
? Object.keys(attrs).length > 0
|
|
251
|
+
: false;
|
|
252
|
+
if (!hasAttributes) {
|
|
253
|
+
let firstToken = null;
|
|
254
|
+
for (let j = i + 1; j < processedTokens.length; j++) {
|
|
255
|
+
const t = processedTokens[j];
|
|
256
|
+
if (t[0] !== "Characters" || t[1].trim() !== "") {
|
|
257
|
+
firstToken = t;
|
|
258
|
+
break;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
const hasTrChild =
|
|
262
|
+
firstToken &&
|
|
263
|
+
(firstToken[0] === "StartTag" || firstToken[0] === "EmptyTag") &&
|
|
264
|
+
firstToken[2] === "tr";
|
|
265
|
+
|
|
266
|
+
if (hasTrChild) {
|
|
267
|
+
let isPreceded = false;
|
|
268
|
+
for (let j = 0; j < i; j++) {
|
|
269
|
+
const t = processedTokens[j];
|
|
270
|
+
if (t[0] === "Characters" && t[1].trim() === "") continue;
|
|
271
|
+
if (
|
|
272
|
+
t[0] === "EndTag" &&
|
|
273
|
+
["tbody", "thead", "tfoot"].includes(t[2])
|
|
274
|
+
) {
|
|
275
|
+
isPreceded = true;
|
|
276
|
+
}
|
|
277
|
+
break;
|
|
278
|
+
}
|
|
279
|
+
omitThisTbody = !isPreceded;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
if (name === "colgroup" && omitColgroup) continue;
|
|
285
|
+
if (name === "tbody" && omitThisTbody) continue;
|
|
286
|
+
if (name === "head" && omitHead) continue;
|
|
287
|
+
if (name === "body" && omitBody) continue;
|
|
288
|
+
if (name === "html" && omitHtml) continue;
|
|
289
|
+
if (name === "pre") inPre = true;
|
|
290
|
+
if (name === "textarea") inTextarea = true;
|
|
291
|
+
if (name === "script") inScript = true;
|
|
292
|
+
if (name === "style") inStyle = true;
|
|
293
|
+
if (name === "head") {
|
|
294
|
+
if (options?.inject_meta_charset) {
|
|
295
|
+
serializingHead = true;
|
|
296
|
+
} else {
|
|
297
|
+
result += "<" + name + serializeAttributes(attrs, options) + ">";
|
|
298
|
+
}
|
|
299
|
+
} else if (serializingHead) {
|
|
300
|
+
result += "<" + name + serializeAttributes(attrs, options) + ">";
|
|
301
|
+
}
|
|
302
|
+
break;
|
|
303
|
+
case "EmptyTag":
|
|
304
|
+
const [, name2, attrs2] = token;
|
|
305
|
+
result +=
|
|
306
|
+
"<" +
|
|
307
|
+
name2 +
|
|
308
|
+
serializeAttributes(attrs2, options) +
|
|
309
|
+
(options?.use_trailing_solidus ? " />" : ">");
|
|
310
|
+
break;
|
|
311
|
+
case "EndTag":
|
|
312
|
+
const [, , name3] = token;
|
|
313
|
+
let omitEndTag = false;
|
|
314
|
+
if (["html", "head", "body"].includes(name3)) {
|
|
315
|
+
if (
|
|
316
|
+
!nextToken ||
|
|
317
|
+
nextToken[0] === "StartTag" ||
|
|
318
|
+
nextToken[0] === "EndTag" ||
|
|
319
|
+
(nextToken[0] === "Characters" && !/^\s/.test(nextToken[1]))
|
|
320
|
+
) {
|
|
321
|
+
omitEndTag = true;
|
|
322
|
+
}
|
|
323
|
+
} else if (nextToken) {
|
|
324
|
+
const nextType = nextToken[0];
|
|
325
|
+
let nextName = null;
|
|
326
|
+
if (nextType === "StartTag" || nextType === "EndTag") {
|
|
327
|
+
nextName = nextToken[2];
|
|
328
|
+
} else if (nextType === "EmptyTag") {
|
|
329
|
+
nextName = nextToken[1];
|
|
330
|
+
}
|
|
331
|
+
if (nextType === "EndTag") {
|
|
332
|
+
omitEndTag = [
|
|
333
|
+
"p",
|
|
334
|
+
"li",
|
|
335
|
+
"option",
|
|
336
|
+
"optgroup",
|
|
337
|
+
"tbody",
|
|
338
|
+
"tfoot",
|
|
339
|
+
"tr",
|
|
340
|
+
"td",
|
|
341
|
+
"th",
|
|
342
|
+
"colgroup",
|
|
343
|
+
"dd",
|
|
344
|
+
].includes(name3);
|
|
345
|
+
} else if (nextType === "StartTag") {
|
|
346
|
+
if (
|
|
347
|
+
name3 === "p" &&
|
|
348
|
+
[
|
|
349
|
+
"address",
|
|
350
|
+
"article",
|
|
351
|
+
"aside",
|
|
352
|
+
"blockquote",
|
|
353
|
+
"datagrid",
|
|
354
|
+
"dialog",
|
|
355
|
+
"dir",
|
|
356
|
+
"div",
|
|
357
|
+
"dl",
|
|
358
|
+
"fieldset",
|
|
359
|
+
"footer",
|
|
360
|
+
"form",
|
|
361
|
+
"h1",
|
|
362
|
+
"h2",
|
|
363
|
+
"h3",
|
|
364
|
+
"h4",
|
|
365
|
+
"h5",
|
|
366
|
+
"h6",
|
|
367
|
+
"header",
|
|
368
|
+
"hr",
|
|
369
|
+
"menu",
|
|
370
|
+
"nav",
|
|
371
|
+
"ol",
|
|
372
|
+
"p",
|
|
373
|
+
"pre",
|
|
374
|
+
"section",
|
|
375
|
+
"table",
|
|
376
|
+
"ul",
|
|
377
|
+
].includes(nextName)
|
|
378
|
+
) {
|
|
379
|
+
omitEndTag = true;
|
|
380
|
+
} else if (name3 === "li" && nextName === "li") {
|
|
381
|
+
omitEndTag = true;
|
|
382
|
+
} else if (
|
|
383
|
+
(name3 === "dt" || name3 === "dd") &&
|
|
384
|
+
(nextName === "dt" || nextName === "dd")
|
|
385
|
+
) {
|
|
386
|
+
omitEndTag = true;
|
|
387
|
+
} else if (
|
|
388
|
+
name3 === "option" &&
|
|
389
|
+
(nextName === "option" || nextName === "optgroup")
|
|
390
|
+
) {
|
|
391
|
+
omitEndTag = true;
|
|
392
|
+
} else if (name3 === "optgroup" && nextName === "optgroup") {
|
|
393
|
+
omitEndTag = true;
|
|
394
|
+
} else if (
|
|
395
|
+
(name3 === "tbody" || name3 === "tfoot") &&
|
|
396
|
+
(nextName === "tbody" || nextName === "tfoot")
|
|
397
|
+
) {
|
|
398
|
+
omitEndTag = true;
|
|
399
|
+
} else if (
|
|
400
|
+
name3 === "thead" &&
|
|
401
|
+
(nextName === "tbody" || nextName === "tfoot")
|
|
402
|
+
) {
|
|
403
|
+
omitEndTag = true;
|
|
404
|
+
} else if (name3 === "tr" && nextName === "tr") {
|
|
405
|
+
omitEndTag = true;
|
|
406
|
+
} else if (
|
|
407
|
+
(name3 === "td" || name3 === "th") &&
|
|
408
|
+
(nextName === "td" || nextName === "th")
|
|
409
|
+
) {
|
|
410
|
+
omitEndTag = true;
|
|
411
|
+
} else if (name3 === "colgroup" && nextName !== "colgroup") {
|
|
412
|
+
omitEndTag = true;
|
|
413
|
+
}
|
|
414
|
+
if (name3 === "p" && nextName === "hr") {
|
|
415
|
+
omitEndTag = true;
|
|
416
|
+
}
|
|
417
|
+
} else if (nextType === "EmptyTag") {
|
|
418
|
+
if (name3 === "p" && nextName === "hr") {
|
|
419
|
+
omitEndTag = true;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
if (
|
|
423
|
+
name3 === "colgroup" &&
|
|
424
|
+
nextType === "Characters" &&
|
|
425
|
+
!/^\s/.test(nextToken[1])
|
|
426
|
+
) {
|
|
427
|
+
omitEndTag = true;
|
|
428
|
+
}
|
|
429
|
+
} else {
|
|
430
|
+
omitEndTag = [
|
|
431
|
+
"p",
|
|
432
|
+
"li",
|
|
433
|
+
"option",
|
|
434
|
+
"optgroup",
|
|
435
|
+
"tbody",
|
|
436
|
+
"tfoot",
|
|
437
|
+
"tr",
|
|
438
|
+
"td",
|
|
439
|
+
"th",
|
|
440
|
+
"colgroup",
|
|
441
|
+
"dd",
|
|
442
|
+
].includes(name3);
|
|
443
|
+
}
|
|
444
|
+
if (omitEndTag) continue;
|
|
445
|
+
if (name3 === "script") inScript = false;
|
|
446
|
+
if (name3 === "pre") inPre = false;
|
|
447
|
+
if (name3 === "textarea") inTextarea = false;
|
|
448
|
+
if (name3 === "style") inStyle = false;
|
|
449
|
+
if (name3 === "head") {
|
|
450
|
+
if (options?.inject_meta_charset) {
|
|
451
|
+
serializingHead = false;
|
|
452
|
+
} else {
|
|
453
|
+
result += "</" + name3 + ">";
|
|
454
|
+
}
|
|
455
|
+
} else if (serializingHead) {
|
|
456
|
+
result += "</" + name3 + ">";
|
|
457
|
+
}
|
|
458
|
+
break;
|
|
459
|
+
case "Characters":
|
|
460
|
+
if (serializingHead) {
|
|
461
|
+
let text = token[1];
|
|
462
|
+
if (
|
|
463
|
+
options?.strip_whitespace &&
|
|
464
|
+
!inPre &&
|
|
465
|
+
!inTextarea &&
|
|
466
|
+
!inScript &&
|
|
467
|
+
!inStyle
|
|
468
|
+
) {
|
|
469
|
+
text = text.replace(/\s+/g, " ");
|
|
470
|
+
}
|
|
471
|
+
if (inScript) {
|
|
472
|
+
if (options?.escape_rcdata) {
|
|
473
|
+
result += escapeText(text);
|
|
474
|
+
} else {
|
|
475
|
+
result += text;
|
|
476
|
+
}
|
|
477
|
+
} else if (inTextarea) {
|
|
478
|
+
if (options?.escape_rcdata) {
|
|
479
|
+
result += escapeText(text);
|
|
480
|
+
} else {
|
|
481
|
+
result += text;
|
|
482
|
+
}
|
|
483
|
+
} else {
|
|
484
|
+
result += escapeText(text);
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
break;
|
|
488
|
+
case "Doctype":
|
|
489
|
+
if (serializingHead) {
|
|
490
|
+
result += "<!DOCTYPE " + token[1];
|
|
491
|
+
if (token[2]) {
|
|
492
|
+
result += ' PUBLIC "' + token[2] + '"';
|
|
493
|
+
if (token[3]) result += ' "' + token[3] + '"';
|
|
494
|
+
} else if (token[3]) {
|
|
495
|
+
result += ' SYSTEM "' + token[3] + '"';
|
|
496
|
+
}
|
|
497
|
+
result += ">";
|
|
498
|
+
}
|
|
499
|
+
break;
|
|
500
|
+
case "Comment":
|
|
501
|
+
if (serializingHead) {
|
|
502
|
+
result += "<!--" + token[1] + "-->";
|
|
503
|
+
}
|
|
504
|
+
break;
|
|
505
|
+
default:
|
|
506
|
+
break;
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
return result;
|
|
511
|
+
};
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { Position } from "./types.js";
|
|
2
|
+
|
|
3
|
+
export const calculatePosition = (text: string, offset: number): Position => {
|
|
4
|
+
const lines = text.slice(0, offset).split("\n");
|
|
5
|
+
return {
|
|
6
|
+
line: lines.length,
|
|
7
|
+
column: lines[lines.length - 1]?.length ?? 0,
|
|
8
|
+
offset,
|
|
9
|
+
};
|
|
10
|
+
};
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { HTML_ENTITIES } from "../html-entities.js";
|
|
2
|
+
|
|
3
|
+
export const decodeEntities = (text: string): string => {
|
|
4
|
+
let result = "";
|
|
5
|
+
let i = 0;
|
|
6
|
+
while (i < text.length) {
|
|
7
|
+
if (text[i] === "&") {
|
|
8
|
+
let match = "";
|
|
9
|
+
let j = i + 1;
|
|
10
|
+
if (text[j] === "#") {
|
|
11
|
+
j++;
|
|
12
|
+
if (text[j] === "x" || text[j] === "X") {
|
|
13
|
+
j++;
|
|
14
|
+
while (j < text.length && /[0-9a-fA-F]/.test(text[j])) {
|
|
15
|
+
j++;
|
|
16
|
+
}
|
|
17
|
+
} else {
|
|
18
|
+
while (j < text.length && /[0-9]/.test(text[j])) {
|
|
19
|
+
j++;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
if (text[j] === ";") {
|
|
23
|
+
j++;
|
|
24
|
+
}
|
|
25
|
+
match = text.substring(i, j);
|
|
26
|
+
const entity = match;
|
|
27
|
+
if (entity.startsWith("&#x") && entity.endsWith(";")) {
|
|
28
|
+
const hex = entity.slice(3, -1);
|
|
29
|
+
result += String.fromCharCode(parseInt(hex, 16));
|
|
30
|
+
i = j;
|
|
31
|
+
continue;
|
|
32
|
+
} else if (entity.startsWith("&#") && entity.endsWith(";")) {
|
|
33
|
+
const decimal = entity.slice(2, -1);
|
|
34
|
+
result += String.fromCharCode(parseInt(decimal, 10));
|
|
35
|
+
i = j;
|
|
36
|
+
continue;
|
|
37
|
+
}
|
|
38
|
+
} else {
|
|
39
|
+
while (j < text.length && /[a-zA-Z0-9]/.test(text[j])) {
|
|
40
|
+
j++;
|
|
41
|
+
}
|
|
42
|
+
const hasSemi = text[j] === ";";
|
|
43
|
+
if (hasSemi) {
|
|
44
|
+
j++;
|
|
45
|
+
}
|
|
46
|
+
match = text.substring(i, j);
|
|
47
|
+
const named = match.slice(1, hasSemi ? -1 : undefined);
|
|
48
|
+
if (HTML_ENTITIES[named]) {
|
|
49
|
+
if (hasSemi || (j < text.length && !/[a-zA-Z0-9]/.test(text[j]))) {
|
|
50
|
+
result += HTML_ENTITIES[named];
|
|
51
|
+
i = j;
|
|
52
|
+
continue;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
result += text[i];
|
|
57
|
+
i++;
|
|
58
|
+
} else {
|
|
59
|
+
result += text[i];
|
|
60
|
+
i++;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
return result.replace(/\u0000/g, "\uFFFD");
|
|
64
|
+
};
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { decodeEntities } from "./decode-entities.js";
|
|
2
|
+
|
|
3
|
+
export const parseAttributes = (
|
|
4
|
+
attributeString: string,
|
|
5
|
+
): Record<string, string> => {
|
|
6
|
+
const attributes: Record<string, string> = {};
|
|
7
|
+
let i = 0;
|
|
8
|
+
|
|
9
|
+
while (i < attributeString.length) {
|
|
10
|
+
while (i < attributeString.length && /\s/.test(attributeString[i])) {
|
|
11
|
+
i++;
|
|
12
|
+
}
|
|
13
|
+
if (
|
|
14
|
+
i >= attributeString.length ||
|
|
15
|
+
attributeString[i] === "/" ||
|
|
16
|
+
attributeString[i] === ">"
|
|
17
|
+
) {
|
|
18
|
+
break;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
let name = "";
|
|
22
|
+
while (i < attributeString.length && !/[\s=\/>]/.test(attributeString[i])) {
|
|
23
|
+
name += attributeString[i];
|
|
24
|
+
i++;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
if (!name) {
|
|
28
|
+
i++;
|
|
29
|
+
continue;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
while (i < attributeString.length && /\s/.test(attributeString[i])) {
|
|
33
|
+
i++;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
let value = "";
|
|
37
|
+
if (i < attributeString.length && attributeString[i] === "=") {
|
|
38
|
+
i++;
|
|
39
|
+
while (i < attributeString.length && /\s/.test(attributeString[i])) {
|
|
40
|
+
i++;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if (i < attributeString.length) {
|
|
44
|
+
if (attributeString[i] === '"') {
|
|
45
|
+
i++;
|
|
46
|
+
while (i < attributeString.length && attributeString[i] !== '"') {
|
|
47
|
+
value += attributeString[i];
|
|
48
|
+
i++;
|
|
49
|
+
}
|
|
50
|
+
i++;
|
|
51
|
+
} else if (attributeString[i] === "'") {
|
|
52
|
+
i++;
|
|
53
|
+
while (i < attributeString.length && attributeString[i] !== "'") {
|
|
54
|
+
value += attributeString[i];
|
|
55
|
+
i++;
|
|
56
|
+
}
|
|
57
|
+
i++;
|
|
58
|
+
} else {
|
|
59
|
+
while (
|
|
60
|
+
i < attributeString.length &&
|
|
61
|
+
!/[\s>]/.test(attributeString[i])
|
|
62
|
+
) {
|
|
63
|
+
value += attributeString[i];
|
|
64
|
+
i++;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
attributes[name.toLowerCase()] = decodeEntities(value);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
return attributes;
|
|
74
|
+
};
|