@tkeron/html-parser 1.1.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/.github/workflows/npm_deploy.yml +14 -4
  2. package/README.md +6 -6
  3. package/bun.lock +6 -8
  4. package/check-versions.ts +147 -0
  5. package/index.ts +4 -8
  6. package/package.json +5 -6
  7. package/src/dom-simulator/append-child.ts +130 -0
  8. package/src/dom-simulator/append.ts +18 -0
  9. package/src/dom-simulator/attributes.ts +23 -0
  10. package/src/dom-simulator/clone-node.ts +51 -0
  11. package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
  12. package/src/dom-simulator/create-cdata.ts +18 -0
  13. package/src/dom-simulator/create-comment.ts +23 -0
  14. package/src/dom-simulator/create-doctype.ts +24 -0
  15. package/src/dom-simulator/create-document.ts +81 -0
  16. package/src/dom-simulator/create-element.ts +195 -0
  17. package/src/dom-simulator/create-processing-instruction.ts +19 -0
  18. package/src/dom-simulator/create-temp-parent.ts +9 -0
  19. package/src/dom-simulator/create-text-node.ts +23 -0
  20. package/src/dom-simulator/escape-text-content.ts +6 -0
  21. package/src/dom-simulator/find-special-elements.ts +14 -0
  22. package/src/dom-simulator/get-text-content.ts +18 -0
  23. package/src/dom-simulator/index.ts +36 -0
  24. package/src/dom-simulator/inner-outer-html.ts +182 -0
  25. package/src/dom-simulator/insert-after.ts +20 -0
  26. package/src/dom-simulator/insert-before.ts +108 -0
  27. package/src/dom-simulator/matches.ts +26 -0
  28. package/src/dom-simulator/node-types.ts +26 -0
  29. package/src/dom-simulator/prepend.ts +24 -0
  30. package/src/dom-simulator/remove-child.ts +68 -0
  31. package/src/dom-simulator/remove.ts +7 -0
  32. package/src/dom-simulator/replace-child.ts +152 -0
  33. package/src/dom-simulator/set-text-content.ts +33 -0
  34. package/src/dom-simulator/update-element-content.ts +56 -0
  35. package/src/dom-simulator.ts +12 -1126
  36. package/src/encoding/constants.ts +8 -0
  37. package/src/encoding/detect-encoding.ts +21 -0
  38. package/src/encoding/index.ts +1 -0
  39. package/src/encoding/normalize-encoding.ts +6 -0
  40. package/src/html-entities.ts +2127 -0
  41. package/src/index.ts +5 -5
  42. package/src/parser/adoption-agency-helpers.ts +145 -0
  43. package/src/parser/constants.ts +137 -0
  44. package/src/parser/dom-to-ast.ts +79 -0
  45. package/src/parser/index.ts +9 -0
  46. package/src/parser/parse.ts +772 -0
  47. package/src/parser/types.ts +56 -0
  48. package/src/selectors/find-elements-descendant.ts +47 -0
  49. package/src/selectors/index.ts +2 -0
  50. package/src/selectors/matches-selector.ts +12 -0
  51. package/src/selectors/matches-token.ts +27 -0
  52. package/src/selectors/parse-selector.ts +48 -0
  53. package/src/selectors/query-selector-all.ts +43 -0
  54. package/src/selectors/query-selector.ts +6 -0
  55. package/src/selectors/types.ts +10 -0
  56. package/src/serializer/attributes.ts +74 -0
  57. package/src/serializer/escape.ts +13 -0
  58. package/src/serializer/index.ts +1 -0
  59. package/src/serializer/serialize-tokens.ts +511 -0
  60. package/src/tokenizer/calculate-position.ts +10 -0
  61. package/src/tokenizer/constants.ts +11 -0
  62. package/src/tokenizer/decode-entities.ts +64 -0
  63. package/src/tokenizer/index.ts +2 -0
  64. package/src/tokenizer/parse-attributes.ts +74 -0
  65. package/src/tokenizer/tokenize.ts +165 -0
  66. package/src/tokenizer/types.ts +25 -0
  67. package/tests/adoption-agency-helpers.test.ts +304 -0
  68. package/tests/advanced.test.ts +242 -221
  69. package/tests/cloneNode.test.ts +19 -66
  70. package/tests/custom-elements-head.test.ts +54 -55
  71. package/tests/dom-extended.test.ts +77 -64
  72. package/tests/dom-manipulation.test.ts +51 -24
  73. package/tests/dom.test.ts +15 -13
  74. package/tests/encoding/detect-encoding.test.ts +33 -0
  75. package/tests/google-dom.test.ts +2 -2
  76. package/tests/helpers/tokenizer-adapter.test.ts +29 -43
  77. package/tests/helpers/tokenizer-adapter.ts +36 -33
  78. package/tests/helpers/tree-adapter.test.ts +20 -20
  79. package/tests/helpers/tree-adapter.ts +34 -24
  80. package/tests/html-entities-text.test.ts +6 -2
  81. package/tests/innerhtml-void-elements.test.ts +52 -36
  82. package/tests/outerHTML-replacement.test.ts +37 -65
  83. package/tests/parser/dom-to-ast.test.ts +109 -0
  84. package/tests/parser/parse.test.ts +139 -0
  85. package/tests/parser.test.ts +281 -217
  86. package/tests/selectors/query-selector-all.test.ts +39 -0
  87. package/tests/selectors/query-selector.test.ts +42 -0
  88. package/tests/serializer/attributes.test.ts +132 -0
  89. package/tests/serializer/escape.test.ts +51 -0
  90. package/tests/serializer/serialize-tokens.test.ts +80 -0
  91. package/tests/serializer-core.test.ts +6 -6
  92. package/tests/serializer-injectmeta.test.ts +6 -6
  93. package/tests/serializer-optionaltags.test.ts +9 -6
  94. package/tests/serializer-options.test.ts +6 -6
  95. package/tests/serializer-whitespace.test.ts +6 -6
  96. package/tests/tokenizer/calculate-position.test.ts +34 -0
  97. package/tests/tokenizer/decode-entities.test.ts +31 -0
  98. package/tests/tokenizer/parse-attributes.test.ts +44 -0
  99. package/tests/tokenizer/tokenize.test.ts +757 -0
  100. package/tests/tokenizer-namedEntities.test.ts +10 -7
  101. package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
  102. package/tests/tokenizer.test.ts +268 -256
  103. package/tests/tree-construction-adoption01.test.ts +25 -16
  104. package/tests/tree-construction-adoption02.test.ts +30 -19
  105. package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
  106. package/tests/tree-construction-entities02.test.ts +18 -16
  107. package/tests/tree-construction-html5test-com.test.ts +16 -10
  108. package/tests/tree-construction-math.test.ts +11 -9
  109. package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
  110. package/tests/tree-construction-noscript01.test.ts +11 -9
  111. package/tests/tree-construction-ruby.test.ts +6 -4
  112. package/tests/tree-construction-scriptdata01.test.ts +6 -4
  113. package/tests/tree-construction-svg.test.ts +6 -4
  114. package/tests/tree-construction-template.test.ts +6 -4
  115. package/tests/tree-construction-tests10.test.ts +6 -4
  116. package/tests/tree-construction-tests11.test.ts +6 -4
  117. package/tests/tree-construction-tests20.test.ts +7 -4
  118. package/tests/tree-construction-tests21.test.ts +7 -4
  119. package/tests/tree-construction-tests23.test.ts +7 -4
  120. package/tests/tree-construction-tests24.test.ts +7 -4
  121. package/tests/tree-construction-tests5.test.ts +6 -5
  122. package/tests/tree-construction-tests6.test.ts +6 -5
  123. package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
  124. package/tests/void-elements.test.ts +85 -40
  125. package/tsconfig.json +1 -1
  126. package/src/css-selector.ts +0 -185
  127. package/src/encoding.ts +0 -39
  128. package/src/parser.ts +0 -682
  129. package/src/serializer.ts +0 -450
  130. package/src/tokenizer.ts +0 -325
  131. package/tests/selectors.test.ts +0 -128
@@ -0,0 +1,511 @@
1
+ import { serializeAttributes } from "./attributes.js";
2
+ import { escapeText } from "./escape.js";
3
+
4
+ export const serializeTokens = (
5
+ tokens: any[],
6
+ options?: {
7
+ inject_meta_charset?: boolean;
8
+ encoding?: string;
9
+ quote_char?: string;
10
+ quote_attr_values?: boolean;
11
+ minimize_boolean_attributes?: boolean;
12
+ escape_lt_in_attrs?: boolean;
13
+ use_trailing_solidus?: boolean;
14
+ escape_rcdata?: boolean;
15
+ strip_whitespace?: boolean;
16
+ },
17
+ ): string => {
18
+ const encoding = options?.encoding || "utf-8";
19
+ let result = "";
20
+ let inScript = false;
21
+ let inPre = false;
22
+ let inTextarea = false;
23
+ let inStyle = false;
24
+ let serializingHead = true;
25
+
26
+ let processedTokens = tokens;
27
+ if (options?.inject_meta_charset) {
28
+ let hasCharset = false;
29
+ let modifiedTokens: any[] = [];
30
+ let inHead = false;
31
+
32
+ for (const token of tokens) {
33
+ const type = token[0];
34
+ if (type === "StartTag" && token[2] === "head") {
35
+ inHead = true;
36
+ } else if (type === "EndTag" && token[2] === "head") {
37
+ inHead = false;
38
+ } else if (inHead && type === "EmptyTag" && token[1] === "meta") {
39
+ const attrs = token[2];
40
+ if (attrs.some((attr: any) => attr.name === "charset")) {
41
+ hasCharset = true;
42
+ }
43
+ const hasHttpEquiv = attrs.some(
44
+ (attr: any) =>
45
+ attr.name === "http-equiv" && attr.value === "content-type",
46
+ );
47
+ if (hasHttpEquiv) {
48
+ const contentAttr = attrs.find(
49
+ (attr: any) => attr.name === "content",
50
+ );
51
+ if (contentAttr && contentAttr.value.includes("charset=")) {
52
+ hasCharset = true;
53
+ }
54
+ }
55
+ }
56
+ }
57
+
58
+ inHead = false;
59
+ for (const token of tokens) {
60
+ const type = token[0];
61
+ if (type === "StartTag" && token[2] === "head") {
62
+ inHead = true;
63
+ modifiedTokens.push(token);
64
+ if (!hasCharset && options?.encoding) {
65
+ modifiedTokens.push([
66
+ "EmptyTag",
67
+ "meta",
68
+ [{ name: "charset", value: encoding }],
69
+ ]);
70
+ }
71
+ } else if (type === "EndTag" && token[2] === "head") {
72
+ inHead = false;
73
+ modifiedTokens.push(token);
74
+ } else if (inHead && type === "EmptyTag" && token[1] === "meta") {
75
+ let newAttrs = token[2].slice();
76
+ let isHttpEquiv = false;
77
+ for (let i = 0; i < newAttrs.length; i++) {
78
+ const attr = newAttrs[i];
79
+ if (attr.name === "charset" && options?.encoding) {
80
+ newAttrs[i] = { name: "charset", value: encoding };
81
+ } else if (
82
+ attr.name === "http-equiv" &&
83
+ attr.value === "content-type"
84
+ ) {
85
+ isHttpEquiv = true;
86
+ } else if (
87
+ attr.name === "content" &&
88
+ isHttpEquiv &&
89
+ options?.encoding
90
+ ) {
91
+ newAttrs[i] = {
92
+ name: "content",
93
+ value: attr.value.replace(/charset=[^;]*/, "charset=" + encoding),
94
+ };
95
+ }
96
+ }
97
+ modifiedTokens.push([type, token[1], newAttrs]);
98
+ } else {
99
+ modifiedTokens.push(token);
100
+ }
101
+ }
102
+ processedTokens = modifiedTokens;
103
+ }
104
+
105
+ let omitHtml = false;
106
+ let omitHead = false;
107
+ let omitBody = false;
108
+ let omitColgroup = false;
109
+ let htmlStartIndex = -1;
110
+ let headStartIndex = -1;
111
+ let bodyStartIndex = -1;
112
+ let colgroupStartIndex = -1;
113
+ let tbodyCount = 0;
114
+ let colgroupCount = 0;
115
+ for (let i = 0; i < processedTokens.length; i++) {
116
+ const token = processedTokens[i];
117
+ const type = token[0];
118
+ if (type === "StartTag") {
119
+ const name = token[2];
120
+ if (name === "html") {
121
+ htmlStartIndex = i;
122
+ }
123
+ if (name === "head") {
124
+ headStartIndex = i;
125
+ }
126
+ if (name === "body") {
127
+ bodyStartIndex = i;
128
+ }
129
+ if (name === "colgroup") {
130
+ colgroupStartIndex = i;
131
+ colgroupCount++;
132
+ }
133
+ if (name === "tbody") {
134
+ tbodyCount++;
135
+ }
136
+ }
137
+ }
138
+ if (htmlStartIndex >= 0) {
139
+ const htmlToken = processedTokens[htmlStartIndex];
140
+ const attrs = htmlToken[3];
141
+ const hasAttributes = Array.isArray(attrs)
142
+ ? attrs.length > 0
143
+ : attrs
144
+ ? Object.keys(attrs).length > 0
145
+ : false;
146
+ if (hasAttributes) {
147
+ omitHtml = false;
148
+ } else {
149
+ let firstToken = null;
150
+ for (let j = htmlStartIndex + 1; j < processedTokens.length; j++) {
151
+ const t = processedTokens[j];
152
+ if (t[0] !== "Characters" || t[1].trim() !== "") {
153
+ firstToken = t;
154
+ break;
155
+ }
156
+ }
157
+ if (!firstToken) {
158
+ omitHtml = true;
159
+ } else if (firstToken[0] === "Comment") {
160
+ omitHtml = false;
161
+ } else if (firstToken[0] === "Characters") {
162
+ if (/^\s/.test(firstToken[1])) {
163
+ omitHtml = false;
164
+ } else {
165
+ omitHtml = true;
166
+ }
167
+ } else {
168
+ omitHtml = true;
169
+ }
170
+ }
171
+ }
172
+ if (headStartIndex >= 0) {
173
+ let firstToken = null;
174
+ for (let j = headStartIndex + 1; j < processedTokens.length; j++) {
175
+ const t = processedTokens[j];
176
+ if (t[0] !== "Characters" || t[1].trim() !== "") {
177
+ firstToken = t;
178
+ break;
179
+ }
180
+ }
181
+ omitHead = false;
182
+ if (firstToken) {
183
+ if (firstToken[0] === "StartTag") {
184
+ omitHead = true;
185
+ } else if (firstToken[0] === "EndTag" && firstToken[2] === "head") {
186
+ omitHead = true;
187
+ } else if (firstToken[0] === "EmptyTag") {
188
+ omitHead = true;
189
+ }
190
+ }
191
+ }
192
+ if (bodyStartIndex >= 0) {
193
+ let firstToken = null;
194
+ for (let j = bodyStartIndex + 1; j < processedTokens.length; j++) {
195
+ const t = processedTokens[j];
196
+ if (t[0] !== "Characters" || t[1].trim() !== "") {
197
+ firstToken = t;
198
+ break;
199
+ }
200
+ }
201
+ omitBody = false;
202
+ if (firstToken) {
203
+ if (firstToken[0] === "StartTag") {
204
+ omitBody = true;
205
+ } else if (firstToken[0] === "EndTag") {
206
+ omitBody = true;
207
+ } else if (firstToken[0] === "Characters" && !/^\s/.test(firstToken[1])) {
208
+ omitBody = true;
209
+ }
210
+ } else {
211
+ omitBody = true;
212
+ }
213
+ }
214
+ if (colgroupStartIndex >= 0) {
215
+ const colgroupToken = processedTokens[colgroupStartIndex];
216
+ const attrs = colgroupToken[3];
217
+ const hasAttributes = Array.isArray(attrs)
218
+ ? attrs.length > 0
219
+ : attrs
220
+ ? Object.keys(attrs).length > 0
221
+ : false;
222
+ let firstToken = null;
223
+ for (let j = colgroupStartIndex + 1; j < processedTokens.length; j++) {
224
+ const t = processedTokens[j];
225
+ if (t[0] !== "Characters" || t[1].trim() !== "") {
226
+ firstToken = t;
227
+ break;
228
+ }
229
+ }
230
+ omitColgroup =
231
+ !hasAttributes &&
232
+ firstToken &&
233
+ (firstToken[0] === "StartTag" || firstToken[0] === "EmptyTag") &&
234
+ (firstToken[0] === "StartTag" ? firstToken[2] : firstToken[1]) === "col";
235
+ }
236
+
237
+ for (let i = 0; i < processedTokens.length; i++) {
238
+ const token = processedTokens[i];
239
+ const nextToken = processedTokens[i + 1];
240
+ const type = token[0];
241
+ switch (type) {
242
+ case "StartTag":
243
+ const [, , name, attrs] = token;
244
+
245
+ let omitThisTbody = false;
246
+ if (name === "tbody") {
247
+ const hasAttributes = Array.isArray(attrs)
248
+ ? attrs.length > 0
249
+ : attrs
250
+ ? Object.keys(attrs).length > 0
251
+ : false;
252
+ if (!hasAttributes) {
253
+ let firstToken = null;
254
+ for (let j = i + 1; j < processedTokens.length; j++) {
255
+ const t = processedTokens[j];
256
+ if (t[0] !== "Characters" || t[1].trim() !== "") {
257
+ firstToken = t;
258
+ break;
259
+ }
260
+ }
261
+ const hasTrChild =
262
+ firstToken &&
263
+ (firstToken[0] === "StartTag" || firstToken[0] === "EmptyTag") &&
264
+ firstToken[2] === "tr";
265
+
266
+ if (hasTrChild) {
267
+ let isPreceded = false;
268
+ for (let j = 0; j < i; j++) {
269
+ const t = processedTokens[j];
270
+ if (t[0] === "Characters" && t[1].trim() === "") continue;
271
+ if (
272
+ t[0] === "EndTag" &&
273
+ ["tbody", "thead", "tfoot"].includes(t[2])
274
+ ) {
275
+ isPreceded = true;
276
+ }
277
+ break;
278
+ }
279
+ omitThisTbody = !isPreceded;
280
+ }
281
+ }
282
+ }
283
+
284
+ if (name === "colgroup" && omitColgroup) continue;
285
+ if (name === "tbody" && omitThisTbody) continue;
286
+ if (name === "head" && omitHead) continue;
287
+ if (name === "body" && omitBody) continue;
288
+ if (name === "html" && omitHtml) continue;
289
+ if (name === "pre") inPre = true;
290
+ if (name === "textarea") inTextarea = true;
291
+ if (name === "script") inScript = true;
292
+ if (name === "style") inStyle = true;
293
+ if (name === "head") {
294
+ if (options?.inject_meta_charset) {
295
+ serializingHead = true;
296
+ } else {
297
+ result += "<" + name + serializeAttributes(attrs, options) + ">";
298
+ }
299
+ } else if (serializingHead) {
300
+ result += "<" + name + serializeAttributes(attrs, options) + ">";
301
+ }
302
+ break;
303
+ case "EmptyTag":
304
+ const [, name2, attrs2] = token;
305
+ result +=
306
+ "<" +
307
+ name2 +
308
+ serializeAttributes(attrs2, options) +
309
+ (options?.use_trailing_solidus ? " />" : ">");
310
+ break;
311
+ case "EndTag":
312
+ const [, , name3] = token;
313
+ let omitEndTag = false;
314
+ if (["html", "head", "body"].includes(name3)) {
315
+ if (
316
+ !nextToken ||
317
+ nextToken[0] === "StartTag" ||
318
+ nextToken[0] === "EndTag" ||
319
+ (nextToken[0] === "Characters" && !/^\s/.test(nextToken[1]))
320
+ ) {
321
+ omitEndTag = true;
322
+ }
323
+ } else if (nextToken) {
324
+ const nextType = nextToken[0];
325
+ let nextName = null;
326
+ if (nextType === "StartTag" || nextType === "EndTag") {
327
+ nextName = nextToken[2];
328
+ } else if (nextType === "EmptyTag") {
329
+ nextName = nextToken[1];
330
+ }
331
+ if (nextType === "EndTag") {
332
+ omitEndTag = [
333
+ "p",
334
+ "li",
335
+ "option",
336
+ "optgroup",
337
+ "tbody",
338
+ "tfoot",
339
+ "tr",
340
+ "td",
341
+ "th",
342
+ "colgroup",
343
+ "dd",
344
+ ].includes(name3);
345
+ } else if (nextType === "StartTag") {
346
+ if (
347
+ name3 === "p" &&
348
+ [
349
+ "address",
350
+ "article",
351
+ "aside",
352
+ "blockquote",
353
+ "datagrid",
354
+ "dialog",
355
+ "dir",
356
+ "div",
357
+ "dl",
358
+ "fieldset",
359
+ "footer",
360
+ "form",
361
+ "h1",
362
+ "h2",
363
+ "h3",
364
+ "h4",
365
+ "h5",
366
+ "h6",
367
+ "header",
368
+ "hr",
369
+ "menu",
370
+ "nav",
371
+ "ol",
372
+ "p",
373
+ "pre",
374
+ "section",
375
+ "table",
376
+ "ul",
377
+ ].includes(nextName)
378
+ ) {
379
+ omitEndTag = true;
380
+ } else if (name3 === "li" && nextName === "li") {
381
+ omitEndTag = true;
382
+ } else if (
383
+ (name3 === "dt" || name3 === "dd") &&
384
+ (nextName === "dt" || nextName === "dd")
385
+ ) {
386
+ omitEndTag = true;
387
+ } else if (
388
+ name3 === "option" &&
389
+ (nextName === "option" || nextName === "optgroup")
390
+ ) {
391
+ omitEndTag = true;
392
+ } else if (name3 === "optgroup" && nextName === "optgroup") {
393
+ omitEndTag = true;
394
+ } else if (
395
+ (name3 === "tbody" || name3 === "tfoot") &&
396
+ (nextName === "tbody" || nextName === "tfoot")
397
+ ) {
398
+ omitEndTag = true;
399
+ } else if (
400
+ name3 === "thead" &&
401
+ (nextName === "tbody" || nextName === "tfoot")
402
+ ) {
403
+ omitEndTag = true;
404
+ } else if (name3 === "tr" && nextName === "tr") {
405
+ omitEndTag = true;
406
+ } else if (
407
+ (name3 === "td" || name3 === "th") &&
408
+ (nextName === "td" || nextName === "th")
409
+ ) {
410
+ omitEndTag = true;
411
+ } else if (name3 === "colgroup" && nextName !== "colgroup") {
412
+ omitEndTag = true;
413
+ }
414
+ if (name3 === "p" && nextName === "hr") {
415
+ omitEndTag = true;
416
+ }
417
+ } else if (nextType === "EmptyTag") {
418
+ if (name3 === "p" && nextName === "hr") {
419
+ omitEndTag = true;
420
+ }
421
+ }
422
+ if (
423
+ name3 === "colgroup" &&
424
+ nextType === "Characters" &&
425
+ !/^\s/.test(nextToken[1])
426
+ ) {
427
+ omitEndTag = true;
428
+ }
429
+ } else {
430
+ omitEndTag = [
431
+ "p",
432
+ "li",
433
+ "option",
434
+ "optgroup",
435
+ "tbody",
436
+ "tfoot",
437
+ "tr",
438
+ "td",
439
+ "th",
440
+ "colgroup",
441
+ "dd",
442
+ ].includes(name3);
443
+ }
444
+ if (omitEndTag) continue;
445
+ if (name3 === "script") inScript = false;
446
+ if (name3 === "pre") inPre = false;
447
+ if (name3 === "textarea") inTextarea = false;
448
+ if (name3 === "style") inStyle = false;
449
+ if (name3 === "head") {
450
+ if (options?.inject_meta_charset) {
451
+ serializingHead = false;
452
+ } else {
453
+ result += "</" + name3 + ">";
454
+ }
455
+ } else if (serializingHead) {
456
+ result += "</" + name3 + ">";
457
+ }
458
+ break;
459
+ case "Characters":
460
+ if (serializingHead) {
461
+ let text = token[1];
462
+ if (
463
+ options?.strip_whitespace &&
464
+ !inPre &&
465
+ !inTextarea &&
466
+ !inScript &&
467
+ !inStyle
468
+ ) {
469
+ text = text.replace(/\s+/g, " ");
470
+ }
471
+ if (inScript) {
472
+ if (options?.escape_rcdata) {
473
+ result += escapeText(text);
474
+ } else {
475
+ result += text;
476
+ }
477
+ } else if (inTextarea) {
478
+ if (options?.escape_rcdata) {
479
+ result += escapeText(text);
480
+ } else {
481
+ result += text;
482
+ }
483
+ } else {
484
+ result += escapeText(text);
485
+ }
486
+ }
487
+ break;
488
+ case "Doctype":
489
+ if (serializingHead) {
490
+ result += "<!DOCTYPE " + token[1];
491
+ if (token[2]) {
492
+ result += ' PUBLIC "' + token[2] + '"';
493
+ if (token[3]) result += ' "' + token[3] + '"';
494
+ } else if (token[3]) {
495
+ result += ' SYSTEM "' + token[3] + '"';
496
+ }
497
+ result += ">";
498
+ }
499
+ break;
500
+ case "Comment":
501
+ if (serializingHead) {
502
+ result += "<!--" + token[1] + "-->";
503
+ }
504
+ break;
505
+ default:
506
+ break;
507
+ }
508
+ }
509
+
510
+ return result;
511
+ };
@@ -0,0 +1,10 @@
1
+ import { Position } from "./types.js";
2
+
3
+ export const calculatePosition = (text: string, offset: number): Position => {
4
+ const lines = text.slice(0, offset).split("\n");
5
+ return {
6
+ line: lines.length,
7
+ column: lines[lines.length - 1]?.length ?? 0,
8
+ offset,
9
+ };
10
+ };
@@ -0,0 +1,11 @@
1
+ export const RAW_TEXT_ELEMENTS = new Set([
2
+ "script",
3
+ "style",
4
+ "xmp",
5
+ "iframe",
6
+ "noembed",
7
+ "noframes",
8
+ "noscript",
9
+ ]);
10
+
11
+ export const RCDATA_ELEMENTS = new Set(["textarea", "title"]);
@@ -0,0 +1,64 @@
1
+ import { HTML_ENTITIES } from "../html-entities.js";
2
+
3
+ export const decodeEntities = (text: string): string => {
4
+ let result = "";
5
+ let i = 0;
6
+ while (i < text.length) {
7
+ if (text[i] === "&") {
8
+ let match = "";
9
+ let j = i + 1;
10
+ if (text[j] === "#") {
11
+ j++;
12
+ if (text[j] === "x" || text[j] === "X") {
13
+ j++;
14
+ while (j < text.length && /[0-9a-fA-F]/.test(text[j])) {
15
+ j++;
16
+ }
17
+ } else {
18
+ while (j < text.length && /[0-9]/.test(text[j])) {
19
+ j++;
20
+ }
21
+ }
22
+ if (text[j] === ";") {
23
+ j++;
24
+ }
25
+ match = text.substring(i, j);
26
+ const entity = match;
27
+ if (entity.startsWith("&#x") && entity.endsWith(";")) {
28
+ const hex = entity.slice(3, -1);
29
+ result += String.fromCharCode(parseInt(hex, 16));
30
+ i = j;
31
+ continue;
32
+ } else if (entity.startsWith("&#") && entity.endsWith(";")) {
33
+ const decimal = entity.slice(2, -1);
34
+ result += String.fromCharCode(parseInt(decimal, 10));
35
+ i = j;
36
+ continue;
37
+ }
38
+ } else {
39
+ while (j < text.length && /[a-zA-Z0-9]/.test(text[j])) {
40
+ j++;
41
+ }
42
+ const hasSemi = text[j] === ";";
43
+ if (hasSemi) {
44
+ j++;
45
+ }
46
+ match = text.substring(i, j);
47
+ const named = match.slice(1, hasSemi ? -1 : undefined);
48
+ if (HTML_ENTITIES[named]) {
49
+ if (hasSemi || (j < text.length && !/[a-zA-Z0-9]/.test(text[j]))) {
50
+ result += HTML_ENTITIES[named];
51
+ i = j;
52
+ continue;
53
+ }
54
+ }
55
+ }
56
+ result += text[i];
57
+ i++;
58
+ } else {
59
+ result += text[i];
60
+ i++;
61
+ }
62
+ }
63
+ return result.replace(/\u0000/g, "\uFFFD");
64
+ };
@@ -0,0 +1,2 @@
1
+ export { TokenType, type Position, type Token } from "./types.js";
2
+ export { tokenize } from "./tokenize.js";
@@ -0,0 +1,74 @@
1
+ import { decodeEntities } from "./decode-entities.js";
2
+
3
+ export const parseAttributes = (
4
+ attributeString: string,
5
+ ): Record<string, string> => {
6
+ const attributes: Record<string, string> = {};
7
+ let i = 0;
8
+
9
+ while (i < attributeString.length) {
10
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
11
+ i++;
12
+ }
13
+ if (
14
+ i >= attributeString.length ||
15
+ attributeString[i] === "/" ||
16
+ attributeString[i] === ">"
17
+ ) {
18
+ break;
19
+ }
20
+
21
+ let name = "";
22
+ while (i < attributeString.length && !/[\s=\/>]/.test(attributeString[i])) {
23
+ name += attributeString[i];
24
+ i++;
25
+ }
26
+
27
+ if (!name) {
28
+ i++;
29
+ continue;
30
+ }
31
+
32
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
33
+ i++;
34
+ }
35
+
36
+ let value = "";
37
+ if (i < attributeString.length && attributeString[i] === "=") {
38
+ i++;
39
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
40
+ i++;
41
+ }
42
+
43
+ if (i < attributeString.length) {
44
+ if (attributeString[i] === '"') {
45
+ i++;
46
+ while (i < attributeString.length && attributeString[i] !== '"') {
47
+ value += attributeString[i];
48
+ i++;
49
+ }
50
+ i++;
51
+ } else if (attributeString[i] === "'") {
52
+ i++;
53
+ while (i < attributeString.length && attributeString[i] !== "'") {
54
+ value += attributeString[i];
55
+ i++;
56
+ }
57
+ i++;
58
+ } else {
59
+ while (
60
+ i < attributeString.length &&
61
+ !/[\s>]/.test(attributeString[i])
62
+ ) {
63
+ value += attributeString[i];
64
+ i++;
65
+ }
66
+ }
67
+ }
68
+ }
69
+
70
+ attributes[name.toLowerCase()] = decodeEntities(value);
71
+ }
72
+
73
+ return attributes;
74
+ };