@tkeron/html-parser 0.1.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/README.md +1 -7
  2. package/bun.lock +8 -3
  3. package/index.ts +4 -0
  4. package/package.json +13 -6
  5. package/src/css-selector.ts +45 -27
  6. package/src/dom-simulator.ts +162 -20
  7. package/src/encoding.ts +39 -0
  8. package/src/index.ts +9 -0
  9. package/src/parser.ts +478 -183
  10. package/src/serializer.ts +450 -0
  11. package/src/tokenizer.ts +59 -139
  12. package/tests/advanced.test.ts +119 -106
  13. package/tests/custom-elements.test.ts +172 -162
  14. package/tests/dom-extended.test.ts +12 -12
  15. package/tests/dom-manipulation.test.ts +637 -0
  16. package/tests/dom.test.ts +32 -27
  17. package/tests/helpers/tokenizer-adapter.test.ts +70 -0
  18. package/tests/helpers/tokenizer-adapter.ts +65 -0
  19. package/tests/helpers/tree-adapter.test.ts +39 -0
  20. package/tests/helpers/tree-adapter.ts +43 -0
  21. package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
  22. package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
  23. package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
  24. package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
  25. package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
  26. package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
  27. package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
  28. package/tests/html5lib-data/tree-construction/math.dat +104 -0
  29. package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
  30. package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
  31. package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
  32. package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
  33. package/tests/html5lib-data/tree-construction/svg.dat +104 -0
  34. package/tests/html5lib-data/tree-construction/template.dat +1673 -0
  35. package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
  36. package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
  37. package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
  38. package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
  39. package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
  40. package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
  41. package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
  42. package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
  43. package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
  44. package/tests/parser.test.ts +172 -193
  45. package/tests/selectors.test.ts +64 -1
  46. package/tests/serializer-core.test.ts +16 -0
  47. package/tests/serializer-data/core.test +125 -0
  48. package/tests/serializer-data/injectmeta.test +66 -0
  49. package/tests/serializer-data/optionaltags.test +965 -0
  50. package/tests/serializer-data/options.test +60 -0
  51. package/tests/serializer-data/whitespace.test +51 -0
  52. package/tests/serializer-injectmeta.test.ts +16 -0
  53. package/tests/serializer-optionaltags.test.ts +16 -0
  54. package/tests/serializer-options.test.ts +16 -0
  55. package/tests/serializer-whitespace.test.ts +16 -0
  56. package/tests/tokenizer-namedEntities.test.ts +20 -0
  57. package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
  58. package/tests/tokenizer.test.ts +83 -0
  59. package/tests/tree-construction-adoption01.test.ts +37 -0
  60. package/tests/tree-construction-adoption02.test.ts +34 -0
  61. package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
  62. package/tests/tree-construction-entities02.test.ts +33 -0
  63. package/tests/tree-construction-html5test-com.test.ts +24 -0
  64. package/tests/tree-construction-math.test.ts +18 -0
  65. package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
  66. package/tests/tree-construction-noscript01.test.ts +18 -0
  67. package/tests/tree-construction-ruby.test.ts +21 -0
  68. package/tests/tree-construction-scriptdata01.test.ts +21 -0
  69. package/tests/tree-construction-svg.test.ts +21 -0
  70. package/tests/tree-construction-template.test.ts +21 -0
  71. package/tests/tree-construction-tests10.test.ts +21 -0
  72. package/tests/tree-construction-tests11.test.ts +21 -0
  73. package/tests/tree-construction-tests20.test.ts +18 -0
  74. package/tests/tree-construction-tests21.test.ts +18 -0
  75. package/tests/tree-construction-tests23.test.ts +18 -0
  76. package/tests/tree-construction-tests24.test.ts +18 -0
  77. package/tests/tree-construction-tests5.test.ts +21 -0
  78. package/tests/tree-construction-tests6.test.ts +21 -0
  79. package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
  80. package/tests/void-elements.test.ts +471 -0
  81. package/tests/official/README.md +0 -87
  82. package/tests/official/acid/acid-tests.test.ts +0 -309
  83. package/tests/official/final-output/final-output.test.ts +0 -361
  84. package/tests/official/html5lib/tokenizer-utils.ts +0 -192
  85. package/tests/official/html5lib/tokenizer.test.ts +0 -171
  86. package/tests/official/html5lib/tree-construction-utils.ts +0 -194
  87. package/tests/official/html5lib/tree-construction.test.ts +0 -250
  88. package/tests/official/validator/validator-tests.test.ts +0 -237
  89. package/tests/official/validator-nu/validator-nu.test.ts +0 -335
  90. package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
  91. package/tests/official/wpt/wpt-tests.test.ts +0 -409
@@ -0,0 +1,450 @@
1
+ /**
2
+ * Serializes a list of HTML5 tokens to an HTML string.
3
+ * Based on HTML5 serialization algorithm.
4
+ */
5
+
6
+ function escapeText(text: string): string {
7
+ return text.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
8
+ }
9
+
10
+ function escapeAttributeValue(value: string): string {
11
+ return value.replace(/&/g, '&amp;').replace(/"/g, '&quot;').replace(/'/g, '&#39;');
12
+ }
13
+
14
+ function needsQuotes(value: string): boolean {
15
+ return value === '' || /[\t\n\r\f "'=`>]/.test(value);
16
+ }
17
+
18
+ function serializeAttribute(name: string, value: string, options?: { quote_char?: string; quote_attr_values?: boolean; minimize_boolean_attributes?: boolean; escape_lt_in_attrs?: boolean; escape_rcdata?: boolean }): string {
19
+ if ((options?.minimize_boolean_attributes !== false) && value === name) {
20
+ return name;
21
+ }
22
+ const needsQuote = needsQuotes(value) || options?.quote_attr_values;
23
+ if (!needsQuote) {
24
+ return `${name}=${value}`;
25
+ }
26
+ let escaped = value.replace(/&/g, '&amp;');
27
+ if (options?.escape_lt_in_attrs) {
28
+ escaped = escaped.replace(/</g, '&lt;');
29
+ }
30
+ const forcedQuote = options?.quote_char;
31
+ if (forcedQuote) {
32
+ if (forcedQuote === "'") {
33
+ escaped = escaped.replace(/'/g, '&#39;');
34
+ } else {
35
+ escaped = escaped.replace(/"/g, '&quot;');
36
+ }
37
+ return `${name}=${forcedQuote}${escaped}${forcedQuote}`;
38
+ } else {
39
+ // Auto choose quote
40
+ if (value.includes('"') && value.includes("'")) {
41
+ escaped = escaped.replace(/"/g, '&quot;');
42
+ return `${name}="${escaped}"`;
43
+ } else if (value.includes('"')) {
44
+ return `${name}='${escaped}'`;
45
+ } else {
46
+ escaped = escaped.replace(/"/g, '&quot;');
47
+ return `${name}="${escaped}"`;
48
+ }
49
+ }
50
+ }
51
+
52
+ function serializeAttributes(attrs: any, options?: { quote_char?: string; quote_attr_values?: boolean; minimize_boolean_attributes?: boolean; escape_lt_in_attrs?: boolean; use_trailing_solidus?: boolean; escape_rcdata?: boolean }): string {
53
+ let attrList: [string, string][];
54
+ if (Array.isArray(attrs)) {
55
+ attrList = attrs.map((attr: any) => [attr.name, attr.value]);
56
+ } else {
57
+ attrList = attrs ? Object.entries(attrs) : [];
58
+ }
59
+ attrList.sort(([a], [b]) => a.localeCompare(b));
60
+ return attrList.map(([name, value]) => ' ' + serializeAttribute(name, value, options)).join('');
61
+ }
62
+
63
+ export function serializeTokens(tokens: any[], options?: { inject_meta_charset?: boolean; encoding?: string; quote_char?: string; quote_attr_values?: boolean; minimize_boolean_attributes?: boolean; escape_lt_in_attrs?: boolean; use_trailing_solidus?: boolean; escape_rcdata?: boolean; strip_whitespace?: boolean }): string {
64
+ const encoding = options?.encoding || 'utf-8';
65
+ let result = '';
66
+ let inScript = false;
67
+ let inPre = false;
68
+ let inTextarea = false;
69
+ let inStyle = false;
70
+ let serializingHead = true;
71
+
72
+ // If inject_meta_charset, modify tokens
73
+ let processedTokens = tokens;
74
+ if (options?.inject_meta_charset) {
75
+ let hasCharset = false;
76
+ let modifiedTokens: any[] = [];
77
+ let inHead = false;
78
+
79
+ // First pass: check if has charset
80
+ for (const token of tokens) {
81
+ const type = token[0];
82
+ if (type === 'StartTag' && token[2] === 'head') {
83
+ inHead = true;
84
+ } else if (type === 'EndTag' && token[2] === 'head') {
85
+ inHead = false;
86
+ } else if (inHead && type === 'EmptyTag' && token[1] === 'meta') {
87
+ const attrs = token[2];
88
+ if (attrs.some((attr: any) => attr.name === 'charset')) {
89
+ hasCharset = true;
90
+ }
91
+ const hasHttpEquiv = attrs.some((attr: any) => attr.name === 'http-equiv' && attr.value === 'content-type');
92
+ if (hasHttpEquiv) {
93
+ const contentAttr = attrs.find((attr: any) => attr.name === 'content');
94
+ if (contentAttr && contentAttr.value.includes('charset=')) {
95
+ hasCharset = true;
96
+ }
97
+ }
98
+ }
99
+ }
100
+
101
+ // Second pass: modify
102
+ inHead = false;
103
+ for (const token of tokens) {
104
+ const type = token[0];
105
+ if (type === 'StartTag' && token[2] === 'head') {
106
+ inHead = true;
107
+ modifiedTokens.push(token);
108
+ if (!hasCharset && options?.encoding) {
109
+ modifiedTokens.push(['EmptyTag', 'meta', [{ name: 'charset', value: encoding }]]);
110
+ }
111
+ } else if (type === 'EndTag' && token[2] === 'head') {
112
+ inHead = false;
113
+ modifiedTokens.push(token);
114
+ } else if (inHead && type === 'EmptyTag' && token[1] === 'meta') {
115
+ let newAttrs = token[2].slice();
116
+ let isHttpEquiv = false;
117
+ for (let i = 0; i < newAttrs.length; i++) {
118
+ const attr = newAttrs[i];
119
+ if (attr.name === 'charset' && options?.encoding) {
120
+ newAttrs[i] = { name: 'charset', value: encoding };
121
+ } else if (attr.name === 'http-equiv' && attr.value === 'content-type') {
122
+ isHttpEquiv = true;
123
+ } else if (attr.name === 'content' && isHttpEquiv && options?.encoding) {
124
+ newAttrs[i] = { name: 'content', value: attr.value.replace(/charset=[^;]*/, 'charset=' + encoding) };
125
+ }
126
+ }
127
+ modifiedTokens.push([type, token[1], newAttrs]);
128
+ } else {
129
+ modifiedTokens.push(token);
130
+ }
131
+ }
132
+ processedTokens = modifiedTokens;
133
+ }
134
+
135
+ // Serialize
136
+ let omitHtml = false;
137
+ let omitHead = false;
138
+ let omitBody = false;
139
+ let omitColgroup = false;
140
+ let omitTbody = false;
141
+ let headHasContent = false;
142
+ let inHead = false;
143
+ // First pass to detect optional tags
144
+ let htmlStartIndex = -1;
145
+ let headStartIndex = -1;
146
+ let bodyStartIndex = -1;
147
+ let colgroupStartIndex = -1;
148
+ let tbodyStartIndex = -1;
149
+ let tbodyCount = 0;
150
+ let colgroupCount = 0;
151
+ for (let i = 0; i < processedTokens.length; i++) {
152
+ const token = processedTokens[i];
153
+ const type = token[0];
154
+ if (type === 'StartTag') {
155
+ const name = token[2];
156
+ if (name === 'html') {
157
+ htmlStartIndex = i;
158
+ }
159
+ if (name === 'head') {
160
+ headStartIndex = i;
161
+ }
162
+ if (name === 'body') {
163
+ bodyStartIndex = i;
164
+ }
165
+ if (name === 'colgroup') {
166
+ colgroupStartIndex = i;
167
+ colgroupCount++;
168
+ }
169
+ if (name === 'tbody') {
170
+ tbodyStartIndex = i;
171
+ tbodyCount++;
172
+ }
173
+ }
174
+ }
175
+ // Check if html should be omitted
176
+ if (htmlStartIndex >= 0) {
177
+ const htmlToken = processedTokens[htmlStartIndex];
178
+ const attrs = htmlToken[3];
179
+ const hasAttributes = Array.isArray(attrs) ? attrs.length > 0 : (attrs ? Object.keys(attrs).length > 0 : false);
180
+ if (hasAttributes) {
181
+ omitHtml = false;
182
+ } else {
183
+ let firstToken = null;
184
+ for (let j = htmlStartIndex + 1; j < processedTokens.length; j++) {
185
+ const t = processedTokens[j];
186
+ if (t[0] !== 'Characters' || t[1].trim() !== '') {
187
+ firstToken = t;
188
+ break;
189
+ }
190
+ }
191
+ if (!firstToken) {
192
+ omitHtml = true;
193
+ } else if (firstToken[0] === 'Comment') {
194
+ omitHtml = false;
195
+ } else if (firstToken[0] === 'Characters') {
196
+ if (/^\s/.test(firstToken[1])) {
197
+ omitHtml = false;
198
+ } else {
199
+ omitHtml = true;
200
+ }
201
+ } else {
202
+ omitHtml = true;
203
+ }
204
+ }
205
+ }
206
+ // Check if head should be omitted
207
+ if (headStartIndex >= 0) {
208
+ let firstToken = null;
209
+ for (let j = headStartIndex + 1; j < processedTokens.length; j++) {
210
+ const t = processedTokens[j];
211
+ if (t[0] !== 'Characters' || t[1].trim() !== '') {
212
+ firstToken = t;
213
+ break;
214
+ }
215
+ }
216
+ omitHead = false;
217
+ if (firstToken) {
218
+ if (firstToken[0] === 'StartTag') {
219
+ omitHead = true;
220
+ } else if (firstToken[0] === 'EndTag' && firstToken[2] === 'head') {
221
+ omitHead = true;
222
+ } else if (firstToken[0] === 'EmptyTag') {
223
+ omitHead = true;
224
+ }
225
+ }
226
+ }
227
+ // Check if body should be omitted
228
+ if (bodyStartIndex >= 0) {
229
+ let firstToken = null;
230
+ for (let j = bodyStartIndex + 1; j < processedTokens.length; j++) {
231
+ const t = processedTokens[j];
232
+ if (t[0] !== 'Characters' || t[1].trim() !== '') {
233
+ firstToken = t;
234
+ break;
235
+ }
236
+ }
237
+ omitBody = false;
238
+ if (firstToken) {
239
+ if (firstToken[0] === 'StartTag') {
240
+ omitBody = true;
241
+ } else if (firstToken[0] === 'EndTag') {
242
+ omitBody = true;
243
+ } else if (firstToken[0] === 'Characters' && !/^\s/.test(firstToken[1])) {
244
+ omitBody = true;
245
+ }
246
+ } else {
247
+ omitBody = true;
248
+ }
249
+ }
250
+ // Check if colgroup should be omitted
251
+ if (colgroupStartIndex >= 0) {
252
+ const colgroupToken = processedTokens[colgroupStartIndex];
253
+ const attrs = colgroupToken[3];
254
+ const hasAttributes = Array.isArray(attrs) ? attrs.length > 0 : (attrs ? Object.keys(attrs).length > 0 : false);
255
+ let firstToken = null;
256
+ for (let j = colgroupStartIndex + 1; j < processedTokens.length; j++) {
257
+ const t = processedTokens[j];
258
+ if (t[0] !== 'Characters' || t[1].trim() !== '') {
259
+ firstToken = t;
260
+ break;
261
+ }
262
+ }
263
+ omitColgroup = !hasAttributes && firstToken && (firstToken[0] === 'StartTag' || firstToken[0] === 'EmptyTag') && ((firstToken[0] === 'StartTag' ? firstToken[2] : firstToken[1]) === 'col');
264
+ }
265
+ // Check if tbody should be omitted - we'll check this per tbody in the loop
266
+ // omitTbody is now calculated per element
267
+
268
+ for (let i = 0; i < processedTokens.length; i++) {
269
+ const token = processedTokens[i];
270
+ const nextToken = processedTokens[i + 1];
271
+ const type = token[0];
272
+ switch (type) {
273
+ case 'StartTag':
274
+ const [, , name, attrs] = token;
275
+ const attrCount = Array.isArray(attrs) ? attrs.length : (attrs ? Object.keys(attrs).length : 0);
276
+
277
+ // Check if tbody should be omitted for this specific tbody
278
+ let omitThisTbody = false;
279
+ if (name === 'tbody') {
280
+ const hasAttributes = Array.isArray(attrs) ? attrs.length > 0 : (attrs ? Object.keys(attrs).length > 0 : false);
281
+ if (!hasAttributes) {
282
+ // Check if first significant token after tbody is a tr
283
+ let firstToken = null;
284
+ for (let j = i + 1; j < processedTokens.length; j++) {
285
+ const t = processedTokens[j];
286
+ if (t[0] !== 'Characters' || t[1].trim() !== '') {
287
+ firstToken = t;
288
+ break;
289
+ }
290
+ }
291
+ const hasTrChild = firstToken && (firstToken[0] === 'StartTag' || firstToken[0] === 'EmptyTag') && firstToken[2] === 'tr';
292
+
293
+ if (hasTrChild) {
294
+ // Check if not preceded by tbody, thead, or tfoot
295
+ // This is indicated by whether the fragment starts with EndTag of those elements
296
+ let isPreceded = false;
297
+ for (let j = 0; j < i; j++) {
298
+ const t = processedTokens[j];
299
+ if (t[0] === 'Characters' && t[1].trim() === '') continue;
300
+ if (t[0] === 'EndTag' && ['tbody', 'thead', 'tfoot'].includes(t[2])) {
301
+ isPreceded = true;
302
+ }
303
+ break; // Only check the first significant token
304
+ }
305
+ omitThisTbody = !isPreceded;
306
+ }
307
+ }
308
+ }
309
+
310
+ if (name === 'colgroup' && omitColgroup) continue;
311
+ if (name === 'tbody' && omitThisTbody) continue;
312
+ if (name === 'head' && omitHead) continue;
313
+ if (name === 'body' && omitBody) continue;
314
+ if (name === 'html' && omitHtml) continue;
315
+ if (name === 'pre') inPre = true;
316
+ if (name === 'textarea') inTextarea = true;
317
+ if (name === 'script') inScript = true;
318
+ if (name === 'style') inStyle = true;
319
+ if (name === 'head') {
320
+ if (options?.inject_meta_charset) {
321
+ serializingHead = true;
322
+ } else {
323
+ result += '<' + name + serializeAttributes(attrs, options) + '>';
324
+ }
325
+ } else if (serializingHead) {
326
+ result += '<' + name + serializeAttributes(attrs, options) + '>';
327
+ }
328
+ break;
329
+ case 'EmptyTag':
330
+ const [, name2, attrs2] = token;
331
+ result += '<' + name2 + serializeAttributes(attrs2, options) + (options?.use_trailing_solidus ? ' />' : '>');
332
+ break;
333
+ case 'EndTag':
334
+ const [, , name3] = token;
335
+ // Check if end-tag should be omitted
336
+ let omitEndTag = false;
337
+ if (['html', 'head', 'body'].includes(name3)) {
338
+ if (!nextToken || nextToken[0] === 'StartTag' || nextToken[0] === 'EndTag' || (nextToken[0] === 'Characters' && !/^\s/.test(nextToken[1]))) {
339
+ omitEndTag = true;
340
+ }
341
+ } else if (nextToken) {
342
+ const nextType = nextToken[0];
343
+ let nextName = null;
344
+ if (nextType === 'StartTag' || nextType === 'EndTag') {
345
+ nextName = nextToken[2];
346
+ } else if (nextType === 'EmptyTag') {
347
+ nextName = nextToken[1];
348
+ }
349
+ if (nextType === 'EndTag') {
350
+ omitEndTag = ['p', 'li', 'option', 'optgroup', 'tbody', 'tfoot', 'tr', 'td', 'th', 'colgroup', 'dd'].includes(name3);
351
+ } else if (nextType === 'StartTag') {
352
+ if (name3 === 'p' && ['address', 'article', 'aside', 'blockquote', 'datagrid', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hr', 'menu', 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul'].includes(nextName)) {
353
+ omitEndTag = true;
354
+ } else if (name3 === 'li' && nextName === 'li') {
355
+ omitEndTag = true;
356
+ } else if ((name3 === 'dt' || name3 === 'dd') && (nextName === 'dt' || nextName === 'dd')) {
357
+ omitEndTag = true;
358
+ } else if (name3 === 'option' && (nextName === 'option' || nextName === 'optgroup')) {
359
+ omitEndTag = true;
360
+ } else if (name3 === 'optgroup' && nextName === 'optgroup') {
361
+ omitEndTag = true;
362
+ } else if ((name3 === 'tbody' || name3 === 'tfoot') && (nextName === 'tbody' || nextName === 'tfoot')) {
363
+ omitEndTag = true;
364
+ } else if (name3 === 'thead' && (nextName === 'tbody' || nextName === 'tfoot')) {
365
+ omitEndTag = true;
366
+ } else if (name3 === 'tr' && nextName === 'tr') {
367
+ omitEndTag = true;
368
+ } else if ((name3 === 'td' || name3 === 'th') && (nextName === 'td' || nextName === 'th')) {
369
+ omitEndTag = true;
370
+ } else if (name3 === 'colgroup' && nextName !== 'colgroup') {
371
+ omitEndTag = true;
372
+ }
373
+ if (name3 === 'p' && nextName === 'hr') {
374
+ omitEndTag = true;
375
+ }
376
+ } else if (nextType === 'EmptyTag') {
377
+ if (name3 === 'p' && nextName === 'hr') {
378
+ omitEndTag = true;
379
+ }
380
+ }
381
+ if (name3 === 'colgroup' && nextType === 'Characters' && !/^\s/.test(nextToken[1])) {
382
+ omitEndTag = true;
383
+ }
384
+ } else {
385
+ // At EOF, omit certain end-tags
386
+ omitEndTag = ['p', 'li', 'option', 'optgroup', 'tbody', 'tfoot', 'tr', 'td', 'th', 'colgroup', 'dd'].includes(name3);
387
+ }
388
+ if (omitEndTag) continue;
389
+ if (name3 === 'script') inScript = false;
390
+ if (name3 === 'pre') inPre = false;
391
+ if (name3 === 'textarea') inTextarea = false;
392
+ if (name3 === 'style') inStyle = false;
393
+ if (name3 === 'head') {
394
+ if (options?.inject_meta_charset) {
395
+ serializingHead = false;
396
+ } else {
397
+ result += '</' + name3 + '>';
398
+ }
399
+ } else if (serializingHead) {
400
+ result += '</' + name3 + '>';
401
+ }
402
+ break;
403
+ case 'Characters':
404
+ if (serializingHead) {
405
+ let text = token[1];
406
+ if (options?.strip_whitespace && !inPre && !inTextarea && !inScript && !inStyle) {
407
+ text = text.replace(/\s+/g, ' ');
408
+ }
409
+ if (inScript) {
410
+ if (options?.escape_rcdata) {
411
+ result += escapeText(text);
412
+ } else {
413
+ result += text;
414
+ }
415
+ } else if (inTextarea) {
416
+ if (options?.escape_rcdata) {
417
+ result += escapeText(text);
418
+ } else {
419
+ result += text;
420
+ }
421
+ } else {
422
+ result += escapeText(text);
423
+ }
424
+ }
425
+ break;
426
+ case 'Doctype':
427
+ if (serializingHead) {
428
+ result += '<!DOCTYPE ' + token[1];
429
+ if (token[2]) {
430
+ result += ' PUBLIC "' + token[2] + '"';
431
+ if (token[3]) result += ' "' + token[3] + '"';
432
+ } else if (token[3]) {
433
+ result += ' SYSTEM "' + token[3] + '"';
434
+ }
435
+ result += '>';
436
+ }
437
+ break;
438
+ case 'Comment':
439
+ if (serializingHead) {
440
+ result += '<!--' + token[1] + '-->';
441
+ }
442
+ break;
443
+ default:
444
+ // Ignore unknown tokens
445
+ break;
446
+ }
447
+ }
448
+
449
+ return result;
450
+ }
package/src/tokenizer.ts CHANGED
@@ -24,55 +24,71 @@ export interface Token {
24
24
  isClosing?: boolean;
25
25
  }
26
26
 
27
- const HTML_ENTITIES: Record<string, string> = {
28
- '&amp;': '&',
29
- '&lt;': '<',
30
- '&gt;': '>',
31
- '&quot;': '"',
32
- '&apos;': "'",
33
- '&nbsp;': '\u00A0',
34
- '&copy;': '©',
35
- '&reg;': '®',
36
- '&trade;': '™',
37
- '&hellip;': '…',
38
- '&mdash;': '—',
39
- '&ndash;': '–',
40
- '&lsquo;': '\u2018',
41
- '&rsquo;': '\u2019',
42
- '&ldquo;': '\u201C',
43
- '&rdquo;': '\u201D',
44
- '&not;': '¬'
45
- };
27
+ import { allNamedEntities } from 'all-named-html-entities';
28
+
29
+ const HTML_ENTITIES: Record<string, string> = allNamedEntities;
46
30
 
47
31
  function decodeEntities(text: string): string {
48
- let result = text.replace(/\u0000/g, '\uFFFD');
49
-
50
- return result.replace(/&(?:#x([0-9a-fA-F]+);?|#([0-9]+);?|([a-zA-Z][a-zA-Z0-9]*);?)/g, (match, hex, decimal, named) => {
51
- if (hex) {
52
- return String.fromCharCode(parseInt(hex, 16));
53
- }
54
- if (decimal) {
55
- return String.fromCharCode(parseInt(decimal, 10));
56
- }
57
- if (named) {
58
- if (HTML_ENTITIES[`&${named};`]) {
59
- return HTML_ENTITIES[`&${named};`];
60
- }
61
-
62
- if (!match.endsWith(';')) {
63
- for (let i = named.length; i > 0; i--) {
64
- const prefix = named.substring(0, i);
65
- if (HTML_ENTITIES[`&${prefix};`]) {
66
- const remainder = named.substring(i);
67
- return HTML_ENTITIES[`&${prefix};`] + remainder;
32
+ let result = '';
33
+ let i = 0;
34
+ while (i < text.length) {
35
+ if (text[i] === '&') {
36
+ let match = '';
37
+ let j = i + 1;
38
+ if (text[j] === '#') {
39
+ j++;
40
+ if (text[j] === 'x' || text[j] === 'X') {
41
+ j++;
42
+ while (j < text.length && /[0-9a-fA-F]/.test(text[j])) {
43
+ j++;
44
+ }
45
+ } else {
46
+ while (j < text.length && /[0-9]/.test(text[j])) {
47
+ j++;
48
+ }
49
+ }
50
+ if (text[j] === ';') {
51
+ j++;
52
+ }
53
+ match = text.substring(i, j);
54
+ const entity = match;
55
+ if (entity.startsWith('&#x') && entity.endsWith(';')) {
56
+ const hex = entity.slice(3, -1);
57
+ result += String.fromCharCode(parseInt(hex, 16));
58
+ i = j;
59
+ continue;
60
+ } else if (entity.startsWith('&#') && entity.endsWith(';')) {
61
+ const decimal = entity.slice(2, -1);
62
+ result += String.fromCharCode(parseInt(decimal, 10));
63
+ i = j;
64
+ continue;
65
+ }
66
+ } else {
67
+ while (j < text.length && /[a-zA-Z0-9]/.test(text[j])) {
68
+ j++;
69
+ }
70
+ const hasSemi = text[j] === ';';
71
+ if (hasSemi) {
72
+ j++;
73
+ }
74
+ match = text.substring(i, j);
75
+ const named = match.slice(1, hasSemi ? -1 : undefined);
76
+ if (HTML_ENTITIES[named]) {
77
+ if (hasSemi || (j < text.length && !/[a-zA-Z0-9]/.test(text[j]))) {
78
+ result += HTML_ENTITIES[named];
79
+ i = j;
80
+ continue;
68
81
  }
69
82
  }
70
83
  }
71
-
72
- return match;
84
+ result += text[i];
85
+ i++;
86
+ } else {
87
+ result += text[i];
88
+ i++;
73
89
  }
74
- return match;
75
- });
90
+ }
91
+ return result.replace(/\u0000/g, '\uFFFD');
76
92
  }
77
93
 
78
94
  function parseAttributes(attributeString: string): Record<string, string> {
@@ -251,99 +267,3 @@ export function tokenize(html: string): Token[] {
251
267
 
252
268
  return tokens;
253
269
  }
254
-
255
- export function tokenizeWithRewriter(html: string): Token[] {
256
- const tokens: Token[] = [];
257
- let textBuffer = '';
258
- let position = 0;
259
-
260
- const rewriter = new HTMLRewriter();
261
-
262
- rewriter.on('*', {
263
- element(element) {
264
- if (textBuffer.trim()) {
265
- tokens.push({
266
- type: TokenType.TEXT,
267
- value: decodeEntities(textBuffer),
268
- position: calculatePosition(html, position - textBuffer.length)
269
- });
270
- textBuffer = '';
271
- }
272
-
273
- const attributes: Record<string, string> = {};
274
- for (const [name, value] of element.attributes) {
275
- attributes[name] = value;
276
- }
277
-
278
- tokens.push({
279
- type: TokenType.TAG_OPEN,
280
- value: element.tagName.toLowerCase(),
281
- position: calculatePosition(html, position),
282
- attributes,
283
- isSelfClosing: element.selfClosing
284
- });
285
-
286
- if (!element.selfClosing) {
287
- element.onEndTag((endTag) => {
288
- tokens.push({
289
- type: TokenType.TAG_CLOSE,
290
- value: endTag.name.toLowerCase(),
291
- position: calculatePosition(html, position),
292
- isClosing: true
293
- });
294
- });
295
- }
296
- },
297
-
298
- text(text) {
299
- textBuffer += text.text;
300
- },
301
-
302
- comments(comment) {
303
- tokens.push({
304
- type: TokenType.COMMENT,
305
- value: comment.text,
306
- position: calculatePosition(html, position)
307
- });
308
- }
309
- });
310
-
311
- try {
312
- const response = new Response(html, {
313
- headers: { 'Content-Type': 'text/html' }
314
- });
315
-
316
- rewriter.transform(response);
317
-
318
- if (textBuffer.trim()) {
319
- tokens.push({
320
- type: TokenType.TEXT,
321
- value: decodeEntities(textBuffer),
322
- position: calculatePosition(html, position - textBuffer.length)
323
- });
324
- }
325
-
326
- } catch (error) {
327
- console.warn('HTMLRewriter failed, falling back to manual parsing:', error);
328
- return tokenize(html);
329
- }
330
-
331
- tokens.sort((a, b) => a.position.offset - b.position.offset);
332
- tokens.push({
333
- type: TokenType.EOF,
334
- value: '',
335
- position: calculatePosition(html, html.length)
336
- });
337
-
338
- return tokens;
339
- }
340
-
341
- export function smartTokenize(html: string): Token[] {
342
- const hasSpecialContent = /<!DOCTYPE|<!--|\[CDATA\[|<\?/.test(html);
343
-
344
- if (hasSpecialContent || html.length < 1000) {
345
- return tokenize(html);
346
- } else {
347
- return tokenizeWithRewriter(html);
348
- }
349
- }