@tkeron/html-parser 0.1.5 â 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/bun.lock +3 -3
- package/package.json +7 -6
- package/src/css-selector.ts +45 -27
- package/src/dom-simulator.ts +124 -4
- package/src/parser.ts +0 -39
- package/src/tokenizer.ts +0 -96
- package/tests/dom-manipulation.test.ts +638 -0
- package/tests/selectors.test.ts +64 -1
- package/tests/tokenizer.test.ts +86 -0
- package/tests/void-elements.test.ts +471 -0
package/README.md
CHANGED
|
@@ -11,7 +11,7 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
|
|
|
11
11
|
- ðŠķ **Lightweight**: Minimal dependencies, native implementation
|
|
12
12
|
- ð **Standards Compliant**: Returns standard DOM Document objects
|
|
13
13
|
- ð§ **TypeScript Support**: Full TypeScript definitions included
|
|
14
|
-
- â
**Well Tested**: Comprehensive unit test suite (
|
|
14
|
+
- â
**Well Tested**: Comprehensive unit test suite (569 tests passing)
|
|
15
15
|
- ð **100% Compatible**: Drop-in replacement, same API
|
|
16
16
|
|
|
17
17
|
## Installation
|
package/bun.lock
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"": {
|
|
6
6
|
"name": "@tkeron/html-parser",
|
|
7
7
|
"devDependencies": {
|
|
8
|
-
"@types/bun": "^1.3.
|
|
8
|
+
"@types/bun": "^1.3.6",
|
|
9
9
|
},
|
|
10
10
|
"peerDependencies": {
|
|
11
11
|
"typescript": "^5.9.3",
|
|
@@ -13,11 +13,11 @@
|
|
|
13
13
|
},
|
|
14
14
|
},
|
|
15
15
|
"packages": {
|
|
16
|
-
"@types/bun": ["@types/bun@1.3.
|
|
16
|
+
"@types/bun": ["@types/bun@1.3.6", "", { "dependencies": { "bun-types": "1.3.6" } }, "sha512-uWCv6FO/8LcpREhenN1d1b6fcspAB+cefwD7uti8C8VffIv0Um08TKMn98FynpTiU38+y2dUO55T11NgDt8VAA=="],
|
|
17
17
|
|
|
18
18
|
"@types/node": ["@types/node@25.0.3", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA=="],
|
|
19
19
|
|
|
20
|
-
"bun-types": ["bun-types@1.3.
|
|
20
|
+
"bun-types": ["bun-types@1.3.6", "", { "dependencies": { "@types/node": "*" } }, "sha512-OlFwHcnNV99r//9v5IIOgQ9Uk37gZqrNMCcqEaExdkVq3Avwqok1bJFmvGMCkCE0FqzdY8VMOZpfpR3lwI+CsQ=="],
|
|
21
21
|
|
|
22
22
|
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
|
|
23
23
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tkeron/html-parser",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.7",
|
|
4
4
|
"description": "A fast and lightweight HTML parser for Bun",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"module": "index.ts",
|
|
@@ -8,16 +8,17 @@
|
|
|
8
8
|
"author": "tkeron",
|
|
9
9
|
"license": "MIT",
|
|
10
10
|
"devDependencies": {
|
|
11
|
-
"@types/bun": "^1.3.
|
|
11
|
+
"@types/bun": "^1.3.6"
|
|
12
12
|
},
|
|
13
13
|
"peerDependencies": {
|
|
14
14
|
"typescript": "^5.9.3"
|
|
15
15
|
},
|
|
16
16
|
"keywords": [
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
"
|
|
17
|
+
"html",
|
|
18
|
+
"parser",
|
|
19
|
+
"dom",
|
|
20
|
+
"bun",
|
|
21
|
+
"tokenizer"
|
|
21
22
|
],
|
|
22
23
|
"repository": {
|
|
23
24
|
"url": "git@github.com:tkeron/html-parser.git"
|
package/src/css-selector.ts
CHANGED
|
@@ -14,33 +14,47 @@ function parseSelector(selector: string): SelectorGroup[] {
|
|
|
14
14
|
|
|
15
15
|
return parts.map((part) => {
|
|
16
16
|
const trimmed = part.trim();
|
|
17
|
-
let tokens: SelectorToken[];
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
17
|
+
let tokens: SelectorToken[] = [];
|
|
18
|
+
|
|
19
|
+
// Handle universal selector
|
|
20
|
+
if (trimmed === '*') {
|
|
21
|
+
// Match any element - we'll handle this specially
|
|
22
|
+
return { tokens: [] };
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// Parse complex selectors like p#intro.first or .foo.bar.baz
|
|
26
|
+
let remaining = trimmed;
|
|
27
|
+
|
|
28
|
+
// Extract tag name first if present
|
|
29
|
+
const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9]*)/);
|
|
30
|
+
if (tagMatch) {
|
|
31
|
+
tokens.push({ type: "tag", value: tagMatch[1].toLowerCase() });
|
|
32
|
+
remaining = remaining.slice(tagMatch[1].length);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Extract all IDs (HTML5 allows IDs starting with digits)
|
|
36
|
+
const idMatches = remaining.matchAll(/#([a-zA-Z0-9][a-zA-Z0-9_-]*)/g);
|
|
37
|
+
for (const match of idMatches) {
|
|
38
|
+
tokens.push({ type: "id", value: match[1] });
|
|
39
|
+
}
|
|
40
|
+
remaining = remaining.replace(/#[a-zA-Z0-9][a-zA-Z0-9_-]*/g, '');
|
|
41
|
+
|
|
42
|
+
// Extract all classes
|
|
43
|
+
const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
|
|
44
|
+
for (const match of classMatches) {
|
|
45
|
+
tokens.push({ type: "class", value: match[1] });
|
|
46
|
+
}
|
|
47
|
+
remaining = remaining.replace(/\.[a-zA-Z][a-zA-Z0-9_-]*/g, '');
|
|
48
|
+
|
|
49
|
+
// Extract attributes
|
|
50
|
+
const attrMatches = remaining.matchAll(/\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]/g);
|
|
51
|
+
for (const match of attrMatches) {
|
|
52
|
+
tokens.push({
|
|
53
|
+
type: "attribute",
|
|
54
|
+
value: match[1].trim(),
|
|
55
|
+
attributeName: match[1].trim(),
|
|
56
|
+
attributeValue: match[2] ? match[2].trim() : undefined
|
|
57
|
+
});
|
|
44
58
|
}
|
|
45
59
|
|
|
46
60
|
return { tokens };
|
|
@@ -74,6 +88,10 @@ function matchesToken(element: any, token: SelectorToken): boolean {
|
|
|
74
88
|
}
|
|
75
89
|
|
|
76
90
|
function matchesSelector(element: any, tokens: SelectorToken[]): boolean {
|
|
91
|
+
// Universal selector - matches any element
|
|
92
|
+
if (tokens.length === 0) {
|
|
93
|
+
return true;
|
|
94
|
+
}
|
|
77
95
|
return tokens.every((token) => matchesToken(element, token));
|
|
78
96
|
}
|
|
79
97
|
|
package/src/dom-simulator.ts
CHANGED
|
@@ -6,6 +6,11 @@ import {
|
|
|
6
6
|
querySelectorAll as querySelectorAllFunction,
|
|
7
7
|
} from "./css-selector.js";
|
|
8
8
|
|
|
9
|
+
const VOID_ELEMENTS = new Set([
|
|
10
|
+
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
|
11
|
+
'link', 'meta', 'param', 'source', 'track', 'wbr'
|
|
12
|
+
]);
|
|
13
|
+
|
|
9
14
|
export const enum NodeType {
|
|
10
15
|
ELEMENT_NODE = 1,
|
|
11
16
|
TEXT_NODE = 3,
|
|
@@ -22,9 +27,13 @@ export function createElement(
|
|
|
22
27
|
): any {
|
|
23
28
|
const innerHTML = "";
|
|
24
29
|
const tagNameLower = tagName.toLowerCase();
|
|
25
|
-
const
|
|
30
|
+
const isVoid = VOID_ELEMENTS.has(tagNameLower);
|
|
31
|
+
const attrsStr = Object.entries(attributes)
|
|
26
32
|
.map(([k, v]) => ` ${k}="${v}"`)
|
|
27
|
-
.join("")
|
|
33
|
+
.join("");
|
|
34
|
+
const initialOuterHTML = isVoid
|
|
35
|
+
? `<${tagNameLower}${attrsStr}>`
|
|
36
|
+
: `<${tagNameLower}${attrsStr}></${tagNameLower}>`;
|
|
28
37
|
const textContent = "";
|
|
29
38
|
|
|
30
39
|
const element: any = {
|
|
@@ -54,6 +63,18 @@ export function createElement(
|
|
|
54
63
|
return child;
|
|
55
64
|
},
|
|
56
65
|
|
|
66
|
+
prepend(...nodes: any[]): void {
|
|
67
|
+
prepend(element, ...nodes);
|
|
68
|
+
},
|
|
69
|
+
|
|
70
|
+
append(...nodes: any[]): void {
|
|
71
|
+
append(element, ...nodes);
|
|
72
|
+
},
|
|
73
|
+
|
|
74
|
+
remove(): void {
|
|
75
|
+
remove(element);
|
|
76
|
+
},
|
|
77
|
+
|
|
57
78
|
removeChild(child: any): any {
|
|
58
79
|
return removeChild(element, child);
|
|
59
80
|
},
|
|
@@ -96,6 +117,10 @@ export function createElement(
|
|
|
96
117
|
return querySelectorAllFunction(element, selector);
|
|
97
118
|
},
|
|
98
119
|
|
|
120
|
+
matches(selector: string): boolean {
|
|
121
|
+
return matches(element, selector);
|
|
122
|
+
},
|
|
123
|
+
|
|
99
124
|
cloneNode(deep: boolean = false): any {
|
|
100
125
|
return cloneNode(element, deep);
|
|
101
126
|
},
|
|
@@ -172,6 +197,10 @@ export function createTextNode(content: string): any {
|
|
|
172
197
|
lastChild: null,
|
|
173
198
|
nextSibling: null,
|
|
174
199
|
previousSibling: null,
|
|
200
|
+
|
|
201
|
+
remove(): void {
|
|
202
|
+
remove(textNode);
|
|
203
|
+
},
|
|
175
204
|
};
|
|
176
205
|
return textNode;
|
|
177
206
|
}
|
|
@@ -189,6 +218,10 @@ export function createComment(content: string): any {
|
|
|
189
218
|
lastChild: null,
|
|
190
219
|
nextSibling: null,
|
|
191
220
|
previousSibling: null,
|
|
221
|
+
|
|
222
|
+
remove(): void {
|
|
223
|
+
remove(commentNode);
|
|
224
|
+
},
|
|
192
225
|
};
|
|
193
226
|
return commentNode;
|
|
194
227
|
}
|
|
@@ -222,6 +255,14 @@ export function createDocument(): any {
|
|
|
222
255
|
return child;
|
|
223
256
|
},
|
|
224
257
|
|
|
258
|
+
prepend(...nodes: any[]): void {
|
|
259
|
+
prepend(document, ...nodes);
|
|
260
|
+
},
|
|
261
|
+
|
|
262
|
+
append(...nodes: any[]): void {
|
|
263
|
+
append(document, ...nodes);
|
|
264
|
+
},
|
|
265
|
+
|
|
225
266
|
removeChild(child: any): any {
|
|
226
267
|
return removeChild(document, child);
|
|
227
268
|
},
|
|
@@ -395,6 +436,83 @@ function appendChild(parent: any, child: any): void {
|
|
|
395
436
|
}
|
|
396
437
|
}
|
|
397
438
|
|
|
439
|
+
function prepend(parent: any, ...nodes: any[]): void {
|
|
440
|
+
if (nodes.length === 0) return;
|
|
441
|
+
|
|
442
|
+
for (let i = nodes.length - 1; i >= 0; i--) {
|
|
443
|
+
const node = nodes[i];
|
|
444
|
+
let childNode: any;
|
|
445
|
+
|
|
446
|
+
if (typeof node === 'string') {
|
|
447
|
+
childNode = createTextNode(node);
|
|
448
|
+
} else {
|
|
449
|
+
childNode = node;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
if (parent.firstChild) {
|
|
453
|
+
insertBefore(parent, childNode, parent.firstChild);
|
|
454
|
+
} else {
|
|
455
|
+
appendChild(parent, childNode);
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
function append(parent: any, ...nodes: any[]): void {
|
|
461
|
+
if (nodes.length === 0) return;
|
|
462
|
+
|
|
463
|
+
for (const node of nodes) {
|
|
464
|
+
let childNode: any;
|
|
465
|
+
|
|
466
|
+
if (typeof node === 'string') {
|
|
467
|
+
childNode = createTextNode(node);
|
|
468
|
+
} else {
|
|
469
|
+
childNode = node;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
appendChild(parent, childNode);
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
function remove(node: any): void {
|
|
477
|
+
if (node.parentNode) {
|
|
478
|
+
removeChild(node.parentNode, node);
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
function matches(element: any, selector: string): boolean {
|
|
483
|
+
if (!selector || element.nodeType !== NodeType.ELEMENT_NODE) {
|
|
484
|
+
return false;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
try {
|
|
488
|
+
// Para selectores complejos con descendientes, necesitamos buscar desde un ancestro
|
|
489
|
+
if (selector.includes(' ') || selector.includes('>')) {
|
|
490
|
+
// Buscar desde la raÃz del documento
|
|
491
|
+
let root = element;
|
|
492
|
+
while (root.parentNode) {
|
|
493
|
+
root = root.parentNode;
|
|
494
|
+
}
|
|
495
|
+
const results = querySelectorAllFunction(root, selector);
|
|
496
|
+
return results.includes(element);
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// Para selectores simples, usar el padre o crear uno temporal
|
|
500
|
+
const parent = element.parentNode || createTempParent(element);
|
|
501
|
+
const results = querySelectorAllFunction(parent, selector);
|
|
502
|
+
return results.includes(element);
|
|
503
|
+
} catch (error) {
|
|
504
|
+
return false;
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
function createTempParent(element: any): any {
|
|
509
|
+
const temp = createElement('div');
|
|
510
|
+
temp.childNodes.push(element);
|
|
511
|
+
temp.children.push(element);
|
|
512
|
+
element._tempParent = temp;
|
|
513
|
+
return temp;
|
|
514
|
+
}
|
|
515
|
+
|
|
398
516
|
function removeChild(parent: any, child: any): any {
|
|
399
517
|
const index = parent.childNodes.indexOf(child);
|
|
400
518
|
if (index === -1) {
|
|
@@ -733,9 +851,10 @@ function updateElementContent(element: any): void {
|
|
|
733
851
|
.map(([k, v]) => ` ${k}="${v}"`)
|
|
734
852
|
.join("");
|
|
735
853
|
const tagNameLower = element.tagName.toLowerCase();
|
|
854
|
+
const isVoid = VOID_ELEMENTS.has(tagNameLower);
|
|
736
855
|
|
|
737
856
|
Object.defineProperty(element, "_internalOuterHTML", {
|
|
738
|
-
value: `<${tagNameLower}${attrs}>${innerHTML}</${tagNameLower}>`,
|
|
857
|
+
value: isVoid ? `<${tagNameLower}${attrs}>` : `<${tagNameLower}${attrs}>${innerHTML}</${tagNameLower}>`,
|
|
739
858
|
writable: true,
|
|
740
859
|
enumerable: false,
|
|
741
860
|
configurable: true,
|
|
@@ -830,9 +949,10 @@ export function setInnerHTML(element: any, html: string): void {
|
|
|
830
949
|
.map(([k, v]) => ` ${k}="${v}"`)
|
|
831
950
|
.join("");
|
|
832
951
|
const tagNameLower = element.tagName.toLowerCase();
|
|
952
|
+
const isVoid = VOID_ELEMENTS.has(tagNameLower);
|
|
833
953
|
|
|
834
954
|
Object.defineProperty(element, "_internalOuterHTML", {
|
|
835
|
-
value: `<${tagNameLower}${attrs}>${actualInnerHTML}</${tagNameLower}>`,
|
|
955
|
+
value: isVoid ? `<${tagNameLower}${attrs}>` : `<${tagNameLower}${attrs}>${actualInnerHTML}</${tagNameLower}>`,
|
|
836
956
|
writable: true,
|
|
837
957
|
enumerable: false,
|
|
838
958
|
configurable: true,
|
package/src/parser.ts
CHANGED
|
@@ -314,42 +314,3 @@ function shouldSkipWhitespace(parent: ASTNode): boolean {
|
|
|
314
314
|
|
|
315
315
|
return parent.tagName ? skipWhitespaceIn.has(parent.tagName) : false;
|
|
316
316
|
}
|
|
317
|
-
|
|
318
|
-
export function traverseAST(node: ASTNode, callback: (node: ASTNode) => void): void {
|
|
319
|
-
callback(node);
|
|
320
|
-
|
|
321
|
-
if (node.children) {
|
|
322
|
-
for (const child of node.children) {
|
|
323
|
-
traverseAST(child, callback);
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
export function findNodesByTagName(root: ASTNode, tagName: string): ASTNode[] {
|
|
329
|
-
const results: ASTNode[] = [];
|
|
330
|
-
|
|
331
|
-
traverseAST(root, (node) => {
|
|
332
|
-
if (node.type === ASTNodeType.ELEMENT && node.tagName === tagName.toLowerCase()) {
|
|
333
|
-
results.push(node);
|
|
334
|
-
}
|
|
335
|
-
});
|
|
336
|
-
|
|
337
|
-
return results;
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
export function findNodesByAttribute(root: ASTNode, attrName: string, attrValue?: string): ASTNode[] {
|
|
341
|
-
const results: ASTNode[] = [];
|
|
342
|
-
|
|
343
|
-
traverseAST(root, (node) => {
|
|
344
|
-
if (node.type === ASTNodeType.ELEMENT && node.attributes) {
|
|
345
|
-
const hasAttr = attrName in node.attributes;
|
|
346
|
-
const valueMatches = attrValue === undefined || node.attributes[attrName] === attrValue;
|
|
347
|
-
|
|
348
|
-
if (hasAttr && valueMatches) {
|
|
349
|
-
results.push(node);
|
|
350
|
-
}
|
|
351
|
-
}
|
|
352
|
-
});
|
|
353
|
-
|
|
354
|
-
return results;
|
|
355
|
-
}
|
package/src/tokenizer.ts
CHANGED
|
@@ -251,99 +251,3 @@ export function tokenize(html: string): Token[] {
|
|
|
251
251
|
|
|
252
252
|
return tokens;
|
|
253
253
|
}
|
|
254
|
-
|
|
255
|
-
export function tokenizeWithRewriter(html: string): Token[] {
|
|
256
|
-
const tokens: Token[] = [];
|
|
257
|
-
let textBuffer = '';
|
|
258
|
-
let position = 0;
|
|
259
|
-
|
|
260
|
-
const rewriter = new HTMLRewriter();
|
|
261
|
-
|
|
262
|
-
rewriter.on('*', {
|
|
263
|
-
element(element) {
|
|
264
|
-
if (textBuffer.trim()) {
|
|
265
|
-
tokens.push({
|
|
266
|
-
type: TokenType.TEXT,
|
|
267
|
-
value: decodeEntities(textBuffer),
|
|
268
|
-
position: calculatePosition(html, position - textBuffer.length)
|
|
269
|
-
});
|
|
270
|
-
textBuffer = '';
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
const attributes: Record<string, string> = {};
|
|
274
|
-
for (const [name, value] of element.attributes) {
|
|
275
|
-
attributes[name] = value;
|
|
276
|
-
}
|
|
277
|
-
|
|
278
|
-
tokens.push({
|
|
279
|
-
type: TokenType.TAG_OPEN,
|
|
280
|
-
value: element.tagName.toLowerCase(),
|
|
281
|
-
position: calculatePosition(html, position),
|
|
282
|
-
attributes,
|
|
283
|
-
isSelfClosing: element.selfClosing
|
|
284
|
-
});
|
|
285
|
-
|
|
286
|
-
if (!element.selfClosing) {
|
|
287
|
-
element.onEndTag((endTag) => {
|
|
288
|
-
tokens.push({
|
|
289
|
-
type: TokenType.TAG_CLOSE,
|
|
290
|
-
value: endTag.name.toLowerCase(),
|
|
291
|
-
position: calculatePosition(html, position),
|
|
292
|
-
isClosing: true
|
|
293
|
-
});
|
|
294
|
-
});
|
|
295
|
-
}
|
|
296
|
-
},
|
|
297
|
-
|
|
298
|
-
text(text) {
|
|
299
|
-
textBuffer += text.text;
|
|
300
|
-
},
|
|
301
|
-
|
|
302
|
-
comments(comment) {
|
|
303
|
-
tokens.push({
|
|
304
|
-
type: TokenType.COMMENT,
|
|
305
|
-
value: comment.text,
|
|
306
|
-
position: calculatePosition(html, position)
|
|
307
|
-
});
|
|
308
|
-
}
|
|
309
|
-
});
|
|
310
|
-
|
|
311
|
-
try {
|
|
312
|
-
const response = new Response(html, {
|
|
313
|
-
headers: { 'Content-Type': 'text/html' }
|
|
314
|
-
});
|
|
315
|
-
|
|
316
|
-
rewriter.transform(response);
|
|
317
|
-
|
|
318
|
-
if (textBuffer.trim()) {
|
|
319
|
-
tokens.push({
|
|
320
|
-
type: TokenType.TEXT,
|
|
321
|
-
value: decodeEntities(textBuffer),
|
|
322
|
-
position: calculatePosition(html, position - textBuffer.length)
|
|
323
|
-
});
|
|
324
|
-
}
|
|
325
|
-
|
|
326
|
-
} catch (error) {
|
|
327
|
-
console.warn('HTMLRewriter failed, falling back to manual parsing:', error);
|
|
328
|
-
return tokenize(html);
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
tokens.sort((a, b) => a.position.offset - b.position.offset);
|
|
332
|
-
tokens.push({
|
|
333
|
-
type: TokenType.EOF,
|
|
334
|
-
value: '',
|
|
335
|
-
position: calculatePosition(html, html.length)
|
|
336
|
-
});
|
|
337
|
-
|
|
338
|
-
return tokens;
|
|
339
|
-
}
|
|
340
|
-
|
|
341
|
-
export function smartTokenize(html: string): Token[] {
|
|
342
|
-
const hasSpecialContent = /<!DOCTYPE|<!--|\[CDATA\[|<\?/.test(html);
|
|
343
|
-
|
|
344
|
-
if (hasSpecialContent || html.length < 1000) {
|
|
345
|
-
return tokenize(html);
|
|
346
|
-
} else {
|
|
347
|
-
return tokenizeWithRewriter(html);
|
|
348
|
-
}
|
|
349
|
-
}
|