@tkeron/html-parser 0.1.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -7
- package/bun.lock +8 -3
- package/index.ts +4 -0
- package/package.json +13 -6
- package/src/css-selector.ts +45 -27
- package/src/dom-simulator.ts +162 -20
- package/src/encoding.ts +39 -0
- package/src/index.ts +9 -0
- package/src/parser.ts +478 -183
- package/src/serializer.ts +450 -0
- package/src/tokenizer.ts +59 -139
- package/tests/advanced.test.ts +119 -106
- package/tests/custom-elements.test.ts +172 -162
- package/tests/dom-extended.test.ts +12 -12
- package/tests/dom-manipulation.test.ts +637 -0
- package/tests/dom.test.ts +32 -27
- package/tests/helpers/tokenizer-adapter.test.ts +70 -0
- package/tests/helpers/tokenizer-adapter.ts +65 -0
- package/tests/helpers/tree-adapter.test.ts +39 -0
- package/tests/helpers/tree-adapter.ts +43 -0
- package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
- package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
- package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
- package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
- package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
- package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
- package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
- package/tests/html5lib-data/tree-construction/math.dat +104 -0
- package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
- package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
- package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
- package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
- package/tests/html5lib-data/tree-construction/svg.dat +104 -0
- package/tests/html5lib-data/tree-construction/template.dat +1673 -0
- package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
- package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
- package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
- package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
- package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
- package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
- package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
- package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
- package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
- package/tests/parser.test.ts +172 -193
- package/tests/selectors.test.ts +64 -1
- package/tests/serializer-core.test.ts +16 -0
- package/tests/serializer-data/core.test +125 -0
- package/tests/serializer-data/injectmeta.test +66 -0
- package/tests/serializer-data/optionaltags.test +965 -0
- package/tests/serializer-data/options.test +60 -0
- package/tests/serializer-data/whitespace.test +51 -0
- package/tests/serializer-injectmeta.test.ts +16 -0
- package/tests/serializer-optionaltags.test.ts +16 -0
- package/tests/serializer-options.test.ts +16 -0
- package/tests/serializer-whitespace.test.ts +16 -0
- package/tests/tokenizer-namedEntities.test.ts +20 -0
- package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
- package/tests/tokenizer.test.ts +83 -0
- package/tests/tree-construction-adoption01.test.ts +37 -0
- package/tests/tree-construction-adoption02.test.ts +34 -0
- package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
- package/tests/tree-construction-entities02.test.ts +33 -0
- package/tests/tree-construction-html5test-com.test.ts +24 -0
- package/tests/tree-construction-math.test.ts +18 -0
- package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
- package/tests/tree-construction-noscript01.test.ts +18 -0
- package/tests/tree-construction-ruby.test.ts +21 -0
- package/tests/tree-construction-scriptdata01.test.ts +21 -0
- package/tests/tree-construction-svg.test.ts +21 -0
- package/tests/tree-construction-template.test.ts +21 -0
- package/tests/tree-construction-tests10.test.ts +21 -0
- package/tests/tree-construction-tests11.test.ts +21 -0
- package/tests/tree-construction-tests20.test.ts +18 -0
- package/tests/tree-construction-tests21.test.ts +18 -0
- package/tests/tree-construction-tests23.test.ts +18 -0
- package/tests/tree-construction-tests24.test.ts +18 -0
- package/tests/tree-construction-tests5.test.ts +21 -0
- package/tests/tree-construction-tests6.test.ts +21 -0
- package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
- package/tests/void-elements.test.ts +471 -0
- package/tests/official/README.md +0 -87
- package/tests/official/acid/acid-tests.test.ts +0 -309
- package/tests/official/final-output/final-output.test.ts +0 -361
- package/tests/official/html5lib/tokenizer-utils.ts +0 -192
- package/tests/official/html5lib/tokenizer.test.ts +0 -171
- package/tests/official/html5lib/tree-construction-utils.ts +0 -194
- package/tests/official/html5lib/tree-construction.test.ts +0 -250
- package/tests/official/validator/validator-tests.test.ts +0 -237
- package/tests/official/validator-nu/validator-nu.test.ts +0 -335
- package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
- package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/README.md
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
# HTML Parser - Powered by Bun Native Tokenizer
|
|
2
2
|
|
|
3
|
-
> ⚠️ **Work in Progress** - This package is currently under active development.
|
|
4
|
-
|
|
5
3
|
A fast and lightweight HTML parser for Bun that converts HTML strings into DOM Document objects. **Now powered by a native Bun tokenizer** for optimal performance.
|
|
6
4
|
|
|
7
5
|
## Features
|
|
@@ -11,15 +9,11 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
|
|
|
11
9
|
- 🪶 **Lightweight**: Minimal dependencies, native implementation
|
|
12
10
|
- 🌐 **Standards Compliant**: Returns standard DOM Document objects
|
|
13
11
|
- 🔧 **TypeScript Support**: Full TypeScript definitions included
|
|
14
|
-
- ✅ **Well Tested**: Comprehensive
|
|
12
|
+
- ✅ **Well Tested**: Comprehensive test suite (5200+ tests passing)
|
|
15
13
|
- 🔄 **100% Compatible**: Drop-in replacement, same API
|
|
16
14
|
|
|
17
15
|
## Installation
|
|
18
16
|
|
|
19
|
-
> **Note**: This package is not yet published to npm. For now, you can clone and build locally.
|
|
20
|
-
|
|
21
|
-
Once published, it will be available as:
|
|
22
|
-
|
|
23
17
|
```bash
|
|
24
18
|
npm install @tkeron/html-parser
|
|
25
19
|
```
|
package/bun.lock
CHANGED
|
@@ -4,8 +4,11 @@
|
|
|
4
4
|
"workspaces": {
|
|
5
5
|
"": {
|
|
6
6
|
"name": "@tkeron/html-parser",
|
|
7
|
+
"dependencies": {
|
|
8
|
+
"all-named-html-entities": "^3.1.3",
|
|
9
|
+
},
|
|
7
10
|
"devDependencies": {
|
|
8
|
-
"@types/bun": "^1.3.
|
|
11
|
+
"@types/bun": "^1.3.6",
|
|
9
12
|
},
|
|
10
13
|
"peerDependencies": {
|
|
11
14
|
"typescript": "^5.9.3",
|
|
@@ -13,11 +16,13 @@
|
|
|
13
16
|
},
|
|
14
17
|
},
|
|
15
18
|
"packages": {
|
|
16
|
-
"@types/bun": ["@types/bun@1.3.
|
|
19
|
+
"@types/bun": ["@types/bun@1.3.6", "", { "dependencies": { "bun-types": "1.3.6" } }, "sha512-uWCv6FO/8LcpREhenN1d1b6fcspAB+cefwD7uti8C8VffIv0Um08TKMn98FynpTiU38+y2dUO55T11NgDt8VAA=="],
|
|
17
20
|
|
|
18
21
|
"@types/node": ["@types/node@25.0.3", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA=="],
|
|
19
22
|
|
|
20
|
-
"
|
|
23
|
+
"all-named-html-entities": ["all-named-html-entities@3.1.3", "", {}, "sha512-eG7/XkhxyIUWApWvhVPcusxZ3PTebJo1AvkFkQj7MDSkBYmzXZsNadKZWuo1UxEX6QrE7y7JQx7G3Fx0YjVtnA=="],
|
|
24
|
+
|
|
25
|
+
"bun-types": ["bun-types@1.3.6", "", { "dependencies": { "@types/node": "*" } }, "sha512-OlFwHcnNV99r//9v5IIOgQ9Uk37gZqrNMCcqEaExdkVq3Avwqok1bJFmvGMCkCE0FqzdY8VMOZpfpR3lwI+CsQ=="],
|
|
21
26
|
|
|
22
27
|
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
|
|
23
28
|
|
package/index.ts
CHANGED
|
@@ -7,6 +7,10 @@ import {
|
|
|
7
7
|
export function parseHTML(html: string = ""): Document {
|
|
8
8
|
const tokens = tokenize(html);
|
|
9
9
|
const ast = parse(tokens);
|
|
10
|
+
// If parse already returns a DOM document, return it directly
|
|
11
|
+
if (ast && typeof ast.nodeType === 'number' && ast.nodeType === 9) {
|
|
12
|
+
return ast;
|
|
13
|
+
}
|
|
10
14
|
return astToDOM(ast);
|
|
11
15
|
}
|
|
12
16
|
|
package/package.json
CHANGED
|
@@ -1,25 +1,32 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tkeron/html-parser",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "1.0.0",
|
|
4
4
|
"description": "A fast and lightweight HTML parser for Bun",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"module": "index.ts",
|
|
7
7
|
"type": "module",
|
|
8
8
|
"author": "tkeron",
|
|
9
9
|
"license": "MIT",
|
|
10
|
+
"scripts": {
|
|
11
|
+
"test": "bun test --concurrent"
|
|
12
|
+
},
|
|
10
13
|
"devDependencies": {
|
|
11
|
-
"@types/bun": "^1.3.
|
|
14
|
+
"@types/bun": "^1.3.6"
|
|
12
15
|
},
|
|
13
16
|
"peerDependencies": {
|
|
14
17
|
"typescript": "^5.9.3"
|
|
15
18
|
},
|
|
16
19
|
"keywords": [
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
"
|
|
20
|
+
"html",
|
|
21
|
+
"parser",
|
|
22
|
+
"dom",
|
|
23
|
+
"bun",
|
|
24
|
+
"tokenizer"
|
|
21
25
|
],
|
|
22
26
|
"repository": {
|
|
23
27
|
"url": "git@github.com:tkeron/html-parser.git"
|
|
28
|
+
},
|
|
29
|
+
"dependencies": {
|
|
30
|
+
"all-named-html-entities": "^3.1.3"
|
|
24
31
|
}
|
|
25
32
|
}
|
package/src/css-selector.ts
CHANGED
|
@@ -14,33 +14,47 @@ function parseSelector(selector: string): SelectorGroup[] {
|
|
|
14
14
|
|
|
15
15
|
return parts.map((part) => {
|
|
16
16
|
const trimmed = part.trim();
|
|
17
|
-
let tokens: SelectorToken[];
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
17
|
+
let tokens: SelectorToken[] = [];
|
|
18
|
+
|
|
19
|
+
// Handle universal selector
|
|
20
|
+
if (trimmed === '*') {
|
|
21
|
+
// Match any element - we'll handle this specially
|
|
22
|
+
return { tokens: [] };
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// Parse complex selectors like p#intro.first or .foo.bar.baz
|
|
26
|
+
let remaining = trimmed;
|
|
27
|
+
|
|
28
|
+
// Extract tag name first if present
|
|
29
|
+
const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9-]*)/);
|
|
30
|
+
if (tagMatch) {
|
|
31
|
+
tokens.push({ type: "tag", value: tagMatch[1].toLowerCase() });
|
|
32
|
+
remaining = remaining.slice(tagMatch[1].length);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Extract all IDs (HTML5 allows IDs starting with digits)
|
|
36
|
+
const idMatches = remaining.matchAll(/#([a-zA-Z0-9][a-zA-Z0-9_-]*)/g);
|
|
37
|
+
for (const match of idMatches) {
|
|
38
|
+
tokens.push({ type: "id", value: match[1] });
|
|
39
|
+
}
|
|
40
|
+
remaining = remaining.replace(/#[a-zA-Z0-9][a-zA-Z0-9_-]*/g, '');
|
|
41
|
+
|
|
42
|
+
// Extract all classes
|
|
43
|
+
const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
|
|
44
|
+
for (const match of classMatches) {
|
|
45
|
+
tokens.push({ type: "class", value: match[1] });
|
|
46
|
+
}
|
|
47
|
+
remaining = remaining.replace(/\.[a-zA-Z][a-zA-Z0-9_-]*/g, '');
|
|
48
|
+
|
|
49
|
+
// Extract attributes
|
|
50
|
+
const attrMatches = remaining.matchAll(/\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]/g);
|
|
51
|
+
for (const match of attrMatches) {
|
|
52
|
+
tokens.push({
|
|
53
|
+
type: "attribute",
|
|
54
|
+
value: match[1].trim(),
|
|
55
|
+
attributeName: match[1].trim(),
|
|
56
|
+
attributeValue: match[2] ? match[2].trim() : undefined
|
|
57
|
+
});
|
|
44
58
|
}
|
|
45
59
|
|
|
46
60
|
return { tokens };
|
|
@@ -74,6 +88,10 @@ function matchesToken(element: any, token: SelectorToken): boolean {
|
|
|
74
88
|
}
|
|
75
89
|
|
|
76
90
|
function matchesSelector(element: any, tokens: SelectorToken[]): boolean {
|
|
91
|
+
// Universal selector - matches any element
|
|
92
|
+
if (tokens.length === 0) {
|
|
93
|
+
return true;
|
|
94
|
+
}
|
|
77
95
|
return tokens.every((token) => matchesToken(element, token));
|
|
78
96
|
}
|
|
79
97
|
|
package/src/dom-simulator.ts
CHANGED
|
@@ -6,6 +6,16 @@ import {
|
|
|
6
6
|
querySelectorAll as querySelectorAllFunction,
|
|
7
7
|
} from "./css-selector.js";
|
|
8
8
|
|
|
9
|
+
// Escape special HTML characters in text content
|
|
10
|
+
function escapeTextContent(text: string): string {
|
|
11
|
+
return text.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>');
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
const VOID_ELEMENTS = new Set([
|
|
15
|
+
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
|
16
|
+
'link', 'meta', 'param', 'source', 'track', 'wbr'
|
|
17
|
+
]);
|
|
18
|
+
|
|
9
19
|
export const enum NodeType {
|
|
10
20
|
ELEMENT_NODE = 1,
|
|
11
21
|
TEXT_NODE = 3,
|
|
@@ -22,9 +32,13 @@ export function createElement(
|
|
|
22
32
|
): any {
|
|
23
33
|
const innerHTML = "";
|
|
24
34
|
const tagNameLower = tagName.toLowerCase();
|
|
25
|
-
const
|
|
35
|
+
const isVoid = VOID_ELEMENTS.has(tagNameLower);
|
|
36
|
+
const attrsStr = Object.entries(attributes)
|
|
26
37
|
.map(([k, v]) => ` ${k}="${v}"`)
|
|
27
|
-
.join("")
|
|
38
|
+
.join("");
|
|
39
|
+
const initialOuterHTML = isVoid
|
|
40
|
+
? `<${tagNameLower}${attrsStr}>`
|
|
41
|
+
: `<${tagNameLower}${attrsStr}></${tagNameLower}>`;
|
|
28
42
|
const textContent = "";
|
|
29
43
|
|
|
30
44
|
const element: any = {
|
|
@@ -54,6 +68,18 @@ export function createElement(
|
|
|
54
68
|
return child;
|
|
55
69
|
},
|
|
56
70
|
|
|
71
|
+
prepend(...nodes: any[]): void {
|
|
72
|
+
prepend(element, ...nodes);
|
|
73
|
+
},
|
|
74
|
+
|
|
75
|
+
append(...nodes: any[]): void {
|
|
76
|
+
append(element, ...nodes);
|
|
77
|
+
},
|
|
78
|
+
|
|
79
|
+
remove(): void {
|
|
80
|
+
remove(element);
|
|
81
|
+
},
|
|
82
|
+
|
|
57
83
|
removeChild(child: any): any {
|
|
58
84
|
return removeChild(element, child);
|
|
59
85
|
},
|
|
@@ -96,6 +122,10 @@ export function createElement(
|
|
|
96
122
|
return querySelectorAllFunction(element, selector);
|
|
97
123
|
},
|
|
98
124
|
|
|
125
|
+
matches(selector: string): boolean {
|
|
126
|
+
return matches(element, selector);
|
|
127
|
+
},
|
|
128
|
+
|
|
99
129
|
cloneNode(deep: boolean = false): any {
|
|
100
130
|
return cloneNode(element, deep);
|
|
101
131
|
},
|
|
@@ -172,6 +202,10 @@ export function createTextNode(content: string): any {
|
|
|
172
202
|
lastChild: null,
|
|
173
203
|
nextSibling: null,
|
|
174
204
|
previousSibling: null,
|
|
205
|
+
|
|
206
|
+
remove(): void {
|
|
207
|
+
remove(textNode);
|
|
208
|
+
},
|
|
175
209
|
};
|
|
176
210
|
return textNode;
|
|
177
211
|
}
|
|
@@ -189,10 +223,33 @@ export function createComment(content: string): any {
|
|
|
189
223
|
lastChild: null,
|
|
190
224
|
nextSibling: null,
|
|
191
225
|
previousSibling: null,
|
|
226
|
+
|
|
227
|
+
remove(): void {
|
|
228
|
+
remove(commentNode);
|
|
229
|
+
},
|
|
192
230
|
};
|
|
193
231
|
return commentNode;
|
|
194
232
|
}
|
|
195
233
|
|
|
234
|
+
export function createDoctype(name: string = 'html'): any {
|
|
235
|
+
const doctypeNode: any = {
|
|
236
|
+
nodeType: NodeType.DOCUMENT_TYPE_NODE,
|
|
237
|
+
nodeName: name.toUpperCase(),
|
|
238
|
+
name: name.toLowerCase(),
|
|
239
|
+
nodeValue: null,
|
|
240
|
+
textContent: "",
|
|
241
|
+
publicId: null,
|
|
242
|
+
systemId: null,
|
|
243
|
+
childNodes: [],
|
|
244
|
+
parentNode: null,
|
|
245
|
+
firstChild: null,
|
|
246
|
+
lastChild: null,
|
|
247
|
+
nextSibling: null,
|
|
248
|
+
previousSibling: null,
|
|
249
|
+
};
|
|
250
|
+
return doctypeNode;
|
|
251
|
+
}
|
|
252
|
+
|
|
196
253
|
export function createDocument(): any {
|
|
197
254
|
const document: any = {
|
|
198
255
|
nodeType: NodeType.DOCUMENT_NODE,
|
|
@@ -222,6 +279,14 @@ export function createDocument(): any {
|
|
|
222
279
|
return child;
|
|
223
280
|
},
|
|
224
281
|
|
|
282
|
+
prepend(...nodes: any[]): void {
|
|
283
|
+
prepend(document, ...nodes);
|
|
284
|
+
},
|
|
285
|
+
|
|
286
|
+
append(...nodes: any[]): void {
|
|
287
|
+
append(document, ...nodes);
|
|
288
|
+
},
|
|
289
|
+
|
|
225
290
|
removeChild(child: any): any {
|
|
226
291
|
return removeChild(document, child);
|
|
227
292
|
},
|
|
@@ -334,7 +399,7 @@ function convertASTNodeToDOM(astNode: ASTNode): any {
|
|
|
334
399
|
}
|
|
335
400
|
}
|
|
336
401
|
|
|
337
|
-
function appendChild(parent: any, child: any): void {
|
|
402
|
+
export function appendChild(parent: any, child: any): void {
|
|
338
403
|
if (child.nodeType === NodeType.ELEMENT_NODE || child.nodeType === NodeType.DOCUMENT_NODE) {
|
|
339
404
|
let ancestor = parent;
|
|
340
405
|
while (ancestor) {
|
|
@@ -395,6 +460,83 @@ function appendChild(parent: any, child: any): void {
|
|
|
395
460
|
}
|
|
396
461
|
}
|
|
397
462
|
|
|
463
|
+
function prepend(parent: any, ...nodes: any[]): void {
|
|
464
|
+
if (nodes.length === 0) return;
|
|
465
|
+
|
|
466
|
+
for (let i = nodes.length - 1; i >= 0; i--) {
|
|
467
|
+
const node = nodes[i];
|
|
468
|
+
let childNode: any;
|
|
469
|
+
|
|
470
|
+
if (typeof node === 'string') {
|
|
471
|
+
childNode = createTextNode(node);
|
|
472
|
+
} else {
|
|
473
|
+
childNode = node;
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
if (parent.firstChild) {
|
|
477
|
+
insertBefore(parent, childNode, parent.firstChild);
|
|
478
|
+
} else {
|
|
479
|
+
appendChild(parent, childNode);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
function append(parent: any, ...nodes: any[]): void {
|
|
485
|
+
if (nodes.length === 0) return;
|
|
486
|
+
|
|
487
|
+
for (const node of nodes) {
|
|
488
|
+
let childNode: any;
|
|
489
|
+
|
|
490
|
+
if (typeof node === 'string') {
|
|
491
|
+
childNode = createTextNode(node);
|
|
492
|
+
} else {
|
|
493
|
+
childNode = node;
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
appendChild(parent, childNode);
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
function remove(node: any): void {
|
|
501
|
+
if (node.parentNode) {
|
|
502
|
+
removeChild(node.parentNode, node);
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
function matches(element: any, selector: string): boolean {
|
|
507
|
+
if (!selector || element.nodeType !== NodeType.ELEMENT_NODE) {
|
|
508
|
+
return false;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
try {
|
|
512
|
+
// Para selectores complejos con descendientes, necesitamos buscar desde un ancestro
|
|
513
|
+
if (selector.includes(' ') || selector.includes('>')) {
|
|
514
|
+
// Buscar desde la raíz del documento
|
|
515
|
+
let root = element;
|
|
516
|
+
while (root.parentNode) {
|
|
517
|
+
root = root.parentNode;
|
|
518
|
+
}
|
|
519
|
+
const results = querySelectorAllFunction(root, selector);
|
|
520
|
+
return results.includes(element);
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
// Para selectores simples, usar el padre o crear uno temporal
|
|
524
|
+
const parent = element.parentNode || createTempParent(element);
|
|
525
|
+
const results = querySelectorAllFunction(parent, selector);
|
|
526
|
+
return results.includes(element);
|
|
527
|
+
} catch (error) {
|
|
528
|
+
return false;
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
function createTempParent(element: any): any {
|
|
533
|
+
const temp = createElement('div');
|
|
534
|
+
temp.childNodes.push(element);
|
|
535
|
+
temp.children.push(element);
|
|
536
|
+
element._tempParent = temp;
|
|
537
|
+
return temp;
|
|
538
|
+
}
|
|
539
|
+
|
|
398
540
|
function removeChild(parent: any, child: any): any {
|
|
399
541
|
const index = parent.childNodes.indexOf(child);
|
|
400
542
|
if (index === -1) {
|
|
@@ -733,9 +875,10 @@ function updateElementContent(element: any): void {
|
|
|
733
875
|
.map(([k, v]) => ` ${k}="${v}"`)
|
|
734
876
|
.join("");
|
|
735
877
|
const tagNameLower = element.tagName.toLowerCase();
|
|
878
|
+
const isVoid = VOID_ELEMENTS.has(tagNameLower);
|
|
736
879
|
|
|
737
880
|
Object.defineProperty(element, "_internalOuterHTML", {
|
|
738
|
-
value: `<${tagNameLower}${attrs}>${innerHTML}</${tagNameLower}>`,
|
|
881
|
+
value: isVoid ? `<${tagNameLower}${attrs}>` : `<${tagNameLower}${attrs}>${innerHTML}</${tagNameLower}>`,
|
|
739
882
|
writable: true,
|
|
740
883
|
enumerable: false,
|
|
741
884
|
configurable: true,
|
|
@@ -799,13 +942,13 @@ export function setInnerHTML(element: any, html: string): void {
|
|
|
799
942
|
|
|
800
943
|
if (html.trim()) {
|
|
801
944
|
const tokens = tokenize(html);
|
|
802
|
-
const
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
945
|
+
const doc = parse(tokens);
|
|
946
|
+
const body = doc.body;
|
|
947
|
+
if (body && body.childNodes) {
|
|
948
|
+
const nodesToMove = [...body.childNodes];
|
|
949
|
+
for (const child of nodesToMove) {
|
|
950
|
+
child.parentNode = null;
|
|
951
|
+
appendChild(element, child);
|
|
809
952
|
}
|
|
810
953
|
}
|
|
811
954
|
}
|
|
@@ -830,9 +973,10 @@ export function setInnerHTML(element: any, html: string): void {
|
|
|
830
973
|
.map(([k, v]) => ` ${k}="${v}"`)
|
|
831
974
|
.join("");
|
|
832
975
|
const tagNameLower = element.tagName.toLowerCase();
|
|
976
|
+
const isVoid = VOID_ELEMENTS.has(tagNameLower);
|
|
833
977
|
|
|
834
978
|
Object.defineProperty(element, "_internalOuterHTML", {
|
|
835
|
-
value: `<${tagNameLower}${attrs}>${actualInnerHTML}</${tagNameLower}>`,
|
|
979
|
+
value: isVoid ? `<${tagNameLower}${attrs}>` : `<${tagNameLower}${attrs}>${actualInnerHTML}</${tagNameLower}>`,
|
|
836
980
|
writable: true,
|
|
837
981
|
enumerable: false,
|
|
838
982
|
configurable: true,
|
|
@@ -855,14 +999,12 @@ export function setOuterHTML(element: any, html: string): void {
|
|
|
855
999
|
|
|
856
1000
|
if (html.trim()) {
|
|
857
1001
|
const tokens = tokenize(html);
|
|
858
|
-
const
|
|
859
|
-
|
|
860
|
-
if (
|
|
861
|
-
for (const child of
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
newNodes.push(domChild);
|
|
865
|
-
}
|
|
1002
|
+
const doc = parse(tokens);
|
|
1003
|
+
const body = doc.body;
|
|
1004
|
+
if (body && body.childNodes) {
|
|
1005
|
+
for (const child of body.childNodes) {
|
|
1006
|
+
child.parentNode = null;
|
|
1007
|
+
newNodes.push(child);
|
|
866
1008
|
}
|
|
867
1009
|
}
|
|
868
1010
|
}
|
package/src/encoding.ts
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Detects the character encoding of an HTML document.
|
|
3
|
+
* Based on HTML5 specification for encoding detection.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
const encodingAliases: Record<string, string> = {
|
|
7
|
+
'iso-8859-1': 'windows-1252',
|
|
8
|
+
'iso8859-1': 'windows-1252',
|
|
9
|
+
'iso-8859-2': 'iso-8859-2',
|
|
10
|
+
'iso8859-2': 'iso-8859-2',
|
|
11
|
+
'utf-8': 'utf-8',
|
|
12
|
+
'utf8': 'utf-8',
|
|
13
|
+
// Add more as needed
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
function normalizeEncoding(name: string): string | null {
|
|
17
|
+
const lower = name.toLowerCase().replace(/[^a-z0-9-]/g, '');
|
|
18
|
+
return encodingAliases[lower] || lower;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function detectEncoding(html: string): string | null {
|
|
22
|
+
// Limit to first 1024 characters for performance
|
|
23
|
+
const prefix = html.substring(0, 1024);
|
|
24
|
+
|
|
25
|
+
// Look for <meta charset="...">
|
|
26
|
+
const charsetMatch = prefix.match(/<meta[^>]*charset\s*=\s*["']?([^"'\s>]+)["']?/i);
|
|
27
|
+
if (charsetMatch) {
|
|
28
|
+
return normalizeEncoding(charsetMatch[1]);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Look for <meta http-equiv="Content-Type" content="text/html; charset=...">
|
|
32
|
+
const contentTypeMatch = prefix.match(/<meta[^>]*http-equiv\s*=\s*["']?\s*content-type\s*["']?[^>]*content\s*=\s*["']?\s*text\/html;\s*charset\s*=\s*([^"'\s>]+)["']?/i);
|
|
33
|
+
if (contentTypeMatch) {
|
|
34
|
+
return normalizeEncoding(contentTypeMatch[1]);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Default to Windows-1252 if no encoding found (as per HTML5 spec)
|
|
38
|
+
return 'windows-1252';
|
|
39
|
+
}
|