@tkeron/html-parser 0.1.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -7
- package/bun.lock +5 -0
- package/index.ts +4 -0
- package/package.json +7 -1
- package/src/css-selector.ts +1 -1
- package/src/dom-simulator.ts +38 -16
- package/src/encoding.ts +39 -0
- package/src/index.ts +9 -0
- package/src/parser.ts +478 -144
- package/src/serializer.ts +450 -0
- package/src/tokenizer.ts +59 -43
- package/tests/advanced.test.ts +119 -106
- package/tests/custom-elements.test.ts +172 -162
- package/tests/dom-extended.test.ts +12 -12
- package/tests/dom-manipulation.test.ts +9 -10
- package/tests/dom.test.ts +32 -27
- package/tests/helpers/tokenizer-adapter.test.ts +70 -0
- package/tests/helpers/tokenizer-adapter.ts +65 -0
- package/tests/helpers/tree-adapter.test.ts +39 -0
- package/tests/helpers/tree-adapter.ts +43 -0
- package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
- package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
- package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
- package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
- package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
- package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
- package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
- package/tests/html5lib-data/tree-construction/math.dat +104 -0
- package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
- package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
- package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
- package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
- package/tests/html5lib-data/tree-construction/svg.dat +104 -0
- package/tests/html5lib-data/tree-construction/template.dat +1673 -0
- package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
- package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
- package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
- package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
- package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
- package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
- package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
- package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
- package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
- package/tests/parser.test.ts +172 -193
- package/tests/serializer-core.test.ts +16 -0
- package/tests/serializer-data/core.test +125 -0
- package/tests/serializer-data/injectmeta.test +66 -0
- package/tests/serializer-data/optionaltags.test +965 -0
- package/tests/serializer-data/options.test +60 -0
- package/tests/serializer-data/whitespace.test +51 -0
- package/tests/serializer-injectmeta.test.ts +16 -0
- package/tests/serializer-optionaltags.test.ts +16 -0
- package/tests/serializer-options.test.ts +16 -0
- package/tests/serializer-whitespace.test.ts +16 -0
- package/tests/tokenizer-namedEntities.test.ts +20 -0
- package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
- package/tests/tokenizer.test.ts +3 -6
- package/tests/tree-construction-adoption01.test.ts +37 -0
- package/tests/tree-construction-adoption02.test.ts +34 -0
- package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
- package/tests/tree-construction-entities02.test.ts +33 -0
- package/tests/tree-construction-html5test-com.test.ts +24 -0
- package/tests/tree-construction-math.test.ts +18 -0
- package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
- package/tests/tree-construction-noscript01.test.ts +18 -0
- package/tests/tree-construction-ruby.test.ts +21 -0
- package/tests/tree-construction-scriptdata01.test.ts +21 -0
- package/tests/tree-construction-svg.test.ts +21 -0
- package/tests/tree-construction-template.test.ts +21 -0
- package/tests/tree-construction-tests10.test.ts +21 -0
- package/tests/tree-construction-tests11.test.ts +21 -0
- package/tests/tree-construction-tests20.test.ts +18 -0
- package/tests/tree-construction-tests21.test.ts +18 -0
- package/tests/tree-construction-tests23.test.ts +18 -0
- package/tests/tree-construction-tests24.test.ts +18 -0
- package/tests/tree-construction-tests5.test.ts +21 -0
- package/tests/tree-construction-tests6.test.ts +21 -0
- package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
- package/tests/official/README.md +0 -87
- package/tests/official/acid/acid-tests.test.ts +0 -309
- package/tests/official/final-output/final-output.test.ts +0 -361
- package/tests/official/html5lib/tokenizer-utils.ts +0 -192
- package/tests/official/html5lib/tokenizer.test.ts +0 -171
- package/tests/official/html5lib/tree-construction-utils.ts +0 -194
- package/tests/official/html5lib/tree-construction.test.ts +0 -250
- package/tests/official/validator/validator-tests.test.ts +0 -237
- package/tests/official/validator-nu/validator-nu.test.ts +0 -335
- package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
- package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/README.md
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
# HTML Parser - Powered by Bun Native Tokenizer
|
|
2
2
|
|
|
3
|
-
> ⚠️ **Work in Progress** - This package is currently under active development.
|
|
4
|
-
|
|
5
3
|
A fast and lightweight HTML parser for Bun that converts HTML strings into DOM Document objects. **Now powered by a native Bun tokenizer** for optimal performance.
|
|
6
4
|
|
|
7
5
|
## Features
|
|
@@ -11,15 +9,11 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
|
|
|
11
9
|
- 🪶 **Lightweight**: Minimal dependencies, native implementation
|
|
12
10
|
- 🌐 **Standards Compliant**: Returns standard DOM Document objects
|
|
13
11
|
- 🔧 **TypeScript Support**: Full TypeScript definitions included
|
|
14
|
-
- ✅ **Well Tested**: Comprehensive
|
|
12
|
+
- ✅ **Well Tested**: Comprehensive test suite (5200+ tests passing)
|
|
15
13
|
- 🔄 **100% Compatible**: Drop-in replacement, same API
|
|
16
14
|
|
|
17
15
|
## Installation
|
|
18
16
|
|
|
19
|
-
> **Note**: This package is not yet published to npm. For now, you can clone and build locally.
|
|
20
|
-
|
|
21
|
-
Once published, it will be available as:
|
|
22
|
-
|
|
23
17
|
```bash
|
|
24
18
|
npm install @tkeron/html-parser
|
|
25
19
|
```
|
package/bun.lock
CHANGED
|
@@ -4,6 +4,9 @@
|
|
|
4
4
|
"workspaces": {
|
|
5
5
|
"": {
|
|
6
6
|
"name": "@tkeron/html-parser",
|
|
7
|
+
"dependencies": {
|
|
8
|
+
"all-named-html-entities": "^3.1.3",
|
|
9
|
+
},
|
|
7
10
|
"devDependencies": {
|
|
8
11
|
"@types/bun": "^1.3.6",
|
|
9
12
|
},
|
|
@@ -17,6 +20,8 @@
|
|
|
17
20
|
|
|
18
21
|
"@types/node": ["@types/node@25.0.3", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA=="],
|
|
19
22
|
|
|
23
|
+
"all-named-html-entities": ["all-named-html-entities@3.1.3", "", {}, "sha512-eG7/XkhxyIUWApWvhVPcusxZ3PTebJo1AvkFkQj7MDSkBYmzXZsNadKZWuo1UxEX6QrE7y7JQx7G3Fx0YjVtnA=="],
|
|
24
|
+
|
|
20
25
|
"bun-types": ["bun-types@1.3.6", "", { "dependencies": { "@types/node": "*" } }, "sha512-OlFwHcnNV99r//9v5IIOgQ9Uk37gZqrNMCcqEaExdkVq3Avwqok1bJFmvGMCkCE0FqzdY8VMOZpfpR3lwI+CsQ=="],
|
|
21
26
|
|
|
22
27
|
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
|
package/index.ts
CHANGED
|
@@ -7,6 +7,10 @@ import {
|
|
|
7
7
|
export function parseHTML(html: string = ""): Document {
|
|
8
8
|
const tokens = tokenize(html);
|
|
9
9
|
const ast = parse(tokens);
|
|
10
|
+
// If parse already returns a DOM document, return it directly
|
|
11
|
+
if (ast && typeof ast.nodeType === 'number' && ast.nodeType === 9) {
|
|
12
|
+
return ast;
|
|
13
|
+
}
|
|
10
14
|
return astToDOM(ast);
|
|
11
15
|
}
|
|
12
16
|
|
package/package.json
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tkeron/html-parser",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "1.0.0",
|
|
4
4
|
"description": "A fast and lightweight HTML parser for Bun",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"module": "index.ts",
|
|
7
7
|
"type": "module",
|
|
8
8
|
"author": "tkeron",
|
|
9
9
|
"license": "MIT",
|
|
10
|
+
"scripts": {
|
|
11
|
+
"test": "bun test --concurrent"
|
|
12
|
+
},
|
|
10
13
|
"devDependencies": {
|
|
11
14
|
"@types/bun": "^1.3.6"
|
|
12
15
|
},
|
|
@@ -22,5 +25,8 @@
|
|
|
22
25
|
],
|
|
23
26
|
"repository": {
|
|
24
27
|
"url": "git@github.com:tkeron/html-parser.git"
|
|
28
|
+
},
|
|
29
|
+
"dependencies": {
|
|
30
|
+
"all-named-html-entities": "^3.1.3"
|
|
25
31
|
}
|
|
26
32
|
}
|
package/src/css-selector.ts
CHANGED
|
@@ -26,7 +26,7 @@ function parseSelector(selector: string): SelectorGroup[] {
|
|
|
26
26
|
let remaining = trimmed;
|
|
27
27
|
|
|
28
28
|
// Extract tag name first if present
|
|
29
|
-
const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9]*)/);
|
|
29
|
+
const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9-]*)/);
|
|
30
30
|
if (tagMatch) {
|
|
31
31
|
tokens.push({ type: "tag", value: tagMatch[1].toLowerCase() });
|
|
32
32
|
remaining = remaining.slice(tagMatch[1].length);
|
package/src/dom-simulator.ts
CHANGED
|
@@ -6,6 +6,11 @@ import {
|
|
|
6
6
|
querySelectorAll as querySelectorAllFunction,
|
|
7
7
|
} from "./css-selector.js";
|
|
8
8
|
|
|
9
|
+
// Escape special HTML characters in text content
|
|
10
|
+
function escapeTextContent(text: string): string {
|
|
11
|
+
return text.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>');
|
|
12
|
+
}
|
|
13
|
+
|
|
9
14
|
const VOID_ELEMENTS = new Set([
|
|
10
15
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
|
11
16
|
'link', 'meta', 'param', 'source', 'track', 'wbr'
|
|
@@ -226,6 +231,25 @@ export function createComment(content: string): any {
|
|
|
226
231
|
return commentNode;
|
|
227
232
|
}
|
|
228
233
|
|
|
234
|
+
export function createDoctype(name: string = 'html'): any {
|
|
235
|
+
const doctypeNode: any = {
|
|
236
|
+
nodeType: NodeType.DOCUMENT_TYPE_NODE,
|
|
237
|
+
nodeName: name.toUpperCase(),
|
|
238
|
+
name: name.toLowerCase(),
|
|
239
|
+
nodeValue: null,
|
|
240
|
+
textContent: "",
|
|
241
|
+
publicId: null,
|
|
242
|
+
systemId: null,
|
|
243
|
+
childNodes: [],
|
|
244
|
+
parentNode: null,
|
|
245
|
+
firstChild: null,
|
|
246
|
+
lastChild: null,
|
|
247
|
+
nextSibling: null,
|
|
248
|
+
previousSibling: null,
|
|
249
|
+
};
|
|
250
|
+
return doctypeNode;
|
|
251
|
+
}
|
|
252
|
+
|
|
229
253
|
export function createDocument(): any {
|
|
230
254
|
const document: any = {
|
|
231
255
|
nodeType: NodeType.DOCUMENT_NODE,
|
|
@@ -375,7 +399,7 @@ function convertASTNodeToDOM(astNode: ASTNode): any {
|
|
|
375
399
|
}
|
|
376
400
|
}
|
|
377
401
|
|
|
378
|
-
function appendChild(parent: any, child: any): void {
|
|
402
|
+
export function appendChild(parent: any, child: any): void {
|
|
379
403
|
if (child.nodeType === NodeType.ELEMENT_NODE || child.nodeType === NodeType.DOCUMENT_NODE) {
|
|
380
404
|
let ancestor = parent;
|
|
381
405
|
while (ancestor) {
|
|
@@ -918,13 +942,13 @@ export function setInnerHTML(element: any, html: string): void {
|
|
|
918
942
|
|
|
919
943
|
if (html.trim()) {
|
|
920
944
|
const tokens = tokenize(html);
|
|
921
|
-
const
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
945
|
+
const doc = parse(tokens);
|
|
946
|
+
const body = doc.body;
|
|
947
|
+
if (body && body.childNodes) {
|
|
948
|
+
const nodesToMove = [...body.childNodes];
|
|
949
|
+
for (const child of nodesToMove) {
|
|
950
|
+
child.parentNode = null;
|
|
951
|
+
appendChild(element, child);
|
|
928
952
|
}
|
|
929
953
|
}
|
|
930
954
|
}
|
|
@@ -975,14 +999,12 @@ export function setOuterHTML(element: any, html: string): void {
|
|
|
975
999
|
|
|
976
1000
|
if (html.trim()) {
|
|
977
1001
|
const tokens = tokenize(html);
|
|
978
|
-
const
|
|
979
|
-
|
|
980
|
-
if (
|
|
981
|
-
for (const child of
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
newNodes.push(domChild);
|
|
985
|
-
}
|
|
1002
|
+
const doc = parse(tokens);
|
|
1003
|
+
const body = doc.body;
|
|
1004
|
+
if (body && body.childNodes) {
|
|
1005
|
+
for (const child of body.childNodes) {
|
|
1006
|
+
child.parentNode = null;
|
|
1007
|
+
newNodes.push(child);
|
|
986
1008
|
}
|
|
987
1009
|
}
|
|
988
1010
|
}
|
package/src/encoding.ts
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Detects the character encoding of an HTML document.
|
|
3
|
+
* Based on HTML5 specification for encoding detection.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
const encodingAliases: Record<string, string> = {
|
|
7
|
+
'iso-8859-1': 'windows-1252',
|
|
8
|
+
'iso8859-1': 'windows-1252',
|
|
9
|
+
'iso-8859-2': 'iso-8859-2',
|
|
10
|
+
'iso8859-2': 'iso-8859-2',
|
|
11
|
+
'utf-8': 'utf-8',
|
|
12
|
+
'utf8': 'utf-8',
|
|
13
|
+
// Add more as needed
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
function normalizeEncoding(name: string): string | null {
|
|
17
|
+
const lower = name.toLowerCase().replace(/[^a-z0-9-]/g, '');
|
|
18
|
+
return encodingAliases[lower] || lower;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function detectEncoding(html: string): string | null {
|
|
22
|
+
// Limit to first 1024 characters for performance
|
|
23
|
+
const prefix = html.substring(0, 1024);
|
|
24
|
+
|
|
25
|
+
// Look for <meta charset="...">
|
|
26
|
+
const charsetMatch = prefix.match(/<meta[^>]*charset\s*=\s*["']?([^"'\s>]+)["']?/i);
|
|
27
|
+
if (charsetMatch) {
|
|
28
|
+
return normalizeEncoding(charsetMatch[1]);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Look for <meta http-equiv="Content-Type" content="text/html; charset=...">
|
|
32
|
+
const contentTypeMatch = prefix.match(/<meta[^>]*http-equiv\s*=\s*["']?\s*content-type\s*["']?[^>]*content\s*=\s*["']?\s*text\/html;\s*charset\s*=\s*([^"'\s>]+)["']?/i);
|
|
33
|
+
if (contentTypeMatch) {
|
|
34
|
+
return normalizeEncoding(contentTypeMatch[1]);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Default to Windows-1252 if no encoding found (as per HTML5 spec)
|
|
38
|
+
return 'windows-1252';
|
|
39
|
+
}
|