@tkeron/html-parser 0.1.4 â 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/bun.lock +3 -3
- package/index.ts +0 -5
- package/package.json +7 -6
- package/src/css-selector.ts +45 -32
- package/src/dom-simulator.ts +243 -46
- package/src/parser.ts +0 -39
- package/src/tokenizer.ts +0 -116
- package/tests/advanced.test.ts +2 -2
- package/tests/cloneNode.test.ts +50 -50
- package/tests/custom-elements.test.ts +8 -8
- package/tests/dom-manipulation.test.ts +638 -0
- package/tests/official/acid/acid-tests.test.ts +6 -6
- package/tests/official/final-output/final-output.test.ts +15 -15
- package/tests/official/html5lib/tokenizer-utils.ts +19 -31
- package/tests/official/html5lib/tokenizer.test.ts +4 -4
- package/tests/official/html5lib/tree-construction-utils.ts +20 -34
- package/tests/official/html5lib/tree-construction.test.ts +5 -5
- package/tests/official/validator/validator-tests.test.ts +11 -11
- package/tests/official/wpt/wpt-tests.test.ts +5 -5
- package/tests/outerHTML-replacement.test.ts +208 -0
- package/tests/parser.test.ts +1 -1
- package/tests/selectors.test.ts +64 -1
- package/tests/test-page-0.txt +12 -355
- package/tests/tokenizer.test.ts +86 -0
- package/tests/void-elements.test.ts +471 -0
- package/tests/api-integration.test.ts +0 -114
- package/tests/cloneNode-bug-reproduction.test.ts +0 -325
- package/tests/cloneNode-interactive.ts +0 -235
- package/tests/dom-adoption.test.ts +0 -363
- package/tests/dom-synchronization.test.ts +0 -675
- package/tests/setAttribute-outerHTML.test.ts +0 -102
package/README.md
CHANGED
|
@@ -11,7 +11,7 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
|
|
|
11
11
|
- ðŠķ **Lightweight**: Minimal dependencies, native implementation
|
|
12
12
|
- ð **Standards Compliant**: Returns standard DOM Document objects
|
|
13
13
|
- ð§ **TypeScript Support**: Full TypeScript definitions included
|
|
14
|
-
- â
**Well Tested**: Comprehensive unit test suite (
|
|
14
|
+
- â
**Well Tested**: Comprehensive unit test suite (569 tests passing)
|
|
15
15
|
- ð **100% Compatible**: Drop-in replacement, same API
|
|
16
16
|
|
|
17
17
|
## Installation
|
|
@@ -21,19 +21,19 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
|
|
|
21
21
|
Once published, it will be available as:
|
|
22
22
|
|
|
23
23
|
```bash
|
|
24
|
-
npm install html-parser
|
|
24
|
+
npm install @tkeron/html-parser
|
|
25
25
|
```
|
|
26
26
|
|
|
27
27
|
Or with Bun:
|
|
28
28
|
|
|
29
29
|
```bash
|
|
30
|
-
bun add html-parser
|
|
30
|
+
bun add @tkeron/html-parser
|
|
31
31
|
```
|
|
32
32
|
|
|
33
33
|
## Usage
|
|
34
34
|
|
|
35
35
|
```typescript
|
|
36
|
-
import { parseHTML } from "html-parser";
|
|
36
|
+
import { parseHTML } from "@tkeron/html-parser";
|
|
37
37
|
|
|
38
38
|
// Parse HTML string into DOM Document
|
|
39
39
|
const html =
|
|
@@ -51,7 +51,7 @@ console.log(heading); // "Hello World"
|
|
|
51
51
|
### Simple Example
|
|
52
52
|
|
|
53
53
|
```typescript
|
|
54
|
-
import { parseHTML } from "html-parser";
|
|
54
|
+
import { parseHTML } from "@tkeron/html-parser";
|
|
55
55
|
|
|
56
56
|
const html = `
|
|
57
57
|
<div class="container">
|
|
@@ -117,4 +117,4 @@ MIT
|
|
|
117
117
|
|
|
118
118
|
## Support
|
|
119
119
|
|
|
120
|
-
If you encounter any issues or have questions, please file an issue on the [GitHub repository](https://github.com/
|
|
120
|
+
If you encounter any issues or have questions, please file an issue on the [GitHub repository](https://github.com/tkeron/html-parser).
|
package/bun.lock
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"": {
|
|
6
6
|
"name": "@tkeron/html-parser",
|
|
7
7
|
"devDependencies": {
|
|
8
|
-
"@types/bun": "^1.3.
|
|
8
|
+
"@types/bun": "^1.3.6",
|
|
9
9
|
},
|
|
10
10
|
"peerDependencies": {
|
|
11
11
|
"typescript": "^5.9.3",
|
|
@@ -13,11 +13,11 @@
|
|
|
13
13
|
},
|
|
14
14
|
},
|
|
15
15
|
"packages": {
|
|
16
|
-
"@types/bun": ["@types/bun@1.3.
|
|
16
|
+
"@types/bun": ["@types/bun@1.3.6", "", { "dependencies": { "bun-types": "1.3.6" } }, "sha512-uWCv6FO/8LcpREhenN1d1b6fcspAB+cefwD7uti8C8VffIv0Um08TKMn98FynpTiU38+y2dUO55T11NgDt8VAA=="],
|
|
17
17
|
|
|
18
18
|
"@types/node": ["@types/node@25.0.3", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA=="],
|
|
19
19
|
|
|
20
|
-
"bun-types": ["bun-types@1.3.
|
|
20
|
+
"bun-types": ["bun-types@1.3.6", "", { "dependencies": { "@types/node": "*" } }, "sha512-OlFwHcnNV99r//9v5IIOgQ9Uk37gZqrNMCcqEaExdkVq3Avwqok1bJFmvGMCkCE0FqzdY8VMOZpfpR3lwI+CsQ=="],
|
|
21
21
|
|
|
22
22
|
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
|
|
23
23
|
|
package/index.ts
CHANGED
|
@@ -4,11 +4,6 @@ import {
|
|
|
4
4
|
astToDOM,
|
|
5
5
|
} from './src/dom-simulator.js';
|
|
6
6
|
|
|
7
|
-
/**
|
|
8
|
-
* Parse HTML string into Document object
|
|
9
|
-
* @param html The HTML string to parse
|
|
10
|
-
* @returns A Document object
|
|
11
|
-
*/
|
|
12
7
|
export function parseHTML(html: string = ""): Document {
|
|
13
8
|
const tokens = tokenize(html);
|
|
14
9
|
const ast = parse(tokens);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tkeron/html-parser",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.7",
|
|
4
4
|
"description": "A fast and lightweight HTML parser for Bun",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"module": "index.ts",
|
|
@@ -8,16 +8,17 @@
|
|
|
8
8
|
"author": "tkeron",
|
|
9
9
|
"license": "MIT",
|
|
10
10
|
"devDependencies": {
|
|
11
|
-
"@types/bun": "^1.3.
|
|
11
|
+
"@types/bun": "^1.3.6"
|
|
12
12
|
},
|
|
13
13
|
"peerDependencies": {
|
|
14
14
|
"typescript": "^5.9.3"
|
|
15
15
|
},
|
|
16
16
|
"keywords": [
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
20
|
-
"
|
|
17
|
+
"html",
|
|
18
|
+
"parser",
|
|
19
|
+
"dom",
|
|
20
|
+
"bun",
|
|
21
|
+
"tokenizer"
|
|
21
22
|
],
|
|
22
23
|
"repository": {
|
|
23
24
|
"url": "git@github.com:tkeron/html-parser.git"
|
package/src/css-selector.ts
CHANGED
|
@@ -14,36 +14,47 @@ function parseSelector(selector: string): SelectorGroup[] {
|
|
|
14
14
|
|
|
15
15
|
return parts.map((part) => {
|
|
16
16
|
const trimmed = part.trim();
|
|
17
|
-
let tokens: SelectorToken[];
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
17
|
+
let tokens: SelectorToken[] = [];
|
|
18
|
+
|
|
19
|
+
// Handle universal selector
|
|
20
|
+
if (trimmed === '*') {
|
|
21
|
+
// Match any element - we'll handle this specially
|
|
22
|
+
return { tokens: [] };
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// Parse complex selectors like p#intro.first or .foo.bar.baz
|
|
26
|
+
let remaining = trimmed;
|
|
27
|
+
|
|
28
|
+
// Extract tag name first if present
|
|
29
|
+
const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9]*)/);
|
|
30
|
+
if (tagMatch) {
|
|
31
|
+
tokens.push({ type: "tag", value: tagMatch[1].toLowerCase() });
|
|
32
|
+
remaining = remaining.slice(tagMatch[1].length);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Extract all IDs (HTML5 allows IDs starting with digits)
|
|
36
|
+
const idMatches = remaining.matchAll(/#([a-zA-Z0-9][a-zA-Z0-9_-]*)/g);
|
|
37
|
+
for (const match of idMatches) {
|
|
38
|
+
tokens.push({ type: "id", value: match[1] });
|
|
39
|
+
}
|
|
40
|
+
remaining = remaining.replace(/#[a-zA-Z0-9][a-zA-Z0-9_-]*/g, '');
|
|
41
|
+
|
|
42
|
+
// Extract all classes
|
|
43
|
+
const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
|
|
44
|
+
for (const match of classMatches) {
|
|
45
|
+
tokens.push({ type: "class", value: match[1] });
|
|
46
|
+
}
|
|
47
|
+
remaining = remaining.replace(/\.[a-zA-Z][a-zA-Z0-9_-]*/g, '');
|
|
48
|
+
|
|
49
|
+
// Extract attributes
|
|
50
|
+
const attrMatches = remaining.matchAll(/\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]/g);
|
|
51
|
+
for (const match of attrMatches) {
|
|
52
|
+
tokens.push({
|
|
53
|
+
type: "attribute",
|
|
54
|
+
value: match[1].trim(),
|
|
55
|
+
attributeName: match[1].trim(),
|
|
56
|
+
attributeValue: match[2] ? match[2].trim() : undefined
|
|
57
|
+
});
|
|
47
58
|
}
|
|
48
59
|
|
|
49
60
|
return { tokens };
|
|
@@ -67,11 +78,9 @@ function matchesToken(element: any, token: SelectorToken): boolean {
|
|
|
67
78
|
return element.attributes?.id === token.value;
|
|
68
79
|
case "attribute":
|
|
69
80
|
const attrValue = element.attributes?.[token.attributeName || ""];
|
|
70
|
-
// If no attribute value specified in selector, just check if attribute exists
|
|
71
81
|
if (token.attributeValue === undefined) {
|
|
72
82
|
return attrValue !== undefined;
|
|
73
83
|
}
|
|
74
|
-
// Otherwise check for exact match
|
|
75
84
|
return attrValue === token.attributeValue;
|
|
76
85
|
default:
|
|
77
86
|
return false;
|
|
@@ -79,6 +88,10 @@ function matchesToken(element: any, token: SelectorToken): boolean {
|
|
|
79
88
|
}
|
|
80
89
|
|
|
81
90
|
function matchesSelector(element: any, tokens: SelectorToken[]): boolean {
|
|
91
|
+
// Universal selector - matches any element
|
|
92
|
+
if (tokens.length === 0) {
|
|
93
|
+
return true;
|
|
94
|
+
}
|
|
82
95
|
return tokens.every((token) => matchesToken(element, token));
|
|
83
96
|
}
|
|
84
97
|
|