@tkeron/html-parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +24 -0
- package/LICENSE +21 -0
- package/README.md +120 -0
- package/bun.lock +29 -0
- package/index.ts +18 -0
- package/package.json +25 -0
- package/src/css-selector.ts +172 -0
- package/src/dom-simulator.ts +592 -0
- package/src/dom-types.ts +78 -0
- package/src/parser.ts +355 -0
- package/src/tokenizer.ts +413 -0
- package/tests/advanced.test.ts +487 -0
- package/tests/api-integration.test.ts +114 -0
- package/tests/dom-extended.test.ts +173 -0
- package/tests/dom.test.ts +482 -0
- package/tests/google-dom.test.ts +118 -0
- package/tests/google-homepage.txt +13 -0
- package/tests/official/README.md +87 -0
- package/tests/official/acid/acid-tests.test.ts +309 -0
- package/tests/official/final-output/final-output.test.ts +361 -0
- package/tests/official/html5lib/tokenizer-utils.ts +204 -0
- package/tests/official/html5lib/tokenizer.test.ts +184 -0
- package/tests/official/html5lib/tree-construction-utils.ts +208 -0
- package/tests/official/html5lib/tree-construction.test.ts +250 -0
- package/tests/official/validator/validator-tests.test.ts +237 -0
- package/tests/official/validator-nu/validator-nu.test.ts +335 -0
- package/tests/official/whatwg/whatwg-tests.test.ts +205 -0
- package/tests/official/wpt/wpt-tests.test.ts +409 -0
- package/tests/parser.test.ts +642 -0
- package/tests/selectors.test.ts +65 -0
- package/tests/test-page-0.txt +362 -0
- package/tests/tokenizer.test.ts +666 -0
- package/tsconfig.json +25 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
name: html-parser package
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
branches:
|
|
5
|
+
- main
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build-test-publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
- uses: actions/setup-node@v4.0.3
|
|
13
|
+
with:
|
|
14
|
+
node-version: 22.x
|
|
15
|
+
registry-url: "https://registry.npmjs.org/"
|
|
16
|
+
|
|
17
|
+
- uses: oven-sh/setup-bun@v2
|
|
18
|
+
|
|
19
|
+
- run: |
|
|
20
|
+
bun i
|
|
21
|
+
bun test
|
|
22
|
+
npm publish --access public
|
|
23
|
+
env:
|
|
24
|
+
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 tkeron
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# HTML Parser - Powered by Bun Native Tokenizer
|
|
2
|
+
|
|
3
|
+
> â ïļ **Work in Progress** - This package is currently under active development and not yet published to npm.
|
|
4
|
+
|
|
5
|
+
A fast and lightweight HTML parser for Bun that converts HTML strings into DOM Document objects. **Now powered by a native Bun tokenizer** for optimal performance.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- ⥠**Bun Native Tokenizer**: Optimized specifically for Bun runtime
|
|
10
|
+
- ð **Ultra Fast**: Leverages Bun's native optimizations
|
|
11
|
+
- ðŠķ **Lightweight**: Minimal dependencies, native implementation
|
|
12
|
+
- ð **Standards Compliant**: Returns standard DOM Document objects
|
|
13
|
+
- ð§ **TypeScript Support**: Full TypeScript definitions included
|
|
14
|
+
- â
**Well Tested**: Comprehensive unit test suite (181/181 passing)
|
|
15
|
+
- ð **100% Compatible**: Drop-in replacement, same API
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
> **Note**: This package is not yet published to npm. For now, you can clone and build locally.
|
|
20
|
+
|
|
21
|
+
Once published, it will be available as:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
npm install html-parser
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Or with Bun:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
bun add html-parser
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
```typescript
|
|
36
|
+
import { parseHTML } from "html-parser";
|
|
37
|
+
|
|
38
|
+
// Parse HTML string into DOM Document
|
|
39
|
+
const html =
|
|
40
|
+
"<html><head><title>Test</title></head><body><h1>Hello World</h1></body></html>";
|
|
41
|
+
const document = parseHTML(html);
|
|
42
|
+
|
|
43
|
+
// Use standard DOM methods
|
|
44
|
+
const title = document.querySelector("title")?.textContent;
|
|
45
|
+
const heading = document.querySelector("h1")?.textContent;
|
|
46
|
+
|
|
47
|
+
console.log(title); // "Test"
|
|
48
|
+
console.log(heading); // "Hello World"
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Simple Example
|
|
52
|
+
|
|
53
|
+
```typescript
|
|
54
|
+
import { parseHTML } from "html-parser";
|
|
55
|
+
|
|
56
|
+
const html = `
|
|
57
|
+
<div class="container">
|
|
58
|
+
<p>Hello, world!</p>
|
|
59
|
+
<span id="info">This is a test</span>
|
|
60
|
+
</div>
|
|
61
|
+
`;
|
|
62
|
+
|
|
63
|
+
const doc = parseHTML(html);
|
|
64
|
+
const container = doc.querySelector(".container");
|
|
65
|
+
const info = doc.getElementById("info");
|
|
66
|
+
|
|
67
|
+
console.log(container?.children.length); // 2
|
|
68
|
+
console.log(info?.textContent); // "This is a test"
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## API
|
|
72
|
+
|
|
73
|
+
### `parseHTML(html: string): Document`
|
|
74
|
+
|
|
75
|
+
Parses an HTML string and returns a DOM Document object.
|
|
76
|
+
|
|
77
|
+
**Parameters:**
|
|
78
|
+
|
|
79
|
+
- `html` (string): The HTML string to parse
|
|
80
|
+
|
|
81
|
+
**Returns:**
|
|
82
|
+
|
|
83
|
+
- `Document`: A standard DOM Document object with all the usual methods like `querySelector`, `getElementById`, etc.
|
|
84
|
+
|
|
85
|
+
## Development
|
|
86
|
+
|
|
87
|
+
This project is built with Bun. To get started:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# Install dependencies
|
|
91
|
+
bun install
|
|
92
|
+
|
|
93
|
+
# Run tests
|
|
94
|
+
bun test
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Testing
|
|
99
|
+
|
|
100
|
+
Run the test suite:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
bun test
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## License
|
|
107
|
+
|
|
108
|
+
MIT
|
|
109
|
+
|
|
110
|
+
## Contributing
|
|
111
|
+
|
|
112
|
+
1. Fork the repository
|
|
113
|
+
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
|
114
|
+
3. Commit your changes (`git commit -m 'Add some amazing feature'`)
|
|
115
|
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
|
116
|
+
5. Open a Pull Request
|
|
117
|
+
|
|
118
|
+
## Support
|
|
119
|
+
|
|
120
|
+
If you encounter any issues or have questions, please file an issue on the [GitHub repository](https://github.com/yourusername/html-parser).
|
package/bun.lock
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lockfileVersion": 1,
|
|
3
|
+
"workspaces": {
|
|
4
|
+
"": {
|
|
5
|
+
"name": "html-parser",
|
|
6
|
+
"devDependencies": {
|
|
7
|
+
"@types/bun": "latest",
|
|
8
|
+
},
|
|
9
|
+
"peerDependencies": {
|
|
10
|
+
"typescript": "^5.8.3",
|
|
11
|
+
},
|
|
12
|
+
},
|
|
13
|
+
},
|
|
14
|
+
"packages": {
|
|
15
|
+
"@types/bun": ["@types/bun@1.2.18", "", { "dependencies": { "bun-types": "1.2.18" } }, "sha512-Xf6RaWVheyemaThV0kUfaAUvCNokFr+bH8Jxp+tTZfx7dAPA8z9ePnP9S9+Vspzuxxx9JRAXhnyccRj3GyCMdQ=="],
|
|
16
|
+
|
|
17
|
+
"@types/node": ["@types/node@24.0.4", "", { "dependencies": { "undici-types": "~7.8.0" } }, "sha512-ulyqAkrhnuNq9pB76DRBTkcS6YsmDALy6Ua63V8OhrOBgbcYt6IOdzpw5P1+dyRIyMerzLkeYWBeOXPpA9GMAA=="],
|
|
18
|
+
|
|
19
|
+
"@types/react": ["@types/react@19.1.8", "", { "dependencies": { "csstype": "^3.0.2" } }, "sha512-AwAfQ2Wa5bCx9WP8nZL2uMZWod7J7/JSplxbTmBQ5ms6QpqNYm672H0Vu9ZVKVngQ+ii4R/byguVEUZQyeg44g=="],
|
|
20
|
+
|
|
21
|
+
"bun-types": ["bun-types@1.2.18", "", { "dependencies": { "@types/node": "*" }, "peerDependencies": { "@types/react": "^19" } }, "sha512-04+Eha5NP7Z0A9YgDAzMk5PHR16ZuLVa83b26kH5+cp1qZW4F6FmAURngE7INf4tKOvCE69vYvDEwoNl1tGiWw=="],
|
|
22
|
+
|
|
23
|
+
"csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="],
|
|
24
|
+
|
|
25
|
+
"typescript": ["typescript@5.8.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ=="],
|
|
26
|
+
|
|
27
|
+
"undici-types": ["undici-types@7.8.0", "", {}, "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw=="],
|
|
28
|
+
}
|
|
29
|
+
}
|
package/index.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { tokenize } from './src/tokenizer.js';
|
|
2
|
+
import { parse } from './src/parser.js';
|
|
3
|
+
import {
|
|
4
|
+
astToDOM,
|
|
5
|
+
} from './src/dom-simulator.js';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Parse HTML string into Document object
|
|
9
|
+
* @param html The HTML string to parse
|
|
10
|
+
* @returns A Document object
|
|
11
|
+
*/
|
|
12
|
+
export function parseHTML(html: string = ""): Document {
|
|
13
|
+
const tokens = tokenize(html);
|
|
14
|
+
const ast = parse(tokens);
|
|
15
|
+
return astToDOM(ast);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
package/package.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@tkeron/html-parser",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "A fast and lightweight HTML parser for Bun",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"module": "index.ts",
|
|
7
|
+
"type": "module",
|
|
8
|
+
"author": "tkeron",
|
|
9
|
+
"license": "MIT",
|
|
10
|
+
"devDependencies": {
|
|
11
|
+
"@types/bun": "latest"
|
|
12
|
+
},
|
|
13
|
+
"peerDependencies": {
|
|
14
|
+
"typescript": "^5.8.3"
|
|
15
|
+
},
|
|
16
|
+
"keywords": [
|
|
17
|
+
"cli",
|
|
18
|
+
"commands",
|
|
19
|
+
"command-line",
|
|
20
|
+
"arguments"
|
|
21
|
+
],
|
|
22
|
+
"repository": {
|
|
23
|
+
"url": "git@github.com:tkeron/html-parser.git"
|
|
24
|
+
}
|
|
25
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
interface SelectorToken {
|
|
2
|
+
type: "tag" | "class" | "id" | "attribute";
|
|
3
|
+
value: string;
|
|
4
|
+
attributeName?: string;
|
|
5
|
+
attributeValue?: string;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
interface SelectorGroup {
|
|
9
|
+
tokens: SelectorToken[];
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
function parseSelector(selector: string): SelectorGroup[] {
|
|
13
|
+
const parts = selector.trim().split(/\s+/);
|
|
14
|
+
|
|
15
|
+
return parts.map((part) => {
|
|
16
|
+
const trimmed = part.trim();
|
|
17
|
+
let tokens: SelectorToken[];
|
|
18
|
+
|
|
19
|
+
if (trimmed.startsWith("#")) {
|
|
20
|
+
tokens = [{ type: "id", value: trimmed.slice(1) }];
|
|
21
|
+
} else if (trimmed.startsWith(".")) {
|
|
22
|
+
tokens = [{ type: "class", value: trimmed.slice(1) }];
|
|
23
|
+
} else if (trimmed.includes("[") && trimmed.includes("]")) {
|
|
24
|
+
// Handle attribute selectors like input[type="email"], meta[charset], or [role="button"]
|
|
25
|
+
const attributeMatch = trimmed.match(/^([^[\]]*)\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]$/);
|
|
26
|
+
if (attributeMatch) {
|
|
27
|
+
const [, tagName, attrName, attrValue] = attributeMatch;
|
|
28
|
+
tokens = [];
|
|
29
|
+
|
|
30
|
+
// Add tag token if there's a tag name
|
|
31
|
+
if (tagName && tagName.trim()) {
|
|
32
|
+
tokens.push({ type: "tag", value: tagName.trim().toLowerCase() });
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Add attribute token
|
|
36
|
+
tokens.push({
|
|
37
|
+
type: "attribute",
|
|
38
|
+
value: (attrName || "").trim(),
|
|
39
|
+
attributeName: (attrName || "").trim(),
|
|
40
|
+
attributeValue: attrValue ? attrValue.trim() : undefined
|
|
41
|
+
});
|
|
42
|
+
} else {
|
|
43
|
+
tokens = [{ type: "tag", value: trimmed.toLowerCase() }];
|
|
44
|
+
}
|
|
45
|
+
} else {
|
|
46
|
+
tokens = [{ type: "tag", value: trimmed.toLowerCase() }];
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return { tokens };
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function matchesToken(element: any, token: SelectorToken): boolean {
|
|
54
|
+
if (!element || !element.tagName) {
|
|
55
|
+
return false;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
switch (token.type) {
|
|
59
|
+
case "tag":
|
|
60
|
+
return element.tagName.toLowerCase() === token.value;
|
|
61
|
+
case "class":
|
|
62
|
+
const classAttr =
|
|
63
|
+
element.attributes?.class || element.attributes?.className || "";
|
|
64
|
+
const classes = classAttr.split(/\s+/).filter(Boolean);
|
|
65
|
+
return classes.includes(token.value);
|
|
66
|
+
case "id":
|
|
67
|
+
return element.attributes?.id === token.value;
|
|
68
|
+
case "attribute":
|
|
69
|
+
const attrValue = element.attributes?.[token.attributeName || ""];
|
|
70
|
+
// If no attribute value specified in selector, just check if attribute exists
|
|
71
|
+
if (token.attributeValue === undefined) {
|
|
72
|
+
return attrValue !== undefined;
|
|
73
|
+
}
|
|
74
|
+
// Otherwise check for exact match
|
|
75
|
+
return attrValue === token.attributeValue;
|
|
76
|
+
default:
|
|
77
|
+
return false;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function matchesSelector(element: any, tokens: SelectorToken[]): boolean {
|
|
82
|
+
return tokens.every((token) => matchesToken(element, token));
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function findElementsDescendant(
|
|
86
|
+
node: any,
|
|
87
|
+
selectorGroups: SelectorGroup[],
|
|
88
|
+
groupIndex: number,
|
|
89
|
+
results: any[]
|
|
90
|
+
): void {
|
|
91
|
+
if (groupIndex >= selectorGroups.length) {
|
|
92
|
+
return;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const currentGroup = selectorGroups[groupIndex];
|
|
96
|
+
if (!currentGroup) {
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const isLastGroup = groupIndex === selectorGroups.length - 1;
|
|
101
|
+
|
|
102
|
+
for (const child of node.childNodes || []) {
|
|
103
|
+
if (child.nodeType === 1) {
|
|
104
|
+
const element = child;
|
|
105
|
+
|
|
106
|
+
if (matchesSelector(element, currentGroup.tokens)) {
|
|
107
|
+
if (isLastGroup) {
|
|
108
|
+
results.push(element);
|
|
109
|
+
} else {
|
|
110
|
+
findElementsDescendant(
|
|
111
|
+
element,
|
|
112
|
+
selectorGroups,
|
|
113
|
+
groupIndex + 1,
|
|
114
|
+
results
|
|
115
|
+
);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const shouldContinueSearching =
|
|
121
|
+
!isLastGroup ||
|
|
122
|
+
child.nodeType !== 1 ||
|
|
123
|
+
!matchesSelector(child, currentGroup.tokens);
|
|
124
|
+
if (shouldContinueSearching) {
|
|
125
|
+
findElementsDescendant(child, selectorGroups, groupIndex, results);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
function findElements(
|
|
131
|
+
node: any,
|
|
132
|
+
selectorGroups: SelectorGroup[],
|
|
133
|
+
results: any[]
|
|
134
|
+
): void {
|
|
135
|
+
if (selectorGroups.length === 1) {
|
|
136
|
+
const firstGroup = selectorGroups[0];
|
|
137
|
+
if (firstGroup) {
|
|
138
|
+
const tokens = firstGroup.tokens;
|
|
139
|
+
findElementsSimple(node, tokens, results);
|
|
140
|
+
}
|
|
141
|
+
} else {
|
|
142
|
+
findElementsDescendant(node, selectorGroups, 0, results);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
function findElementsSimple(
|
|
147
|
+
node: any,
|
|
148
|
+
tokens: SelectorToken[],
|
|
149
|
+
results: any[]
|
|
150
|
+
): void {
|
|
151
|
+
if (node.nodeType === 1) {
|
|
152
|
+
const element = node;
|
|
153
|
+
if (matchesSelector(element, tokens)) {
|
|
154
|
+
results.push(element);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
for (const child of node.childNodes || []) {
|
|
158
|
+
findElementsSimple(child, tokens, results);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
export function querySelectorAll(root: any, selector: string): any[] {
|
|
163
|
+
const selectorGroups = parseSelector(selector);
|
|
164
|
+
const results: any[] = [];
|
|
165
|
+
findElements(root, selectorGroups, results);
|
|
166
|
+
return results;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
export function querySelector(root: any, selector: string): any | null {
|
|
170
|
+
const results = querySelectorAll(root, selector);
|
|
171
|
+
return results[0] || null;
|
|
172
|
+
}
|