@tkeron/html-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ name: html-parser package
2
+ on:
3
+ push:
4
+ branches:
5
+ - main
6
+
7
+ jobs:
8
+ build-test-publish:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ - uses: actions/setup-node@v4.0.3
13
+ with:
14
+ node-version: 22.x
15
+ registry-url: "https://registry.npmjs.org/"
16
+
17
+ - uses: oven-sh/setup-bun@v2
18
+
19
+ - run: |
20
+ bun i
21
+ bun test
22
+ npm publish --access public
23
+ env:
24
+ NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 tkeron
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,120 @@
1
+ # HTML Parser - Powered by Bun Native Tokenizer
2
+
3
+ > ⚠ïļ **Work in Progress** - This package is currently under active development and not yet published to npm.
4
+
5
+ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM Document objects. **Now powered by a native Bun tokenizer** for optimal performance.
6
+
7
+ ## Features
8
+
9
+ - ⚡ **Bun Native Tokenizer**: Optimized specifically for Bun runtime
10
+ - 🚀 **Ultra Fast**: Leverages Bun's native optimizations
11
+ - ðŸŠķ **Lightweight**: Minimal dependencies, native implementation
12
+ - 🌐 **Standards Compliant**: Returns standard DOM Document objects
13
+ - 🔧 **TypeScript Support**: Full TypeScript definitions included
14
+ - ✅ **Well Tested**: Comprehensive unit test suite (181/181 passing)
15
+ - 🔄 **100% Compatible**: Drop-in replacement, same API
16
+
17
+ ## Installation
18
+
19
+ > **Note**: This package is not yet published to npm. For now, you can clone and build locally.
20
+
21
+ Once published, it will be available as:
22
+
23
+ ```bash
24
+ npm install html-parser
25
+ ```
26
+
27
+ Or with Bun:
28
+
29
+ ```bash
30
+ bun add html-parser
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ ```typescript
36
+ import { parseHTML } from "html-parser";
37
+
38
+ // Parse HTML string into DOM Document
39
+ const html =
40
+ "<html><head><title>Test</title></head><body><h1>Hello World</h1></body></html>";
41
+ const document = parseHTML(html);
42
+
43
+ // Use standard DOM methods
44
+ const title = document.querySelector("title")?.textContent;
45
+ const heading = document.querySelector("h1")?.textContent;
46
+
47
+ console.log(title); // "Test"
48
+ console.log(heading); // "Hello World"
49
+ ```
50
+
51
+ ### Simple Example
52
+
53
+ ```typescript
54
+ import { parseHTML } from "html-parser";
55
+
56
+ const html = `
57
+ <div class="container">
58
+ <p>Hello, world!</p>
59
+ <span id="info">This is a test</span>
60
+ </div>
61
+ `;
62
+
63
+ const doc = parseHTML(html);
64
+ const container = doc.querySelector(".container");
65
+ const info = doc.getElementById("info");
66
+
67
+ console.log(container?.children.length); // 2
68
+ console.log(info?.textContent); // "This is a test"
69
+ ```
70
+
71
+ ## API
72
+
73
+ ### `parseHTML(html: string): Document`
74
+
75
+ Parses an HTML string and returns a DOM Document object.
76
+
77
+ **Parameters:**
78
+
79
+ - `html` (string): The HTML string to parse
80
+
81
+ **Returns:**
82
+
83
+ - `Document`: A standard DOM Document object with all the usual methods like `querySelector`, `getElementById`, etc.
84
+
85
+ ## Development
86
+
87
+ This project is built with Bun. To get started:
88
+
89
+ ```bash
90
+ # Install dependencies
91
+ bun install
92
+
93
+ # Run tests
94
+ bun test
95
+
96
+ ```
97
+
98
+ ## Testing
99
+
100
+ Run the test suite:
101
+
102
+ ```bash
103
+ bun test
104
+ ```
105
+
106
+ ## License
107
+
108
+ MIT
109
+
110
+ ## Contributing
111
+
112
+ 1. Fork the repository
113
+ 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
114
+ 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
115
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
116
+ 5. Open a Pull Request
117
+
118
+ ## Support
119
+
120
+ If you encounter any issues or have questions, please file an issue on the [GitHub repository](https://github.com/yourusername/html-parser).
package/bun.lock ADDED
@@ -0,0 +1,29 @@
1
+ {
2
+ "lockfileVersion": 1,
3
+ "workspaces": {
4
+ "": {
5
+ "name": "html-parser",
6
+ "devDependencies": {
7
+ "@types/bun": "latest",
8
+ },
9
+ "peerDependencies": {
10
+ "typescript": "^5.8.3",
11
+ },
12
+ },
13
+ },
14
+ "packages": {
15
+ "@types/bun": ["@types/bun@1.2.18", "", { "dependencies": { "bun-types": "1.2.18" } }, "sha512-Xf6RaWVheyemaThV0kUfaAUvCNokFr+bH8Jxp+tTZfx7dAPA8z9ePnP9S9+Vspzuxxx9JRAXhnyccRj3GyCMdQ=="],
16
+
17
+ "@types/node": ["@types/node@24.0.4", "", { "dependencies": { "undici-types": "~7.8.0" } }, "sha512-ulyqAkrhnuNq9pB76DRBTkcS6YsmDALy6Ua63V8OhrOBgbcYt6IOdzpw5P1+dyRIyMerzLkeYWBeOXPpA9GMAA=="],
18
+
19
+ "@types/react": ["@types/react@19.1.8", "", { "dependencies": { "csstype": "^3.0.2" } }, "sha512-AwAfQ2Wa5bCx9WP8nZL2uMZWod7J7/JSplxbTmBQ5ms6QpqNYm672H0Vu9ZVKVngQ+ii4R/byguVEUZQyeg44g=="],
20
+
21
+ "bun-types": ["bun-types@1.2.18", "", { "dependencies": { "@types/node": "*" }, "peerDependencies": { "@types/react": "^19" } }, "sha512-04+Eha5NP7Z0A9YgDAzMk5PHR16ZuLVa83b26kH5+cp1qZW4F6FmAURngE7INf4tKOvCE69vYvDEwoNl1tGiWw=="],
22
+
23
+ "csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="],
24
+
25
+ "typescript": ["typescript@5.8.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ=="],
26
+
27
+ "undici-types": ["undici-types@7.8.0", "", {}, "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw=="],
28
+ }
29
+ }
package/index.ts ADDED
@@ -0,0 +1,18 @@
1
+ import { tokenize } from './src/tokenizer.js';
2
+ import { parse } from './src/parser.js';
3
+ import {
4
+ astToDOM,
5
+ } from './src/dom-simulator.js';
6
+
7
+ /**
8
+ * Parse HTML string into Document object
9
+ * @param html The HTML string to parse
10
+ * @returns A Document object
11
+ */
12
+ export function parseHTML(html: string = ""): Document {
13
+ const tokens = tokenize(html);
14
+ const ast = parse(tokens);
15
+ return astToDOM(ast);
16
+ }
17
+
18
+
package/package.json ADDED
@@ -0,0 +1,25 @@
1
+ {
2
+ "name": "@tkeron/html-parser",
3
+ "version": "0.1.0",
4
+ "description": "A fast and lightweight HTML parser for Bun",
5
+ "main": "index.js",
6
+ "module": "index.ts",
7
+ "type": "module",
8
+ "author": "tkeron",
9
+ "license": "MIT",
10
+ "devDependencies": {
11
+ "@types/bun": "latest"
12
+ },
13
+ "peerDependencies": {
14
+ "typescript": "^5.8.3"
15
+ },
16
+ "keywords": [
17
+ "cli",
18
+ "commands",
19
+ "command-line",
20
+ "arguments"
21
+ ],
22
+ "repository": {
23
+ "url": "git@github.com:tkeron/html-parser.git"
24
+ }
25
+ }
@@ -0,0 +1,172 @@
1
+ interface SelectorToken {
2
+ type: "tag" | "class" | "id" | "attribute";
3
+ value: string;
4
+ attributeName?: string;
5
+ attributeValue?: string;
6
+ }
7
+
8
+ interface SelectorGroup {
9
+ tokens: SelectorToken[];
10
+ }
11
+
12
+ function parseSelector(selector: string): SelectorGroup[] {
13
+ const parts = selector.trim().split(/\s+/);
14
+
15
+ return parts.map((part) => {
16
+ const trimmed = part.trim();
17
+ let tokens: SelectorToken[];
18
+
19
+ if (trimmed.startsWith("#")) {
20
+ tokens = [{ type: "id", value: trimmed.slice(1) }];
21
+ } else if (trimmed.startsWith(".")) {
22
+ tokens = [{ type: "class", value: trimmed.slice(1) }];
23
+ } else if (trimmed.includes("[") && trimmed.includes("]")) {
24
+ // Handle attribute selectors like input[type="email"], meta[charset], or [role="button"]
25
+ const attributeMatch = trimmed.match(/^([^[\]]*)\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]$/);
26
+ if (attributeMatch) {
27
+ const [, tagName, attrName, attrValue] = attributeMatch;
28
+ tokens = [];
29
+
30
+ // Add tag token if there's a tag name
31
+ if (tagName && tagName.trim()) {
32
+ tokens.push({ type: "tag", value: tagName.trim().toLowerCase() });
33
+ }
34
+
35
+ // Add attribute token
36
+ tokens.push({
37
+ type: "attribute",
38
+ value: (attrName || "").trim(),
39
+ attributeName: (attrName || "").trim(),
40
+ attributeValue: attrValue ? attrValue.trim() : undefined
41
+ });
42
+ } else {
43
+ tokens = [{ type: "tag", value: trimmed.toLowerCase() }];
44
+ }
45
+ } else {
46
+ tokens = [{ type: "tag", value: trimmed.toLowerCase() }];
47
+ }
48
+
49
+ return { tokens };
50
+ });
51
+ }
52
+
53
+ function matchesToken(element: any, token: SelectorToken): boolean {
54
+ if (!element || !element.tagName) {
55
+ return false;
56
+ }
57
+
58
+ switch (token.type) {
59
+ case "tag":
60
+ return element.tagName.toLowerCase() === token.value;
61
+ case "class":
62
+ const classAttr =
63
+ element.attributes?.class || element.attributes?.className || "";
64
+ const classes = classAttr.split(/\s+/).filter(Boolean);
65
+ return classes.includes(token.value);
66
+ case "id":
67
+ return element.attributes?.id === token.value;
68
+ case "attribute":
69
+ const attrValue = element.attributes?.[token.attributeName || ""];
70
+ // If no attribute value specified in selector, just check if attribute exists
71
+ if (token.attributeValue === undefined) {
72
+ return attrValue !== undefined;
73
+ }
74
+ // Otherwise check for exact match
75
+ return attrValue === token.attributeValue;
76
+ default:
77
+ return false;
78
+ }
79
+ }
80
+
81
+ function matchesSelector(element: any, tokens: SelectorToken[]): boolean {
82
+ return tokens.every((token) => matchesToken(element, token));
83
+ }
84
+
85
+ function findElementsDescendant(
86
+ node: any,
87
+ selectorGroups: SelectorGroup[],
88
+ groupIndex: number,
89
+ results: any[]
90
+ ): void {
91
+ if (groupIndex >= selectorGroups.length) {
92
+ return;
93
+ }
94
+
95
+ const currentGroup = selectorGroups[groupIndex];
96
+ if (!currentGroup) {
97
+ return;
98
+ }
99
+
100
+ const isLastGroup = groupIndex === selectorGroups.length - 1;
101
+
102
+ for (const child of node.childNodes || []) {
103
+ if (child.nodeType === 1) {
104
+ const element = child;
105
+
106
+ if (matchesSelector(element, currentGroup.tokens)) {
107
+ if (isLastGroup) {
108
+ results.push(element);
109
+ } else {
110
+ findElementsDescendant(
111
+ element,
112
+ selectorGroups,
113
+ groupIndex + 1,
114
+ results
115
+ );
116
+ }
117
+ }
118
+ }
119
+
120
+ const shouldContinueSearching =
121
+ !isLastGroup ||
122
+ child.nodeType !== 1 ||
123
+ !matchesSelector(child, currentGroup.tokens);
124
+ if (shouldContinueSearching) {
125
+ findElementsDescendant(child, selectorGroups, groupIndex, results);
126
+ }
127
+ }
128
+ }
129
+
130
+ function findElements(
131
+ node: any,
132
+ selectorGroups: SelectorGroup[],
133
+ results: any[]
134
+ ): void {
135
+ if (selectorGroups.length === 1) {
136
+ const firstGroup = selectorGroups[0];
137
+ if (firstGroup) {
138
+ const tokens = firstGroup.tokens;
139
+ findElementsSimple(node, tokens, results);
140
+ }
141
+ } else {
142
+ findElementsDescendant(node, selectorGroups, 0, results);
143
+ }
144
+ }
145
+
146
+ function findElementsSimple(
147
+ node: any,
148
+ tokens: SelectorToken[],
149
+ results: any[]
150
+ ): void {
151
+ if (node.nodeType === 1) {
152
+ const element = node;
153
+ if (matchesSelector(element, tokens)) {
154
+ results.push(element);
155
+ }
156
+ }
157
+ for (const child of node.childNodes || []) {
158
+ findElementsSimple(child, tokens, results);
159
+ }
160
+ }
161
+
162
+ export function querySelectorAll(root: any, selector: string): any[] {
163
+ const selectorGroups = parseSelector(selector);
164
+ const results: any[] = [];
165
+ findElements(root, selectorGroups, results);
166
+ return results;
167
+ }
168
+
169
+ export function querySelector(root: any, selector: string): any | null {
170
+ const results = querySelectorAll(root, selector);
171
+ return results[0] || null;
172
+ }