@tkeron/html-parser 1.4.1 â 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -1
- package/index.ts +6 -2
- package/package.json +1 -1
- package/src/parser/index.ts +1 -1
- package/src/parser/parse.ts +30 -0
- package/src/selectors/parse-selector.ts +2 -2
- package/tests/helpers/tree-adapter.ts +49 -0
- package/tests/selector-underscore-ids.test.ts +90 -0
- package/tests/tree-construction-adoption01.test.ts +30 -16
package/README.md
CHANGED
|
@@ -9,8 +9,9 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
|
|
|
9
9
|
- ðŠķ **Lightweight**: Zero external dependencies
|
|
10
10
|
- ð **Standards Compliant**: Returns standard DOM Document objects
|
|
11
11
|
- ð§ **TypeScript Support**: Full TypeScript definitions included
|
|
12
|
-
- â
**Well Tested**: Comprehensive test suite (
|
|
12
|
+
- â
**Well Tested**: Comprehensive test suite (5660+ tests passing)
|
|
13
13
|
- ðŊ **HTML5 Spec**: Implements Adoption Agency Algorithm for proper formatting element handling
|
|
14
|
+
- ð§Đ **Fragment Parsing**: Parse HTML fragments with context element support
|
|
14
15
|
|
|
15
16
|
## Installation
|
|
16
17
|
|
|
@@ -76,6 +77,28 @@ Parses an HTML string and returns a DOM Document object.
|
|
|
76
77
|
|
|
77
78
|
- `Document`: A standard DOM Document object with all the usual methods like `querySelector`, `getElementById`, etc.
|
|
78
79
|
|
|
80
|
+
### `parseHTMLFragment(html: string, contextTagName: string): Node[]`
|
|
81
|
+
|
|
82
|
+
Parses an HTML string as a fragment within a context element. Useful for parsing innerHTML-style content.
|
|
83
|
+
|
|
84
|
+
**Parameters:**
|
|
85
|
+
|
|
86
|
+
- `html` (string): The HTML string to parse
|
|
87
|
+
- `contextTagName` (string): The tag name of the context element (e.g., `"div"`, `"body"`)
|
|
88
|
+
|
|
89
|
+
**Returns:**
|
|
90
|
+
|
|
91
|
+
- `Node[]`: An array of parsed nodes
|
|
92
|
+
|
|
93
|
+
**Example:**
|
|
94
|
+
|
|
95
|
+
```typescript
|
|
96
|
+
import { parseHTMLFragment } from "@tkeron/html-parser";
|
|
97
|
+
|
|
98
|
+
const nodes = parseHTMLFragment("<b>Hello</b> <i>World</i>", "div");
|
|
99
|
+
console.log(nodes.length); // 3 (b element, text node, i element)
|
|
100
|
+
```
|
|
101
|
+
|
|
79
102
|
## Development
|
|
80
103
|
|
|
81
104
|
This project is built with Bun. To get started:
|
package/index.ts
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
import { tokenize } from "./src/tokenizer/index.js";
|
|
2
|
-
import { parse } from "./src/parser/index.js";
|
|
2
|
+
import { parse, parseFragment } from "./src/parser/index.js";
|
|
3
3
|
import { astToDOM } from "./src/dom-simulator.js";
|
|
4
4
|
|
|
5
5
|
export function parseHTML(html: string = ""): Document {
|
|
6
6
|
const tokens = tokenize(html);
|
|
7
7
|
const ast = parse(tokens);
|
|
8
|
-
// If parse already returns a DOM document, return it directly
|
|
9
8
|
if (ast && typeof ast.nodeType === "number" && ast.nodeType === 9) {
|
|
10
9
|
return ast;
|
|
11
10
|
}
|
|
12
11
|
return astToDOM(ast);
|
|
13
12
|
}
|
|
13
|
+
|
|
14
|
+
export function parseHTMLFragment(html: string, contextTagName: string): any[] {
|
|
15
|
+
const tokens = tokenize(html);
|
|
16
|
+
return parseFragment(tokens, contextTagName);
|
|
17
|
+
}
|
package/package.json
CHANGED
package/src/parser/index.ts
CHANGED
package/src/parser/parse.ts
CHANGED
|
@@ -1112,3 +1112,33 @@ const getActiveFormattingElementsBeforeMarker = (state: ParserState): any[] => {
|
|
|
1112
1112
|
}
|
|
1113
1113
|
return result;
|
|
1114
1114
|
};
|
|
1115
|
+
|
|
1116
|
+
export const parseFragment = (tokens: Token[], contextTagName: string): any => {
|
|
1117
|
+
const root = createDocument();
|
|
1118
|
+
const contextElement = createElement(contextTagName.toLowerCase(), {});
|
|
1119
|
+
appendChild(root, contextElement);
|
|
1120
|
+
|
|
1121
|
+
const state: ParserState = {
|
|
1122
|
+
tokens,
|
|
1123
|
+
position: 0,
|
|
1124
|
+
length: tokens.length,
|
|
1125
|
+
stack: [root, contextElement],
|
|
1126
|
+
root,
|
|
1127
|
+
insertionMode: InsertionMode.InBody,
|
|
1128
|
+
errors: [],
|
|
1129
|
+
activeFormattingElements: [],
|
|
1130
|
+
};
|
|
1131
|
+
|
|
1132
|
+
while (state.position < state.length) {
|
|
1133
|
+
const token = getCurrentToken(state);
|
|
1134
|
+
|
|
1135
|
+
if (!token || token.type === TokenType.EOF) {
|
|
1136
|
+
break;
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
parseToken(state, token);
|
|
1140
|
+
advance(state);
|
|
1141
|
+
}
|
|
1142
|
+
|
|
1143
|
+
return contextElement.childNodes;
|
|
1144
|
+
};
|
|
@@ -19,11 +19,11 @@ export const parseSelector = (selector: string): SelectorGroup[] => {
|
|
|
19
19
|
remaining = remaining.slice(tagMatch[1].length);
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
-
const idMatches = remaining.matchAll(/#([a-zA-Z0-
|
|
22
|
+
const idMatches = remaining.matchAll(/#([a-zA-Z0-9_-][a-zA-Z0-9_-]*)/g);
|
|
23
23
|
for (const match of idMatches) {
|
|
24
24
|
tokens.push({ type: "id", value: match[1] });
|
|
25
25
|
}
|
|
26
|
-
remaining = remaining.replace(/#[a-zA-Z0-
|
|
26
|
+
remaining = remaining.replace(/#[a-zA-Z0-9_-][a-zA-Z0-9_-]*/g, "");
|
|
27
27
|
|
|
28
28
|
const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
|
|
29
29
|
for (const match of classMatches) {
|
|
@@ -68,3 +68,52 @@ export function serializeToHtml5lib(
|
|
|
68
68
|
serialize(doc, 0);
|
|
69
69
|
return lines.join("\n") + "\n";
|
|
70
70
|
}
|
|
71
|
+
|
|
72
|
+
export function serializeFragmentToHtml5lib(nodes: any[]): string {
|
|
73
|
+
const lines: string[] = [];
|
|
74
|
+
|
|
75
|
+
function serialize(node: any, depth: number): void {
|
|
76
|
+
const indent = "| " + " ".repeat(depth);
|
|
77
|
+
|
|
78
|
+
if (node.nodeType === 1) {
|
|
79
|
+
const tagName = node.tagName.toLowerCase();
|
|
80
|
+
const ns = node.namespaceURI;
|
|
81
|
+
|
|
82
|
+
let nsPrefix = "";
|
|
83
|
+
if (ns === "http://www.w3.org/2000/svg") {
|
|
84
|
+
nsPrefix = "svg ";
|
|
85
|
+
} else if (ns === "http://www.w3.org/1998/Math/MathML") {
|
|
86
|
+
nsPrefix = "math ";
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
lines.push(`${indent}<${nsPrefix}${tagName}>`);
|
|
90
|
+
|
|
91
|
+
const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) =>
|
|
92
|
+
a.localeCompare(b),
|
|
93
|
+
);
|
|
94
|
+
for (const [name, value] of attrs) {
|
|
95
|
+
lines.push(`${indent} ${name}="${value}"`);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (node.tagName.toLowerCase() === "template" && node.content) {
|
|
99
|
+
lines.push(`${indent} content`);
|
|
100
|
+
serialize(node.content, depth + 2);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
for (const child of node.childNodes || []) {
|
|
104
|
+
serialize(child, depth + 1);
|
|
105
|
+
}
|
|
106
|
+
} else if (node.nodeType === 3) {
|
|
107
|
+
lines.push(`${indent}"${node.textContent}"`);
|
|
108
|
+
} else if (node.nodeType === 8) {
|
|
109
|
+
const commentData = node.data || node.nodeValue || node.textContent || "";
|
|
110
|
+
lines.push(`${indent}<!-- ${commentData} -->`);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
for (const node of nodes) {
|
|
115
|
+
serialize(node, 0);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
return lines.join("\n") + "\n";
|
|
119
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { describe, it, expect } from "bun:test";
|
|
2
|
+
import { parseHTML } from "../index";
|
|
3
|
+
|
|
4
|
+
describe("querySelector with underscore IDs", () => {
|
|
5
|
+
it("should find element with ID starting with single underscore", () => {
|
|
6
|
+
const doc = parseHTML(
|
|
7
|
+
"<html><body><div id='_test'>Content</div></body></html>",
|
|
8
|
+
);
|
|
9
|
+
const result = doc.querySelector("#_test");
|
|
10
|
+
expect(result).not.toBeNull();
|
|
11
|
+
expect(result?.tagName).toBe("DIV");
|
|
12
|
+
expect(result?.id).toBe("_test");
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
it("should find element with ID starting with double underscore", () => {
|
|
16
|
+
const doc = parseHTML(
|
|
17
|
+
"<html><body><div id='__test'>Content</div></body></html>",
|
|
18
|
+
);
|
|
19
|
+
const result = doc.querySelector("#__test");
|
|
20
|
+
expect(result).not.toBeNull();
|
|
21
|
+
expect(result?.tagName).toBe("DIV");
|
|
22
|
+
expect(result?.id).toBe("__test");
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it("should find element with complex underscore ID", () => {
|
|
26
|
+
const doc = parseHTML(
|
|
27
|
+
"<html><body><div id='__tkeron_component_root__'>Content</div></body></html>",
|
|
28
|
+
);
|
|
29
|
+
const result = doc.querySelector("#__tkeron_component_root__");
|
|
30
|
+
expect(result).not.toBeNull();
|
|
31
|
+
expect(result?.tagName).toBe("DIV");
|
|
32
|
+
expect(result?.id).toBe("__tkeron_component_root__");
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it("should find underscore ID from child element context", () => {
|
|
36
|
+
const doc = parseHTML(
|
|
37
|
+
"<html><body><div id='__root'><p>Nested</p></div></body></html>",
|
|
38
|
+
);
|
|
39
|
+
const body = doc.querySelector("body");
|
|
40
|
+
const result = body?.querySelector("#__root");
|
|
41
|
+
expect(result).not.toBeNull();
|
|
42
|
+
expect(result?.tagName).toBe("DIV");
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it("should find nested element with underscore ID", () => {
|
|
46
|
+
const doc = parseHTML(
|
|
47
|
+
"<html><body><div><span id='_nested'>Text</span></div></body></html>",
|
|
48
|
+
);
|
|
49
|
+
const result = doc.querySelector("#_nested");
|
|
50
|
+
expect(result).not.toBeNull();
|
|
51
|
+
expect(result?.tagName).toBe("SPAN");
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it("should return null for non-existent underscore ID", () => {
|
|
55
|
+
const doc = parseHTML(
|
|
56
|
+
"<html><body><div id='other'>Content</div></body></html>",
|
|
57
|
+
);
|
|
58
|
+
const result = doc.querySelector("#_nonexistent");
|
|
59
|
+
expect(result).toBeNull();
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
it("should work with querySelectorAll for underscore IDs", () => {
|
|
63
|
+
const doc = parseHTML(
|
|
64
|
+
"<html><body><div id='_a'>A</div><div id='_b'>B</div></body></html>",
|
|
65
|
+
);
|
|
66
|
+
const resultA = doc.querySelectorAll("#_a");
|
|
67
|
+
const resultB = doc.querySelectorAll("#_b");
|
|
68
|
+
expect(resultA.length).toBe(1);
|
|
69
|
+
expect(resultB.length).toBe(1);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it("should find ID starting with hyphen", () => {
|
|
73
|
+
const doc = parseHTML(
|
|
74
|
+
"<html><body><div id='-test'>Content</div></body></html>",
|
|
75
|
+
);
|
|
76
|
+
const result = doc.querySelector("#-test");
|
|
77
|
+
expect(result).not.toBeNull();
|
|
78
|
+
expect(result?.tagName).toBe("DIV");
|
|
79
|
+
expect(result?.id).toBe("-test");
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it("should find ID with mixed underscore and hyphen at start", () => {
|
|
83
|
+
const doc = parseHTML(
|
|
84
|
+
"<html><body><div id='_-mixed'>Content</div></body></html>",
|
|
85
|
+
);
|
|
86
|
+
const result = doc.querySelector("#_-mixed");
|
|
87
|
+
expect(result).not.toBeNull();
|
|
88
|
+
expect(result?.id).toBe("_-mixed");
|
|
89
|
+
});
|
|
90
|
+
});
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
import { expect, it, describe } from "bun:test";
|
|
2
|
-
import { parseHTML } from "../index";
|
|
3
|
-
import {
|
|
2
|
+
import { parseHTML, parseHTMLFragment } from "../index";
|
|
3
|
+
import {
|
|
4
|
+
serializeToHtml5lib,
|
|
5
|
+
serializeFragmentToHtml5lib,
|
|
6
|
+
} from "./helpers/tree-adapter";
|
|
4
7
|
import { readFileSync } from "fs";
|
|
5
8
|
|
|
6
9
|
describe("Tree Construction Adoption01 Tests", () => {
|
|
@@ -15,10 +18,18 @@ describe("Tree Construction Adoption01 Tests", () => {
|
|
|
15
18
|
let data = "";
|
|
16
19
|
let document = "";
|
|
17
20
|
let inDocument = false;
|
|
18
|
-
let inData = true;
|
|
21
|
+
let inData = true;
|
|
22
|
+
let isFragmentTest = false;
|
|
23
|
+
let fragmentContext = "";
|
|
19
24
|
|
|
20
25
|
for (const line of lines) {
|
|
21
|
-
if (line.startsWith("#document")) {
|
|
26
|
+
if (line.startsWith("#document-fragment")) {
|
|
27
|
+
isFragmentTest = true;
|
|
28
|
+
inDocument = false;
|
|
29
|
+
inData = false;
|
|
30
|
+
} else if (isFragmentTest && !fragmentContext && !line.startsWith("#")) {
|
|
31
|
+
fragmentContext = line.trim();
|
|
32
|
+
} else if (line.startsWith("#document")) {
|
|
22
33
|
inDocument = true;
|
|
23
34
|
inData = false;
|
|
24
35
|
} else if (line.startsWith("#errors")) {
|
|
@@ -31,18 +42,21 @@ describe("Tree Construction Adoption01 Tests", () => {
|
|
|
31
42
|
}
|
|
32
43
|
}
|
|
33
44
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
testFn(`Adoption test ${index + 1}`, () => {
|
|
40
|
-
const doc = parseHTML(data);
|
|
41
|
-
const hasExplicitDoctype = data.toLowerCase().includes("<!doctype");
|
|
42
|
-
const serialized = serializeToHtml5lib(doc, {
|
|
43
|
-
skipImplicitDoctype: !hasExplicitDoctype,
|
|
45
|
+
if (isFragmentTest) {
|
|
46
|
+
it(`Adoption test ${index + 1} (fragment: ${fragmentContext})`, () => {
|
|
47
|
+
const nodes = parseHTMLFragment(data, fragmentContext);
|
|
48
|
+
const serialized = serializeFragmentToHtml5lib(nodes);
|
|
49
|
+
expect(serialized).toBe(document);
|
|
44
50
|
});
|
|
45
|
-
|
|
46
|
-
|
|
51
|
+
} else {
|
|
52
|
+
it(`Adoption test ${index + 1}`, () => {
|
|
53
|
+
const doc = parseHTML(data);
|
|
54
|
+
const hasExplicitDoctype = data.toLowerCase().includes("<!doctype");
|
|
55
|
+
const serialized = serializeToHtml5lib(doc, {
|
|
56
|
+
skipImplicitDoctype: !hasExplicitDoctype,
|
|
57
|
+
});
|
|
58
|
+
expect(serialized).toBe(document);
|
|
59
|
+
});
|
|
60
|
+
}
|
|
47
61
|
});
|
|
48
62
|
});
|