@tkeron/html-parser 1.1.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +14 -4
- package/README.md +6 -6
- package/bun.lock +6 -8
- package/check-versions.ts +147 -0
- package/index.ts +4 -8
- package/package.json +5 -6
- package/src/dom-simulator/append-child.ts +130 -0
- package/src/dom-simulator/append.ts +18 -0
- package/src/dom-simulator/attributes.ts +23 -0
- package/src/dom-simulator/clone-node.ts +51 -0
- package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
- package/src/dom-simulator/create-cdata.ts +18 -0
- package/src/dom-simulator/create-comment.ts +23 -0
- package/src/dom-simulator/create-doctype.ts +24 -0
- package/src/dom-simulator/create-document.ts +81 -0
- package/src/dom-simulator/create-element.ts +195 -0
- package/src/dom-simulator/create-processing-instruction.ts +19 -0
- package/src/dom-simulator/create-temp-parent.ts +9 -0
- package/src/dom-simulator/create-text-node.ts +23 -0
- package/src/dom-simulator/escape-text-content.ts +6 -0
- package/src/dom-simulator/find-special-elements.ts +14 -0
- package/src/dom-simulator/get-text-content.ts +18 -0
- package/src/dom-simulator/index.ts +36 -0
- package/src/dom-simulator/inner-outer-html.ts +182 -0
- package/src/dom-simulator/insert-after.ts +20 -0
- package/src/dom-simulator/insert-before.ts +108 -0
- package/src/dom-simulator/matches.ts +26 -0
- package/src/dom-simulator/node-types.ts +26 -0
- package/src/dom-simulator/prepend.ts +24 -0
- package/src/dom-simulator/remove-child.ts +68 -0
- package/src/dom-simulator/remove.ts +7 -0
- package/src/dom-simulator/replace-child.ts +152 -0
- package/src/dom-simulator/set-text-content.ts +33 -0
- package/src/dom-simulator/update-element-content.ts +56 -0
- package/src/dom-simulator.ts +12 -1126
- package/src/encoding/constants.ts +8 -0
- package/src/encoding/detect-encoding.ts +21 -0
- package/src/encoding/index.ts +1 -0
- package/src/encoding/normalize-encoding.ts +6 -0
- package/src/html-entities.ts +2127 -0
- package/src/index.ts +5 -5
- package/src/parser/adoption-agency-helpers.ts +145 -0
- package/src/parser/constants.ts +137 -0
- package/src/parser/dom-to-ast.ts +79 -0
- package/src/parser/index.ts +9 -0
- package/src/parser/parse.ts +772 -0
- package/src/parser/types.ts +56 -0
- package/src/selectors/find-elements-descendant.ts +47 -0
- package/src/selectors/index.ts +2 -0
- package/src/selectors/matches-selector.ts +12 -0
- package/src/selectors/matches-token.ts +27 -0
- package/src/selectors/parse-selector.ts +48 -0
- package/src/selectors/query-selector-all.ts +43 -0
- package/src/selectors/query-selector.ts +6 -0
- package/src/selectors/types.ts +10 -0
- package/src/serializer/attributes.ts +74 -0
- package/src/serializer/escape.ts +13 -0
- package/src/serializer/index.ts +1 -0
- package/src/serializer/serialize-tokens.ts +511 -0
- package/src/tokenizer/calculate-position.ts +10 -0
- package/src/tokenizer/constants.ts +11 -0
- package/src/tokenizer/decode-entities.ts +64 -0
- package/src/tokenizer/index.ts +2 -0
- package/src/tokenizer/parse-attributes.ts +74 -0
- package/src/tokenizer/tokenize.ts +165 -0
- package/src/tokenizer/types.ts +25 -0
- package/tests/adoption-agency-helpers.test.ts +304 -0
- package/tests/advanced.test.ts +242 -221
- package/tests/cloneNode.test.ts +19 -66
- package/tests/custom-elements-head.test.ts +54 -55
- package/tests/dom-extended.test.ts +77 -64
- package/tests/dom-manipulation.test.ts +51 -24
- package/tests/dom.test.ts +15 -13
- package/tests/encoding/detect-encoding.test.ts +33 -0
- package/tests/google-dom.test.ts +2 -2
- package/tests/helpers/tokenizer-adapter.test.ts +29 -43
- package/tests/helpers/tokenizer-adapter.ts +36 -33
- package/tests/helpers/tree-adapter.test.ts +20 -20
- package/tests/helpers/tree-adapter.ts +34 -24
- package/tests/html-entities-text.test.ts +6 -2
- package/tests/innerhtml-void-elements.test.ts +52 -36
- package/tests/outerHTML-replacement.test.ts +37 -65
- package/tests/parser/dom-to-ast.test.ts +109 -0
- package/tests/parser/parse.test.ts +139 -0
- package/tests/parser.test.ts +281 -217
- package/tests/selectors/query-selector-all.test.ts +39 -0
- package/tests/selectors/query-selector.test.ts +42 -0
- package/tests/serializer/attributes.test.ts +132 -0
- package/tests/serializer/escape.test.ts +51 -0
- package/tests/serializer/serialize-tokens.test.ts +80 -0
- package/tests/serializer-core.test.ts +6 -6
- package/tests/serializer-injectmeta.test.ts +6 -6
- package/tests/serializer-optionaltags.test.ts +9 -6
- package/tests/serializer-options.test.ts +6 -6
- package/tests/serializer-whitespace.test.ts +6 -6
- package/tests/tokenizer/calculate-position.test.ts +34 -0
- package/tests/tokenizer/decode-entities.test.ts +31 -0
- package/tests/tokenizer/parse-attributes.test.ts +44 -0
- package/tests/tokenizer/tokenize.test.ts +757 -0
- package/tests/tokenizer-namedEntities.test.ts +10 -7
- package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
- package/tests/tokenizer.test.ts +268 -256
- package/tests/tree-construction-adoption01.test.ts +25 -16
- package/tests/tree-construction-adoption02.test.ts +30 -19
- package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
- package/tests/tree-construction-entities02.test.ts +18 -16
- package/tests/tree-construction-html5test-com.test.ts +16 -10
- package/tests/tree-construction-math.test.ts +11 -9
- package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
- package/tests/tree-construction-noscript01.test.ts +11 -9
- package/tests/tree-construction-ruby.test.ts +6 -4
- package/tests/tree-construction-scriptdata01.test.ts +6 -4
- package/tests/tree-construction-svg.test.ts +6 -4
- package/tests/tree-construction-template.test.ts +6 -4
- package/tests/tree-construction-tests10.test.ts +6 -4
- package/tests/tree-construction-tests11.test.ts +6 -4
- package/tests/tree-construction-tests20.test.ts +7 -4
- package/tests/tree-construction-tests21.test.ts +7 -4
- package/tests/tree-construction-tests23.test.ts +7 -4
- package/tests/tree-construction-tests24.test.ts +7 -4
- package/tests/tree-construction-tests5.test.ts +6 -5
- package/tests/tree-construction-tests6.test.ts +6 -5
- package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
- package/tests/void-elements.test.ts +85 -40
- package/tsconfig.json +1 -1
- package/src/css-selector.ts +0 -185
- package/src/encoding.ts +0 -39
- package/src/parser.ts +0 -682
- package/src/serializer.ts +0 -450
- package/src/tokenizer.ts +0 -325
- package/tests/selectors.test.ts +0 -128
package/tests/advanced.test.ts
CHANGED
|
@@ -1,21 +1,29 @@
|
|
|
1
1
|
// @ts-nocheck
|
|
2
|
-
import { expect, test, describe, it } from
|
|
3
|
-
import { tokenize, TokenType } from
|
|
4
|
-
import {
|
|
2
|
+
import { expect, test, describe, it } from "bun:test";
|
|
3
|
+
import { tokenize, TokenType } from "../src/tokenizer/index.js";
|
|
4
|
+
import {
|
|
5
|
+
parse,
|
|
6
|
+
ASTNodeType,
|
|
7
|
+
domToAST,
|
|
8
|
+
type ASTNode,
|
|
9
|
+
} from "../src/parser/index";
|
|
5
10
|
|
|
6
11
|
function parseToAST(html: string): ASTNode {
|
|
7
12
|
const tokens = tokenize(html);
|
|
8
13
|
const dom = parse(tokens);
|
|
9
14
|
const ast = domToAST(dom);
|
|
10
|
-
|
|
11
|
-
const hasExplicitHtml =
|
|
15
|
+
|
|
16
|
+
const hasExplicitHtml =
|
|
17
|
+
html.includes("<html") ||
|
|
18
|
+
html.includes("<!DOCTYPE") ||
|
|
19
|
+
html.includes("<!doctype");
|
|
12
20
|
if (hasExplicitHtml) {
|
|
13
21
|
return ast;
|
|
14
22
|
}
|
|
15
|
-
|
|
16
|
-
const htmlEl = ast.children?.find(c => c.tagName ===
|
|
23
|
+
|
|
24
|
+
const htmlEl = ast.children?.find((c) => c.tagName === "html");
|
|
17
25
|
if (htmlEl) {
|
|
18
|
-
const bodyEl = htmlEl.children?.find(c => c.tagName ===
|
|
26
|
+
const bodyEl = htmlEl.children?.find((c) => c.tagName === "body");
|
|
19
27
|
if (bodyEl && bodyEl.children) {
|
|
20
28
|
return { type: ASTNodeType.Document, children: bodyEl.children };
|
|
21
29
|
}
|
|
@@ -23,44 +31,47 @@ function parseToAST(html: string): ASTNode {
|
|
|
23
31
|
return ast;
|
|
24
32
|
}
|
|
25
33
|
|
|
26
|
-
describe(
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
it('should handle attributes with no spaces', () => {
|
|
34
|
+
describe("HTML Parser & Tokenizer - Advanced Tests", () => {
|
|
35
|
+
describe("Tokenizer Edge Cases", () => {
|
|
36
|
+
it("should handle attributes with no spaces", () => {
|
|
30
37
|
const tokens = tokenize('<div class="test"id="main"data-value="123">');
|
|
31
38
|
expect(tokens.length).toBeGreaterThan(0);
|
|
32
39
|
const tag = tokens[0]!;
|
|
33
|
-
|
|
40
|
+
|
|
34
41
|
expect(tag.attributes).toEqual({
|
|
35
|
-
class:
|
|
36
|
-
id:
|
|
37
|
-
|
|
42
|
+
class: "test",
|
|
43
|
+
id: "main",
|
|
44
|
+
"data-value": "123",
|
|
38
45
|
});
|
|
39
46
|
});
|
|
40
47
|
|
|
41
|
-
it(
|
|
42
|
-
const tokens = tokenize(
|
|
48
|
+
it("should handle mixed quote styles", () => {
|
|
49
|
+
const tokens = tokenize(
|
|
50
|
+
`<div class='single' id="double" data-test='mix "quoted" content'>`,
|
|
51
|
+
);
|
|
43
52
|
expect(tokens.length).toBeGreaterThan(0);
|
|
44
53
|
const tag = tokens[0]!;
|
|
45
|
-
|
|
46
|
-
expect(tag.attributes!.class).toBe(
|
|
47
|
-
expect(tag.attributes!.id).toBe(
|
|
48
|
-
expect(tag.attributes![
|
|
54
|
+
|
|
55
|
+
expect(tag.attributes!.class).toBe("single");
|
|
56
|
+
expect(tag.attributes!.id).toBe("double");
|
|
57
|
+
expect(tag.attributes!["data-test"]).toBe('mix "quoted" content');
|
|
49
58
|
});
|
|
50
59
|
|
|
51
|
-
it(
|
|
52
|
-
const tokens = tokenize(
|
|
60
|
+
it("should handle unicode characters", () => {
|
|
61
|
+
const tokens = tokenize(
|
|
62
|
+
'<div title="测试" data-emoji="🚀" class="lorem">',
|
|
63
|
+
);
|
|
53
64
|
expect(tokens.length).toBeGreaterThan(0);
|
|
54
65
|
const tag = tokens[0]!;
|
|
55
|
-
|
|
66
|
+
|
|
56
67
|
expect(tag.attributes).toEqual({
|
|
57
|
-
title:
|
|
58
|
-
|
|
59
|
-
class:
|
|
68
|
+
title: "测试",
|
|
69
|
+
"data-emoji": "🚀",
|
|
70
|
+
class: "lorem",
|
|
60
71
|
});
|
|
61
72
|
});
|
|
62
73
|
|
|
63
|
-
it(
|
|
74
|
+
it("should handle complex CDATA content as bogus comment", () => {
|
|
64
75
|
const complexContent = `
|
|
65
76
|
function test() {
|
|
66
77
|
return "<div>HTML inside JS</div>";
|
|
@@ -70,41 +81,43 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
70
81
|
const tokens = tokenize(`<![CDATA[${complexContent}]]>`);
|
|
71
82
|
expect(tokens.length).toBeGreaterThan(0);
|
|
72
83
|
const cdataToken = tokens[0]!;
|
|
73
|
-
|
|
84
|
+
|
|
74
85
|
expect(cdataToken.type).toBe(TokenType.COMMENT);
|
|
75
|
-
expect(cdataToken.value).toBe(
|
|
86
|
+
expect(cdataToken.value).toBe("[CDATA[" + complexContent + "]]");
|
|
76
87
|
});
|
|
77
88
|
|
|
78
|
-
it(
|
|
79
|
-
let html =
|
|
89
|
+
it("should handle performance with large documents", () => {
|
|
90
|
+
let html = "<div>";
|
|
80
91
|
for (let i = 0; i < 1000; i++) {
|
|
81
92
|
html += `<p id="para-${i}">Content ${i}</p>`;
|
|
82
93
|
}
|
|
83
|
-
html +=
|
|
84
|
-
|
|
94
|
+
html += "</div>";
|
|
95
|
+
|
|
85
96
|
const startTime = Date.now();
|
|
86
97
|
const tokens = tokenize(html);
|
|
87
98
|
const endTime = Date.now();
|
|
88
|
-
|
|
99
|
+
|
|
89
100
|
expect(tokens.length).toBeGreaterThan(2000);
|
|
90
|
-
expect(endTime - startTime).toBeLessThan(1000);
|
|
101
|
+
expect(endTime - startTime).toBeLessThan(1000);
|
|
91
102
|
});
|
|
92
103
|
});
|
|
93
104
|
|
|
94
|
-
describe(
|
|
95
|
-
it(
|
|
96
|
-
const ast = parseToAST(
|
|
97
|
-
|
|
105
|
+
describe("Parser DOM-like Functionality", () => {
|
|
106
|
+
it("should create proper parent-child relationships", () => {
|
|
107
|
+
const ast = parseToAST(
|
|
108
|
+
"<div><section><article><h1>Title</h1><p>Content</p></article></section></div>",
|
|
109
|
+
);
|
|
110
|
+
|
|
98
111
|
const divElement = ast.children![0]!;
|
|
99
112
|
const sectionElement = divElement.children![0]!;
|
|
100
113
|
const articleElement = sectionElement.children![0]!;
|
|
101
|
-
|
|
114
|
+
|
|
102
115
|
expect(articleElement.children).toHaveLength(2);
|
|
103
|
-
expect(articleElement.children![0]!.tagName).toBe(
|
|
104
|
-
expect(articleElement.children![1]!.tagName).toBe(
|
|
116
|
+
expect(articleElement.children![0]!.tagName).toBe("h1");
|
|
117
|
+
expect(articleElement.children![1]!.tagName).toBe("p");
|
|
105
118
|
});
|
|
106
119
|
|
|
107
|
-
it(
|
|
120
|
+
it("should handle complex navigation scenarios", () => {
|
|
108
121
|
const html = `
|
|
109
122
|
<nav>
|
|
110
123
|
<ul>
|
|
@@ -115,21 +128,27 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
115
128
|
</nav>
|
|
116
129
|
`;
|
|
117
130
|
const ast = parseToAST(html);
|
|
118
|
-
|
|
119
|
-
const navElement = ast.children!.find(
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
131
|
+
|
|
132
|
+
const navElement = ast.children!.find(
|
|
133
|
+
(child) => child.tagName === "nav",
|
|
134
|
+
)!;
|
|
135
|
+
const ulElement = navElement.children!.find(
|
|
136
|
+
(child) => child.tagName === "ul",
|
|
137
|
+
)!;
|
|
138
|
+
const liElements = ulElement.children!.filter(
|
|
139
|
+
(child) => child.tagName === "li",
|
|
140
|
+
);
|
|
141
|
+
|
|
123
142
|
expect(liElements).toHaveLength(3);
|
|
124
|
-
|
|
143
|
+
|
|
125
144
|
liElements.forEach((li, index) => {
|
|
126
|
-
const anchor = li.children!.find(child => child.tagName ===
|
|
145
|
+
const anchor = li.children!.find((child) => child.tagName === "a")!;
|
|
127
146
|
expect(anchor.attributes!.href).toBeDefined();
|
|
128
147
|
expect(anchor.children![0]!.type).toBe(ASTNodeType.Text);
|
|
129
148
|
});
|
|
130
149
|
});
|
|
131
150
|
|
|
132
|
-
it(
|
|
151
|
+
it("should handle form elements with complex attributes", () => {
|
|
133
152
|
const html = `
|
|
134
153
|
<form action="/submit" method="post">
|
|
135
154
|
<input type="email" name="email" required pattern="[a-z0-9._%+-]+@[a-z0-9.-]+\\.[a-z]{2,}$">
|
|
@@ -141,15 +160,19 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
141
160
|
</form>
|
|
142
161
|
`;
|
|
143
162
|
const ast = parseToAST(html);
|
|
144
|
-
|
|
145
|
-
const formElement = ast.children!.find(
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
163
|
+
|
|
164
|
+
const formElement = ast.children!.find(
|
|
165
|
+
(child) => child.tagName === "form",
|
|
166
|
+
)!;
|
|
167
|
+
expect(formElement.attributes!.action).toBe("/submit");
|
|
168
|
+
expect(formElement.attributes!.method).toBe("post");
|
|
169
|
+
|
|
149
170
|
const formElements: ASTNode[] = [];
|
|
150
171
|
const traverse = (node: ASTNode) => {
|
|
151
172
|
if (node.type === ASTNodeType.Element) {
|
|
152
|
-
if (
|
|
173
|
+
if (
|
|
174
|
+
["input", "select", "textarea", "option"].includes(node.tagName!)
|
|
175
|
+
) {
|
|
153
176
|
formElements.push(node);
|
|
154
177
|
}
|
|
155
178
|
}
|
|
@@ -158,18 +181,20 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
158
181
|
}
|
|
159
182
|
};
|
|
160
183
|
traverse(formElement);
|
|
161
|
-
|
|
184
|
+
|
|
162
185
|
expect(formElements.length).toBeGreaterThan(3);
|
|
163
|
-
|
|
164
|
-
const emailInput = formElements.find(
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
186
|
+
|
|
187
|
+
const emailInput = formElements.find(
|
|
188
|
+
(el) => el.attributes?.name === "email",
|
|
189
|
+
);
|
|
190
|
+
expect(emailInput!.attributes!.required).toBe("");
|
|
191
|
+
expect(emailInput!.attributes!.pattern).toContain("@");
|
|
192
|
+
|
|
193
|
+
const selectElement = formElements.find((el) => el.tagName === "select");
|
|
194
|
+
expect(selectElement!.attributes!.multiple).toBe("");
|
|
170
195
|
});
|
|
171
196
|
|
|
172
|
-
it(
|
|
197
|
+
it("should handle table structures", () => {
|
|
173
198
|
const html = `
|
|
174
199
|
<table>
|
|
175
200
|
<thead>
|
|
@@ -191,18 +216,24 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
191
216
|
</table>
|
|
192
217
|
`;
|
|
193
218
|
const ast = parseToAST(html);
|
|
194
|
-
|
|
195
|
-
const tableElement = ast.children!.find(
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
219
|
+
|
|
220
|
+
const tableElement = ast.children!.find(
|
|
221
|
+
(child) => child.tagName === "table",
|
|
222
|
+
)!;
|
|
223
|
+
|
|
224
|
+
const thead = tableElement.children!.find(
|
|
225
|
+
(child) => child.tagName === "thead",
|
|
226
|
+
);
|
|
227
|
+
const tbody = tableElement.children!.find(
|
|
228
|
+
(child) => child.tagName === "tbody",
|
|
229
|
+
);
|
|
230
|
+
|
|
200
231
|
expect(thead).toBeDefined();
|
|
201
232
|
expect(tbody).toBeDefined();
|
|
202
|
-
|
|
233
|
+
|
|
203
234
|
const rows: ASTNode[] = [];
|
|
204
235
|
const traverse = (node: ASTNode) => {
|
|
205
|
-
if (node.tagName ===
|
|
236
|
+
if (node.tagName === "tr") {
|
|
206
237
|
rows.push(node);
|
|
207
238
|
}
|
|
208
239
|
if (node.children) {
|
|
@@ -210,23 +241,23 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
210
241
|
}
|
|
211
242
|
};
|
|
212
243
|
traverse(tableElement);
|
|
213
|
-
|
|
214
|
-
expect(rows).toHaveLength(3);
|
|
244
|
+
|
|
245
|
+
expect(rows).toHaveLength(3);
|
|
215
246
|
});
|
|
216
247
|
|
|
217
|
-
it(
|
|
248
|
+
it("should handle mixed inline content", () => {
|
|
218
249
|
const html = `
|
|
219
250
|
<p>This is <strong>bold</strong> and <em>italic</em>.
|
|
220
251
|
Here's a <a href="https://example.com">link</a> and
|
|
221
252
|
<code>inline code</code>.</p>
|
|
222
253
|
`;
|
|
223
254
|
const ast = parseToAST(html);
|
|
224
|
-
|
|
225
|
-
const pElement = ast.children!.find(child => child.tagName ===
|
|
226
|
-
|
|
255
|
+
|
|
256
|
+
const pElement = ast.children!.find((child) => child.tagName === "p")!;
|
|
257
|
+
|
|
227
258
|
let textNodes = 0;
|
|
228
259
|
let elementNodes = 0;
|
|
229
|
-
|
|
260
|
+
|
|
230
261
|
const traverse = (node: ASTNode) => {
|
|
231
262
|
if (node.type === ASTNodeType.Text && (node as any).content?.trim()) {
|
|
232
263
|
textNodes++;
|
|
@@ -237,16 +268,16 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
237
268
|
node.children.forEach(traverse);
|
|
238
269
|
}
|
|
239
270
|
};
|
|
240
|
-
|
|
271
|
+
|
|
241
272
|
if (pElement.children) {
|
|
242
273
|
pElement.children.forEach(traverse);
|
|
243
274
|
}
|
|
244
|
-
|
|
245
|
-
expect(elementNodes).toBeGreaterThan(3);
|
|
275
|
+
|
|
276
|
+
expect(elementNodes).toBeGreaterThan(3);
|
|
246
277
|
expect(textNodes).toBeGreaterThan(0);
|
|
247
278
|
});
|
|
248
279
|
|
|
249
|
-
it(
|
|
280
|
+
it("should preserve complete document structure", () => {
|
|
250
281
|
const html = `<!DOCTYPE html>
|
|
251
282
|
<html lang="en">
|
|
252
283
|
<head>
|
|
@@ -270,53 +301,71 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
270
301
|
</footer>
|
|
271
302
|
</body>
|
|
272
303
|
</html>`;
|
|
273
|
-
|
|
304
|
+
|
|
274
305
|
const ast = parseToAST(html);
|
|
275
|
-
|
|
276
|
-
const doctype = ast.children!.find(
|
|
306
|
+
|
|
307
|
+
const doctype = ast.children!.find(
|
|
308
|
+
(child) => child.type === ASTNodeType.Doctype,
|
|
309
|
+
);
|
|
277
310
|
expect(doctype).toBeDefined();
|
|
278
|
-
|
|
279
|
-
const htmlElement = ast.children!.find(
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
311
|
+
|
|
312
|
+
const htmlElement = ast.children!.find(
|
|
313
|
+
(child) => child.tagName === "html",
|
|
314
|
+
)!;
|
|
315
|
+
expect(htmlElement.attributes!.lang).toBe("en");
|
|
316
|
+
|
|
317
|
+
const headElement = htmlElement.children!.find(
|
|
318
|
+
(child) => child.tagName === "head",
|
|
319
|
+
);
|
|
320
|
+
const bodyElement = htmlElement.children!.find(
|
|
321
|
+
(child) => child.tagName === "body",
|
|
322
|
+
);
|
|
323
|
+
|
|
285
324
|
expect(headElement).toBeDefined();
|
|
286
325
|
expect(bodyElement).toBeDefined();
|
|
287
|
-
|
|
288
|
-
const headerElement = bodyElement!.children!.find(
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
326
|
+
|
|
327
|
+
const headerElement = bodyElement!.children!.find(
|
|
328
|
+
(child) => child.tagName === "header",
|
|
329
|
+
);
|
|
330
|
+
const mainElement = bodyElement!.children!.find(
|
|
331
|
+
(child) => child.tagName === "main",
|
|
332
|
+
);
|
|
333
|
+
const footerElement = bodyElement!.children!.find(
|
|
334
|
+
(child) => child.tagName === "footer",
|
|
335
|
+
);
|
|
336
|
+
|
|
292
337
|
expect(headerElement).toBeDefined();
|
|
293
338
|
expect(mainElement).toBeDefined();
|
|
294
339
|
expect(footerElement).toBeDefined();
|
|
295
|
-
|
|
296
|
-
expect(headerElement!.attributes!.id).toBe(
|
|
340
|
+
|
|
341
|
+
expect(headerElement!.attributes!.id).toBe("main-header");
|
|
297
342
|
});
|
|
298
343
|
});
|
|
299
344
|
|
|
300
|
-
describe(
|
|
301
|
-
it(
|
|
345
|
+
describe("Real-world Content Handling", () => {
|
|
346
|
+
it("should handle SVG content", () => {
|
|
302
347
|
const svg = `
|
|
303
348
|
<svg width="100" height="100" xmlns="http://www.w3.org/2000/svg">
|
|
304
349
|
<circle cx="50" cy="50" r="40" fill="red"/>
|
|
305
350
|
<text x="50" y="50">SVG</text>
|
|
306
351
|
</svg>
|
|
307
352
|
`;
|
|
308
|
-
|
|
353
|
+
|
|
309
354
|
const ast = parseToAST(svg);
|
|
310
|
-
|
|
311
|
-
const svgElement = ast.children!.find(
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
355
|
+
|
|
356
|
+
const svgElement = ast.children!.find(
|
|
357
|
+
(child) => child.tagName === "svg",
|
|
358
|
+
)!;
|
|
359
|
+
expect(svgElement.attributes!.xmlns).toBe("http://www.w3.org/2000/svg");
|
|
360
|
+
|
|
361
|
+
const circleElement = svgElement.children!.find(
|
|
362
|
+
(child) => child.tagName === "circle",
|
|
363
|
+
);
|
|
315
364
|
expect(circleElement).toBeDefined();
|
|
316
|
-
expect(circleElement!.attributes!.fill).toBe(
|
|
365
|
+
expect(circleElement!.attributes!.fill).toBe("red");
|
|
317
366
|
});
|
|
318
367
|
|
|
319
|
-
it(
|
|
368
|
+
it("should handle script and style tags", () => {
|
|
320
369
|
const html = `
|
|
321
370
|
<body>
|
|
322
371
|
<script type="text/javascript">
|
|
@@ -329,9 +378,9 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
329
378
|
</style>
|
|
330
379
|
</body>
|
|
331
380
|
`;
|
|
332
|
-
|
|
381
|
+
|
|
333
382
|
const ast = parseToAST(html);
|
|
334
|
-
|
|
383
|
+
|
|
335
384
|
function findByTagName(node: ASTNode, tagName: string): ASTNode | null {
|
|
336
385
|
if (node.tagName === tagName) return node;
|
|
337
386
|
if (node.children) {
|
|
@@ -342,159 +391,131 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
342
391
|
}
|
|
343
392
|
return null;
|
|
344
393
|
}
|
|
345
|
-
|
|
346
|
-
const scriptElement = findByTagName(ast,
|
|
347
|
-
const styleElement = findByTagName(ast,
|
|
348
|
-
|
|
349
|
-
expect(scriptElement!.attributes!.type).toBe(
|
|
350
|
-
expect(styleElement!.attributes!.type).toBe(
|
|
394
|
+
|
|
395
|
+
const scriptElement = findByTagName(ast, "script");
|
|
396
|
+
const styleElement = findByTagName(ast, "style");
|
|
397
|
+
|
|
398
|
+
expect(scriptElement!.attributes!.type).toBe("text/javascript");
|
|
399
|
+
expect(styleElement!.attributes!.type).toBe("text/css");
|
|
351
400
|
});
|
|
352
401
|
});
|
|
353
402
|
|
|
354
|
-
describe(
|
|
355
|
-
it(
|
|
356
|
-
let html =
|
|
403
|
+
describe("Error Recovery and Edge Cases", () => {
|
|
404
|
+
it("should handle extreme nesting depth", () => {
|
|
405
|
+
let html = "";
|
|
357
406
|
const depth = 100;
|
|
358
|
-
|
|
407
|
+
|
|
359
408
|
for (let i = 0; i < depth; i++) {
|
|
360
409
|
html += `<div level="${i}">`;
|
|
361
410
|
}
|
|
362
|
-
html +=
|
|
411
|
+
html += "Deep content";
|
|
363
412
|
for (let i = 0; i < depth; i++) {
|
|
364
|
-
html +=
|
|
413
|
+
html += "</div>";
|
|
365
414
|
}
|
|
366
|
-
|
|
415
|
+
|
|
367
416
|
const ast = parseToAST(html);
|
|
368
|
-
|
|
417
|
+
|
|
369
418
|
let current = ast.children![0]!;
|
|
370
419
|
for (let i = 0; i < depth - 1; i++) {
|
|
371
|
-
expect(current.tagName).toBe(
|
|
420
|
+
expect(current.tagName).toBe("div");
|
|
372
421
|
expect(current.attributes!.level).toBe(i.toString());
|
|
373
|
-
current = current.children!.find(
|
|
422
|
+
current = current.children!.find(
|
|
423
|
+
(child) => child.type === ASTNodeType.Element,
|
|
424
|
+
)!;
|
|
374
425
|
}
|
|
375
|
-
|
|
376
|
-
const textNode = current.children!.find(
|
|
377
|
-
|
|
426
|
+
|
|
427
|
+
const textNode = current.children!.find(
|
|
428
|
+
(child) => child.type === ASTNodeType.Text,
|
|
429
|
+
)!;
|
|
430
|
+
expect((textNode as any).content).toBe("Deep content");
|
|
378
431
|
});
|
|
379
432
|
|
|
380
|
-
it(
|
|
381
|
-
const malformedHTML =
|
|
433
|
+
it("should handle malformed HTML gracefully", () => {
|
|
434
|
+
const malformedHTML = "<div><p><span>Text</div></span></p>";
|
|
382
435
|
const ast = parseToAST(malformedHTML);
|
|
383
|
-
|
|
436
|
+
|
|
384
437
|
const divElement = ast.children![0]!;
|
|
385
|
-
expect(divElement.tagName).toBe(
|
|
438
|
+
expect(divElement.tagName).toBe("div");
|
|
386
439
|
expect(divElement.children!.length).toBeGreaterThan(0);
|
|
387
440
|
});
|
|
388
441
|
|
|
389
|
-
it(
|
|
390
|
-
const html =
|
|
442
|
+
it("should handle orphaned closing tags", () => {
|
|
443
|
+
const html = "</div><p>Valid content</p></span>";
|
|
391
444
|
const ast = parseToAST(html);
|
|
392
|
-
|
|
445
|
+
|
|
393
446
|
const pElement = ast.children!.find(
|
|
394
|
-
child => child.type === ASTNodeType.Element && child.tagName ===
|
|
447
|
+
(child) => child.type === ASTNodeType.Element && child.tagName === "p",
|
|
395
448
|
)!;
|
|
396
449
|
expect(pElement).toBeDefined();
|
|
397
|
-
expect((pElement.children![0]! as any).content).toBe(
|
|
398
|
-
});
|
|
399
|
-
|
|
400
|
-
it.skip('should handle mixed content types in single document', () => {
|
|
401
|
-
const complexHTML = `
|
|
402
|
-
<?xml version="1.0"?>
|
|
403
|
-
<!DOCTYPE html>
|
|
404
|
-
<!-- Document start -->
|
|
405
|
-
<html>
|
|
406
|
-
<head>
|
|
407
|
-
<title>Test & Demo</title>
|
|
408
|
-
<![CDATA[Raw data here]]>
|
|
409
|
-
</head>
|
|
410
|
-
<body>
|
|
411
|
-
<h1>Main Title</h1>
|
|
412
|
-
<p>Paragraph with <strong>bold</strong> text.</p>
|
|
413
|
-
<!-- Body content -->
|
|
414
|
-
</body>
|
|
415
|
-
</html>
|
|
416
|
-
<!-- Document end -->
|
|
417
|
-
`;
|
|
418
|
-
|
|
419
|
-
const ast = parseToAST(complexHTML);
|
|
420
|
-
|
|
421
|
-
const nodeCounts: Record<string, number> = {
|
|
422
|
-
'processing-instruction': 0,
|
|
423
|
-
[ASTNodeType.Doctype]: 0,
|
|
424
|
-
[ASTNodeType.Comment]: 0,
|
|
425
|
-
[ASTNodeType.Element]: 0,
|
|
426
|
-
[ASTNodeType.Text]: 0,
|
|
427
|
-
[ASTNodeType.CDATA]: 0
|
|
428
|
-
};
|
|
429
|
-
|
|
430
|
-
const traverse = (node: ASTNode) => {
|
|
431
|
-
if (node.type in nodeCounts) {
|
|
432
|
-
nodeCounts[node.type]++;
|
|
433
|
-
}
|
|
434
|
-
if (node.children) {
|
|
435
|
-
node.children.forEach(traverse);
|
|
436
|
-
}
|
|
437
|
-
};
|
|
438
|
-
|
|
439
|
-
ast.children!.forEach(traverse);
|
|
440
|
-
|
|
441
|
-
expect(nodeCounts['processing-instruction']).toBeGreaterThan(0);
|
|
442
|
-
expect(nodeCounts[ASTNodeType.Doctype]).toBeGreaterThan(0);
|
|
443
|
-
expect(nodeCounts[ASTNodeType.Comment]).toBeGreaterThan(0);
|
|
444
|
-
expect(nodeCounts[ASTNodeType.Element]).toBeGreaterThan(0);
|
|
445
|
-
expect(nodeCounts[ASTNodeType.Text]).toBeGreaterThan(0);
|
|
446
|
-
expect(nodeCounts[ASTNodeType.CDATA]).toBeGreaterThan(0);
|
|
450
|
+
expect((pElement.children![0]! as any).content).toBe("Valid content");
|
|
447
451
|
});
|
|
448
452
|
});
|
|
449
453
|
|
|
450
|
-
describe(
|
|
451
|
-
it(
|
|
454
|
+
describe("Security and Template Edge Cases", () => {
|
|
455
|
+
it("should treat javascript: urls as regular attribute values", () => {
|
|
452
456
|
const html = `<a href="javascript:alert('XSS')">Click me</a>`;
|
|
453
457
|
const ast = parseToAST(html);
|
|
454
|
-
const aElement = ast.children!.find(child => child.tagName ===
|
|
458
|
+
const aElement = ast.children!.find((child) => child.tagName === "a")!;
|
|
455
459
|
expect(aElement).toBeDefined();
|
|
456
460
|
expect(aElement.attributes!.href).toBe("javascript:alert('XSS')");
|
|
457
461
|
});
|
|
458
462
|
|
|
459
|
-
it(
|
|
463
|
+
it("should correctly parse event handler attributes like onerror", () => {
|
|
460
464
|
const html = `<img src="invalid" onerror="alert('XSS')">`;
|
|
461
465
|
const ast = parseToAST(html);
|
|
462
|
-
const imgElement = ast.children!.find(
|
|
466
|
+
const imgElement = ast.children!.find(
|
|
467
|
+
(child) => child.tagName === "img",
|
|
468
|
+
)!;
|
|
463
469
|
expect(imgElement).toBeDefined();
|
|
464
470
|
expect(imgElement.attributes!.onerror).toBe("alert('XSS')");
|
|
465
471
|
});
|
|
466
472
|
|
|
467
|
-
it(
|
|
473
|
+
it("should treat template engine syntax as plain text", () => {
|
|
468
474
|
const html = `<div>{{ user.name }}</div><p>Hello, <%= name %></p>`;
|
|
469
475
|
const ast = parseToAST(html);
|
|
470
476
|
|
|
471
|
-
const divElement = ast.children!.find(
|
|
477
|
+
const divElement = ast.children!.find(
|
|
478
|
+
(child) => child.tagName === "div",
|
|
479
|
+
)!;
|
|
472
480
|
expect(divElement).toBeDefined();
|
|
473
|
-
const divText = divElement.children!.find(
|
|
474
|
-
|
|
481
|
+
const divText = divElement.children!.find(
|
|
482
|
+
(child) => child.type === ASTNodeType.Text,
|
|
483
|
+
)!;
|
|
484
|
+
expect((divText as any).content).toBe("{{ user.name }}");
|
|
475
485
|
|
|
476
|
-
const pElement = ast.children!.find(child => child.tagName ===
|
|
486
|
+
const pElement = ast.children!.find((child) => child.tagName === "p")!;
|
|
477
487
|
expect(pElement).toBeDefined();
|
|
478
|
-
const pText = pElement.children!.find(
|
|
479
|
-
|
|
488
|
+
const pText = pElement.children!.find(
|
|
489
|
+
(child) => child.type === ASTNodeType.Text,
|
|
490
|
+
)!;
|
|
491
|
+
expect((pText as any).content).toBe("Hello, <%= name %>");
|
|
480
492
|
});
|
|
481
493
|
|
|
482
|
-
it(
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
494
|
+
it("should handle null characters in content gracefully", () => {
|
|
495
|
+
const html = "<div>Hello\0World</div>";
|
|
496
|
+
const ast = parseToAST(html);
|
|
497
|
+
const divElement = ast.children!.find(
|
|
498
|
+
(child) => child.tagName === "div",
|
|
499
|
+
)!;
|
|
500
|
+
const textNode = divElement.children!.find(
|
|
501
|
+
(child) => child.type === ASTNodeType.Text,
|
|
502
|
+
)!;
|
|
503
|
+
expect((textNode as any).content).toBe("Hello\uFFFDWorld");
|
|
488
504
|
});
|
|
489
505
|
|
|
490
|
-
it(
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
506
|
+
it("should handle control characters in content", () => {
|
|
507
|
+
const html = "<div>Line1\x08\x09Line2\x0BLine3\x0CLine4\x0DLine5</div>";
|
|
508
|
+
const ast = parseToAST(html);
|
|
509
|
+
const divElement = ast.children!.find(
|
|
510
|
+
(child) => child.tagName === "div",
|
|
511
|
+
)!;
|
|
512
|
+
const textNode = divElement.children!.find(
|
|
513
|
+
(child) => child.type === ASTNodeType.Text,
|
|
514
|
+
)!;
|
|
515
|
+
expect((textNode as any).content).toContain("\x09");
|
|
516
|
+
expect((textNode as any).content).toContain("\x0D");
|
|
517
|
+
expect((textNode as any).content).toContain("Line1");
|
|
518
|
+
expect((textNode as any).content).toContain("Line5");
|
|
519
|
+
});
|
|
499
520
|
});
|
|
500
|
-
});
|
|
521
|
+
});
|