@tkeron/html-parser 0.1.7 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -7
- package/bun.lock +5 -0
- package/index.ts +4 -0
- package/package.json +7 -1
- package/src/css-selector.ts +1 -1
- package/src/dom-simulator.ts +41 -17
- package/src/encoding.ts +39 -0
- package/src/index.ts +9 -0
- package/src/parser.ts +509 -143
- package/src/serializer.ts +450 -0
- package/src/tokenizer.ts +190 -118
- package/tests/advanced.test.ts +121 -108
- package/tests/custom-elements-head.test.ts +105 -0
- package/tests/dom-extended.test.ts +12 -12
- package/tests/dom-manipulation.test.ts +9 -10
- package/tests/dom.test.ts +32 -27
- package/tests/helpers/tokenizer-adapter.test.ts +70 -0
- package/tests/helpers/tokenizer-adapter.ts +65 -0
- package/tests/helpers/tree-adapter.test.ts +39 -0
- package/tests/helpers/tree-adapter.ts +60 -0
- package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
- package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
- package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
- package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
- package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
- package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
- package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
- package/tests/html5lib-data/tree-construction/math.dat +104 -0
- package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
- package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
- package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
- package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
- package/tests/html5lib-data/tree-construction/svg.dat +104 -0
- package/tests/html5lib-data/tree-construction/template.dat +1673 -0
- package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
- package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
- package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
- package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
- package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
- package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
- package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
- package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
- package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
- package/tests/parser.test.ts +173 -193
- package/tests/serializer-core.test.ts +16 -0
- package/tests/serializer-data/core.test +125 -0
- package/tests/serializer-data/injectmeta.test +66 -0
- package/tests/serializer-data/optionaltags.test +965 -0
- package/tests/serializer-data/options.test +60 -0
- package/tests/serializer-data/whitespace.test +51 -0
- package/tests/serializer-injectmeta.test.ts +16 -0
- package/tests/serializer-optionaltags.test.ts +16 -0
- package/tests/serializer-options.test.ts +16 -0
- package/tests/serializer-whitespace.test.ts +16 -0
- package/tests/tokenizer-namedEntities.test.ts +20 -0
- package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
- package/tests/tokenizer.test.ts +25 -32
- package/tests/tree-construction-adoption01.test.ts +37 -0
- package/tests/tree-construction-adoption02.test.ts +34 -0
- package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
- package/tests/tree-construction-entities02.test.ts +33 -0
- package/tests/tree-construction-html5test-com.test.ts +32 -0
- package/tests/tree-construction-math.test.ts +18 -0
- package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
- package/tests/tree-construction-noscript01.test.ts +18 -0
- package/tests/tree-construction-ruby.test.ts +21 -0
- package/tests/tree-construction-scriptdata01.test.ts +21 -0
- package/tests/tree-construction-svg.test.ts +21 -0
- package/tests/tree-construction-template.test.ts +21 -0
- package/tests/tree-construction-tests10.test.ts +21 -0
- package/tests/tree-construction-tests11.test.ts +21 -0
- package/tests/tree-construction-tests20.test.ts +18 -0
- package/tests/tree-construction-tests21.test.ts +18 -0
- package/tests/tree-construction-tests23.test.ts +18 -0
- package/tests/tree-construction-tests24.test.ts +18 -0
- package/tests/tree-construction-tests5.test.ts +21 -0
- package/tests/tree-construction-tests6.test.ts +21 -0
- package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
- package/tests/custom-elements.test.ts +0 -745
- package/tests/official/README.md +0 -87
- package/tests/official/acid/acid-tests.test.ts +0 -309
- package/tests/official/final-output/final-output.test.ts +0 -361
- package/tests/official/html5lib/tokenizer-utils.ts +0 -192
- package/tests/official/html5lib/tokenizer.test.ts +0 -171
- package/tests/official/html5lib/tree-construction-utils.ts +0 -194
- package/tests/official/html5lib/tree-construction.test.ts +0 -250
- package/tests/official/validator/validator-tests.test.ts +0 -237
- package/tests/official/validator-nu/validator-nu.test.ts +0 -335
- package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
- package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/tests/advanced.test.ts
CHANGED
|
@@ -1,11 +1,32 @@
|
|
|
1
|
-
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
import { expect, test, describe, it } from 'bun:test';
|
|
2
3
|
import { tokenize, TokenType } from '../src/tokenizer';
|
|
3
|
-
import { parse, ASTNodeType, type ASTNode } from '../src/parser';
|
|
4
|
+
import { parse, ASTNodeType, domToAST, type ASTNode } from '../src/parser';
|
|
5
|
+
|
|
6
|
+
function parseToAST(html: string): ASTNode {
|
|
7
|
+
const tokens = tokenize(html);
|
|
8
|
+
const dom = parse(tokens);
|
|
9
|
+
const ast = domToAST(dom);
|
|
10
|
+
|
|
11
|
+
const hasExplicitHtml = html.includes('<html') || html.includes('<!DOCTYPE') || html.includes('<!doctype');
|
|
12
|
+
if (hasExplicitHtml) {
|
|
13
|
+
return ast;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const htmlEl = ast.children?.find(c => c.tagName === 'html');
|
|
17
|
+
if (htmlEl) {
|
|
18
|
+
const bodyEl = htmlEl.children?.find(c => c.tagName === 'body');
|
|
19
|
+
if (bodyEl && bodyEl.children) {
|
|
20
|
+
return { type: ASTNodeType.Document, children: bodyEl.children };
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
return ast;
|
|
24
|
+
}
|
|
4
25
|
|
|
5
26
|
describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
6
27
|
|
|
7
28
|
describe('Tokenizer Edge Cases', () => {
|
|
8
|
-
|
|
29
|
+
it('should handle attributes with no spaces', () => {
|
|
9
30
|
const tokens = tokenize('<div class="test"id="main"data-value="123">');
|
|
10
31
|
expect(tokens.length).toBeGreaterThan(0);
|
|
11
32
|
const tag = tokens[0]!;
|
|
@@ -17,7 +38,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
17
38
|
});
|
|
18
39
|
});
|
|
19
40
|
|
|
20
|
-
|
|
41
|
+
it('should handle mixed quote styles', () => {
|
|
21
42
|
const tokens = tokenize(`<div class='single' id="double" data-test='mix "quoted" content'>`);
|
|
22
43
|
expect(tokens.length).toBeGreaterThan(0);
|
|
23
44
|
const tag = tokens[0]!;
|
|
@@ -27,7 +48,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
27
48
|
expect(tag.attributes!['data-test']).toBe('mix "quoted" content');
|
|
28
49
|
});
|
|
29
50
|
|
|
30
|
-
|
|
51
|
+
it('should handle unicode characters', () => {
|
|
31
52
|
const tokens = tokenize('<div title="测试" data-emoji="🚀" class="lorem">');
|
|
32
53
|
expect(tokens.length).toBeGreaterThan(0);
|
|
33
54
|
const tag = tokens[0]!;
|
|
@@ -39,7 +60,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
39
60
|
});
|
|
40
61
|
});
|
|
41
62
|
|
|
42
|
-
|
|
63
|
+
it('should handle complex CDATA content as bogus comment', () => {
|
|
43
64
|
const complexContent = `
|
|
44
65
|
function test() {
|
|
45
66
|
return "<div>HTML inside JS</div>";
|
|
@@ -50,11 +71,11 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
50
71
|
expect(tokens.length).toBeGreaterThan(0);
|
|
51
72
|
const cdataToken = tokens[0]!;
|
|
52
73
|
|
|
53
|
-
expect(cdataToken.type).toBe(TokenType.
|
|
54
|
-
expect(cdataToken.value).toBe(complexContent);
|
|
74
|
+
expect(cdataToken.type).toBe(TokenType.COMMENT);
|
|
75
|
+
expect(cdataToken.value).toBe('[CDATA[' + complexContent + ']]');
|
|
55
76
|
});
|
|
56
77
|
|
|
57
|
-
|
|
78
|
+
it('should handle performance with large documents', () => {
|
|
58
79
|
let html = '<div>';
|
|
59
80
|
for (let i = 0; i < 1000; i++) {
|
|
60
81
|
html += `<p id="para-${i}">Content ${i}</p>`;
|
|
@@ -71,23 +92,19 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
71
92
|
});
|
|
72
93
|
|
|
73
94
|
describe('Parser DOM-like Functionality', () => {
|
|
74
|
-
|
|
75
|
-
const
|
|
76
|
-
const ast = parse(tokens);
|
|
95
|
+
it('should create proper parent-child relationships', () => {
|
|
96
|
+
const ast = parseToAST('<div><section><article><h1>Title</h1><p>Content</p></article></section></div>');
|
|
77
97
|
|
|
78
98
|
const divElement = ast.children![0]!;
|
|
79
99
|
const sectionElement = divElement.children![0]!;
|
|
80
100
|
const articleElement = sectionElement.children![0]!;
|
|
81
101
|
|
|
82
|
-
expect(sectionElement.parent).toBe(divElement);
|
|
83
|
-
expect(articleElement.parent).toBe(sectionElement);
|
|
84
|
-
|
|
85
102
|
expect(articleElement.children).toHaveLength(2);
|
|
86
103
|
expect(articleElement.children![0]!.tagName).toBe('h1');
|
|
87
104
|
expect(articleElement.children![1]!.tagName).toBe('p');
|
|
88
105
|
});
|
|
89
106
|
|
|
90
|
-
|
|
107
|
+
it('should handle complex navigation scenarios', () => {
|
|
91
108
|
const html = `
|
|
92
109
|
<nav>
|
|
93
110
|
<ul>
|
|
@@ -97,8 +114,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
97
114
|
</ul>
|
|
98
115
|
</nav>
|
|
99
116
|
`;
|
|
100
|
-
const
|
|
101
|
-
const ast = parse(tokens);
|
|
117
|
+
const ast = parseToAST(html);
|
|
102
118
|
|
|
103
119
|
const navElement = ast.children!.find(child => child.tagName === 'nav')!;
|
|
104
120
|
const ulElement = navElement.children!.find(child => child.tagName === 'ul')!;
|
|
@@ -109,11 +125,11 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
109
125
|
liElements.forEach((li, index) => {
|
|
110
126
|
const anchor = li.children!.find(child => child.tagName === 'a')!;
|
|
111
127
|
expect(anchor.attributes!.href).toBeDefined();
|
|
112
|
-
expect(anchor.children![0]!.type).toBe(ASTNodeType.
|
|
128
|
+
expect(anchor.children![0]!.type).toBe(ASTNodeType.Text);
|
|
113
129
|
});
|
|
114
130
|
});
|
|
115
131
|
|
|
116
|
-
|
|
132
|
+
it('should handle form elements with complex attributes', () => {
|
|
117
133
|
const html = `
|
|
118
134
|
<form action="/submit" method="post">
|
|
119
135
|
<input type="email" name="email" required pattern="[a-z0-9._%+-]+@[a-z0-9.-]+\\.[a-z]{2,}$">
|
|
@@ -124,8 +140,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
124
140
|
<textarea name="comments" rows="4" cols="50"></textarea>
|
|
125
141
|
</form>
|
|
126
142
|
`;
|
|
127
|
-
const
|
|
128
|
-
const ast = parse(tokens);
|
|
143
|
+
const ast = parseToAST(html);
|
|
129
144
|
|
|
130
145
|
const formElement = ast.children!.find(child => child.tagName === 'form')!;
|
|
131
146
|
expect(formElement.attributes!.action).toBe('/submit');
|
|
@@ -133,7 +148,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
133
148
|
|
|
134
149
|
const formElements: ASTNode[] = [];
|
|
135
150
|
const traverse = (node: ASTNode) => {
|
|
136
|
-
if (node.type === ASTNodeType.
|
|
151
|
+
if (node.type === ASTNodeType.Element) {
|
|
137
152
|
if (['input', 'select', 'textarea', 'option'].includes(node.tagName!)) {
|
|
138
153
|
formElements.push(node);
|
|
139
154
|
}
|
|
@@ -154,7 +169,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
154
169
|
expect(selectElement!.attributes!.multiple).toBe('');
|
|
155
170
|
});
|
|
156
171
|
|
|
157
|
-
|
|
172
|
+
it('should handle table structures', () => {
|
|
158
173
|
const html = `
|
|
159
174
|
<table>
|
|
160
175
|
<thead>
|
|
@@ -175,8 +190,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
175
190
|
</tbody>
|
|
176
191
|
</table>
|
|
177
192
|
`;
|
|
178
|
-
const
|
|
179
|
-
const ast = parse(tokens);
|
|
193
|
+
const ast = parseToAST(html);
|
|
180
194
|
|
|
181
195
|
const tableElement = ast.children!.find(child => child.tagName === 'table')!;
|
|
182
196
|
|
|
@@ -200,14 +214,13 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
200
214
|
expect(rows).toHaveLength(3);
|
|
201
215
|
});
|
|
202
216
|
|
|
203
|
-
|
|
217
|
+
it('should handle mixed inline content', () => {
|
|
204
218
|
const html = `
|
|
205
219
|
<p>This is <strong>bold</strong> and <em>italic</em>.
|
|
206
220
|
Here's a <a href="https://example.com">link</a> and
|
|
207
221
|
<code>inline code</code>.</p>
|
|
208
222
|
`;
|
|
209
|
-
const
|
|
210
|
-
const ast = parse(tokens);
|
|
223
|
+
const ast = parseToAST(html);
|
|
211
224
|
|
|
212
225
|
const pElement = ast.children!.find(child => child.tagName === 'p')!;
|
|
213
226
|
|
|
@@ -215,9 +228,9 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
215
228
|
let elementNodes = 0;
|
|
216
229
|
|
|
217
230
|
const traverse = (node: ASTNode) => {
|
|
218
|
-
if (node.type === ASTNodeType.
|
|
231
|
+
if (node.type === ASTNodeType.Text && (node as any).content?.trim()) {
|
|
219
232
|
textNodes++;
|
|
220
|
-
} else if (node.type === ASTNodeType.
|
|
233
|
+
} else if (node.type === ASTNodeType.Element) {
|
|
221
234
|
elementNodes++;
|
|
222
235
|
}
|
|
223
236
|
if (node.children) {
|
|
@@ -233,7 +246,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
233
246
|
expect(textNodes).toBeGreaterThan(0);
|
|
234
247
|
});
|
|
235
248
|
|
|
236
|
-
|
|
249
|
+
it('should preserve complete document structure', () => {
|
|
237
250
|
const html = `<!DOCTYPE html>
|
|
238
251
|
<html lang="en">
|
|
239
252
|
<head>
|
|
@@ -258,10 +271,9 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
258
271
|
</body>
|
|
259
272
|
</html>`;
|
|
260
273
|
|
|
261
|
-
const
|
|
262
|
-
const ast = parse(tokens);
|
|
274
|
+
const ast = parseToAST(html);
|
|
263
275
|
|
|
264
|
-
const doctype = ast.children!.find(child => child.type === ASTNodeType.
|
|
276
|
+
const doctype = ast.children!.find(child => child.type === ASTNodeType.Doctype);
|
|
265
277
|
expect(doctype).toBeDefined();
|
|
266
278
|
|
|
267
279
|
const htmlElement = ast.children!.find(child => child.tagName === 'html')!;
|
|
@@ -286,7 +298,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
286
298
|
});
|
|
287
299
|
|
|
288
300
|
describe('Real-world Content Handling', () => {
|
|
289
|
-
|
|
301
|
+
it('should handle SVG content', () => {
|
|
290
302
|
const svg = `
|
|
291
303
|
<svg width="100" height="100" xmlns="http://www.w3.org/2000/svg">
|
|
292
304
|
<circle cx="50" cy="50" r="40" fill="red"/>
|
|
@@ -294,8 +306,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
294
306
|
</svg>
|
|
295
307
|
`;
|
|
296
308
|
|
|
297
|
-
const
|
|
298
|
-
const ast = parse(tokens);
|
|
309
|
+
const ast = parseToAST(svg);
|
|
299
310
|
|
|
300
311
|
const svgElement = ast.children!.find(child => child.tagName === 'svg')!;
|
|
301
312
|
expect(svgElement.attributes!.xmlns).toBe('http://www.w3.org/2000/svg');
|
|
@@ -305,23 +316,35 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
305
316
|
expect(circleElement!.attributes!.fill).toBe('red');
|
|
306
317
|
});
|
|
307
318
|
|
|
308
|
-
|
|
319
|
+
it('should handle script and style tags', () => {
|
|
309
320
|
const html = `
|
|
310
|
-
<
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
321
|
+
<body>
|
|
322
|
+
<script type="text/javascript">
|
|
323
|
+
function hello() {
|
|
324
|
+
alert("Hello");
|
|
325
|
+
}
|
|
326
|
+
</script>
|
|
327
|
+
<style type="text/css">
|
|
328
|
+
.class { color: red; }
|
|
329
|
+
</style>
|
|
330
|
+
</body>
|
|
318
331
|
`;
|
|
319
332
|
|
|
320
|
-
const
|
|
321
|
-
|
|
333
|
+
const ast = parseToAST(html);
|
|
334
|
+
|
|
335
|
+
function findByTagName(node: ASTNode, tagName: string): ASTNode | null {
|
|
336
|
+
if (node.tagName === tagName) return node;
|
|
337
|
+
if (node.children) {
|
|
338
|
+
for (const child of node.children) {
|
|
339
|
+
const found = findByTagName(child, tagName);
|
|
340
|
+
if (found) return found;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
return null;
|
|
344
|
+
}
|
|
322
345
|
|
|
323
|
-
const scriptElement = ast
|
|
324
|
-
const styleElement = ast
|
|
346
|
+
const scriptElement = findByTagName(ast, 'script');
|
|
347
|
+
const styleElement = findByTagName(ast, 'style');
|
|
325
348
|
|
|
326
349
|
expect(scriptElement!.attributes!.type).toBe('text/javascript');
|
|
327
350
|
expect(styleElement!.attributes!.type).toBe('text/css');
|
|
@@ -329,7 +352,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
329
352
|
});
|
|
330
353
|
|
|
331
354
|
describe('Error Recovery and Edge Cases', () => {
|
|
332
|
-
|
|
355
|
+
it('should handle extreme nesting depth', () => {
|
|
333
356
|
let html = '';
|
|
334
357
|
const depth = 100;
|
|
335
358
|
|
|
@@ -341,43 +364,40 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
341
364
|
html += '</div>';
|
|
342
365
|
}
|
|
343
366
|
|
|
344
|
-
const
|
|
345
|
-
const ast = parse(tokens);
|
|
367
|
+
const ast = parseToAST(html);
|
|
346
368
|
|
|
347
369
|
let current = ast.children![0]!;
|
|
348
370
|
for (let i = 0; i < depth - 1; i++) {
|
|
349
371
|
expect(current.tagName).toBe('div');
|
|
350
372
|
expect(current.attributes!.level).toBe(i.toString());
|
|
351
|
-
current = current.children!.find(child => child.type === ASTNodeType.
|
|
373
|
+
current = current.children!.find(child => child.type === ASTNodeType.Element)!;
|
|
352
374
|
}
|
|
353
375
|
|
|
354
|
-
const textNode = current.children!.find(child => child.type === ASTNodeType.
|
|
355
|
-
expect(textNode.content).toBe('Deep content');
|
|
376
|
+
const textNode = current.children!.find(child => child.type === ASTNodeType.Text)!;
|
|
377
|
+
expect((textNode as any).content).toBe('Deep content');
|
|
356
378
|
});
|
|
357
379
|
|
|
358
|
-
|
|
380
|
+
it('should handle malformed HTML gracefully', () => {
|
|
359
381
|
const malformedHTML = '<div><p><span>Text</div></span></p>';
|
|
360
|
-
const
|
|
361
|
-
const ast = parse(tokens);
|
|
382
|
+
const ast = parseToAST(malformedHTML);
|
|
362
383
|
|
|
363
384
|
const divElement = ast.children![0]!;
|
|
364
385
|
expect(divElement.tagName).toBe('div');
|
|
365
386
|
expect(divElement.children!.length).toBeGreaterThan(0);
|
|
366
387
|
});
|
|
367
388
|
|
|
368
|
-
|
|
389
|
+
it('should handle orphaned closing tags', () => {
|
|
369
390
|
const html = '</div><p>Valid content</p></span>';
|
|
370
|
-
const
|
|
371
|
-
const ast = parse(tokens);
|
|
391
|
+
const ast = parseToAST(html);
|
|
372
392
|
|
|
373
393
|
const pElement = ast.children!.find(
|
|
374
|
-
child => child.type === ASTNodeType.
|
|
394
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'p'
|
|
375
395
|
)!;
|
|
376
396
|
expect(pElement).toBeDefined();
|
|
377
|
-
expect(pElement.children![0]
|
|
397
|
+
expect((pElement.children![0]! as any).content).toBe('Valid content');
|
|
378
398
|
});
|
|
379
399
|
|
|
380
|
-
|
|
400
|
+
it.skip('should handle mixed content types in single document', () => {
|
|
381
401
|
const complexHTML = `
|
|
382
402
|
<?xml version="1.0"?>
|
|
383
403
|
<!DOCTYPE html>
|
|
@@ -396,21 +416,20 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
396
416
|
<!-- Document end -->
|
|
397
417
|
`;
|
|
398
418
|
|
|
399
|
-
const
|
|
400
|
-
const ast = parse(tokens);
|
|
419
|
+
const ast = parseToAST(complexHTML);
|
|
401
420
|
|
|
402
|
-
const nodeCounts = {
|
|
403
|
-
|
|
404
|
-
[ASTNodeType.
|
|
405
|
-
[ASTNodeType.
|
|
406
|
-
[ASTNodeType.
|
|
407
|
-
[ASTNodeType.
|
|
421
|
+
const nodeCounts: Record<string, number> = {
|
|
422
|
+
'processing-instruction': 0,
|
|
423
|
+
[ASTNodeType.Doctype]: 0,
|
|
424
|
+
[ASTNodeType.Comment]: 0,
|
|
425
|
+
[ASTNodeType.Element]: 0,
|
|
426
|
+
[ASTNodeType.Text]: 0,
|
|
408
427
|
[ASTNodeType.CDATA]: 0
|
|
409
428
|
};
|
|
410
429
|
|
|
411
430
|
const traverse = (node: ASTNode) => {
|
|
412
431
|
if (node.type in nodeCounts) {
|
|
413
|
-
nodeCounts[node.type
|
|
432
|
+
nodeCounts[node.type]++;
|
|
414
433
|
}
|
|
415
434
|
if (node.children) {
|
|
416
435
|
node.children.forEach(traverse);
|
|
@@ -419,69 +438,63 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
419
438
|
|
|
420
439
|
ast.children!.forEach(traverse);
|
|
421
440
|
|
|
422
|
-
expect(nodeCounts[
|
|
423
|
-
expect(nodeCounts[ASTNodeType.
|
|
424
|
-
expect(nodeCounts[ASTNodeType.
|
|
425
|
-
expect(nodeCounts[ASTNodeType.
|
|
426
|
-
expect(nodeCounts[ASTNodeType.
|
|
441
|
+
expect(nodeCounts['processing-instruction']).toBeGreaterThan(0);
|
|
442
|
+
expect(nodeCounts[ASTNodeType.Doctype]).toBeGreaterThan(0);
|
|
443
|
+
expect(nodeCounts[ASTNodeType.Comment]).toBeGreaterThan(0);
|
|
444
|
+
expect(nodeCounts[ASTNodeType.Element]).toBeGreaterThan(0);
|
|
445
|
+
expect(nodeCounts[ASTNodeType.Text]).toBeGreaterThan(0);
|
|
427
446
|
expect(nodeCounts[ASTNodeType.CDATA]).toBeGreaterThan(0);
|
|
428
447
|
});
|
|
429
448
|
});
|
|
430
449
|
|
|
431
450
|
describe('Security and Template Edge Cases', () => {
|
|
432
|
-
|
|
451
|
+
it('should treat javascript: urls as regular attribute values', () => {
|
|
433
452
|
const html = `<a href="javascript:alert('XSS')">Click me</a>`;
|
|
434
|
-
const
|
|
435
|
-
const ast = parse(tokens);
|
|
453
|
+
const ast = parseToAST(html);
|
|
436
454
|
const aElement = ast.children!.find(child => child.tagName === 'a')!;
|
|
437
455
|
expect(aElement).toBeDefined();
|
|
438
456
|
expect(aElement.attributes!.href).toBe("javascript:alert('XSS')");
|
|
439
457
|
});
|
|
440
458
|
|
|
441
|
-
|
|
459
|
+
it('should correctly parse event handler attributes like onerror', () => {
|
|
442
460
|
const html = `<img src="invalid" onerror="alert('XSS')">`;
|
|
443
|
-
const
|
|
444
|
-
const ast = parse(tokens);
|
|
461
|
+
const ast = parseToAST(html);
|
|
445
462
|
const imgElement = ast.children!.find(child => child.tagName === 'img')!;
|
|
446
463
|
expect(imgElement).toBeDefined();
|
|
447
464
|
expect(imgElement.attributes!.onerror).toBe("alert('XSS')");
|
|
448
465
|
});
|
|
449
466
|
|
|
450
|
-
|
|
467
|
+
it('should treat template engine syntax as plain text', () => {
|
|
451
468
|
const html = `<div>{{ user.name }}</div><p>Hello, <%= name %></p>`;
|
|
452
|
-
const
|
|
453
|
-
const ast = parse(tokens);
|
|
469
|
+
const ast = parseToAST(html);
|
|
454
470
|
|
|
455
471
|
const divElement = ast.children!.find(child => child.tagName === 'div')!;
|
|
456
472
|
expect(divElement).toBeDefined();
|
|
457
|
-
const divText = divElement.children!.find(child => child.type === ASTNodeType.
|
|
458
|
-
expect(divText.content).toBe('{{ user.name }}');
|
|
473
|
+
const divText = divElement.children!.find(child => child.type === ASTNodeType.Text)!;
|
|
474
|
+
expect((divText as any).content).toBe('{{ user.name }}');
|
|
459
475
|
|
|
460
476
|
const pElement = ast.children!.find(child => child.tagName === 'p')!;
|
|
461
477
|
expect(pElement).toBeDefined();
|
|
462
|
-
const pText = pElement.children!.find(child => child.type === ASTNodeType.
|
|
463
|
-
expect(pText.content).toBe('Hello, <%= name %>');
|
|
478
|
+
const pText = pElement.children!.find(child => child.type === ASTNodeType.Text)!;
|
|
479
|
+
expect((pText as any).content).toBe('Hello, <%= name %>');
|
|
464
480
|
});
|
|
465
481
|
|
|
466
|
-
|
|
482
|
+
it('should handle null characters in content gracefully', () => {
|
|
467
483
|
const html = '<div>Hello\0World</div>';
|
|
468
|
-
const
|
|
469
|
-
const ast = parse(tokens);
|
|
484
|
+
const ast = parseToAST(html);
|
|
470
485
|
const divElement = ast.children!.find(child => child.tagName === 'div')!;
|
|
471
|
-
const textNode = divElement.children!.find(child => child.type === ASTNodeType.
|
|
472
|
-
expect(textNode.content).toBe('Hello\uFFFDWorld');
|
|
486
|
+
const textNode = divElement.children!.find(child => child.type === ASTNodeType.Text)!;
|
|
487
|
+
expect((textNode as any).content).toBe('Hello\uFFFDWorld');
|
|
473
488
|
});
|
|
474
489
|
|
|
475
|
-
|
|
490
|
+
it('should handle control characters in content', () => {
|
|
476
491
|
const html = '<div>Line1\x08\x09Line2\x0BLine3\x0CLine4\x0DLine5</div>';
|
|
477
|
-
const
|
|
478
|
-
const ast = parse(tokens);
|
|
492
|
+
const ast = parseToAST(html);
|
|
479
493
|
const divElement = ast.children!.find(child => child.tagName === 'div')!;
|
|
480
|
-
const textNode = divElement.children!.find(child => child.type === ASTNodeType.
|
|
481
|
-
expect(textNode.content).toContain('\x09');
|
|
482
|
-
expect(textNode.content).toContain('\x0D');
|
|
483
|
-
expect(textNode.content).toContain('Line1');
|
|
484
|
-
expect(textNode.content).toContain('Line5');
|
|
485
|
-
});
|
|
494
|
+
const textNode = divElement.children!.find(child => child.type === ASTNodeType.Text)!;
|
|
495
|
+
expect((textNode as any).content).toContain('\x09');
|
|
496
|
+
expect((textNode as any).content).toContain('\x0D');
|
|
497
|
+
expect((textNode as any).content).toContain('Line1');
|
|
498
|
+
expect((textNode as any).content).toContain('Line5'); });
|
|
486
499
|
});
|
|
487
|
-
});
|
|
500
|
+
});
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { describe, it, expect } from 'bun:test';
|
|
2
|
+
import { parseHTML } from '../index';
|
|
3
|
+
|
|
4
|
+
describe('Custom Elements in <head>', () => {
|
|
5
|
+
|
|
6
|
+
it('should keep <meta-tags> custom element in head', () => {
|
|
7
|
+
const doc = parseHTML(
|
|
8
|
+
'<!DOCTYPE html><html><head><meta-tags></meta-tags></head><body></body></html>'
|
|
9
|
+
);
|
|
10
|
+
|
|
11
|
+
const metaTags = doc.head?.querySelector('meta-tags');
|
|
12
|
+
expect(metaTags).toBeTruthy();
|
|
13
|
+
expect(metaTags?.parentElement?.tagName).toBe('HEAD');
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
it('should keep <social-meta> custom element in head', () => {
|
|
17
|
+
const doc = parseHTML(
|
|
18
|
+
'<!DOCTYPE html><html><head><social-meta></social-meta></head><body></body></html>'
|
|
19
|
+
);
|
|
20
|
+
|
|
21
|
+
const socialMeta = doc.head?.querySelector('social-meta');
|
|
22
|
+
expect(socialMeta).toBeTruthy();
|
|
23
|
+
expect(socialMeta?.parentElement?.tagName).toBe('HEAD');
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it('should keep any <custom-element> with hyphen in head', () => {
|
|
27
|
+
const doc = parseHTML(
|
|
28
|
+
'<!DOCTYPE html><html><head><my-component></my-component></head><body></body></html>'
|
|
29
|
+
);
|
|
30
|
+
|
|
31
|
+
const myComponent = doc.head?.querySelector('my-component');
|
|
32
|
+
expect(myComponent).toBeTruthy();
|
|
33
|
+
expect(myComponent?.parentElement?.tagName).toBe('HEAD');
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it('should still eject non-custom elements like <div> to body', () => {
|
|
37
|
+
const doc = parseHTML(
|
|
38
|
+
'<!DOCTYPE html><html><head><div>test</div></head><body></body></html>'
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
const divInHead = doc.head?.querySelector('div');
|
|
42
|
+
const divInBody = doc.body?.querySelector('div');
|
|
43
|
+
expect(divInHead).toBeFalsy();
|
|
44
|
+
expect(divInBody).toBeTruthy();
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it('should handle nested custom elements in head', () => {
|
|
48
|
+
const doc = parseHTML(
|
|
49
|
+
'<!DOCTYPE html><html><head><my-wrapper><inner-comp></inner-comp></my-wrapper></head><body></body></html>'
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
const myWrapper = doc.head?.querySelector('my-wrapper');
|
|
53
|
+
expect(myWrapper).toBeTruthy();
|
|
54
|
+
expect(myWrapper?.parentElement?.tagName).toBe('HEAD');
|
|
55
|
+
|
|
56
|
+
const innerComp = myWrapper?.querySelector('inner-comp');
|
|
57
|
+
expect(innerComp).toBeTruthy();
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it('should keep custom elements with attributes in head', () => {
|
|
61
|
+
const doc = parseHTML(
|
|
62
|
+
'<!DOCTYPE html><html><head><seo-meta property="og:title" content="Test"></seo-meta></head><body></body></html>'
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
const seoMeta = doc.head?.querySelector('seo-meta');
|
|
66
|
+
expect(seoMeta).toBeTruthy();
|
|
67
|
+
expect(seoMeta?.getAttribute('property')).toBe('og:title');
|
|
68
|
+
expect(seoMeta?.getAttribute('content')).toBe('Test');
|
|
69
|
+
expect(seoMeta?.parentElement?.tagName).toBe('HEAD');
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it('should keep self-closing custom elements in head', () => {
|
|
73
|
+
const doc = parseHTML(
|
|
74
|
+
'<!DOCTYPE html><html><head><custom-void /></head><body></body></html>'
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
const customVoid = doc.head?.querySelector('custom-void');
|
|
78
|
+
expect(customVoid).toBeTruthy();
|
|
79
|
+
expect(customVoid?.parentElement?.tagName).toBe('HEAD');
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it('should handle custom elements mixed with standard head elements', () => {
|
|
83
|
+
const doc = parseHTML(
|
|
84
|
+
'<!DOCTYPE html><html><head><title>Test</title><meta-tags></meta-tags><link rel="stylesheet" href="style.css"></head><body></body></html>'
|
|
85
|
+
);
|
|
86
|
+
|
|
87
|
+
const title = doc.head?.querySelector('title');
|
|
88
|
+
const metaTags = doc.head?.querySelector('meta-tags');
|
|
89
|
+
const link = doc.head?.querySelector('link');
|
|
90
|
+
|
|
91
|
+
expect(title).toBeTruthy();
|
|
92
|
+
expect(metaTags).toBeTruthy();
|
|
93
|
+
expect(link).toBeTruthy();
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
it('should handle custom element containing text in head', () => {
|
|
97
|
+
const doc = parseHTML(
|
|
98
|
+
'<!DOCTYPE html><html><head><inline-script>console.log("test")</inline-script></head><body></body></html>'
|
|
99
|
+
);
|
|
100
|
+
|
|
101
|
+
const inlineScript = doc.head?.querySelector('inline-script');
|
|
102
|
+
expect(inlineScript).toBeTruthy();
|
|
103
|
+
expect(inlineScript?.parentElement?.tagName).toBe('HEAD');
|
|
104
|
+
});
|
|
105
|
+
});
|