@tkeron/html-parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +24 -0
- package/LICENSE +21 -0
- package/README.md +120 -0
- package/bun.lock +29 -0
- package/index.ts +18 -0
- package/package.json +25 -0
- package/src/css-selector.ts +172 -0
- package/src/dom-simulator.ts +592 -0
- package/src/dom-types.ts +78 -0
- package/src/parser.ts +355 -0
- package/src/tokenizer.ts +413 -0
- package/tests/advanced.test.ts +487 -0
- package/tests/api-integration.test.ts +114 -0
- package/tests/dom-extended.test.ts +173 -0
- package/tests/dom.test.ts +482 -0
- package/tests/google-dom.test.ts +118 -0
- package/tests/google-homepage.txt +13 -0
- package/tests/official/README.md +87 -0
- package/tests/official/acid/acid-tests.test.ts +309 -0
- package/tests/official/final-output/final-output.test.ts +361 -0
- package/tests/official/html5lib/tokenizer-utils.ts +204 -0
- package/tests/official/html5lib/tokenizer.test.ts +184 -0
- package/tests/official/html5lib/tree-construction-utils.ts +208 -0
- package/tests/official/html5lib/tree-construction.test.ts +250 -0
- package/tests/official/validator/validator-tests.test.ts +237 -0
- package/tests/official/validator-nu/validator-nu.test.ts +335 -0
- package/tests/official/whatwg/whatwg-tests.test.ts +205 -0
- package/tests/official/wpt/wpt-tests.test.ts +409 -0
- package/tests/parser.test.ts +642 -0
- package/tests/selectors.test.ts +65 -0
- package/tests/test-page-0.txt +362 -0
- package/tests/tokenizer.test.ts +666 -0
- package/tsconfig.json +25 -0
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
import { expect, test, describe } from 'bun:test';
|
|
2
|
+
import { tokenize, TokenType } from '../src/tokenizer';
|
|
3
|
+
import { parse, ASTNodeType, type ASTNode } from '../src/parser';
|
|
4
|
+
|
|
5
|
+
describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
6
|
+
|
|
7
|
+
describe('Tokenizer Edge Cases', () => {
|
|
8
|
+
test('should handle attributes with no spaces', () => {
|
|
9
|
+
const tokens = tokenize('<div class="test"id="main"data-value="123">');
|
|
10
|
+
expect(tokens.length).toBeGreaterThan(0);
|
|
11
|
+
const tag = tokens[0]!;
|
|
12
|
+
|
|
13
|
+
expect(tag.attributes).toEqual({
|
|
14
|
+
class: 'test',
|
|
15
|
+
id: 'main',
|
|
16
|
+
'data-value': '123'
|
|
17
|
+
});
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
test('should handle mixed quote styles', () => {
|
|
21
|
+
const tokens = tokenize(`<div class='single' id="double" data-test='mix "quoted" content'>`);
|
|
22
|
+
expect(tokens.length).toBeGreaterThan(0);
|
|
23
|
+
const tag = tokens[0]!;
|
|
24
|
+
|
|
25
|
+
expect(tag.attributes!.class).toBe('single');
|
|
26
|
+
expect(tag.attributes!.id).toBe('double');
|
|
27
|
+
expect(tag.attributes!['data-test']).toBe('mix "quoted" content');
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
test('should handle unicode characters', () => {
|
|
31
|
+
const tokens = tokenize('<div title="测试" data-emoji="🚀" class="café">');
|
|
32
|
+
expect(tokens.length).toBeGreaterThan(0);
|
|
33
|
+
const tag = tokens[0]!;
|
|
34
|
+
|
|
35
|
+
expect(tag.attributes).toEqual({
|
|
36
|
+
title: '测试',
|
|
37
|
+
'data-emoji': '🚀',
|
|
38
|
+
class: 'café'
|
|
39
|
+
});
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
test('should handle complex CDATA content', () => {
|
|
43
|
+
const complexContent = `
|
|
44
|
+
function test() {
|
|
45
|
+
return "<div>HTML inside JS</div>";
|
|
46
|
+
}
|
|
47
|
+
var x = "String with <tags>";
|
|
48
|
+
`;
|
|
49
|
+
const tokens = tokenize(`<![CDATA[${complexContent}]]>`);
|
|
50
|
+
expect(tokens.length).toBeGreaterThan(0);
|
|
51
|
+
const cdataToken = tokens[0]!;
|
|
52
|
+
|
|
53
|
+
expect(cdataToken.type).toBe(TokenType.CDATA);
|
|
54
|
+
expect(cdataToken.value).toBe(complexContent);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
test('should handle performance with large documents', () => {
|
|
58
|
+
let html = '<div>';
|
|
59
|
+
for (let i = 0; i < 1000; i++) {
|
|
60
|
+
html += `<p id="para-${i}">Content ${i}</p>`;
|
|
61
|
+
}
|
|
62
|
+
html += '</div>';
|
|
63
|
+
|
|
64
|
+
const startTime = Date.now();
|
|
65
|
+
const tokens = tokenize(html);
|
|
66
|
+
const endTime = Date.now();
|
|
67
|
+
|
|
68
|
+
expect(tokens.length).toBeGreaterThan(2000);
|
|
69
|
+
expect(endTime - startTime).toBeLessThan(1000);
|
|
70
|
+
});
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
describe('Parser DOM-like Functionality', () => {
|
|
74
|
+
test('should create proper parent-child relationships', () => {
|
|
75
|
+
const tokens = tokenize('<div><section><article><h1>Title</h1><p>Content</p></article></section></div>');
|
|
76
|
+
const ast = parse(tokens);
|
|
77
|
+
|
|
78
|
+
const divElement = ast.children![0]!;
|
|
79
|
+
const sectionElement = divElement.children![0]!;
|
|
80
|
+
const articleElement = sectionElement.children![0]!;
|
|
81
|
+
|
|
82
|
+
expect(sectionElement.parent).toBe(divElement);
|
|
83
|
+
expect(articleElement.parent).toBe(sectionElement);
|
|
84
|
+
|
|
85
|
+
expect(articleElement.children).toHaveLength(2);
|
|
86
|
+
expect(articleElement.children![0]!.tagName).toBe('h1');
|
|
87
|
+
expect(articleElement.children![1]!.tagName).toBe('p');
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
test('should handle complex navigation scenarios', () => {
|
|
91
|
+
const html = `
|
|
92
|
+
<nav>
|
|
93
|
+
<ul>
|
|
94
|
+
<li><a href="#home">Home</a></li>
|
|
95
|
+
<li><a href="#about">About</a></li>
|
|
96
|
+
<li><a href="#contact">Contact</a></li>
|
|
97
|
+
</ul>
|
|
98
|
+
</nav>
|
|
99
|
+
`;
|
|
100
|
+
const tokens = tokenize(html);
|
|
101
|
+
const ast = parse(tokens);
|
|
102
|
+
|
|
103
|
+
const navElement = ast.children!.find(child => child.tagName === 'nav')!;
|
|
104
|
+
const ulElement = navElement.children!.find(child => child.tagName === 'ul')!;
|
|
105
|
+
const liElements = ulElement.children!.filter(child => child.tagName === 'li');
|
|
106
|
+
|
|
107
|
+
expect(liElements).toHaveLength(3);
|
|
108
|
+
|
|
109
|
+
liElements.forEach((li, index) => {
|
|
110
|
+
const anchor = li.children!.find(child => child.tagName === 'a')!;
|
|
111
|
+
expect(anchor.attributes!.href).toBeDefined();
|
|
112
|
+
expect(anchor.children![0]!.type).toBe(ASTNodeType.TEXT);
|
|
113
|
+
});
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
test('should handle form elements with complex attributes', () => {
|
|
117
|
+
const html = `
|
|
118
|
+
<form action="/submit" method="post">
|
|
119
|
+
<input type="email" name="email" required pattern="[a-z0-9._%+-]+@[a-z0-9.-]+\\.[a-z]{2,}$">
|
|
120
|
+
<select name="country" multiple>
|
|
121
|
+
<option value="us" selected>United States</option>
|
|
122
|
+
<option value="ca">Canada</option>
|
|
123
|
+
</select>
|
|
124
|
+
<textarea name="comments" rows="4" cols="50"></textarea>
|
|
125
|
+
</form>
|
|
126
|
+
`;
|
|
127
|
+
const tokens = tokenize(html);
|
|
128
|
+
const ast = parse(tokens);
|
|
129
|
+
|
|
130
|
+
const formElement = ast.children!.find(child => child.tagName === 'form')!;
|
|
131
|
+
expect(formElement.attributes!.action).toBe('/submit');
|
|
132
|
+
expect(formElement.attributes!.method).toBe('post');
|
|
133
|
+
|
|
134
|
+
const formElements: ASTNode[] = [];
|
|
135
|
+
const traverse = (node: ASTNode) => {
|
|
136
|
+
if (node.type === ASTNodeType.ELEMENT) {
|
|
137
|
+
if (['input', 'select', 'textarea', 'option'].includes(node.tagName!)) {
|
|
138
|
+
formElements.push(node);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
if (node.children) {
|
|
142
|
+
node.children.forEach(traverse);
|
|
143
|
+
}
|
|
144
|
+
};
|
|
145
|
+
traverse(formElement);
|
|
146
|
+
|
|
147
|
+
expect(formElements.length).toBeGreaterThan(3);
|
|
148
|
+
|
|
149
|
+
const emailInput = formElements.find(el => el.attributes?.name === 'email');
|
|
150
|
+
expect(emailInput!.attributes!.required).toBe('');
|
|
151
|
+
expect(emailInput!.attributes!.pattern).toContain('@');
|
|
152
|
+
|
|
153
|
+
const selectElement = formElements.find(el => el.tagName === 'select');
|
|
154
|
+
expect(selectElement!.attributes!.multiple).toBe('');
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
test('should handle table structures', () => {
|
|
158
|
+
const html = `
|
|
159
|
+
<table>
|
|
160
|
+
<thead>
|
|
161
|
+
<tr>
|
|
162
|
+
<th>Name</th>
|
|
163
|
+
<th>Age</th>
|
|
164
|
+
</tr>
|
|
165
|
+
</thead>
|
|
166
|
+
<tbody>
|
|
167
|
+
<tr>
|
|
168
|
+
<td>John</td>
|
|
169
|
+
<td>30</td>
|
|
170
|
+
</tr>
|
|
171
|
+
<tr>
|
|
172
|
+
<td>Jane</td>
|
|
173
|
+
<td>25</td>
|
|
174
|
+
</tr>
|
|
175
|
+
</tbody>
|
|
176
|
+
</table>
|
|
177
|
+
`;
|
|
178
|
+
const tokens = tokenize(html);
|
|
179
|
+
const ast = parse(tokens);
|
|
180
|
+
|
|
181
|
+
const tableElement = ast.children!.find(child => child.tagName === 'table')!;
|
|
182
|
+
|
|
183
|
+
const thead = tableElement.children!.find(child => child.tagName === 'thead');
|
|
184
|
+
const tbody = tableElement.children!.find(child => child.tagName === 'tbody');
|
|
185
|
+
|
|
186
|
+
expect(thead).toBeDefined();
|
|
187
|
+
expect(tbody).toBeDefined();
|
|
188
|
+
|
|
189
|
+
const rows: ASTNode[] = [];
|
|
190
|
+
const traverse = (node: ASTNode) => {
|
|
191
|
+
if (node.tagName === 'tr') {
|
|
192
|
+
rows.push(node);
|
|
193
|
+
}
|
|
194
|
+
if (node.children) {
|
|
195
|
+
node.children.forEach(traverse);
|
|
196
|
+
}
|
|
197
|
+
};
|
|
198
|
+
traverse(tableElement);
|
|
199
|
+
|
|
200
|
+
expect(rows).toHaveLength(3);
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
test('should handle mixed inline content', () => {
|
|
204
|
+
const html = `
|
|
205
|
+
<p>This is <strong>bold</strong> and <em>italic</em>.
|
|
206
|
+
Here's a <a href="https://example.com">link</a> and
|
|
207
|
+
<code>inline code</code>.</p>
|
|
208
|
+
`;
|
|
209
|
+
const tokens = tokenize(html);
|
|
210
|
+
const ast = parse(tokens);
|
|
211
|
+
|
|
212
|
+
const pElement = ast.children!.find(child => child.tagName === 'p')!;
|
|
213
|
+
|
|
214
|
+
let textNodes = 0;
|
|
215
|
+
let elementNodes = 0;
|
|
216
|
+
|
|
217
|
+
const traverse = (node: ASTNode) => {
|
|
218
|
+
if (node.type === ASTNodeType.TEXT && node.content!.trim()) {
|
|
219
|
+
textNodes++;
|
|
220
|
+
} else if (node.type === ASTNodeType.ELEMENT) {
|
|
221
|
+
elementNodes++;
|
|
222
|
+
}
|
|
223
|
+
if (node.children) {
|
|
224
|
+
node.children.forEach(traverse);
|
|
225
|
+
}
|
|
226
|
+
};
|
|
227
|
+
|
|
228
|
+
if (pElement.children) {
|
|
229
|
+
pElement.children.forEach(traverse);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
expect(elementNodes).toBeGreaterThan(3);
|
|
233
|
+
expect(textNodes).toBeGreaterThan(0);
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
test('should preserve complete document structure', () => {
|
|
237
|
+
const html = `<!DOCTYPE html>
|
|
238
|
+
<html lang="en">
|
|
239
|
+
<head>
|
|
240
|
+
<meta charset="UTF-8">
|
|
241
|
+
<title>Test Document</title>
|
|
242
|
+
</head>
|
|
243
|
+
<body>
|
|
244
|
+
<header id="main-header">
|
|
245
|
+
<h1>Welcome</h1>
|
|
246
|
+
</header>
|
|
247
|
+
<main>
|
|
248
|
+
<section class="content">
|
|
249
|
+
<article>
|
|
250
|
+
<h2>Article Title</h2>
|
|
251
|
+
<p>Content here.</p>
|
|
252
|
+
</article>
|
|
253
|
+
</section>
|
|
254
|
+
</main>
|
|
255
|
+
<footer>
|
|
256
|
+
<p>© 2025 Test</p>
|
|
257
|
+
</footer>
|
|
258
|
+
</body>
|
|
259
|
+
</html>`;
|
|
260
|
+
|
|
261
|
+
const tokens = tokenize(html);
|
|
262
|
+
const ast = parse(tokens);
|
|
263
|
+
|
|
264
|
+
const doctype = ast.children!.find(child => child.type === ASTNodeType.DOCTYPE);
|
|
265
|
+
expect(doctype).toBeDefined();
|
|
266
|
+
|
|
267
|
+
const htmlElement = ast.children!.find(child => child.tagName === 'html')!;
|
|
268
|
+
expect(htmlElement.attributes!.lang).toBe('en');
|
|
269
|
+
|
|
270
|
+
const headElement = htmlElement.children!.find(child => child.tagName === 'head');
|
|
271
|
+
const bodyElement = htmlElement.children!.find(child => child.tagName === 'body');
|
|
272
|
+
|
|
273
|
+
expect(headElement).toBeDefined();
|
|
274
|
+
expect(bodyElement).toBeDefined();
|
|
275
|
+
|
|
276
|
+
const headerElement = bodyElement!.children!.find(child => child.tagName === 'header');
|
|
277
|
+
const mainElement = bodyElement!.children!.find(child => child.tagName === 'main');
|
|
278
|
+
const footerElement = bodyElement!.children!.find(child => child.tagName === 'footer');
|
|
279
|
+
|
|
280
|
+
expect(headerElement).toBeDefined();
|
|
281
|
+
expect(mainElement).toBeDefined();
|
|
282
|
+
expect(footerElement).toBeDefined();
|
|
283
|
+
|
|
284
|
+
expect(headerElement!.attributes!.id).toBe('main-header');
|
|
285
|
+
});
|
|
286
|
+
});
|
|
287
|
+
|
|
288
|
+
describe('Real-world Content Handling', () => {
|
|
289
|
+
test('should handle SVG content', () => {
|
|
290
|
+
const svg = `
|
|
291
|
+
<svg width="100" height="100" xmlns="http://www.w3.org/2000/svg">
|
|
292
|
+
<circle cx="50" cy="50" r="40" fill="red"/>
|
|
293
|
+
<text x="50" y="50">SVG</text>
|
|
294
|
+
</svg>
|
|
295
|
+
`;
|
|
296
|
+
|
|
297
|
+
const tokens = tokenize(svg);
|
|
298
|
+
const ast = parse(tokens);
|
|
299
|
+
|
|
300
|
+
const svgElement = ast.children!.find(child => child.tagName === 'svg')!;
|
|
301
|
+
expect(svgElement.attributes!.xmlns).toBe('http://www.w3.org/2000/svg');
|
|
302
|
+
|
|
303
|
+
const circleElement = svgElement.children!.find(child => child.tagName === 'circle');
|
|
304
|
+
expect(circleElement).toBeDefined();
|
|
305
|
+
expect(circleElement!.attributes!.fill).toBe('red');
|
|
306
|
+
});
|
|
307
|
+
|
|
308
|
+
test('should handle script and style tags', () => {
|
|
309
|
+
const html = `
|
|
310
|
+
<script type="text/javascript">
|
|
311
|
+
function hello() {
|
|
312
|
+
alert("Hello");
|
|
313
|
+
}
|
|
314
|
+
</script>
|
|
315
|
+
<style type="text/css">
|
|
316
|
+
.class { color: red; }
|
|
317
|
+
</style>
|
|
318
|
+
`;
|
|
319
|
+
|
|
320
|
+
const tokens = tokenize(html);
|
|
321
|
+
const ast = parse(tokens);
|
|
322
|
+
|
|
323
|
+
const scriptElement = ast.children!.find(child => child.tagName === 'script');
|
|
324
|
+
const styleElement = ast.children!.find(child => child.tagName === 'style');
|
|
325
|
+
|
|
326
|
+
expect(scriptElement!.attributes!.type).toBe('text/javascript');
|
|
327
|
+
expect(styleElement!.attributes!.type).toBe('text/css');
|
|
328
|
+
});
|
|
329
|
+
});
|
|
330
|
+
|
|
331
|
+
describe('Error Recovery and Edge Cases', () => {
|
|
332
|
+
test('should handle extreme nesting depth', () => {
|
|
333
|
+
let html = '';
|
|
334
|
+
const depth = 100;
|
|
335
|
+
|
|
336
|
+
for (let i = 0; i < depth; i++) {
|
|
337
|
+
html += `<div level="${i}">`;
|
|
338
|
+
}
|
|
339
|
+
html += 'Deep content';
|
|
340
|
+
for (let i = 0; i < depth; i++) {
|
|
341
|
+
html += '</div>';
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
const tokens = tokenize(html);
|
|
345
|
+
const ast = parse(tokens);
|
|
346
|
+
|
|
347
|
+
let current = ast.children![0]!;
|
|
348
|
+
for (let i = 0; i < depth - 1; i++) {
|
|
349
|
+
expect(current.tagName).toBe('div');
|
|
350
|
+
expect(current.attributes!.level).toBe(i.toString());
|
|
351
|
+
current = current.children!.find(child => child.type === ASTNodeType.ELEMENT)!;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
const textNode = current.children!.find(child => child.type === ASTNodeType.TEXT)!;
|
|
355
|
+
expect(textNode.content).toBe('Deep content');
|
|
356
|
+
});
|
|
357
|
+
|
|
358
|
+
test('should handle malformed HTML gracefully', () => {
|
|
359
|
+
const malformedHTML = '<div><p><span>Text</div></span></p>';
|
|
360
|
+
const tokens = tokenize(malformedHTML);
|
|
361
|
+
const ast = parse(tokens);
|
|
362
|
+
|
|
363
|
+
const divElement = ast.children![0]!;
|
|
364
|
+
expect(divElement.tagName).toBe('div');
|
|
365
|
+
expect(divElement.children!.length).toBeGreaterThan(0);
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
test('should handle orphaned closing tags', () => {
|
|
369
|
+
const html = '</div><p>Valid content</p></span>';
|
|
370
|
+
const tokens = tokenize(html);
|
|
371
|
+
const ast = parse(tokens);
|
|
372
|
+
|
|
373
|
+
const pElement = ast.children!.find(
|
|
374
|
+
child => child.type === ASTNodeType.ELEMENT && child.tagName === 'p'
|
|
375
|
+
)!;
|
|
376
|
+
expect(pElement).toBeDefined();
|
|
377
|
+
expect(pElement.children![0]!.content).toBe('Valid content');
|
|
378
|
+
});
|
|
379
|
+
|
|
380
|
+
test('should handle mixed content types in single document', () => {
|
|
381
|
+
const complexHTML = `
|
|
382
|
+
<?xml version="1.0"?>
|
|
383
|
+
<!DOCTYPE html>
|
|
384
|
+
<!-- Document start -->
|
|
385
|
+
<html>
|
|
386
|
+
<head>
|
|
387
|
+
<title>Test & Demo</title>
|
|
388
|
+
<![CDATA[Raw data here]]>
|
|
389
|
+
</head>
|
|
390
|
+
<body>
|
|
391
|
+
<h1>Main Title</h1>
|
|
392
|
+
<p>Paragraph with <strong>bold</strong> text.</p>
|
|
393
|
+
<!-- Body content -->
|
|
394
|
+
</body>
|
|
395
|
+
</html>
|
|
396
|
+
<!-- Document end -->
|
|
397
|
+
`;
|
|
398
|
+
|
|
399
|
+
const tokens = tokenize(complexHTML);
|
|
400
|
+
const ast = parse(tokens);
|
|
401
|
+
|
|
402
|
+
const nodeCounts = {
|
|
403
|
+
[ASTNodeType.PROCESSING_INSTRUCTION]: 0,
|
|
404
|
+
[ASTNodeType.DOCTYPE]: 0,
|
|
405
|
+
[ASTNodeType.COMMENT]: 0,
|
|
406
|
+
[ASTNodeType.ELEMENT]: 0,
|
|
407
|
+
[ASTNodeType.TEXT]: 0,
|
|
408
|
+
[ASTNodeType.CDATA]: 0
|
|
409
|
+
};
|
|
410
|
+
|
|
411
|
+
const traverse = (node: ASTNode) => {
|
|
412
|
+
if (node.type in nodeCounts) {
|
|
413
|
+
nodeCounts[node.type as keyof typeof nodeCounts]++;
|
|
414
|
+
}
|
|
415
|
+
if (node.children) {
|
|
416
|
+
node.children.forEach(traverse);
|
|
417
|
+
}
|
|
418
|
+
};
|
|
419
|
+
|
|
420
|
+
ast.children!.forEach(traverse);
|
|
421
|
+
|
|
422
|
+
expect(nodeCounts[ASTNodeType.PROCESSING_INSTRUCTION]).toBeGreaterThan(0);
|
|
423
|
+
expect(nodeCounts[ASTNodeType.DOCTYPE]).toBeGreaterThan(0);
|
|
424
|
+
expect(nodeCounts[ASTNodeType.COMMENT]).toBeGreaterThan(0);
|
|
425
|
+
expect(nodeCounts[ASTNodeType.ELEMENT]).toBeGreaterThan(0);
|
|
426
|
+
expect(nodeCounts[ASTNodeType.TEXT]).toBeGreaterThan(0);
|
|
427
|
+
expect(nodeCounts[ASTNodeType.CDATA]).toBeGreaterThan(0);
|
|
428
|
+
});
|
|
429
|
+
});
|
|
430
|
+
|
|
431
|
+
describe('Security and Template Edge Cases', () => {
|
|
432
|
+
test('should treat javascript: urls as regular attribute values', () => {
|
|
433
|
+
const html = `<a href="javascript:alert('XSS')">Click me</a>`;
|
|
434
|
+
const tokens = tokenize(html);
|
|
435
|
+
const ast = parse(tokens);
|
|
436
|
+
const aElement = ast.children!.find(child => child.tagName === 'a')!;
|
|
437
|
+
expect(aElement).toBeDefined();
|
|
438
|
+
expect(aElement.attributes!.href).toBe("javascript:alert('XSS')");
|
|
439
|
+
});
|
|
440
|
+
|
|
441
|
+
test('should correctly parse event handler attributes like onerror', () => {
|
|
442
|
+
const html = `<img src="invalid" onerror="alert('XSS')">`;
|
|
443
|
+
const tokens = tokenize(html);
|
|
444
|
+
const ast = parse(tokens);
|
|
445
|
+
const imgElement = ast.children!.find(child => child.tagName === 'img')!;
|
|
446
|
+
expect(imgElement).toBeDefined();
|
|
447
|
+
expect(imgElement.attributes!.onerror).toBe("alert('XSS')");
|
|
448
|
+
});
|
|
449
|
+
|
|
450
|
+
test('should treat template engine syntax as plain text', () => {
|
|
451
|
+
const html = `<div>{{ user.name }}</div><p>Hello, <%= name %></p>`;
|
|
452
|
+
const tokens = tokenize(html);
|
|
453
|
+
const ast = parse(tokens);
|
|
454
|
+
|
|
455
|
+
const divElement = ast.children!.find(child => child.tagName === 'div')!;
|
|
456
|
+
expect(divElement).toBeDefined();
|
|
457
|
+
const divText = divElement.children!.find(child => child.type === ASTNodeType.TEXT)!;
|
|
458
|
+
expect(divText.content).toBe('{{ user.name }}');
|
|
459
|
+
|
|
460
|
+
const pElement = ast.children!.find(child => child.tagName === 'p')!;
|
|
461
|
+
expect(pElement).toBeDefined();
|
|
462
|
+
const pText = pElement.children!.find(child => child.type === ASTNodeType.TEXT)!;
|
|
463
|
+
expect(pText.content).toBe('Hello, <%= name %>');
|
|
464
|
+
});
|
|
465
|
+
|
|
466
|
+
test('should handle null characters in content gracefully', () => {
|
|
467
|
+
const html = '<div>Hello\0World</div>';
|
|
468
|
+
const tokens = tokenize(html);
|
|
469
|
+
const ast = parse(tokens);
|
|
470
|
+
const divElement = ast.children!.find(child => child.tagName === 'div')!;
|
|
471
|
+
const textNode = divElement.children!.find(child => child.type === ASTNodeType.TEXT)!;
|
|
472
|
+
expect(textNode.content).toBe('Hello\uFFFDWorld');
|
|
473
|
+
});
|
|
474
|
+
|
|
475
|
+
test('should handle control characters in content', () => {
|
|
476
|
+
const html = '<div>Line1\x08\x09Line2\x0BLine3\x0CLine4\x0DLine5</div>';
|
|
477
|
+
const tokens = tokenize(html);
|
|
478
|
+
const ast = parse(tokens);
|
|
479
|
+
const divElement = ast.children!.find(child => child.tagName === 'div')!;
|
|
480
|
+
const textNode = divElement.children!.find(child => child.type === ASTNodeType.TEXT)!;
|
|
481
|
+
expect(textNode.content).toContain('\x09');
|
|
482
|
+
expect(textNode.content).toContain('\x0D');
|
|
483
|
+
expect(textNode.content).toContain('Line1');
|
|
484
|
+
expect(textNode.content).toContain('Line5');
|
|
485
|
+
});
|
|
486
|
+
});
|
|
487
|
+
});
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import { describe, it, expect } from 'bun:test';
|
|
2
|
+
import { parseHTML } from '../index';
|
|
3
|
+
|
|
4
|
+
describe('API Integration Tests - Real DOM Usage', () => {
|
|
5
|
+
it('should work like real DOM API - querySelector and setAttribute integrated', () => {
|
|
6
|
+
const doc = parseHTML(`
|
|
7
|
+
<html>
|
|
8
|
+
<head>
|
|
9
|
+
<title>Test Document</title>
|
|
10
|
+
</head>
|
|
11
|
+
<body>
|
|
12
|
+
<div id="container" class="main">
|
|
13
|
+
<h1>Welcome</h1>
|
|
14
|
+
<p class="intro">This is a paragraph.</p>
|
|
15
|
+
<ul>
|
|
16
|
+
<li>Item 1</li>
|
|
17
|
+
<li>Item 2</li>
|
|
18
|
+
</ul>
|
|
19
|
+
</div>
|
|
20
|
+
</body>
|
|
21
|
+
</html>
|
|
22
|
+
`);
|
|
23
|
+
|
|
24
|
+
expect(typeof doc.querySelector).toBe('function');
|
|
25
|
+
expect(typeof doc.querySelectorAll).toBe('function');
|
|
26
|
+
expect(typeof doc.createElement).toBe('function');
|
|
27
|
+
expect(typeof doc.createTextNode).toBe('function');
|
|
28
|
+
|
|
29
|
+
const container = doc.querySelector('#container');
|
|
30
|
+
expect(container).toBeTruthy();
|
|
31
|
+
expect(container?.tagName).toBe('DIV');
|
|
32
|
+
|
|
33
|
+
expect(typeof container?.querySelector).toBe('function');
|
|
34
|
+
expect(typeof container?.setAttribute).toBe('function');
|
|
35
|
+
expect(typeof container?.getAttribute).toBe('function');
|
|
36
|
+
|
|
37
|
+
const h1 = container?.querySelector('h1');
|
|
38
|
+
expect(h1).toBeTruthy();
|
|
39
|
+
expect(h1?.tagName).toBe('H1');
|
|
40
|
+
expect(h1?.textContent).toBe('Welcome');
|
|
41
|
+
|
|
42
|
+
h1?.setAttribute('class', 'title');
|
|
43
|
+
expect(h1?.getAttribute('class')).toBe('title');
|
|
44
|
+
expect(h1?.hasAttribute('class')).toBe(true);
|
|
45
|
+
|
|
46
|
+
const allLi = doc.querySelectorAll('li');
|
|
47
|
+
expect(allLi.length).toBe(2);
|
|
48
|
+
expect(allLi[0]?.tagName).toBe('LI');
|
|
49
|
+
expect(allLi[1]?.tagName).toBe('LI');
|
|
50
|
+
|
|
51
|
+
const newElement = doc.createElement('span');
|
|
52
|
+
newElement.setAttribute('id', 'new-span');
|
|
53
|
+
container?.appendChild(newElement);
|
|
54
|
+
|
|
55
|
+
const spanElement = container?.querySelector('#new-span');
|
|
56
|
+
expect(spanElement).toBeTruthy();
|
|
57
|
+
expect(spanElement?.tagName).toBe('SPAN');
|
|
58
|
+
expect(spanElement?.getAttribute('id')).toBe('new-span');
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
it('should work for complex DOM manipulation scenarios', () => {
|
|
62
|
+
const doc = parseHTML('<div><p>Hello</p></div>');
|
|
63
|
+
|
|
64
|
+
const div = doc.querySelector('div');
|
|
65
|
+
expect(div).toBeTruthy();
|
|
66
|
+
|
|
67
|
+
const p = div?.querySelector('p');
|
|
68
|
+
expect(p).toBeTruthy();
|
|
69
|
+
expect(p?.textContent).toBe('Hello');
|
|
70
|
+
|
|
71
|
+
p?.setAttribute('class', 'greeting');
|
|
72
|
+
expect(p?.getAttribute('class')).toBe('greeting');
|
|
73
|
+
|
|
74
|
+
const span = doc.createElement('span');
|
|
75
|
+
span.setAttribute('id', 'dynamic');
|
|
76
|
+
|
|
77
|
+
div?.appendChild(span);
|
|
78
|
+
|
|
79
|
+
const foundSpan = div?.querySelector('#dynamic');
|
|
80
|
+
expect(foundSpan).toBeTruthy();
|
|
81
|
+
expect(foundSpan?.getAttribute('id')).toBe('dynamic');
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it('should handle removeChild correctly', () => {
|
|
85
|
+
const doc = parseHTML('<div><p>First</p><span>Second</span></div>');
|
|
86
|
+
const div = doc.querySelector('div');
|
|
87
|
+
const p = div?.querySelector('p');
|
|
88
|
+
|
|
89
|
+
expect(div?.childNodes.length).toBe(2);
|
|
90
|
+
expect(p?.textContent).toBe('First');
|
|
91
|
+
|
|
92
|
+
if (p) {
|
|
93
|
+
div?.removeChild(p);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
expect(div?.childNodes.length).toBe(1);
|
|
97
|
+
expect(div?.querySelector('p')).toBe(null);
|
|
98
|
+
expect(div?.querySelector('span')).toBeTruthy();
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
it('should demonstrate clean API without helper functions', () => {
|
|
102
|
+
const doc = parseHTML('<html><body><div id="test">Content</div></body></html>');
|
|
103
|
+
|
|
104
|
+
const testDiv = doc.querySelector('#test');
|
|
105
|
+
testDiv?.setAttribute('class', 'active');
|
|
106
|
+
const className = testDiv?.getAttribute('class');
|
|
107
|
+
const bodyDiv = doc.body?.querySelector('#test');
|
|
108
|
+
|
|
109
|
+
expect(testDiv).toBeTruthy();
|
|
110
|
+
expect(className).toBe('active');
|
|
111
|
+
expect(bodyDiv).toBeTruthy();
|
|
112
|
+
expect(testDiv === bodyDiv).toBe(true);
|
|
113
|
+
});
|
|
114
|
+
});
|