@tkeron/html-parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +24 -0
- package/LICENSE +21 -0
- package/README.md +120 -0
- package/bun.lock +29 -0
- package/index.ts +18 -0
- package/package.json +25 -0
- package/src/css-selector.ts +172 -0
- package/src/dom-simulator.ts +592 -0
- package/src/dom-types.ts +78 -0
- package/src/parser.ts +355 -0
- package/src/tokenizer.ts +413 -0
- package/tests/advanced.test.ts +487 -0
- package/tests/api-integration.test.ts +114 -0
- package/tests/dom-extended.test.ts +173 -0
- package/tests/dom.test.ts +482 -0
- package/tests/google-dom.test.ts +118 -0
- package/tests/google-homepage.txt +13 -0
- package/tests/official/README.md +87 -0
- package/tests/official/acid/acid-tests.test.ts +309 -0
- package/tests/official/final-output/final-output.test.ts +361 -0
- package/tests/official/html5lib/tokenizer-utils.ts +204 -0
- package/tests/official/html5lib/tokenizer.test.ts +184 -0
- package/tests/official/html5lib/tree-construction-utils.ts +208 -0
- package/tests/official/html5lib/tree-construction.test.ts +250 -0
- package/tests/official/validator/validator-tests.test.ts +237 -0
- package/tests/official/validator-nu/validator-nu.test.ts +335 -0
- package/tests/official/whatwg/whatwg-tests.test.ts +205 -0
- package/tests/official/wpt/wpt-tests.test.ts +409 -0
- package/tests/parser.test.ts +642 -0
- package/tests/selectors.test.ts +65 -0
- package/tests/test-page-0.txt +362 -0
- package/tests/tokenizer.test.ts +666 -0
- package/tsconfig.json +25 -0
|
@@ -0,0 +1,642 @@
|
|
|
1
|
+
import { expect, test, describe } from 'bun:test';
|
|
2
|
+
import { tokenize } from '../src/tokenizer';
|
|
3
|
+
import { parse, ASTNodeType, type ASTNode } from '../src/parser';
|
|
4
|
+
import { file } from 'bun';
|
|
5
|
+
import { join } from 'path';
|
|
6
|
+
|
|
7
|
+
describe('HTML Parser', () => {
|
|
8
|
+
|
|
9
|
+
describe('Basic Elements', () => {
|
|
10
|
+
test('should parse simple element', () => {
|
|
11
|
+
const tokens = tokenize('<div></div>');
|
|
12
|
+
const ast = parse(tokens);
|
|
13
|
+
|
|
14
|
+
expect(ast.type).toBe(ASTNodeType.DOCUMENT);
|
|
15
|
+
expect(ast.children).toHaveLength(1);
|
|
16
|
+
|
|
17
|
+
const divElement = ast.children![0]!;
|
|
18
|
+
expect(divElement.type).toBe(ASTNodeType.ELEMENT);
|
|
19
|
+
expect(divElement.tagName).toBe('div');
|
|
20
|
+
expect(divElement.children).toHaveLength(0);
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
test('should parse element with attributes', () => {
|
|
24
|
+
const tokens = tokenize('<div class="container" id="main"></div>');
|
|
25
|
+
const ast = parse(tokens);
|
|
26
|
+
|
|
27
|
+
const divElement = ast.children![0]!;
|
|
28
|
+
expect(divElement.attributes).toEqual({
|
|
29
|
+
class: 'container',
|
|
30
|
+
id: 'main'
|
|
31
|
+
});
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
test('should parse self-closing elements', () => {
|
|
35
|
+
const tokens = tokenize('<img src="test.jpg" alt="test"/>');
|
|
36
|
+
const ast = parse(tokens);
|
|
37
|
+
|
|
38
|
+
const imgElement = ast.children![0]!;
|
|
39
|
+
expect(imgElement.type).toBe(ASTNodeType.ELEMENT);
|
|
40
|
+
expect(imgElement.tagName).toBe('img');
|
|
41
|
+
expect(imgElement.isSelfClosing).toBe(true);
|
|
42
|
+
expect(imgElement.attributes).toEqual({
|
|
43
|
+
src: 'test.jpg',
|
|
44
|
+
alt: 'test'
|
|
45
|
+
});
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
test('should parse void elements correctly', () => {
|
|
49
|
+
const tokens = tokenize('<br><hr><input type="text">');
|
|
50
|
+
const ast = parse(tokens);
|
|
51
|
+
|
|
52
|
+
expect(ast.children).toHaveLength(3);
|
|
53
|
+
expect(ast.children![0]!.tagName).toBe('br');
|
|
54
|
+
expect(ast.children![0]!.isSelfClosing).toBe(true);
|
|
55
|
+
expect(ast.children![1]!.tagName).toBe('hr');
|
|
56
|
+
expect(ast.children![1]!.isSelfClosing).toBe(true);
|
|
57
|
+
expect(ast.children![2]!.tagName).toBe('input');
|
|
58
|
+
expect(ast.children![2]!.isSelfClosing).toBe(true);
|
|
59
|
+
});
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
describe('Nested Elements', () => {
|
|
63
|
+
test('should parse nested elements', () => {
|
|
64
|
+
const tokens = tokenize('<div><p>Hello</p></div>');
|
|
65
|
+
const ast = parse(tokens);
|
|
66
|
+
|
|
67
|
+
const divElement = ast.children![0]!;
|
|
68
|
+
expect(divElement.tagName).toBe('div');
|
|
69
|
+
expect(divElement.children).toHaveLength(1);
|
|
70
|
+
|
|
71
|
+
const pElement = divElement.children![0]!;
|
|
72
|
+
expect(pElement.tagName).toBe('p');
|
|
73
|
+
expect(pElement.children).toHaveLength(1);
|
|
74
|
+
|
|
75
|
+
const textNode = pElement.children![0]!;
|
|
76
|
+
expect(textNode.type).toBe(ASTNodeType.TEXT);
|
|
77
|
+
expect(textNode.content).toBe('Hello');
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
test('should parse deeply nested elements', () => {
|
|
81
|
+
const tokens = tokenize('<div><section><article><h1>Title</h1></article></section></div>');
|
|
82
|
+
const ast = parse(tokens);
|
|
83
|
+
|
|
84
|
+
const divElement = ast.children![0]!;
|
|
85
|
+
const sectionElement = divElement.children![0]!;
|
|
86
|
+
const articleElement = sectionElement.children![0]!;
|
|
87
|
+
const h1Element = articleElement.children![0]!;
|
|
88
|
+
|
|
89
|
+
expect(h1Element.tagName).toBe('h1');
|
|
90
|
+
expect(h1Element.children![0]!.content).toBe('Title');
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
test('should handle multiple siblings', () => {
|
|
94
|
+
const tokens = tokenize('<div><p>First</p><p>Second</p><p>Third</p></div>');
|
|
95
|
+
const ast = parse(tokens);
|
|
96
|
+
|
|
97
|
+
const divElement = ast.children![0]!;
|
|
98
|
+
expect(divElement.children).toHaveLength(3);
|
|
99
|
+
|
|
100
|
+
expect(divElement.children![0]!.tagName).toBe('p');
|
|
101
|
+
expect(divElement.children![0]!.children![0]!.content).toBe('First');
|
|
102
|
+
expect(divElement.children![1]!.children![0]!.content).toBe('Second');
|
|
103
|
+
expect(divElement.children![2]!.children![0]!.content).toBe('Third');
|
|
104
|
+
});
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
describe('Text Content', () => {
|
|
108
|
+
test('should parse text content', () => {
|
|
109
|
+
const tokens = tokenize('Hello World');
|
|
110
|
+
const ast = parse(tokens);
|
|
111
|
+
|
|
112
|
+
expect(ast.children).toHaveLength(1);
|
|
113
|
+
const textNode = ast.children![0]!;
|
|
114
|
+
expect(textNode.type).toBe(ASTNodeType.TEXT);
|
|
115
|
+
expect(textNode.content).toBe('Hello World');
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
test('should parse mixed text and elements', () => {
|
|
119
|
+
const tokens = tokenize('Before <strong>bold</strong> after');
|
|
120
|
+
const ast = parse(tokens);
|
|
121
|
+
|
|
122
|
+
expect(ast.children).toHaveLength(3);
|
|
123
|
+
expect(ast.children![0]!.content).toBe('Before ');
|
|
124
|
+
expect(ast.children![1]!.tagName).toBe('strong');
|
|
125
|
+
expect(ast.children![1]!.children![0]!.content).toBe('bold');
|
|
126
|
+
expect(ast.children![2]!.content).toBe(' after');
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
test('should handle entities in text', () => {
|
|
130
|
+
const tokens = tokenize('<p>& < ></p>');
|
|
131
|
+
const ast = parse(tokens);
|
|
132
|
+
|
|
133
|
+
const pElement = ast.children![0]!;
|
|
134
|
+
const textNode = pElement.children![0]!;
|
|
135
|
+
expect(textNode.content).toBe('& < >');
|
|
136
|
+
});
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
describe('Comments and Special Nodes', () => {
|
|
140
|
+
test('should parse HTML comments', () => {
|
|
141
|
+
const tokens = tokenize('<!-- This is a comment -->');
|
|
142
|
+
const ast = parse(tokens);
|
|
143
|
+
|
|
144
|
+
expect(ast.children).toHaveLength(1);
|
|
145
|
+
const commentNode = ast.children![0]!;
|
|
146
|
+
expect(commentNode.type).toBe(ASTNodeType.COMMENT);
|
|
147
|
+
expect(commentNode.content).toBe(' This is a comment ');
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
test('should parse DOCTYPE', () => {
|
|
151
|
+
const tokens = tokenize('<!DOCTYPE html>');
|
|
152
|
+
const ast = parse(tokens);
|
|
153
|
+
|
|
154
|
+
expect(ast.children).toHaveLength(1);
|
|
155
|
+
const doctypeNode = ast.children![0]!;
|
|
156
|
+
expect(doctypeNode.type).toBe(ASTNodeType.DOCTYPE);
|
|
157
|
+
expect(doctypeNode.content).toBe('html');
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
test('should parse CDATA sections', () => {
|
|
161
|
+
const tokens = tokenize('<![CDATA[Some raw data]]>');
|
|
162
|
+
const ast = parse(tokens);
|
|
163
|
+
|
|
164
|
+
expect(ast.children).toHaveLength(1);
|
|
165
|
+
const cdataNode = ast.children![0]!;
|
|
166
|
+
expect(cdataNode.type).toBe(ASTNodeType.CDATA);
|
|
167
|
+
expect(cdataNode.content).toBe('Some raw data');
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
test('should parse processing instructions', () => {
|
|
171
|
+
const tokens = tokenize('<?xml version="1.0"?>');
|
|
172
|
+
const ast = parse(tokens);
|
|
173
|
+
|
|
174
|
+
expect(ast.children).toHaveLength(1);
|
|
175
|
+
const piNode = ast.children![0]!;
|
|
176
|
+
expect(piNode.type).toBe(ASTNodeType.PROCESSING_INSTRUCTION);
|
|
177
|
+
expect(piNode.content).toBe('<?xml version="1.0"');
|
|
178
|
+
});
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
describe('Complete HTML Documents', () => {
|
|
182
|
+
test('should parse complete HTML document', () => {
|
|
183
|
+
const html = `<!DOCTYPE html>
|
|
184
|
+
<html lang="en">
|
|
185
|
+
<head>
|
|
186
|
+
<title>Test Document</title>
|
|
187
|
+
<meta charset="UTF-8">
|
|
188
|
+
</head>
|
|
189
|
+
<body>
|
|
190
|
+
<h1>Hello World</h1>
|
|
191
|
+
<p>This is a test paragraph.</p>
|
|
192
|
+
<!-- This is a comment -->
|
|
193
|
+
</body>
|
|
194
|
+
</html>`;
|
|
195
|
+
|
|
196
|
+
const tokens = tokenize(html);
|
|
197
|
+
const ast = parse(tokens);
|
|
198
|
+
|
|
199
|
+
expect(ast.children!.length).toBeGreaterThan(1);
|
|
200
|
+
|
|
201
|
+
const htmlElement = ast.children!.find(
|
|
202
|
+
child => child.type === ASTNodeType.ELEMENT && child.tagName === 'html'
|
|
203
|
+
)!;
|
|
204
|
+
|
|
205
|
+
expect(htmlElement).toBeDefined();
|
|
206
|
+
expect(htmlElement.attributes!.lang).toBe('en');
|
|
207
|
+
|
|
208
|
+
const elementChildren = htmlElement.children!.filter(
|
|
209
|
+
child => child.type === ASTNodeType.ELEMENT
|
|
210
|
+
);
|
|
211
|
+
expect(elementChildren).toHaveLength(2);
|
|
212
|
+
|
|
213
|
+
const headElement = elementChildren.find(child => child.tagName === 'head')!;
|
|
214
|
+
const bodyElement = elementChildren.find(child => child.tagName === 'body')!;
|
|
215
|
+
|
|
216
|
+
expect(headElement).toBeDefined();
|
|
217
|
+
expect(bodyElement).toBeDefined();
|
|
218
|
+
});
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
describe('real web scenarios', () => {
|
|
222
|
+
test('should parse real-world HTML', async () => {
|
|
223
|
+
const html = await file(join(__dirname, "test-page-0.txt")).text();
|
|
224
|
+
const tokens = tokenize(html);
|
|
225
|
+
const ast = parse(tokens);
|
|
226
|
+
});
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
describe('Error Recovery', () => {
|
|
230
|
+
test('should handle unclosed tags', () => {
|
|
231
|
+
const tokens = tokenize('<div><p>Unclosed paragraph</div>');
|
|
232
|
+
const ast = parse(tokens);
|
|
233
|
+
|
|
234
|
+
const divElement = ast.children![0]!;
|
|
235
|
+
expect(divElement.tagName).toBe('div');
|
|
236
|
+
|
|
237
|
+
const pElement = divElement.children![0]!;
|
|
238
|
+
expect(pElement.tagName).toBe('p');
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
test('should handle unexpected closing tags', () => {
|
|
242
|
+
const tokens = tokenize('<div></span></div>');
|
|
243
|
+
const ast = parse(tokens);
|
|
244
|
+
|
|
245
|
+
const divElement = ast.children![0]!;
|
|
246
|
+
expect(divElement.tagName).toBe('div');
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
test('should handle malformed attributes', () => {
|
|
250
|
+
const tokens = tokenize('<div class="test id="main">Content</div>');
|
|
251
|
+
const ast = parse(tokens);
|
|
252
|
+
|
|
253
|
+
const divElement = ast.children![0]!;
|
|
254
|
+
expect(divElement.tagName).toBe('div');
|
|
255
|
+
expect(divElement.attributes).toBeDefined();
|
|
256
|
+
});
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
describe('Auto-closing Tags', () => {
|
|
260
|
+
test('should auto-close list items', () => {
|
|
261
|
+
const tokens = tokenize('<ul><li>First<li>Second</ul>');
|
|
262
|
+
const ast = parse(tokens);
|
|
263
|
+
|
|
264
|
+
const ulElement = ast.children![0]!;
|
|
265
|
+
const liElements = ulElement.children!.filter(
|
|
266
|
+
child => child.type === ASTNodeType.ELEMENT && child.tagName === 'li'
|
|
267
|
+
);
|
|
268
|
+
|
|
269
|
+
expect(liElements).toHaveLength(2);
|
|
270
|
+
expect(liElements[0]!.children![0]!.content).toBe('First');
|
|
271
|
+
expect(liElements[1]!.children![0]!.content).toBe('Second');
|
|
272
|
+
});
|
|
273
|
+
|
|
274
|
+
test('should auto-close paragraph tags', () => {
|
|
275
|
+
const tokens = tokenize('<p>First paragraph<p>Second paragraph');
|
|
276
|
+
const ast = parse(tokens);
|
|
277
|
+
|
|
278
|
+
const pElements = ast.children!.filter(
|
|
279
|
+
child => child.type === ASTNodeType.ELEMENT && child.tagName === 'p'
|
|
280
|
+
);
|
|
281
|
+
|
|
282
|
+
expect(pElements).toHaveLength(2);
|
|
283
|
+
expect(pElements[0]!.children![0]!.content).toBe('First paragraph');
|
|
284
|
+
expect(pElements[1]!.children![0]!.content).toBe('Second paragraph');
|
|
285
|
+
});
|
|
286
|
+
});
|
|
287
|
+
|
|
288
|
+
describe('Whitespace Handling', () => {
|
|
289
|
+
test('should preserve significant whitespace', () => {
|
|
290
|
+
const tokens = tokenize('<p> Hello World </p>');
|
|
291
|
+
const ast = parse(tokens);
|
|
292
|
+
|
|
293
|
+
const pElement = ast.children![0]!;
|
|
294
|
+
const textNode = pElement.children![0]!;
|
|
295
|
+
expect(textNode.content).toBe(' Hello World ');
|
|
296
|
+
});
|
|
297
|
+
|
|
298
|
+
test('should skip insignificant whitespace', () => {
|
|
299
|
+
const tokens = tokenize(`<html>
|
|
300
|
+
<head>
|
|
301
|
+
<title>Test</title>
|
|
302
|
+
</head>
|
|
303
|
+
</html>`);
|
|
304
|
+
const ast = parse(tokens);
|
|
305
|
+
|
|
306
|
+
const htmlElement = ast.children!.find(
|
|
307
|
+
child => child.type === ASTNodeType.ELEMENT && child.tagName === 'html'
|
|
308
|
+
)!;
|
|
309
|
+
|
|
310
|
+
const headElement = htmlElement.children!.find(
|
|
311
|
+
child => child.type === ASTNodeType.ELEMENT && child.tagName === 'head'
|
|
312
|
+
)!;
|
|
313
|
+
|
|
314
|
+
expect(headElement).toBeDefined();
|
|
315
|
+
});
|
|
316
|
+
});
|
|
317
|
+
|
|
318
|
+
describe("complete web page", () => {
|
|
319
|
+
test('should parse a complete web page', async () => {
|
|
320
|
+
const html = await file(join(__dirname, "test-page-0.txt")).text();
|
|
321
|
+
const tokens = tokenize(html);
|
|
322
|
+
const ast = parse(tokens);
|
|
323
|
+
expect(ast.children!.length).toBeGreaterThanOrEqual(3);
|
|
324
|
+
const htmlElement = ast.children!.find(
|
|
325
|
+
child => child.type === ASTNodeType.ELEMENT && child.tagName === 'html'
|
|
326
|
+
)!;
|
|
327
|
+
expect(htmlElement.type).toBe(ASTNodeType.ELEMENT);
|
|
328
|
+
expect(htmlElement.tagName).toBe('html');
|
|
329
|
+
expect(htmlElement.attributes!.lang).toBe('es');
|
|
330
|
+
const headElement = htmlElement.children!.find(
|
|
331
|
+
child => child.type === ASTNodeType.ELEMENT && child.tagName === 'head'
|
|
332
|
+
)!;
|
|
333
|
+
const bodyElement = htmlElement.children!.find(
|
|
334
|
+
child => child.type === ASTNodeType.ELEMENT && child.tagName === 'body'
|
|
335
|
+
)!;
|
|
336
|
+
expect(headElement).toBeDefined();
|
|
337
|
+
expect(bodyElement).toBeDefined();
|
|
338
|
+
})
|
|
339
|
+
})
|
|
340
|
+
|
|
341
|
+
describe('Advanced Edge Cases', () => {
|
|
342
|
+
test('should handle empty attributes', () => {
|
|
343
|
+
const tokens = tokenize('<input disabled checked="" value="">');
|
|
344
|
+
const ast = parse(tokens);
|
|
345
|
+
const inputElement = ast.children![0]!;
|
|
346
|
+
expect(inputElement.attributes).toEqual({
|
|
347
|
+
disabled: '',
|
|
348
|
+
checked: '',
|
|
349
|
+
value: ''
|
|
350
|
+
});
|
|
351
|
+
});
|
|
352
|
+
|
|
353
|
+
test('should handle attributes with special characters', () => {
|
|
354
|
+
const tokens = tokenize('<div data-test="hello-world" class="my_class-123">');
|
|
355
|
+
const ast = parse(tokens);
|
|
356
|
+
const divElement = ast.children![0]!;
|
|
357
|
+
expect(divElement.attributes).toEqual({
|
|
358
|
+
'data-test': 'hello-world',
|
|
359
|
+
'class': 'my_class-123'
|
|
360
|
+
});
|
|
361
|
+
});
|
|
362
|
+
|
|
363
|
+
test('should handle mixed quotes in attributes', () => {
|
|
364
|
+
const tokens = tokenize(`<div title='He said "Hello"' data-info="She's here">`);
|
|
365
|
+
const ast = parse(tokens);
|
|
366
|
+
const divElement = ast.children![0]!;
|
|
367
|
+
expect(divElement.attributes!.title).toBe('He said "Hello"');
|
|
368
|
+
expect(divElement.attributes!['data-info']).toBe("She's here");
|
|
369
|
+
}); test('should handle deeply nested comments', () => {
|
|
370
|
+
const tokens = tokenize('<div><!-- Outer <!-- Inner --> comment --></div>');
|
|
371
|
+
const ast = parse(tokens);
|
|
372
|
+
const divElement = ast.children![0]!;
|
|
373
|
+
expect(divElement.children!.length).toBeGreaterThanOrEqual(1);
|
|
374
|
+
expect(divElement.children![0]!.type).toBe(ASTNodeType.COMMENT);
|
|
375
|
+
});
|
|
376
|
+
|
|
377
|
+
test('should handle multiple consecutive whitespace', () => {
|
|
378
|
+
const tokens = tokenize('<p> \n\t Hello \n\t World \n\t </p>');
|
|
379
|
+
const ast = parse(tokens);
|
|
380
|
+
const pElement = ast.children![0]!;
|
|
381
|
+
const textNode = pElement.children![0]!;
|
|
382
|
+
expect(textNode.content).toContain('Hello');
|
|
383
|
+
expect(textNode.content).toContain('World');
|
|
384
|
+
});
|
|
385
|
+
|
|
386
|
+
test('should handle malformed nested tags', () => {
|
|
387
|
+
const tokens = tokenize('<div><p><span>Text</div></span></p>');
|
|
388
|
+
const ast = parse(tokens);
|
|
389
|
+
const divElement = ast.children![0]!;
|
|
390
|
+
expect(divElement.tagName).toBe('div');
|
|
391
|
+
expect(divElement.children!.length).toBeGreaterThan(0);
|
|
392
|
+
});
|
|
393
|
+
|
|
394
|
+
test('should handle orphaned closing tags', () => {
|
|
395
|
+
const tokens = tokenize('</div><p>Content</p></span>');
|
|
396
|
+
const ast = parse(tokens);
|
|
397
|
+
const pElement = ast.children!.find(
|
|
398
|
+
child => child.type === ASTNodeType.ELEMENT && child.tagName === 'p'
|
|
399
|
+
)!;
|
|
400
|
+
expect(pElement).toBeDefined();
|
|
401
|
+
expect(pElement.children![0]!.content).toBe('Content');
|
|
402
|
+
});
|
|
403
|
+
|
|
404
|
+
test('should handle extreme nesting depth', () => {
|
|
405
|
+
let html = '';
|
|
406
|
+
const depth = 50;
|
|
407
|
+
for (let i = 0; i < depth; i++) {
|
|
408
|
+
html += `<div level="${i}">`;
|
|
409
|
+
}
|
|
410
|
+
html += 'Deep content';
|
|
411
|
+
for (let i = 0; i < depth; i++) {
|
|
412
|
+
html += '</div>';
|
|
413
|
+
}
|
|
414
|
+
const tokens = tokenize(html);
|
|
415
|
+
const ast = parse(tokens);
|
|
416
|
+
let current = ast.children![0]!;
|
|
417
|
+
for (let i = 0; i < depth - 1; i++) {
|
|
418
|
+
expect(current.tagName).toBe('div');
|
|
419
|
+
expect(current.attributes!.level).toBe(i.toString());
|
|
420
|
+
current = current.children!.find(child => child.type === ASTNodeType.ELEMENT)!;
|
|
421
|
+
}
|
|
422
|
+
const textNode = current.children!.find(child => child.type === ASTNodeType.TEXT)!;
|
|
423
|
+
expect(textNode.content).toBe('Deep content');
|
|
424
|
+
});
|
|
425
|
+
})
|
|
426
|
+
|
|
427
|
+
describe('Complex Entity Handling', () => {
|
|
428
|
+
test('should handle numeric character references', () => {
|
|
429
|
+
const tokens = tokenize('<p>A € A €</p>');
|
|
430
|
+
const ast = parse(tokens);
|
|
431
|
+
const pElement = ast.children![0]!;
|
|
432
|
+
const textNode = pElement.children![0]!;
|
|
433
|
+
expect(textNode.content).toBe('A € A €');
|
|
434
|
+
});
|
|
435
|
+
|
|
436
|
+
test('should handle mixed entities and text', () => {
|
|
437
|
+
const tokens = tokenize('<p>R&D <testing> "quotes" 'apostrophe'</p>');
|
|
438
|
+
const ast = parse(tokens);
|
|
439
|
+
const pElement = ast.children![0]!;
|
|
440
|
+
const textNode = pElement.children![0]!;
|
|
441
|
+
expect(textNode.content).toBe('R&D <testing> "quotes" \'apostrophe\'');
|
|
442
|
+
});
|
|
443
|
+
|
|
444
|
+
test('should handle entities in attributes', () => {
|
|
445
|
+
const tokens = tokenize('<div title="R&D <section>" data-test=""hello"">');
|
|
446
|
+
const ast = parse(tokens);
|
|
447
|
+
const divElement = ast.children![0]!;
|
|
448
|
+
expect(divElement.attributes!.title).toBe('R&D <section>');
|
|
449
|
+
expect(divElement.attributes!['data-test']).toBe('"hello"');
|
|
450
|
+
});
|
|
451
|
+
})
|
|
452
|
+
|
|
453
|
+
describe('DOM-like Functionality Tests', () => {
|
|
454
|
+
test('should maintain parent-child relationships', () => {
|
|
455
|
+
const tokens = tokenize('<div><section><article><h1>Title</h1><p>Content</p></article></section></div>');
|
|
456
|
+
const ast = parse(tokens);
|
|
457
|
+
const divElement = ast.children![0]!;
|
|
458
|
+
const sectionElement = divElement.children![0]!;
|
|
459
|
+
const articleElement = sectionElement.children![0]!;
|
|
460
|
+
expect(sectionElement.parent).toBe(divElement);
|
|
461
|
+
expect(articleElement.parent).toBe(sectionElement);
|
|
462
|
+
expect(articleElement.children).toHaveLength(2);
|
|
463
|
+
expect(articleElement.children![0]!.tagName).toBe('h1');
|
|
464
|
+
expect(articleElement.children![1]!.tagName).toBe('p');
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
test('should handle sibling navigation scenarios', () => {
|
|
468
|
+
const tokens = tokenize('<nav><a href="#home">Home</a><a href="#about">About</a><a href="#contact">Contact</a></nav>');
|
|
469
|
+
const ast = parse(tokens);
|
|
470
|
+
const navElement = ast.children![0]!;
|
|
471
|
+
const links = navElement.children!.filter(child => child.type === ASTNodeType.ELEMENT);
|
|
472
|
+
expect(links).toHaveLength(3);
|
|
473
|
+
links.forEach((link, index) => {
|
|
474
|
+
expect(link.tagName).toBe('a');
|
|
475
|
+
expect(link.attributes!.href).toBeDefined();
|
|
476
|
+
expect(link.children![0]!.type).toBe(ASTNodeType.TEXT);
|
|
477
|
+
});
|
|
478
|
+
expect(links[0]!.children![0]!.content).toBe('Home');
|
|
479
|
+
expect(links[1]!.children![0]!.content).toBe('About');
|
|
480
|
+
expect(links[2]!.children![0]!.content).toBe('Contact');
|
|
481
|
+
});
|
|
482
|
+
|
|
483
|
+
test('should handle form elements with all attribute types', () => {
|
|
484
|
+
const tokens = tokenize(`
|
|
485
|
+
<form action="/submit" method="post" enctype="multipart/form-data">
|
|
486
|
+
<input type="text" name="username" required placeholder="Enter username" maxlength="50">
|
|
487
|
+
<input type="password" name="password" required>
|
|
488
|
+
<input type="email" name="email" pattern="[a-z0-9._%+-]+@[a-z0-9.-]+\\.[a-z]{2,}$">
|
|
489
|
+
<select name="country" multiple size="5">
|
|
490
|
+
<option value="us" selected>United States</option>
|
|
491
|
+
<option value="ca">Canada</option>
|
|
492
|
+
<option value="mx">Mexico</option>
|
|
493
|
+
</select>
|
|
494
|
+
<textarea name="comments" rows="4" cols="50" placeholder="Enter comments"></textarea>
|
|
495
|
+
<input type="checkbox" name="terms" id="terms" checked>
|
|
496
|
+
<label for="terms">I agree to the terms</label>
|
|
497
|
+
<button type="submit" disabled>Submit</button>
|
|
498
|
+
</form>
|
|
499
|
+
`);
|
|
500
|
+
const ast = parse(tokens);
|
|
501
|
+
const formElement = ast.children!.find(child => child.tagName === 'form')!;
|
|
502
|
+
expect(formElement.attributes!.action).toBe('/submit');
|
|
503
|
+
expect(formElement.attributes!.method).toBe('post');
|
|
504
|
+
const inputs: ASTNode[] = [];
|
|
505
|
+
const traverse = (node: ASTNode) => {
|
|
506
|
+
if (node.type === ASTNodeType.ELEMENT) {
|
|
507
|
+
if (['input', 'select', 'textarea', 'button'].includes(node.tagName!)) {
|
|
508
|
+
inputs.push(node);
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
if (node.children) {
|
|
512
|
+
node.children.forEach(traverse);
|
|
513
|
+
}
|
|
514
|
+
};
|
|
515
|
+
traverse(formElement);
|
|
516
|
+
expect(inputs.length).toBeGreaterThan(5);
|
|
517
|
+
const usernameInput = inputs.find(input => input.attributes?.name === 'username');
|
|
518
|
+
expect(usernameInput!.attributes!.required).toBe('');
|
|
519
|
+
expect(usernameInput!.attributes!.placeholder).toBe('Enter username');
|
|
520
|
+
const selectElement = inputs.find(input => input.tagName === 'select');
|
|
521
|
+
expect(selectElement!.attributes!.multiple).toBe('');
|
|
522
|
+
});
|
|
523
|
+
|
|
524
|
+
test('should handle table structures correctly', () => {
|
|
525
|
+
const tokens = tokenize(`
|
|
526
|
+
<table border="1" cellpadding="5" cellspacing="0">
|
|
527
|
+
<thead>
|
|
528
|
+
<tr>
|
|
529
|
+
<th scope="col">Name</th>
|
|
530
|
+
<th scope="col">Age</th>
|
|
531
|
+
<th scope="col">City</th>
|
|
532
|
+
</tr>
|
|
533
|
+
</thead>
|
|
534
|
+
<tbody>
|
|
535
|
+
<tr>
|
|
536
|
+
<td>John Doe</td>
|
|
537
|
+
<td>30</td>
|
|
538
|
+
<td>New York</td>
|
|
539
|
+
</tr>
|
|
540
|
+
<tr>
|
|
541
|
+
<td>Jane Smith</td>
|
|
542
|
+
<td>25</td>
|
|
543
|
+
<td>Los Angeles</td>
|
|
544
|
+
</tr>
|
|
545
|
+
</tbody>
|
|
546
|
+
</table>
|
|
547
|
+
`);
|
|
548
|
+
const ast = parse(tokens);
|
|
549
|
+
const tableElement = ast.children!.find(child => child.tagName === 'table')!;
|
|
550
|
+
const thead = tableElement.children!.find(child => child.tagName === 'thead');
|
|
551
|
+
const tbody = tableElement.children!.find(child => child.tagName === 'tbody');
|
|
552
|
+
expect(thead).toBeDefined();
|
|
553
|
+
expect(tbody).toBeDefined();
|
|
554
|
+
const rows: ASTNode[] = [];
|
|
555
|
+
const traverse = (node: ASTNode) => {
|
|
556
|
+
if (node.tagName === 'tr') {
|
|
557
|
+
rows.push(node);
|
|
558
|
+
}
|
|
559
|
+
if (node.children) {
|
|
560
|
+
node.children.forEach(traverse);
|
|
561
|
+
}
|
|
562
|
+
};
|
|
563
|
+
traverse(tableElement);
|
|
564
|
+
expect(rows).toHaveLength(3);
|
|
565
|
+
});
|
|
566
|
+
|
|
567
|
+
test('should handle mixed content with inline elements', () => {
|
|
568
|
+
const tokens = tokenize(`
|
|
569
|
+
<p>This is <strong>bold text</strong> and this is <em>italic text</em>.
|
|
570
|
+
Here's a <a href="https://example.com" target="_blank">link</a> and some
|
|
571
|
+
<code>inline code</code>. Also <span class="highlight">highlighted text</span>.</p>
|
|
572
|
+
`);
|
|
573
|
+
const ast = parse(tokens);
|
|
574
|
+
const pElement = ast.children!.find(child => child.tagName === 'p')!;
|
|
575
|
+
let textNodes = 0;
|
|
576
|
+
let elementNodes = 0;
|
|
577
|
+
let totalChildren = 0;
|
|
578
|
+
const traverse = (node: ASTNode) => {
|
|
579
|
+
totalChildren++;
|
|
580
|
+
if (node.type === ASTNodeType.TEXT && node.content!.trim()) {
|
|
581
|
+
textNodes++;
|
|
582
|
+
} else if (node.type === ASTNodeType.ELEMENT) {
|
|
583
|
+
elementNodes++;
|
|
584
|
+
}
|
|
585
|
+
if (node.children) {
|
|
586
|
+
node.children.forEach(traverse);
|
|
587
|
+
}
|
|
588
|
+
};
|
|
589
|
+
if (pElement.children) {
|
|
590
|
+
pElement.children.forEach(traverse);
|
|
591
|
+
}
|
|
592
|
+
expect(elementNodes).toBeGreaterThan(3);
|
|
593
|
+
expect(textNodes).toBeGreaterThan(0);
|
|
594
|
+
});
|
|
595
|
+
|
|
596
|
+
test('should preserve document structure integrity', () => {
|
|
597
|
+
const tokens = tokenize(`<!DOCTYPE html>
|
|
598
|
+
<html lang="en">
|
|
599
|
+
<head>
|
|
600
|
+
<meta charset="UTF-8">
|
|
601
|
+
<title>Test Document</title>
|
|
602
|
+
<style>body { margin: 0; }</style>
|
|
603
|
+
<script>console.log('Hello');</script>
|
|
604
|
+
</head>
|
|
605
|
+
<body>
|
|
606
|
+
<header id="main-header">
|
|
607
|
+
<h1>Welcome</h1>
|
|
608
|
+
</header>
|
|
609
|
+
<main>
|
|
610
|
+
<section class="content">
|
|
611
|
+
<article>
|
|
612
|
+
<h2>Article Title</h2>
|
|
613
|
+
<p>Article content goes here.</p>
|
|
614
|
+
</article>
|
|
615
|
+
</section>
|
|
616
|
+
</main>
|
|
617
|
+
<footer>
|
|
618
|
+
<p>© 2025 Test Company</p>
|
|
619
|
+
</footer>
|
|
620
|
+
</body>
|
|
621
|
+
</html>`);
|
|
622
|
+
const ast = parse(tokens);
|
|
623
|
+
const doctype = ast.children!.find(child => child.type === ASTNodeType.DOCTYPE);
|
|
624
|
+
expect(doctype).toBeDefined();
|
|
625
|
+
const htmlElement = ast.children!.find(child => child.tagName === 'html')!;
|
|
626
|
+
expect(htmlElement.attributes!.lang).toBe('en');
|
|
627
|
+
const headElement = htmlElement.children!.find(child => child.tagName === 'head');
|
|
628
|
+
const bodyElement = htmlElement.children!.find(child => child.tagName === 'body');
|
|
629
|
+
expect(headElement).toBeDefined();
|
|
630
|
+
expect(bodyElement).toBeDefined();
|
|
631
|
+
const headerElement = bodyElement!.children!.find(child => child.tagName === 'header');
|
|
632
|
+
const mainElement = bodyElement!.children!.find(child => child.tagName === 'main');
|
|
633
|
+
const footerElement = bodyElement!.children!.find(child => child.tagName === 'footer');
|
|
634
|
+
expect(headerElement).toBeDefined();
|
|
635
|
+
expect(mainElement).toBeDefined();
|
|
636
|
+
expect(footerElement).toBeDefined();
|
|
637
|
+
expect(headerElement!.attributes!.id).toBe('main-header');
|
|
638
|
+
});
|
|
639
|
+
})
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
});
|