@tkeron/html-parser 0.1.7 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -7
- package/bun.lock +5 -0
- package/index.ts +4 -0
- package/package.json +7 -1
- package/src/css-selector.ts +1 -1
- package/src/dom-simulator.ts +41 -17
- package/src/encoding.ts +39 -0
- package/src/index.ts +9 -0
- package/src/parser.ts +509 -143
- package/src/serializer.ts +450 -0
- package/src/tokenizer.ts +190 -118
- package/tests/advanced.test.ts +121 -108
- package/tests/custom-elements-head.test.ts +105 -0
- package/tests/dom-extended.test.ts +12 -12
- package/tests/dom-manipulation.test.ts +9 -10
- package/tests/dom.test.ts +32 -27
- package/tests/helpers/tokenizer-adapter.test.ts +70 -0
- package/tests/helpers/tokenizer-adapter.ts +65 -0
- package/tests/helpers/tree-adapter.test.ts +39 -0
- package/tests/helpers/tree-adapter.ts +60 -0
- package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
- package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
- package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
- package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
- package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
- package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
- package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
- package/tests/html5lib-data/tree-construction/math.dat +104 -0
- package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
- package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
- package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
- package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
- package/tests/html5lib-data/tree-construction/svg.dat +104 -0
- package/tests/html5lib-data/tree-construction/template.dat +1673 -0
- package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
- package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
- package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
- package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
- package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
- package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
- package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
- package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
- package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
- package/tests/parser.test.ts +173 -193
- package/tests/serializer-core.test.ts +16 -0
- package/tests/serializer-data/core.test +125 -0
- package/tests/serializer-data/injectmeta.test +66 -0
- package/tests/serializer-data/optionaltags.test +965 -0
- package/tests/serializer-data/options.test +60 -0
- package/tests/serializer-data/whitespace.test +51 -0
- package/tests/serializer-injectmeta.test.ts +16 -0
- package/tests/serializer-optionaltags.test.ts +16 -0
- package/tests/serializer-options.test.ts +16 -0
- package/tests/serializer-whitespace.test.ts +16 -0
- package/tests/tokenizer-namedEntities.test.ts +20 -0
- package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
- package/tests/tokenizer.test.ts +25 -32
- package/tests/tree-construction-adoption01.test.ts +37 -0
- package/tests/tree-construction-adoption02.test.ts +34 -0
- package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
- package/tests/tree-construction-entities02.test.ts +33 -0
- package/tests/tree-construction-html5test-com.test.ts +32 -0
- package/tests/tree-construction-math.test.ts +18 -0
- package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
- package/tests/tree-construction-noscript01.test.ts +18 -0
- package/tests/tree-construction-ruby.test.ts +21 -0
- package/tests/tree-construction-scriptdata01.test.ts +21 -0
- package/tests/tree-construction-svg.test.ts +21 -0
- package/tests/tree-construction-template.test.ts +21 -0
- package/tests/tree-construction-tests10.test.ts +21 -0
- package/tests/tree-construction-tests11.test.ts +21 -0
- package/tests/tree-construction-tests20.test.ts +18 -0
- package/tests/tree-construction-tests21.test.ts +18 -0
- package/tests/tree-construction-tests23.test.ts +18 -0
- package/tests/tree-construction-tests24.test.ts +18 -0
- package/tests/tree-construction-tests5.test.ts +21 -0
- package/tests/tree-construction-tests6.test.ts +21 -0
- package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
- package/tests/custom-elements.test.ts +0 -745
- package/tests/official/README.md +0 -87
- package/tests/official/acid/acid-tests.test.ts +0 -309
- package/tests/official/final-output/final-output.test.ts +0 -361
- package/tests/official/html5lib/tokenizer-utils.ts +0 -192
- package/tests/official/html5lib/tokenizer.test.ts +0 -171
- package/tests/official/html5lib/tree-construction-utils.ts +0 -194
- package/tests/official/html5lib/tree-construction.test.ts +0 -250
- package/tests/official/validator/validator-tests.test.ts +0 -237
- package/tests/official/validator-nu/validator-nu.test.ts +0 -335
- package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
- package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/tests/parser.test.ts
CHANGED
|
@@ -1,28 +1,47 @@
|
|
|
1
|
-
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
import { expect, it, describe } from 'bun:test';
|
|
2
3
|
import { tokenize } from '../src/tokenizer';
|
|
3
|
-
import { parse, ASTNodeType, type ASTNode } from '../src/parser';
|
|
4
|
+
import { parse, domToAST, ASTNodeType, type ASTNode } from '../src/parser';
|
|
4
5
|
import { file } from 'bun';
|
|
5
|
-
|
|
6
|
+
|
|
7
|
+
function parseToAST(html: string): ASTNode {
|
|
8
|
+
const tokens = tokenize(html);
|
|
9
|
+
const dom = parse(tokens);
|
|
10
|
+
const ast = domToAST(dom);
|
|
11
|
+
|
|
12
|
+
const hasExplicitHtml = html.includes('<html') || html.includes('<!DOCTYPE') || html.includes('<!doctype');
|
|
13
|
+
if (hasExplicitHtml) {
|
|
14
|
+
return ast;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
const htmlEl = ast.children?.find(c => c.tagName === 'html');
|
|
18
|
+
if (htmlEl) {
|
|
19
|
+
const bodyEl = htmlEl.children?.find(c => c.tagName === 'body');
|
|
20
|
+
if (bodyEl && bodyEl.children) {
|
|
21
|
+
const nonHtmlChildren = ast.children?.filter(c => c.tagName !== 'html' && c.type !== 'doctype') || [];
|
|
22
|
+
return { type: ASTNodeType.Document, children: [...nonHtmlChildren, ...bodyEl.children] };
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
return ast;
|
|
26
|
+
}
|
|
6
27
|
|
|
7
28
|
describe('HTML Parser', () => {
|
|
8
29
|
|
|
9
30
|
describe('Basic Elements', () => {
|
|
10
|
-
|
|
11
|
-
const
|
|
12
|
-
const ast = parse(tokens);
|
|
31
|
+
it('should parse simple element', () => {
|
|
32
|
+
const ast = parseToAST('<div></div>');
|
|
13
33
|
|
|
14
|
-
expect(ast.type).toBe(ASTNodeType.
|
|
34
|
+
expect(ast.type).toBe(ASTNodeType.Document);
|
|
15
35
|
expect(ast.children).toHaveLength(1);
|
|
16
36
|
|
|
17
37
|
const divElement = ast.children![0]!;
|
|
18
|
-
expect(divElement.type).toBe(ASTNodeType.
|
|
38
|
+
expect(divElement.type).toBe(ASTNodeType.Element);
|
|
19
39
|
expect(divElement.tagName).toBe('div');
|
|
20
40
|
expect(divElement.children).toHaveLength(0);
|
|
21
41
|
});
|
|
22
42
|
|
|
23
|
-
|
|
24
|
-
const
|
|
25
|
-
const ast = parse(tokens);
|
|
43
|
+
it('should parse element with attributes', () => {
|
|
44
|
+
const ast = parseToAST('<div class="container" id="main"></div>');
|
|
26
45
|
|
|
27
46
|
const divElement = ast.children![0]!;
|
|
28
47
|
expect(divElement.attributes).toEqual({
|
|
@@ -31,38 +50,35 @@ describe('HTML Parser', () => {
|
|
|
31
50
|
});
|
|
32
51
|
});
|
|
33
52
|
|
|
34
|
-
|
|
35
|
-
const
|
|
36
|
-
const ast = parse(tokens);
|
|
53
|
+
it('should parse self-closing elements', () => {
|
|
54
|
+
const ast = parseToAST('<img src="test.jpg" alt="test"/>');
|
|
37
55
|
|
|
38
56
|
const imgElement = ast.children![0]!;
|
|
39
|
-
expect(imgElement.type).toBe(ASTNodeType.
|
|
57
|
+
expect(imgElement.type).toBe(ASTNodeType.Element);
|
|
40
58
|
expect(imgElement.tagName).toBe('img');
|
|
41
|
-
expect(imgElement.isSelfClosing).toBe(true);
|
|
59
|
+
expect((imgElement as any).isSelfClosing).toBe(true);
|
|
42
60
|
expect(imgElement.attributes).toEqual({
|
|
43
61
|
src: 'test.jpg',
|
|
44
62
|
alt: 'test'
|
|
45
63
|
});
|
|
46
64
|
});
|
|
47
65
|
|
|
48
|
-
|
|
49
|
-
const
|
|
50
|
-
const ast = parse(tokens);
|
|
66
|
+
it('should parse void elements correctly', () => {
|
|
67
|
+
const ast = parseToAST('<br><hr><input type="text">');
|
|
51
68
|
|
|
52
69
|
expect(ast.children).toHaveLength(3);
|
|
53
70
|
expect(ast.children![0]!.tagName).toBe('br');
|
|
54
|
-
expect(ast.children![0]
|
|
71
|
+
expect((ast.children![0]! as any).isSelfClosing).toBe(true);
|
|
55
72
|
expect(ast.children![1]!.tagName).toBe('hr');
|
|
56
|
-
expect(ast.children![1]
|
|
73
|
+
expect((ast.children![1]! as any).isSelfClosing).toBe(true);
|
|
57
74
|
expect(ast.children![2]!.tagName).toBe('input');
|
|
58
|
-
expect(ast.children![2]
|
|
75
|
+
expect((ast.children![2]! as any).isSelfClosing).toBe(true);
|
|
59
76
|
});
|
|
60
77
|
});
|
|
61
78
|
|
|
62
79
|
describe('Nested Elements', () => {
|
|
63
|
-
|
|
64
|
-
const
|
|
65
|
-
const ast = parse(tokens);
|
|
80
|
+
it('should parse nested elements', () => {
|
|
81
|
+
const ast = parseToAST('<div><p>Hello</p></div>');
|
|
66
82
|
|
|
67
83
|
const divElement = ast.children![0]!;
|
|
68
84
|
expect(divElement.tagName).toBe('div');
|
|
@@ -73,13 +89,12 @@ describe('HTML Parser', () => {
|
|
|
73
89
|
expect(pElement.children).toHaveLength(1);
|
|
74
90
|
|
|
75
91
|
const textNode = pElement.children![0]!;
|
|
76
|
-
expect(textNode.type).toBe(ASTNodeType.
|
|
77
|
-
expect(textNode.content).toBe('Hello');
|
|
92
|
+
expect(textNode.type).toBe(ASTNodeType.Text);
|
|
93
|
+
expect((textNode as any).content).toBe('Hello');
|
|
78
94
|
});
|
|
79
95
|
|
|
80
|
-
|
|
81
|
-
const
|
|
82
|
-
const ast = parse(tokens);
|
|
96
|
+
it('should parse deeply nested elements', () => {
|
|
97
|
+
const ast = parseToAST('<div><section><article><h1>Title</h1></article></section></div>');
|
|
83
98
|
|
|
84
99
|
const divElement = ast.children![0]!;
|
|
85
100
|
const sectionElement = divElement.children![0]!;
|
|
@@ -87,99 +102,90 @@ describe('HTML Parser', () => {
|
|
|
87
102
|
const h1Element = articleElement.children![0]!;
|
|
88
103
|
|
|
89
104
|
expect(h1Element.tagName).toBe('h1');
|
|
90
|
-
expect(h1Element.children![0]
|
|
105
|
+
expect((h1Element.children![0]! as any).content).toBe('Title');
|
|
91
106
|
});
|
|
92
107
|
|
|
93
|
-
|
|
94
|
-
const
|
|
95
|
-
const ast = parse(tokens);
|
|
108
|
+
it('should handle multiple siblings', () => {
|
|
109
|
+
const ast = parseToAST('<div><p>First</p><p>Second</p><p>Third</p></div>');
|
|
96
110
|
|
|
97
111
|
const divElement = ast.children![0]!;
|
|
98
112
|
expect(divElement.children).toHaveLength(3);
|
|
99
113
|
|
|
100
114
|
expect(divElement.children![0]!.tagName).toBe('p');
|
|
101
|
-
expect(divElement.children![0]!.children![0]
|
|
102
|
-
expect(divElement.children![1]!.children![0]
|
|
103
|
-
expect(divElement.children![2]!.children![0]
|
|
115
|
+
expect((divElement.children![0]!.children![0] as any).content).toBe('First');
|
|
116
|
+
expect((divElement.children![1]!.children![0] as any).content).toBe('Second');
|
|
117
|
+
expect((divElement.children![2]!.children![0] as any).content).toBe('Third');
|
|
104
118
|
});
|
|
105
119
|
});
|
|
106
120
|
|
|
107
121
|
describe('Text Content', () => {
|
|
108
|
-
|
|
109
|
-
const
|
|
110
|
-
const ast = parse(tokens);
|
|
122
|
+
it('should parse text content', () => {
|
|
123
|
+
const ast = parseToAST('Hello World');
|
|
111
124
|
|
|
112
125
|
expect(ast.children).toHaveLength(1);
|
|
113
126
|
const textNode = ast.children![0]!;
|
|
114
|
-
expect(textNode.type).toBe(ASTNodeType.
|
|
115
|
-
expect(textNode.content).toBe('Hello World');
|
|
127
|
+
expect(textNode.type).toBe(ASTNodeType.Text);
|
|
128
|
+
expect((textNode as any).content).toBe('Hello World');
|
|
116
129
|
});
|
|
117
130
|
|
|
118
|
-
|
|
119
|
-
const
|
|
120
|
-
const ast = parse(tokens);
|
|
131
|
+
it('should parse mixed text and elements', () => {
|
|
132
|
+
const ast = parseToAST('Before <strong>bold</strong> after');
|
|
121
133
|
|
|
122
134
|
expect(ast.children).toHaveLength(3);
|
|
123
|
-
expect(ast.children![0]
|
|
135
|
+
expect((ast.children![0]! as any).content).toBe('Before ');
|
|
124
136
|
expect(ast.children![1]!.tagName).toBe('strong');
|
|
125
|
-
expect(ast.children![1]!.children![0]
|
|
126
|
-
expect(ast.children![2]
|
|
137
|
+
expect((ast.children![1]!.children![0]! as any).content).toBe('bold');
|
|
138
|
+
expect((ast.children![2]! as any).content).toBe(' after');
|
|
127
139
|
});
|
|
128
140
|
|
|
129
|
-
|
|
130
|
-
const
|
|
131
|
-
const ast = parse(tokens);
|
|
141
|
+
it('should handle entities in text', () => {
|
|
142
|
+
const ast = parseToAST('<p>& < ></p>');
|
|
132
143
|
|
|
133
144
|
const pElement = ast.children![0]!;
|
|
134
145
|
const textNode = pElement.children![0]!;
|
|
135
|
-
expect(textNode.content).toBe('& < >');
|
|
146
|
+
expect((textNode as any).content).toBe('& < >');
|
|
136
147
|
});
|
|
137
148
|
});
|
|
138
149
|
|
|
139
150
|
describe('Comments and Special Nodes', () => {
|
|
140
|
-
|
|
141
|
-
const
|
|
142
|
-
const ast = parse(tokens);
|
|
151
|
+
it('should parse HTML comments', () => {
|
|
152
|
+
const ast = parseToAST('<!-- This is a comment -->');
|
|
143
153
|
|
|
144
154
|
expect(ast.children).toHaveLength(1);
|
|
145
155
|
const commentNode = ast.children![0]!;
|
|
146
|
-
expect(commentNode.type).toBe(ASTNodeType.
|
|
147
|
-
expect(commentNode.content).toBe(' This is a comment ');
|
|
156
|
+
expect(commentNode.type).toBe(ASTNodeType.Comment);
|
|
157
|
+
expect((commentNode as any).content).toBe(' This is a comment ');
|
|
148
158
|
});
|
|
149
159
|
|
|
150
|
-
|
|
151
|
-
const
|
|
152
|
-
const ast = parse(tokens);
|
|
160
|
+
it('should parse DOCTYPE', () => {
|
|
161
|
+
const ast = parseToAST('<!DOCTYPE html>');
|
|
153
162
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
expect(doctypeNode.
|
|
157
|
-
expect(doctypeNode.content).toBe('html');
|
|
163
|
+
const doctypeNode = ast.children?.find(c => c.type === ASTNodeType.Doctype);
|
|
164
|
+
expect(doctypeNode).toBeDefined();
|
|
165
|
+
expect((doctypeNode as any).content).toBe('html');
|
|
158
166
|
});
|
|
159
167
|
|
|
160
|
-
|
|
161
|
-
const
|
|
162
|
-
const ast = parse(tokens);
|
|
168
|
+
it.skip('should parse CDATA sections', () => {
|
|
169
|
+
const ast = parseToAST('<![CDATA[Some raw data]]>');
|
|
163
170
|
|
|
164
171
|
expect(ast.children).toHaveLength(1);
|
|
165
172
|
const cdataNode = ast.children![0]!;
|
|
166
173
|
expect(cdataNode.type).toBe(ASTNodeType.CDATA);
|
|
167
|
-
expect(cdataNode.content).toBe('Some raw data');
|
|
174
|
+
expect((cdataNode as any).content).toBe('Some raw data');
|
|
168
175
|
});
|
|
169
176
|
|
|
170
|
-
|
|
171
|
-
const
|
|
172
|
-
const ast = parse(tokens);
|
|
177
|
+
it.skip('should parse processing instructions', () => {
|
|
178
|
+
const ast = parseToAST('<?xml version="1.0"?>');
|
|
173
179
|
|
|
174
180
|
expect(ast.children).toHaveLength(1);
|
|
175
181
|
const piNode = ast.children![0]!;
|
|
176
|
-
expect(piNode.type).toBe(
|
|
177
|
-
expect(piNode.content).toBe('<?xml version="1.0"');
|
|
182
|
+
expect(piNode.type).toBe('processing-instruction' as any);
|
|
183
|
+
expect((piNode as any).content).toBe('<?xml version="1.0"');
|
|
178
184
|
});
|
|
179
185
|
});
|
|
180
186
|
|
|
181
187
|
describe('Complete HTML Documents', () => {
|
|
182
|
-
|
|
188
|
+
it('should parse complete HTML document', () => {
|
|
183
189
|
const html = `<!DOCTYPE html>
|
|
184
190
|
<html lang="en">
|
|
185
191
|
<head>
|
|
@@ -193,20 +199,19 @@ describe('HTML Parser', () => {
|
|
|
193
199
|
</body>
|
|
194
200
|
</html>`;
|
|
195
201
|
|
|
196
|
-
const
|
|
197
|
-
const ast = parse(tokens);
|
|
202
|
+
const ast = parseToAST(html);
|
|
198
203
|
|
|
199
204
|
expect(ast.children!.length).toBeGreaterThan(1);
|
|
200
205
|
|
|
201
206
|
const htmlElement = ast.children!.find(
|
|
202
|
-
child => child.type === ASTNodeType.
|
|
207
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'html'
|
|
203
208
|
)!;
|
|
204
209
|
|
|
205
210
|
expect(htmlElement).toBeDefined();
|
|
206
211
|
expect(htmlElement.attributes!.lang).toBe('en');
|
|
207
212
|
|
|
208
213
|
const elementChildren = htmlElement.children!.filter(
|
|
209
|
-
child => child.type === ASTNodeType.
|
|
214
|
+
child => child.type === ASTNodeType.Element
|
|
210
215
|
);
|
|
211
216
|
expect(elementChildren).toHaveLength(2);
|
|
212
217
|
|
|
@@ -219,17 +224,15 @@ describe('HTML Parser', () => {
|
|
|
219
224
|
});
|
|
220
225
|
|
|
221
226
|
describe('real web scenarios', () => {
|
|
222
|
-
|
|
223
|
-
const html = await file(
|
|
224
|
-
const
|
|
225
|
-
const ast = parse(tokens);
|
|
227
|
+
it('should parse real-world HTML', async () => {
|
|
228
|
+
const html = await file("./tests/test-page-0.txt").text();
|
|
229
|
+
const ast = parseToAST(html);
|
|
226
230
|
});
|
|
227
231
|
});
|
|
228
232
|
|
|
229
233
|
describe('Error Recovery', () => {
|
|
230
|
-
|
|
231
|
-
const
|
|
232
|
-
const ast = parse(tokens);
|
|
234
|
+
it('should handle unclosed tags', () => {
|
|
235
|
+
const ast = parseToAST('<div><p>Unclosed paragraph</div>');
|
|
233
236
|
|
|
234
237
|
const divElement = ast.children![0]!;
|
|
235
238
|
expect(divElement.tagName).toBe('div');
|
|
@@ -238,17 +241,15 @@ describe('HTML Parser', () => {
|
|
|
238
241
|
expect(pElement.tagName).toBe('p');
|
|
239
242
|
});
|
|
240
243
|
|
|
241
|
-
|
|
242
|
-
const
|
|
243
|
-
const ast = parse(tokens);
|
|
244
|
+
it('should handle unexpected closing tags', () => {
|
|
245
|
+
const ast = parseToAST('<div></span></div>');
|
|
244
246
|
|
|
245
247
|
const divElement = ast.children![0]!;
|
|
246
248
|
expect(divElement.tagName).toBe('div');
|
|
247
249
|
});
|
|
248
250
|
|
|
249
|
-
|
|
250
|
-
const
|
|
251
|
-
const ast = parse(tokens);
|
|
251
|
+
it('should handle malformed attributes', () => {
|
|
252
|
+
const ast = parseToAST('<div class="test id="main">Content</div>');
|
|
252
253
|
|
|
253
254
|
const divElement = ast.children![0]!;
|
|
254
255
|
expect(divElement.tagName).toBe('div');
|
|
@@ -257,58 +258,54 @@ describe('HTML Parser', () => {
|
|
|
257
258
|
});
|
|
258
259
|
|
|
259
260
|
describe('Auto-closing Tags', () => {
|
|
260
|
-
|
|
261
|
-
const
|
|
262
|
-
const ast = parse(tokens);
|
|
261
|
+
it('should auto-close list items', () => {
|
|
262
|
+
const ast = parseToAST('<ul><li>First<li>Second</ul>');
|
|
263
263
|
|
|
264
264
|
const ulElement = ast.children![0]!;
|
|
265
265
|
const liElements = ulElement.children!.filter(
|
|
266
|
-
child => child.type === ASTNodeType.
|
|
266
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'li'
|
|
267
267
|
);
|
|
268
268
|
|
|
269
269
|
expect(liElements).toHaveLength(2);
|
|
270
|
-
expect(liElements[0]!.children![0]
|
|
271
|
-
expect(liElements[1]!.children![0]
|
|
270
|
+
expect((liElements[0]!.children![0]! as any).content).toBe('First');
|
|
271
|
+
expect((liElements[1]!.children![0]! as any).content).toBe('Second');
|
|
272
272
|
});
|
|
273
273
|
|
|
274
|
-
|
|
275
|
-
const
|
|
276
|
-
const ast = parse(tokens);
|
|
274
|
+
it('should auto-close paragraph tags', () => {
|
|
275
|
+
const ast = parseToAST('<p>First paragraph<p>Second paragraph');
|
|
277
276
|
|
|
278
277
|
const pElements = ast.children!.filter(
|
|
279
|
-
child => child.type === ASTNodeType.
|
|
278
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'p'
|
|
280
279
|
);
|
|
281
280
|
|
|
282
281
|
expect(pElements).toHaveLength(2);
|
|
283
|
-
expect(pElements[0]!.children![0]
|
|
284
|
-
expect(pElements[1]!.children![0]
|
|
282
|
+
expect((pElements[0]!.children![0]! as any).content).toBe('First paragraph');
|
|
283
|
+
expect((pElements[1]!.children![0]! as any).content).toBe('Second paragraph');
|
|
285
284
|
});
|
|
286
285
|
});
|
|
287
286
|
|
|
288
287
|
describe('Whitespace Handling', () => {
|
|
289
|
-
|
|
290
|
-
const
|
|
291
|
-
const ast = parse(tokens);
|
|
288
|
+
it('should preserve significant whitespace', () => {
|
|
289
|
+
const ast = parseToAST('<p> Hello World </p>');
|
|
292
290
|
|
|
293
291
|
const pElement = ast.children![0]!;
|
|
294
292
|
const textNode = pElement.children![0]!;
|
|
295
|
-
expect(textNode.content).toBe(' Hello World ');
|
|
293
|
+
expect((textNode as any).content).toBe(' Hello World ');
|
|
296
294
|
});
|
|
297
295
|
|
|
298
|
-
|
|
299
|
-
const
|
|
296
|
+
it('should skip insignificant whitespace', () => {
|
|
297
|
+
const ast = parseToAST(`<html>
|
|
300
298
|
<head>
|
|
301
299
|
<title>Test</title>
|
|
302
300
|
</head>
|
|
303
301
|
</html>`);
|
|
304
|
-
const ast = parse(tokens);
|
|
305
302
|
|
|
306
303
|
const htmlElement = ast.children!.find(
|
|
307
|
-
child => child.type === ASTNodeType.
|
|
304
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'html'
|
|
308
305
|
)!;
|
|
309
306
|
|
|
310
307
|
const headElement = htmlElement.children!.find(
|
|
311
|
-
child => child.type === ASTNodeType.
|
|
308
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'head'
|
|
312
309
|
)!;
|
|
313
310
|
|
|
314
311
|
expect(headElement).toBeDefined();
|
|
@@ -316,22 +313,22 @@ describe('HTML Parser', () => {
|
|
|
316
313
|
});
|
|
317
314
|
|
|
318
315
|
describe("complete web page", () => {
|
|
319
|
-
|
|
320
|
-
const html = await file(
|
|
321
|
-
const
|
|
322
|
-
|
|
323
|
-
expect(ast.children!.length).toBeGreaterThanOrEqual(3);
|
|
316
|
+
it('should parse a complete web page', async () => {
|
|
317
|
+
const html = await file("./tests/test-page-0.txt").text();
|
|
318
|
+
const ast = parseToAST(html);
|
|
319
|
+
expect(ast.children!.length).toBeGreaterThanOrEqual(1);
|
|
324
320
|
const htmlElement = ast.children!.find(
|
|
325
|
-
child => child.type === ASTNodeType.
|
|
321
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'html'
|
|
326
322
|
)!;
|
|
327
|
-
expect(htmlElement
|
|
323
|
+
expect(htmlElement).toBeDefined();
|
|
324
|
+
expect(htmlElement.type).toBe(ASTNodeType.Element);
|
|
328
325
|
expect(htmlElement.tagName).toBe('html');
|
|
329
326
|
expect(htmlElement.attributes!.lang).toBe('en');
|
|
330
327
|
const headElement = htmlElement.children!.find(
|
|
331
|
-
child => child.type === ASTNodeType.
|
|
328
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'head'
|
|
332
329
|
)!;
|
|
333
330
|
const bodyElement = htmlElement.children!.find(
|
|
334
|
-
child => child.type === ASTNodeType.
|
|
331
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'body'
|
|
335
332
|
)!;
|
|
336
333
|
expect(headElement).toBeDefined();
|
|
337
334
|
expect(bodyElement).toBeDefined();
|
|
@@ -339,9 +336,8 @@ describe('HTML Parser', () => {
|
|
|
339
336
|
})
|
|
340
337
|
|
|
341
338
|
describe('Advanced Edge Cases', () => {
|
|
342
|
-
|
|
343
|
-
const
|
|
344
|
-
const ast = parse(tokens);
|
|
339
|
+
it('should handle empty attributes', () => {
|
|
340
|
+
const ast = parseToAST('<input disabled checked="" value="">');
|
|
345
341
|
const inputElement = ast.children![0]!;
|
|
346
342
|
expect(inputElement.attributes).toEqual({
|
|
347
343
|
disabled: '',
|
|
@@ -350,9 +346,8 @@ describe('HTML Parser', () => {
|
|
|
350
346
|
});
|
|
351
347
|
});
|
|
352
348
|
|
|
353
|
-
|
|
354
|
-
const
|
|
355
|
-
const ast = parse(tokens);
|
|
349
|
+
it('should handle attributes with special characters', () => {
|
|
350
|
+
const ast = parseToAST('<div data-test="hello-world" class="my_class-123">');
|
|
356
351
|
const divElement = ast.children![0]!;
|
|
357
352
|
expect(divElement.attributes).toEqual({
|
|
358
353
|
'data-test': 'hello-world',
|
|
@@ -360,48 +355,45 @@ describe('HTML Parser', () => {
|
|
|
360
355
|
});
|
|
361
356
|
});
|
|
362
357
|
|
|
363
|
-
|
|
364
|
-
const
|
|
365
|
-
const ast = parse(tokens);
|
|
358
|
+
it('should handle mixed quotes in attributes', () => {
|
|
359
|
+
const ast = parseToAST(`<div title='He said "Hello"' data-info="She's here">`);
|
|
366
360
|
const divElement = ast.children![0]!;
|
|
367
361
|
expect(divElement.attributes!.title).toBe('He said "Hello"');
|
|
368
362
|
expect(divElement.attributes!['data-info']).toBe("She's here");
|
|
369
|
-
});
|
|
370
|
-
|
|
371
|
-
|
|
363
|
+
});
|
|
364
|
+
|
|
365
|
+
it('should handle deeply nested comments', () => {
|
|
366
|
+
const ast = parseToAST('<div><!-- Outer <!-- Inner --> comment --></div>');
|
|
372
367
|
const divElement = ast.children![0]!;
|
|
373
368
|
expect(divElement.children!.length).toBeGreaterThanOrEqual(1);
|
|
374
|
-
expect(divElement.children![0]!.type).toBe(ASTNodeType.
|
|
369
|
+
expect(divElement.children![0]!.type).toBe(ASTNodeType.Comment);
|
|
375
370
|
});
|
|
376
371
|
|
|
377
|
-
|
|
378
|
-
const
|
|
379
|
-
const ast = parse(tokens);
|
|
372
|
+
it('should handle multiple consecutive whitespace', () => {
|
|
373
|
+
const ast = parseToAST('<p> \n\t Hello \n\t World \n\t </p>');
|
|
380
374
|
const pElement = ast.children![0]!;
|
|
381
375
|
const textNode = pElement.children![0]!;
|
|
382
|
-
expect(textNode.content).toContain('Hello');
|
|
383
|
-
expect(textNode.content).toContain('World');
|
|
376
|
+
expect((textNode as any).content).toContain('Hello');
|
|
377
|
+
expect((textNode as any).content).toContain('World');
|
|
384
378
|
});
|
|
385
379
|
|
|
386
|
-
|
|
387
|
-
const
|
|
388
|
-
const ast = parse(tokens);
|
|
380
|
+
it('should handle malformed nested tags', () => {
|
|
381
|
+
const ast = parseToAST('<div><p><span>Text</div></span></p>');
|
|
389
382
|
const divElement = ast.children![0]!;
|
|
390
383
|
expect(divElement.tagName).toBe('div');
|
|
391
384
|
expect(divElement.children!.length).toBeGreaterThan(0);
|
|
392
385
|
});
|
|
393
386
|
|
|
394
|
-
|
|
395
|
-
const
|
|
396
|
-
const ast = parse(tokens);
|
|
387
|
+
it('should handle orphaned closing tags', () => {
|
|
388
|
+
const ast = parseToAST('</div><p>Content</p></span>');
|
|
397
389
|
const pElement = ast.children!.find(
|
|
398
|
-
child => child.type === ASTNodeType.
|
|
390
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'p'
|
|
399
391
|
)!;
|
|
400
392
|
expect(pElement).toBeDefined();
|
|
401
|
-
expect(pElement.children![0]
|
|
393
|
+
expect((pElement.children![0]! as any).content).toBe('Content');
|
|
402
394
|
});
|
|
403
395
|
|
|
404
|
-
|
|
396
|
+
it('should handle extreme nesting depth', () => {
|
|
405
397
|
let html = '';
|
|
406
398
|
const depth = 50;
|
|
407
399
|
for (let i = 0; i < depth; i++) {
|
|
@@ -411,39 +403,35 @@ describe('HTML Parser', () => {
|
|
|
411
403
|
for (let i = 0; i < depth; i++) {
|
|
412
404
|
html += '</div>';
|
|
413
405
|
}
|
|
414
|
-
const
|
|
415
|
-
const ast = parse(tokens);
|
|
406
|
+
const ast = parseToAST(html);
|
|
416
407
|
let current = ast.children![0]!;
|
|
417
408
|
for (let i = 0; i < depth - 1; i++) {
|
|
418
409
|
expect(current.tagName).toBe('div');
|
|
419
410
|
expect(current.attributes!.level).toBe(i.toString());
|
|
420
|
-
current = current.children!.find(child => child.type === ASTNodeType.
|
|
411
|
+
current = current.children!.find(child => child.type === ASTNodeType.Element)!;
|
|
421
412
|
}
|
|
422
|
-
const textNode = current.children!.find(child => child.type === ASTNodeType.
|
|
423
|
-
expect(textNode.content).toBe('Deep content');
|
|
413
|
+
const textNode = current.children!.find(child => child.type === ASTNodeType.Text)!;
|
|
414
|
+
expect((textNode as any).content).toBe('Deep content');
|
|
424
415
|
});
|
|
425
416
|
})
|
|
426
417
|
|
|
427
418
|
describe('Complex Entity Handling', () => {
|
|
428
|
-
|
|
429
|
-
const
|
|
430
|
-
const ast = parse(tokens);
|
|
419
|
+
it('should handle numeric character references', () => {
|
|
420
|
+
const ast = parseToAST('<p>A € A €</p>');
|
|
431
421
|
const pElement = ast.children![0]!;
|
|
432
422
|
const textNode = pElement.children![0]!;
|
|
433
|
-
expect(textNode.content).toBe('A € A €');
|
|
423
|
+
expect((textNode as any).content).toBe('A € A €');
|
|
434
424
|
});
|
|
435
425
|
|
|
436
|
-
|
|
437
|
-
const
|
|
438
|
-
const ast = parse(tokens);
|
|
426
|
+
it('should handle mixed entities and text', () => {
|
|
427
|
+
const ast = parseToAST('<p>R&D <testing> "quotes" 'apostrophe'</p>');
|
|
439
428
|
const pElement = ast.children![0]!;
|
|
440
429
|
const textNode = pElement.children![0]!;
|
|
441
|
-
expect(textNode.content).toBe('R&D <testing> "quotes" \'apostrophe\'');
|
|
430
|
+
expect((textNode as any).content).toBe('R&D <testing> "quotes" \'apostrophe\'');
|
|
442
431
|
});
|
|
443
432
|
|
|
444
|
-
|
|
445
|
-
const
|
|
446
|
-
const ast = parse(tokens);
|
|
433
|
+
it('should handle entities in attributes', () => {
|
|
434
|
+
const ast = parseToAST('<div title="R&D <section>" data-test=""hello"">');
|
|
447
435
|
const divElement = ast.children![0]!;
|
|
448
436
|
expect(divElement.attributes!.title).toBe('R&D <section>');
|
|
449
437
|
expect(divElement.attributes!['data-test']).toBe('"hello"');
|
|
@@ -451,37 +439,33 @@ describe('HTML Parser', () => {
|
|
|
451
439
|
})
|
|
452
440
|
|
|
453
441
|
describe('DOM-like Functionality Tests', () => {
|
|
454
|
-
|
|
455
|
-
const
|
|
456
|
-
const ast = parse(tokens);
|
|
442
|
+
it('should maintain parent-child relationships', () => {
|
|
443
|
+
const ast = parseToAST('<div><section><article><h1>Title</h1><p>Content</p></article></section></div>');
|
|
457
444
|
const divElement = ast.children![0]!;
|
|
458
445
|
const sectionElement = divElement.children![0]!;
|
|
459
446
|
const articleElement = sectionElement.children![0]!;
|
|
460
|
-
expect(sectionElement.parent).toBe(divElement);
|
|
461
|
-
expect(articleElement.parent).toBe(sectionElement);
|
|
462
447
|
expect(articleElement.children).toHaveLength(2);
|
|
463
448
|
expect(articleElement.children![0]!.tagName).toBe('h1');
|
|
464
449
|
expect(articleElement.children![1]!.tagName).toBe('p');
|
|
465
450
|
});
|
|
466
451
|
|
|
467
|
-
|
|
468
|
-
const
|
|
469
|
-
const ast = parse(tokens);
|
|
452
|
+
it('should handle sibling navigation scenarios', () => {
|
|
453
|
+
const ast = parseToAST('<nav><a href="#home">Home</a><a href="#about">About</a><a href="#contact">Contact</a></nav>');
|
|
470
454
|
const navElement = ast.children![0]!;
|
|
471
|
-
const links = navElement.children!.filter(child => child.type === ASTNodeType.
|
|
455
|
+
const links = navElement.children!.filter(child => child.type === ASTNodeType.Element);
|
|
472
456
|
expect(links).toHaveLength(3);
|
|
473
457
|
links.forEach((link, index) => {
|
|
474
458
|
expect(link.tagName).toBe('a');
|
|
475
459
|
expect(link.attributes!.href).toBeDefined();
|
|
476
|
-
expect(link.children![0]!.type).toBe(ASTNodeType.
|
|
460
|
+
expect(link.children![0]!.type).toBe(ASTNodeType.Text);
|
|
477
461
|
});
|
|
478
|
-
expect(links[0]!.children![0]
|
|
479
|
-
expect(links[1]!.children![0]
|
|
480
|
-
expect(links[2]!.children![0]
|
|
462
|
+
expect((links[0]!.children![0]! as any).content).toBe('Home');
|
|
463
|
+
expect((links[1]!.children![0]! as any).content).toBe('About');
|
|
464
|
+
expect((links[2]!.children![0]! as any).content).toBe('Contact');
|
|
481
465
|
});
|
|
482
466
|
|
|
483
|
-
|
|
484
|
-
const
|
|
467
|
+
it('should handle form elements with all attribute types', () => {
|
|
468
|
+
const ast = parseToAST(`
|
|
485
469
|
<form action="/submit" method="post" enctype="multipart/form-data">
|
|
486
470
|
<input type="text" name="username" required placeholder="Enter username" maxlength="50">
|
|
487
471
|
<input type="password" name="password" required>
|
|
@@ -497,13 +481,12 @@ describe('HTML Parser', () => {
|
|
|
497
481
|
<button type="submit" disabled>Submit</button>
|
|
498
482
|
</form>
|
|
499
483
|
`);
|
|
500
|
-
const ast = parse(tokens);
|
|
501
484
|
const formElement = ast.children!.find(child => child.tagName === 'form')!;
|
|
502
485
|
expect(formElement.attributes!.action).toBe('/submit');
|
|
503
486
|
expect(formElement.attributes!.method).toBe('post');
|
|
504
487
|
const inputs: ASTNode[] = [];
|
|
505
488
|
const traverse = (node: ASTNode) => {
|
|
506
|
-
if (node.type === ASTNodeType.
|
|
489
|
+
if (node.type === ASTNodeType.Element) {
|
|
507
490
|
if (['input', 'select', 'textarea', 'button'].includes(node.tagName!)) {
|
|
508
491
|
inputs.push(node);
|
|
509
492
|
}
|
|
@@ -521,8 +504,8 @@ describe('HTML Parser', () => {
|
|
|
521
504
|
expect(selectElement!.attributes!.multiple).toBe('');
|
|
522
505
|
});
|
|
523
506
|
|
|
524
|
-
|
|
525
|
-
const
|
|
507
|
+
it('should handle table structures correctly', () => {
|
|
508
|
+
const ast = parseToAST(`
|
|
526
509
|
<table border="1" cellpadding="5" cellspacing="0">
|
|
527
510
|
<thead>
|
|
528
511
|
<tr>
|
|
@@ -545,7 +528,6 @@ describe('HTML Parser', () => {
|
|
|
545
528
|
</tbody>
|
|
546
529
|
</table>
|
|
547
530
|
`);
|
|
548
|
-
const ast = parse(tokens);
|
|
549
531
|
const tableElement = ast.children!.find(child => child.tagName === 'table')!;
|
|
550
532
|
const thead = tableElement.children!.find(child => child.tagName === 'thead');
|
|
551
533
|
const tbody = tableElement.children!.find(child => child.tagName === 'tbody');
|
|
@@ -564,22 +546,21 @@ describe('HTML Parser', () => {
|
|
|
564
546
|
expect(rows).toHaveLength(3);
|
|
565
547
|
});
|
|
566
548
|
|
|
567
|
-
|
|
568
|
-
const
|
|
549
|
+
it('should handle mixed content with inline elements', () => {
|
|
550
|
+
const ast = parseToAST(`
|
|
569
551
|
<p>This is <strong>bold text</strong> and this is <em>italic text</em>.
|
|
570
552
|
Here's a <a href="https://example.com" target="_blank">link</a> and some
|
|
571
553
|
<code>inline code</code>. Also <span class="highlight">highlighted text</span>.</p>
|
|
572
554
|
`);
|
|
573
|
-
const ast = parse(tokens);
|
|
574
555
|
const pElement = ast.children!.find(child => child.tagName === 'p')!;
|
|
575
556
|
let textNodes = 0;
|
|
576
557
|
let elementNodes = 0;
|
|
577
558
|
let totalChildren = 0;
|
|
578
559
|
const traverse = (node: ASTNode) => {
|
|
579
560
|
totalChildren++;
|
|
580
|
-
if (node.type === ASTNodeType.
|
|
561
|
+
if (node.type === ASTNodeType.Text && (node as any).content!.trim()) {
|
|
581
562
|
textNodes++;
|
|
582
|
-
} else if (node.type === ASTNodeType.
|
|
563
|
+
} else if (node.type === ASTNodeType.Element) {
|
|
583
564
|
elementNodes++;
|
|
584
565
|
}
|
|
585
566
|
if (node.children) {
|
|
@@ -593,8 +574,8 @@ describe('HTML Parser', () => {
|
|
|
593
574
|
expect(textNodes).toBeGreaterThan(0);
|
|
594
575
|
});
|
|
595
576
|
|
|
596
|
-
|
|
597
|
-
const
|
|
577
|
+
it('should preserve document structure integrity', () => {
|
|
578
|
+
const ast = parseToAST(`<!DOCTYPE html>
|
|
598
579
|
<html lang="en">
|
|
599
580
|
<head>
|
|
600
581
|
<meta charset="UTF-8">
|
|
@@ -619,8 +600,7 @@ describe('HTML Parser', () => {
|
|
|
619
600
|
</footer>
|
|
620
601
|
</body>
|
|
621
602
|
</html>`);
|
|
622
|
-
const
|
|
623
|
-
const doctype = ast.children!.find(child => child.type === ASTNodeType.DOCTYPE);
|
|
603
|
+
const doctype = ast.children!.find(child => child.type === ASTNodeType.Doctype);
|
|
624
604
|
expect(doctype).toBeDefined();
|
|
625
605
|
const htmlElement = ast.children!.find(child => child.tagName === 'html')!;
|
|
626
606
|
expect(htmlElement.attributes!.lang).toBe('en');
|