@tkeron/html-parser 0.1.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -7
- package/bun.lock +8 -3
- package/index.ts +4 -0
- package/package.json +13 -6
- package/src/css-selector.ts +45 -27
- package/src/dom-simulator.ts +162 -20
- package/src/encoding.ts +39 -0
- package/src/index.ts +9 -0
- package/src/parser.ts +478 -183
- package/src/serializer.ts +450 -0
- package/src/tokenizer.ts +59 -139
- package/tests/advanced.test.ts +119 -106
- package/tests/custom-elements.test.ts +172 -162
- package/tests/dom-extended.test.ts +12 -12
- package/tests/dom-manipulation.test.ts +637 -0
- package/tests/dom.test.ts +32 -27
- package/tests/helpers/tokenizer-adapter.test.ts +70 -0
- package/tests/helpers/tokenizer-adapter.ts +65 -0
- package/tests/helpers/tree-adapter.test.ts +39 -0
- package/tests/helpers/tree-adapter.ts +43 -0
- package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
- package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
- package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
- package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
- package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
- package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
- package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
- package/tests/html5lib-data/tree-construction/math.dat +104 -0
- package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
- package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
- package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
- package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
- package/tests/html5lib-data/tree-construction/svg.dat +104 -0
- package/tests/html5lib-data/tree-construction/template.dat +1673 -0
- package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
- package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
- package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
- package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
- package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
- package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
- package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
- package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
- package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
- package/tests/parser.test.ts +172 -193
- package/tests/selectors.test.ts +64 -1
- package/tests/serializer-core.test.ts +16 -0
- package/tests/serializer-data/core.test +125 -0
- package/tests/serializer-data/injectmeta.test +66 -0
- package/tests/serializer-data/optionaltags.test +965 -0
- package/tests/serializer-data/options.test +60 -0
- package/tests/serializer-data/whitespace.test +51 -0
- package/tests/serializer-injectmeta.test.ts +16 -0
- package/tests/serializer-optionaltags.test.ts +16 -0
- package/tests/serializer-options.test.ts +16 -0
- package/tests/serializer-whitespace.test.ts +16 -0
- package/tests/tokenizer-namedEntities.test.ts +20 -0
- package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
- package/tests/tokenizer.test.ts +83 -0
- package/tests/tree-construction-adoption01.test.ts +37 -0
- package/tests/tree-construction-adoption02.test.ts +34 -0
- package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
- package/tests/tree-construction-entities02.test.ts +33 -0
- package/tests/tree-construction-html5test-com.test.ts +24 -0
- package/tests/tree-construction-math.test.ts +18 -0
- package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
- package/tests/tree-construction-noscript01.test.ts +18 -0
- package/tests/tree-construction-ruby.test.ts +21 -0
- package/tests/tree-construction-scriptdata01.test.ts +21 -0
- package/tests/tree-construction-svg.test.ts +21 -0
- package/tests/tree-construction-template.test.ts +21 -0
- package/tests/tree-construction-tests10.test.ts +21 -0
- package/tests/tree-construction-tests11.test.ts +21 -0
- package/tests/tree-construction-tests20.test.ts +18 -0
- package/tests/tree-construction-tests21.test.ts +18 -0
- package/tests/tree-construction-tests23.test.ts +18 -0
- package/tests/tree-construction-tests24.test.ts +18 -0
- package/tests/tree-construction-tests5.test.ts +21 -0
- package/tests/tree-construction-tests6.test.ts +21 -0
- package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
- package/tests/void-elements.test.ts +471 -0
- package/tests/official/README.md +0 -87
- package/tests/official/acid/acid-tests.test.ts +0 -309
- package/tests/official/final-output/final-output.test.ts +0 -361
- package/tests/official/html5lib/tokenizer-utils.ts +0 -192
- package/tests/official/html5lib/tokenizer.test.ts +0 -171
- package/tests/official/html5lib/tree-construction-utils.ts +0 -194
- package/tests/official/html5lib/tree-construction.test.ts +0 -250
- package/tests/official/validator/validator-tests.test.ts +0 -237
- package/tests/official/validator-nu/validator-nu.test.ts +0 -335
- package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
- package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/tests/parser.test.ts
CHANGED
|
@@ -1,28 +1,46 @@
|
|
|
1
|
-
|
|
1
|
+
// @ts-nocheck
|
|
2
|
+
import { expect, it, describe } from 'bun:test';
|
|
2
3
|
import { tokenize } from '../src/tokenizer';
|
|
3
|
-
import { parse, ASTNodeType, type ASTNode } from '../src/parser';
|
|
4
|
+
import { parse, domToAST, ASTNodeType, type ASTNode } from '../src/parser';
|
|
4
5
|
import { file } from 'bun';
|
|
5
|
-
|
|
6
|
+
|
|
7
|
+
function parseToAST(html: string): ASTNode {
|
|
8
|
+
const tokens = tokenize(html);
|
|
9
|
+
const dom = parse(tokens);
|
|
10
|
+
const ast = domToAST(dom);
|
|
11
|
+
|
|
12
|
+
const hasExplicitHtml = html.includes('<html') || html.includes('<!DOCTYPE') || html.includes('<!doctype');
|
|
13
|
+
if (hasExplicitHtml) {
|
|
14
|
+
return ast;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
const htmlEl = ast.children?.find(c => c.tagName === 'html');
|
|
18
|
+
if (htmlEl) {
|
|
19
|
+
const bodyEl = htmlEl.children?.find(c => c.tagName === 'body');
|
|
20
|
+
if (bodyEl && bodyEl.children) {
|
|
21
|
+
return { type: ASTNodeType.Document, children: bodyEl.children };
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
return ast;
|
|
25
|
+
}
|
|
6
26
|
|
|
7
27
|
describe('HTML Parser', () => {
|
|
8
28
|
|
|
9
29
|
describe('Basic Elements', () => {
|
|
10
|
-
|
|
11
|
-
const
|
|
12
|
-
const ast = parse(tokens);
|
|
30
|
+
it('should parse simple element', () => {
|
|
31
|
+
const ast = parseToAST('<div></div>');
|
|
13
32
|
|
|
14
|
-
expect(ast.type).toBe(ASTNodeType.
|
|
33
|
+
expect(ast.type).toBe(ASTNodeType.Document);
|
|
15
34
|
expect(ast.children).toHaveLength(1);
|
|
16
35
|
|
|
17
36
|
const divElement = ast.children![0]!;
|
|
18
|
-
expect(divElement.type).toBe(ASTNodeType.
|
|
37
|
+
expect(divElement.type).toBe(ASTNodeType.Element);
|
|
19
38
|
expect(divElement.tagName).toBe('div');
|
|
20
39
|
expect(divElement.children).toHaveLength(0);
|
|
21
40
|
});
|
|
22
41
|
|
|
23
|
-
|
|
24
|
-
const
|
|
25
|
-
const ast = parse(tokens);
|
|
42
|
+
it('should parse element with attributes', () => {
|
|
43
|
+
const ast = parseToAST('<div class="container" id="main"></div>');
|
|
26
44
|
|
|
27
45
|
const divElement = ast.children![0]!;
|
|
28
46
|
expect(divElement.attributes).toEqual({
|
|
@@ -31,38 +49,35 @@ describe('HTML Parser', () => {
|
|
|
31
49
|
});
|
|
32
50
|
});
|
|
33
51
|
|
|
34
|
-
|
|
35
|
-
const
|
|
36
|
-
const ast = parse(tokens);
|
|
52
|
+
it('should parse self-closing elements', () => {
|
|
53
|
+
const ast = parseToAST('<img src="test.jpg" alt="test"/>');
|
|
37
54
|
|
|
38
55
|
const imgElement = ast.children![0]!;
|
|
39
|
-
expect(imgElement.type).toBe(ASTNodeType.
|
|
56
|
+
expect(imgElement.type).toBe(ASTNodeType.Element);
|
|
40
57
|
expect(imgElement.tagName).toBe('img');
|
|
41
|
-
expect(imgElement.isSelfClosing).toBe(true);
|
|
58
|
+
expect((imgElement as any).isSelfClosing).toBe(true);
|
|
42
59
|
expect(imgElement.attributes).toEqual({
|
|
43
60
|
src: 'test.jpg',
|
|
44
61
|
alt: 'test'
|
|
45
62
|
});
|
|
46
63
|
});
|
|
47
64
|
|
|
48
|
-
|
|
49
|
-
const
|
|
50
|
-
const ast = parse(tokens);
|
|
65
|
+
it('should parse void elements correctly', () => {
|
|
66
|
+
const ast = parseToAST('<br><hr><input type="text">');
|
|
51
67
|
|
|
52
68
|
expect(ast.children).toHaveLength(3);
|
|
53
69
|
expect(ast.children![0]!.tagName).toBe('br');
|
|
54
|
-
expect(ast.children![0]
|
|
70
|
+
expect((ast.children![0]! as any).isSelfClosing).toBe(true);
|
|
55
71
|
expect(ast.children![1]!.tagName).toBe('hr');
|
|
56
|
-
expect(ast.children![1]
|
|
72
|
+
expect((ast.children![1]! as any).isSelfClosing).toBe(true);
|
|
57
73
|
expect(ast.children![2]!.tagName).toBe('input');
|
|
58
|
-
expect(ast.children![2]
|
|
74
|
+
expect((ast.children![2]! as any).isSelfClosing).toBe(true);
|
|
59
75
|
});
|
|
60
76
|
});
|
|
61
77
|
|
|
62
78
|
describe('Nested Elements', () => {
|
|
63
|
-
|
|
64
|
-
const
|
|
65
|
-
const ast = parse(tokens);
|
|
79
|
+
it('should parse nested elements', () => {
|
|
80
|
+
const ast = parseToAST('<div><p>Hello</p></div>');
|
|
66
81
|
|
|
67
82
|
const divElement = ast.children![0]!;
|
|
68
83
|
expect(divElement.tagName).toBe('div');
|
|
@@ -73,13 +88,12 @@ describe('HTML Parser', () => {
|
|
|
73
88
|
expect(pElement.children).toHaveLength(1);
|
|
74
89
|
|
|
75
90
|
const textNode = pElement.children![0]!;
|
|
76
|
-
expect(textNode.type).toBe(ASTNodeType.
|
|
77
|
-
expect(textNode.content).toBe('Hello');
|
|
91
|
+
expect(textNode.type).toBe(ASTNodeType.Text);
|
|
92
|
+
expect((textNode as any).content).toBe('Hello');
|
|
78
93
|
});
|
|
79
94
|
|
|
80
|
-
|
|
81
|
-
const
|
|
82
|
-
const ast = parse(tokens);
|
|
95
|
+
it('should parse deeply nested elements', () => {
|
|
96
|
+
const ast = parseToAST('<div><section><article><h1>Title</h1></article></section></div>');
|
|
83
97
|
|
|
84
98
|
const divElement = ast.children![0]!;
|
|
85
99
|
const sectionElement = divElement.children![0]!;
|
|
@@ -87,99 +101,90 @@ describe('HTML Parser', () => {
|
|
|
87
101
|
const h1Element = articleElement.children![0]!;
|
|
88
102
|
|
|
89
103
|
expect(h1Element.tagName).toBe('h1');
|
|
90
|
-
expect(h1Element.children![0]
|
|
104
|
+
expect((h1Element.children![0]! as any).content).toBe('Title');
|
|
91
105
|
});
|
|
92
106
|
|
|
93
|
-
|
|
94
|
-
const
|
|
95
|
-
const ast = parse(tokens);
|
|
107
|
+
it('should handle multiple siblings', () => {
|
|
108
|
+
const ast = parseToAST('<div><p>First</p><p>Second</p><p>Third</p></div>');
|
|
96
109
|
|
|
97
110
|
const divElement = ast.children![0]!;
|
|
98
111
|
expect(divElement.children).toHaveLength(3);
|
|
99
112
|
|
|
100
113
|
expect(divElement.children![0]!.tagName).toBe('p');
|
|
101
|
-
expect(divElement.children![0]!.children![0]
|
|
102
|
-
expect(divElement.children![1]!.children![0]
|
|
103
|
-
expect(divElement.children![2]!.children![0]
|
|
114
|
+
expect((divElement.children![0]!.children![0] as any).content).toBe('First');
|
|
115
|
+
expect((divElement.children![1]!.children![0] as any).content).toBe('Second');
|
|
116
|
+
expect((divElement.children![2]!.children![0] as any).content).toBe('Third');
|
|
104
117
|
});
|
|
105
118
|
});
|
|
106
119
|
|
|
107
120
|
describe('Text Content', () => {
|
|
108
|
-
|
|
109
|
-
const
|
|
110
|
-
const ast = parse(tokens);
|
|
121
|
+
it('should parse text content', () => {
|
|
122
|
+
const ast = parseToAST('Hello World');
|
|
111
123
|
|
|
112
124
|
expect(ast.children).toHaveLength(1);
|
|
113
125
|
const textNode = ast.children![0]!;
|
|
114
|
-
expect(textNode.type).toBe(ASTNodeType.
|
|
115
|
-
expect(textNode.content).toBe('Hello World');
|
|
126
|
+
expect(textNode.type).toBe(ASTNodeType.Text);
|
|
127
|
+
expect((textNode as any).content).toBe('Hello World');
|
|
116
128
|
});
|
|
117
129
|
|
|
118
|
-
|
|
119
|
-
const
|
|
120
|
-
const ast = parse(tokens);
|
|
130
|
+
it('should parse mixed text and elements', () => {
|
|
131
|
+
const ast = parseToAST('Before <strong>bold</strong> after');
|
|
121
132
|
|
|
122
133
|
expect(ast.children).toHaveLength(3);
|
|
123
|
-
expect(ast.children![0]
|
|
134
|
+
expect((ast.children![0]! as any).content).toBe('Before ');
|
|
124
135
|
expect(ast.children![1]!.tagName).toBe('strong');
|
|
125
|
-
expect(ast.children![1]!.children![0]
|
|
126
|
-
expect(ast.children![2]
|
|
136
|
+
expect((ast.children![1]!.children![0]! as any).content).toBe('bold');
|
|
137
|
+
expect((ast.children![2]! as any).content).toBe(' after');
|
|
127
138
|
});
|
|
128
139
|
|
|
129
|
-
|
|
130
|
-
const
|
|
131
|
-
const ast = parse(tokens);
|
|
140
|
+
it('should handle entities in text', () => {
|
|
141
|
+
const ast = parseToAST('<p>& < ></p>');
|
|
132
142
|
|
|
133
143
|
const pElement = ast.children![0]!;
|
|
134
144
|
const textNode = pElement.children![0]!;
|
|
135
|
-
expect(textNode.content).toBe('& < >');
|
|
145
|
+
expect((textNode as any).content).toBe('& < >');
|
|
136
146
|
});
|
|
137
147
|
});
|
|
138
148
|
|
|
139
149
|
describe('Comments and Special Nodes', () => {
|
|
140
|
-
|
|
141
|
-
const
|
|
142
|
-
const ast = parse(tokens);
|
|
150
|
+
it('should parse HTML comments', () => {
|
|
151
|
+
const ast = parseToAST('<!-- This is a comment -->');
|
|
143
152
|
|
|
144
153
|
expect(ast.children).toHaveLength(1);
|
|
145
154
|
const commentNode = ast.children![0]!;
|
|
146
|
-
expect(commentNode.type).toBe(ASTNodeType.
|
|
147
|
-
expect(commentNode.content).toBe(' This is a comment ');
|
|
155
|
+
expect(commentNode.type).toBe(ASTNodeType.Comment);
|
|
156
|
+
expect((commentNode as any).content).toBe(' This is a comment ');
|
|
148
157
|
});
|
|
149
158
|
|
|
150
|
-
|
|
151
|
-
const
|
|
152
|
-
const ast = parse(tokens);
|
|
159
|
+
it('should parse DOCTYPE', () => {
|
|
160
|
+
const ast = parseToAST('<!DOCTYPE html>');
|
|
153
161
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
expect(doctypeNode.
|
|
157
|
-
expect(doctypeNode.content).toBe('html');
|
|
162
|
+
const doctypeNode = ast.children?.find(c => c.type === ASTNodeType.Doctype);
|
|
163
|
+
expect(doctypeNode).toBeDefined();
|
|
164
|
+
expect((doctypeNode as any).content).toBe('html');
|
|
158
165
|
});
|
|
159
166
|
|
|
160
|
-
|
|
161
|
-
const
|
|
162
|
-
const ast = parse(tokens);
|
|
167
|
+
it.skip('should parse CDATA sections', () => {
|
|
168
|
+
const ast = parseToAST('<![CDATA[Some raw data]]>');
|
|
163
169
|
|
|
164
170
|
expect(ast.children).toHaveLength(1);
|
|
165
171
|
const cdataNode = ast.children![0]!;
|
|
166
172
|
expect(cdataNode.type).toBe(ASTNodeType.CDATA);
|
|
167
|
-
expect(cdataNode.content).toBe('Some raw data');
|
|
173
|
+
expect((cdataNode as any).content).toBe('Some raw data');
|
|
168
174
|
});
|
|
169
175
|
|
|
170
|
-
|
|
171
|
-
const
|
|
172
|
-
const ast = parse(tokens);
|
|
176
|
+
it.skip('should parse processing instructions', () => {
|
|
177
|
+
const ast = parseToAST('<?xml version="1.0"?>');
|
|
173
178
|
|
|
174
179
|
expect(ast.children).toHaveLength(1);
|
|
175
180
|
const piNode = ast.children![0]!;
|
|
176
|
-
expect(piNode.type).toBe(
|
|
177
|
-
expect(piNode.content).toBe('<?xml version="1.0"');
|
|
181
|
+
expect(piNode.type).toBe('processing-instruction' as any);
|
|
182
|
+
expect((piNode as any).content).toBe('<?xml version="1.0"');
|
|
178
183
|
});
|
|
179
184
|
});
|
|
180
185
|
|
|
181
186
|
describe('Complete HTML Documents', () => {
|
|
182
|
-
|
|
187
|
+
it('should parse complete HTML document', () => {
|
|
183
188
|
const html = `<!DOCTYPE html>
|
|
184
189
|
<html lang="en">
|
|
185
190
|
<head>
|
|
@@ -193,20 +198,19 @@ describe('HTML Parser', () => {
|
|
|
193
198
|
</body>
|
|
194
199
|
</html>`;
|
|
195
200
|
|
|
196
|
-
const
|
|
197
|
-
const ast = parse(tokens);
|
|
201
|
+
const ast = parseToAST(html);
|
|
198
202
|
|
|
199
203
|
expect(ast.children!.length).toBeGreaterThan(1);
|
|
200
204
|
|
|
201
205
|
const htmlElement = ast.children!.find(
|
|
202
|
-
child => child.type === ASTNodeType.
|
|
206
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'html'
|
|
203
207
|
)!;
|
|
204
208
|
|
|
205
209
|
expect(htmlElement).toBeDefined();
|
|
206
210
|
expect(htmlElement.attributes!.lang).toBe('en');
|
|
207
211
|
|
|
208
212
|
const elementChildren = htmlElement.children!.filter(
|
|
209
|
-
child => child.type === ASTNodeType.
|
|
213
|
+
child => child.type === ASTNodeType.Element
|
|
210
214
|
);
|
|
211
215
|
expect(elementChildren).toHaveLength(2);
|
|
212
216
|
|
|
@@ -219,17 +223,15 @@ describe('HTML Parser', () => {
|
|
|
219
223
|
});
|
|
220
224
|
|
|
221
225
|
describe('real web scenarios', () => {
|
|
222
|
-
|
|
223
|
-
const html = await file(
|
|
224
|
-
const
|
|
225
|
-
const ast = parse(tokens);
|
|
226
|
+
it('should parse real-world HTML', async () => {
|
|
227
|
+
const html = await file("./tests/test-page-0.txt").text();
|
|
228
|
+
const ast = parseToAST(html);
|
|
226
229
|
});
|
|
227
230
|
});
|
|
228
231
|
|
|
229
232
|
describe('Error Recovery', () => {
|
|
230
|
-
|
|
231
|
-
const
|
|
232
|
-
const ast = parse(tokens);
|
|
233
|
+
it('should handle unclosed tags', () => {
|
|
234
|
+
const ast = parseToAST('<div><p>Unclosed paragraph</div>');
|
|
233
235
|
|
|
234
236
|
const divElement = ast.children![0]!;
|
|
235
237
|
expect(divElement.tagName).toBe('div');
|
|
@@ -238,17 +240,15 @@ describe('HTML Parser', () => {
|
|
|
238
240
|
expect(pElement.tagName).toBe('p');
|
|
239
241
|
});
|
|
240
242
|
|
|
241
|
-
|
|
242
|
-
const
|
|
243
|
-
const ast = parse(tokens);
|
|
243
|
+
it('should handle unexpected closing tags', () => {
|
|
244
|
+
const ast = parseToAST('<div></span></div>');
|
|
244
245
|
|
|
245
246
|
const divElement = ast.children![0]!;
|
|
246
247
|
expect(divElement.tagName).toBe('div');
|
|
247
248
|
});
|
|
248
249
|
|
|
249
|
-
|
|
250
|
-
const
|
|
251
|
-
const ast = parse(tokens);
|
|
250
|
+
it('should handle malformed attributes', () => {
|
|
251
|
+
const ast = parseToAST('<div class="test id="main">Content</div>');
|
|
252
252
|
|
|
253
253
|
const divElement = ast.children![0]!;
|
|
254
254
|
expect(divElement.tagName).toBe('div');
|
|
@@ -257,58 +257,54 @@ describe('HTML Parser', () => {
|
|
|
257
257
|
});
|
|
258
258
|
|
|
259
259
|
describe('Auto-closing Tags', () => {
|
|
260
|
-
|
|
261
|
-
const
|
|
262
|
-
const ast = parse(tokens);
|
|
260
|
+
it('should auto-close list items', () => {
|
|
261
|
+
const ast = parseToAST('<ul><li>First<li>Second</ul>');
|
|
263
262
|
|
|
264
263
|
const ulElement = ast.children![0]!;
|
|
265
264
|
const liElements = ulElement.children!.filter(
|
|
266
|
-
child => child.type === ASTNodeType.
|
|
265
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'li'
|
|
267
266
|
);
|
|
268
267
|
|
|
269
268
|
expect(liElements).toHaveLength(2);
|
|
270
|
-
expect(liElements[0]!.children![0]
|
|
271
|
-
expect(liElements[1]!.children![0]
|
|
269
|
+
expect((liElements[0]!.children![0]! as any).content).toBe('First');
|
|
270
|
+
expect((liElements[1]!.children![0]! as any).content).toBe('Second');
|
|
272
271
|
});
|
|
273
272
|
|
|
274
|
-
|
|
275
|
-
const
|
|
276
|
-
const ast = parse(tokens);
|
|
273
|
+
it('should auto-close paragraph tags', () => {
|
|
274
|
+
const ast = parseToAST('<p>First paragraph<p>Second paragraph');
|
|
277
275
|
|
|
278
276
|
const pElements = ast.children!.filter(
|
|
279
|
-
child => child.type === ASTNodeType.
|
|
277
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'p'
|
|
280
278
|
);
|
|
281
279
|
|
|
282
280
|
expect(pElements).toHaveLength(2);
|
|
283
|
-
expect(pElements[0]!.children![0]
|
|
284
|
-
expect(pElements[1]!.children![0]
|
|
281
|
+
expect((pElements[0]!.children![0]! as any).content).toBe('First paragraph');
|
|
282
|
+
expect((pElements[1]!.children![0]! as any).content).toBe('Second paragraph');
|
|
285
283
|
});
|
|
286
284
|
});
|
|
287
285
|
|
|
288
286
|
describe('Whitespace Handling', () => {
|
|
289
|
-
|
|
290
|
-
const
|
|
291
|
-
const ast = parse(tokens);
|
|
287
|
+
it('should preserve significant whitespace', () => {
|
|
288
|
+
const ast = parseToAST('<p> Hello World </p>');
|
|
292
289
|
|
|
293
290
|
const pElement = ast.children![0]!;
|
|
294
291
|
const textNode = pElement.children![0]!;
|
|
295
|
-
expect(textNode.content).toBe(' Hello World ');
|
|
292
|
+
expect((textNode as any).content).toBe(' Hello World ');
|
|
296
293
|
});
|
|
297
294
|
|
|
298
|
-
|
|
299
|
-
const
|
|
295
|
+
it('should skip insignificant whitespace', () => {
|
|
296
|
+
const ast = parseToAST(`<html>
|
|
300
297
|
<head>
|
|
301
298
|
<title>Test</title>
|
|
302
299
|
</head>
|
|
303
300
|
</html>`);
|
|
304
|
-
const ast = parse(tokens);
|
|
305
301
|
|
|
306
302
|
const htmlElement = ast.children!.find(
|
|
307
|
-
child => child.type === ASTNodeType.
|
|
303
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'html'
|
|
308
304
|
)!;
|
|
309
305
|
|
|
310
306
|
const headElement = htmlElement.children!.find(
|
|
311
|
-
child => child.type === ASTNodeType.
|
|
307
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'head'
|
|
312
308
|
)!;
|
|
313
309
|
|
|
314
310
|
expect(headElement).toBeDefined();
|
|
@@ -316,22 +312,22 @@ describe('HTML Parser', () => {
|
|
|
316
312
|
});
|
|
317
313
|
|
|
318
314
|
describe("complete web page", () => {
|
|
319
|
-
|
|
320
|
-
const html = await file(
|
|
321
|
-
const
|
|
322
|
-
|
|
323
|
-
expect(ast.children!.length).toBeGreaterThanOrEqual(3);
|
|
315
|
+
it('should parse a complete web page', async () => {
|
|
316
|
+
const html = await file("./tests/test-page-0.txt").text();
|
|
317
|
+
const ast = parseToAST(html);
|
|
318
|
+
expect(ast.children!.length).toBeGreaterThanOrEqual(1);
|
|
324
319
|
const htmlElement = ast.children!.find(
|
|
325
|
-
child => child.type === ASTNodeType.
|
|
320
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'html'
|
|
326
321
|
)!;
|
|
327
|
-
expect(htmlElement
|
|
322
|
+
expect(htmlElement).toBeDefined();
|
|
323
|
+
expect(htmlElement.type).toBe(ASTNodeType.Element);
|
|
328
324
|
expect(htmlElement.tagName).toBe('html');
|
|
329
325
|
expect(htmlElement.attributes!.lang).toBe('en');
|
|
330
326
|
const headElement = htmlElement.children!.find(
|
|
331
|
-
child => child.type === ASTNodeType.
|
|
327
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'head'
|
|
332
328
|
)!;
|
|
333
329
|
const bodyElement = htmlElement.children!.find(
|
|
334
|
-
child => child.type === ASTNodeType.
|
|
330
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'body'
|
|
335
331
|
)!;
|
|
336
332
|
expect(headElement).toBeDefined();
|
|
337
333
|
expect(bodyElement).toBeDefined();
|
|
@@ -339,9 +335,8 @@ describe('HTML Parser', () => {
|
|
|
339
335
|
})
|
|
340
336
|
|
|
341
337
|
describe('Advanced Edge Cases', () => {
|
|
342
|
-
|
|
343
|
-
const
|
|
344
|
-
const ast = parse(tokens);
|
|
338
|
+
it('should handle empty attributes', () => {
|
|
339
|
+
const ast = parseToAST('<input disabled checked="" value="">');
|
|
345
340
|
const inputElement = ast.children![0]!;
|
|
346
341
|
expect(inputElement.attributes).toEqual({
|
|
347
342
|
disabled: '',
|
|
@@ -350,9 +345,8 @@ describe('HTML Parser', () => {
|
|
|
350
345
|
});
|
|
351
346
|
});
|
|
352
347
|
|
|
353
|
-
|
|
354
|
-
const
|
|
355
|
-
const ast = parse(tokens);
|
|
348
|
+
it('should handle attributes with special characters', () => {
|
|
349
|
+
const ast = parseToAST('<div data-test="hello-world" class="my_class-123">');
|
|
356
350
|
const divElement = ast.children![0]!;
|
|
357
351
|
expect(divElement.attributes).toEqual({
|
|
358
352
|
'data-test': 'hello-world',
|
|
@@ -360,48 +354,45 @@ describe('HTML Parser', () => {
|
|
|
360
354
|
});
|
|
361
355
|
});
|
|
362
356
|
|
|
363
|
-
|
|
364
|
-
const
|
|
365
|
-
const ast = parse(tokens);
|
|
357
|
+
it('should handle mixed quotes in attributes', () => {
|
|
358
|
+
const ast = parseToAST(`<div title='He said "Hello"' data-info="She's here">`);
|
|
366
359
|
const divElement = ast.children![0]!;
|
|
367
360
|
expect(divElement.attributes!.title).toBe('He said "Hello"');
|
|
368
361
|
expect(divElement.attributes!['data-info']).toBe("She's here");
|
|
369
|
-
});
|
|
370
|
-
|
|
371
|
-
|
|
362
|
+
});
|
|
363
|
+
|
|
364
|
+
it('should handle deeply nested comments', () => {
|
|
365
|
+
const ast = parseToAST('<div><!-- Outer <!-- Inner --> comment --></div>');
|
|
372
366
|
const divElement = ast.children![0]!;
|
|
373
367
|
expect(divElement.children!.length).toBeGreaterThanOrEqual(1);
|
|
374
|
-
expect(divElement.children![0]!.type).toBe(ASTNodeType.
|
|
368
|
+
expect(divElement.children![0]!.type).toBe(ASTNodeType.Comment);
|
|
375
369
|
});
|
|
376
370
|
|
|
377
|
-
|
|
378
|
-
const
|
|
379
|
-
const ast = parse(tokens);
|
|
371
|
+
it('should handle multiple consecutive whitespace', () => {
|
|
372
|
+
const ast = parseToAST('<p> \n\t Hello \n\t World \n\t </p>');
|
|
380
373
|
const pElement = ast.children![0]!;
|
|
381
374
|
const textNode = pElement.children![0]!;
|
|
382
|
-
expect(textNode.content).toContain('Hello');
|
|
383
|
-
expect(textNode.content).toContain('World');
|
|
375
|
+
expect((textNode as any).content).toContain('Hello');
|
|
376
|
+
expect((textNode as any).content).toContain('World');
|
|
384
377
|
});
|
|
385
378
|
|
|
386
|
-
|
|
387
|
-
const
|
|
388
|
-
const ast = parse(tokens);
|
|
379
|
+
it('should handle malformed nested tags', () => {
|
|
380
|
+
const ast = parseToAST('<div><p><span>Text</div></span></p>');
|
|
389
381
|
const divElement = ast.children![0]!;
|
|
390
382
|
expect(divElement.tagName).toBe('div');
|
|
391
383
|
expect(divElement.children!.length).toBeGreaterThan(0);
|
|
392
384
|
});
|
|
393
385
|
|
|
394
|
-
|
|
395
|
-
const
|
|
396
|
-
const ast = parse(tokens);
|
|
386
|
+
it('should handle orphaned closing tags', () => {
|
|
387
|
+
const ast = parseToAST('</div><p>Content</p></span>');
|
|
397
388
|
const pElement = ast.children!.find(
|
|
398
|
-
child => child.type === ASTNodeType.
|
|
389
|
+
child => child.type === ASTNodeType.Element && child.tagName === 'p'
|
|
399
390
|
)!;
|
|
400
391
|
expect(pElement).toBeDefined();
|
|
401
|
-
expect(pElement.children![0]
|
|
392
|
+
expect((pElement.children![0]! as any).content).toBe('Content');
|
|
402
393
|
});
|
|
403
394
|
|
|
404
|
-
|
|
395
|
+
it('should handle extreme nesting depth', () => {
|
|
405
396
|
let html = '';
|
|
406
397
|
const depth = 50;
|
|
407
398
|
for (let i = 0; i < depth; i++) {
|
|
@@ -411,39 +402,35 @@ describe('HTML Parser', () => {
|
|
|
411
402
|
for (let i = 0; i < depth; i++) {
|
|
412
403
|
html += '</div>';
|
|
413
404
|
}
|
|
414
|
-
const
|
|
415
|
-
const ast = parse(tokens);
|
|
405
|
+
const ast = parseToAST(html);
|
|
416
406
|
let current = ast.children![0]!;
|
|
417
407
|
for (let i = 0; i < depth - 1; i++) {
|
|
418
408
|
expect(current.tagName).toBe('div');
|
|
419
409
|
expect(current.attributes!.level).toBe(i.toString());
|
|
420
|
-
current = current.children!.find(child => child.type === ASTNodeType.
|
|
410
|
+
current = current.children!.find(child => child.type === ASTNodeType.Element)!;
|
|
421
411
|
}
|
|
422
|
-
const textNode = current.children!.find(child => child.type === ASTNodeType.
|
|
423
|
-
expect(textNode.content).toBe('Deep content');
|
|
412
|
+
const textNode = current.children!.find(child => child.type === ASTNodeType.Text)!;
|
|
413
|
+
expect((textNode as any).content).toBe('Deep content');
|
|
424
414
|
});
|
|
425
415
|
})
|
|
426
416
|
|
|
427
417
|
describe('Complex Entity Handling', () => {
|
|
428
|
-
|
|
429
|
-
const
|
|
430
|
-
const ast = parse(tokens);
|
|
418
|
+
it('should handle numeric character references', () => {
|
|
419
|
+
const ast = parseToAST('<p>A € A €</p>');
|
|
431
420
|
const pElement = ast.children![0]!;
|
|
432
421
|
const textNode = pElement.children![0]!;
|
|
433
|
-
expect(textNode.content).toBe('A € A €');
|
|
422
|
+
expect((textNode as any).content).toBe('A € A €');
|
|
434
423
|
});
|
|
435
424
|
|
|
436
|
-
|
|
437
|
-
const
|
|
438
|
-
const ast = parse(tokens);
|
|
425
|
+
it('should handle mixed entities and text', () => {
|
|
426
|
+
const ast = parseToAST('<p>R&D <testing> "quotes" 'apostrophe'</p>');
|
|
439
427
|
const pElement = ast.children![0]!;
|
|
440
428
|
const textNode = pElement.children![0]!;
|
|
441
|
-
expect(textNode.content).toBe('R&D <testing> "quotes" \'apostrophe\'');
|
|
429
|
+
expect((textNode as any).content).toBe('R&D <testing> "quotes" \'apostrophe\'');
|
|
442
430
|
});
|
|
443
431
|
|
|
444
|
-
|
|
445
|
-
const
|
|
446
|
-
const ast = parse(tokens);
|
|
432
|
+
it('should handle entities in attributes', () => {
|
|
433
|
+
const ast = parseToAST('<div title="R&D <section>" data-test=""hello"">');
|
|
447
434
|
const divElement = ast.children![0]!;
|
|
448
435
|
expect(divElement.attributes!.title).toBe('R&D <section>');
|
|
449
436
|
expect(divElement.attributes!['data-test']).toBe('"hello"');
|
|
@@ -451,37 +438,33 @@ describe('HTML Parser', () => {
|
|
|
451
438
|
})
|
|
452
439
|
|
|
453
440
|
describe('DOM-like Functionality Tests', () => {
|
|
454
|
-
|
|
455
|
-
const
|
|
456
|
-
const ast = parse(tokens);
|
|
441
|
+
it('should maintain parent-child relationships', () => {
|
|
442
|
+
const ast = parseToAST('<div><section><article><h1>Title</h1><p>Content</p></article></section></div>');
|
|
457
443
|
const divElement = ast.children![0]!;
|
|
458
444
|
const sectionElement = divElement.children![0]!;
|
|
459
445
|
const articleElement = sectionElement.children![0]!;
|
|
460
|
-
expect(sectionElement.parent).toBe(divElement);
|
|
461
|
-
expect(articleElement.parent).toBe(sectionElement);
|
|
462
446
|
expect(articleElement.children).toHaveLength(2);
|
|
463
447
|
expect(articleElement.children![0]!.tagName).toBe('h1');
|
|
464
448
|
expect(articleElement.children![1]!.tagName).toBe('p');
|
|
465
449
|
});
|
|
466
450
|
|
|
467
|
-
|
|
468
|
-
const
|
|
469
|
-
const ast = parse(tokens);
|
|
451
|
+
it('should handle sibling navigation scenarios', () => {
|
|
452
|
+
const ast = parseToAST('<nav><a href="#home">Home</a><a href="#about">About</a><a href="#contact">Contact</a></nav>');
|
|
470
453
|
const navElement = ast.children![0]!;
|
|
471
|
-
const links = navElement.children!.filter(child => child.type === ASTNodeType.
|
|
454
|
+
const links = navElement.children!.filter(child => child.type === ASTNodeType.Element);
|
|
472
455
|
expect(links).toHaveLength(3);
|
|
473
456
|
links.forEach((link, index) => {
|
|
474
457
|
expect(link.tagName).toBe('a');
|
|
475
458
|
expect(link.attributes!.href).toBeDefined();
|
|
476
|
-
expect(link.children![0]!.type).toBe(ASTNodeType.
|
|
459
|
+
expect(link.children![0]!.type).toBe(ASTNodeType.Text);
|
|
477
460
|
});
|
|
478
|
-
expect(links[0]!.children![0]
|
|
479
|
-
expect(links[1]!.children![0]
|
|
480
|
-
expect(links[2]!.children![0]
|
|
461
|
+
expect((links[0]!.children![0]! as any).content).toBe('Home');
|
|
462
|
+
expect((links[1]!.children![0]! as any).content).toBe('About');
|
|
463
|
+
expect((links[2]!.children![0]! as any).content).toBe('Contact');
|
|
481
464
|
});
|
|
482
465
|
|
|
483
|
-
|
|
484
|
-
const
|
|
466
|
+
it('should handle form elements with all attribute types', () => {
|
|
467
|
+
const ast = parseToAST(`
|
|
485
468
|
<form action="/submit" method="post" enctype="multipart/form-data">
|
|
486
469
|
<input type="text" name="username" required placeholder="Enter username" maxlength="50">
|
|
487
470
|
<input type="password" name="password" required>
|
|
@@ -497,13 +480,12 @@ describe('HTML Parser', () => {
|
|
|
497
480
|
<button type="submit" disabled>Submit</button>
|
|
498
481
|
</form>
|
|
499
482
|
`);
|
|
500
|
-
const ast = parse(tokens);
|
|
501
483
|
const formElement = ast.children!.find(child => child.tagName === 'form')!;
|
|
502
484
|
expect(formElement.attributes!.action).toBe('/submit');
|
|
503
485
|
expect(formElement.attributes!.method).toBe('post');
|
|
504
486
|
const inputs: ASTNode[] = [];
|
|
505
487
|
const traverse = (node: ASTNode) => {
|
|
506
|
-
if (node.type === ASTNodeType.
|
|
488
|
+
if (node.type === ASTNodeType.Element) {
|
|
507
489
|
if (['input', 'select', 'textarea', 'button'].includes(node.tagName!)) {
|
|
508
490
|
inputs.push(node);
|
|
509
491
|
}
|
|
@@ -521,8 +503,8 @@ describe('HTML Parser', () => {
|
|
|
521
503
|
expect(selectElement!.attributes!.multiple).toBe('');
|
|
522
504
|
});
|
|
523
505
|
|
|
524
|
-
|
|
525
|
-
const
|
|
506
|
+
it('should handle table structures correctly', () => {
|
|
507
|
+
const ast = parseToAST(`
|
|
526
508
|
<table border="1" cellpadding="5" cellspacing="0">
|
|
527
509
|
<thead>
|
|
528
510
|
<tr>
|
|
@@ -545,7 +527,6 @@ describe('HTML Parser', () => {
|
|
|
545
527
|
</tbody>
|
|
546
528
|
</table>
|
|
547
529
|
`);
|
|
548
|
-
const ast = parse(tokens);
|
|
549
530
|
const tableElement = ast.children!.find(child => child.tagName === 'table')!;
|
|
550
531
|
const thead = tableElement.children!.find(child => child.tagName === 'thead');
|
|
551
532
|
const tbody = tableElement.children!.find(child => child.tagName === 'tbody');
|
|
@@ -564,22 +545,21 @@ describe('HTML Parser', () => {
|
|
|
564
545
|
expect(rows).toHaveLength(3);
|
|
565
546
|
});
|
|
566
547
|
|
|
567
|
-
|
|
568
|
-
const
|
|
548
|
+
it('should handle mixed content with inline elements', () => {
|
|
549
|
+
const ast = parseToAST(`
|
|
569
550
|
<p>This is <strong>bold text</strong> and this is <em>italic text</em>.
|
|
570
551
|
Here's a <a href="https://example.com" target="_blank">link</a> and some
|
|
571
552
|
<code>inline code</code>. Also <span class="highlight">highlighted text</span>.</p>
|
|
572
553
|
`);
|
|
573
|
-
const ast = parse(tokens);
|
|
574
554
|
const pElement = ast.children!.find(child => child.tagName === 'p')!;
|
|
575
555
|
let textNodes = 0;
|
|
576
556
|
let elementNodes = 0;
|
|
577
557
|
let totalChildren = 0;
|
|
578
558
|
const traverse = (node: ASTNode) => {
|
|
579
559
|
totalChildren++;
|
|
580
|
-
if (node.type === ASTNodeType.
|
|
560
|
+
if (node.type === ASTNodeType.Text && (node as any).content!.trim()) {
|
|
581
561
|
textNodes++;
|
|
582
|
-
} else if (node.type === ASTNodeType.
|
|
562
|
+
} else if (node.type === ASTNodeType.Element) {
|
|
583
563
|
elementNodes++;
|
|
584
564
|
}
|
|
585
565
|
if (node.children) {
|
|
@@ -593,8 +573,8 @@ describe('HTML Parser', () => {
|
|
|
593
573
|
expect(textNodes).toBeGreaterThan(0);
|
|
594
574
|
});
|
|
595
575
|
|
|
596
|
-
|
|
597
|
-
const
|
|
576
|
+
it('should preserve document structure integrity', () => {
|
|
577
|
+
const ast = parseToAST(`<!DOCTYPE html>
|
|
598
578
|
<html lang="en">
|
|
599
579
|
<head>
|
|
600
580
|
<meta charset="UTF-8">
|
|
@@ -619,8 +599,7 @@ describe('HTML Parser', () => {
|
|
|
619
599
|
</footer>
|
|
620
600
|
</body>
|
|
621
601
|
</html>`);
|
|
622
|
-
const
|
|
623
|
-
const doctype = ast.children!.find(child => child.type === ASTNodeType.DOCTYPE);
|
|
602
|
+
const doctype = ast.children!.find(child => child.type === ASTNodeType.Doctype);
|
|
624
603
|
expect(doctype).toBeDefined();
|
|
625
604
|
const htmlElement = ast.children!.find(child => child.tagName === 'html')!;
|
|
626
605
|
expect(htmlElement.attributes!.lang).toBe('en');
|