@tkeron/html-parser 0.1.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/README.md +1 -7
  2. package/bun.lock +8 -3
  3. package/index.ts +4 -0
  4. package/package.json +13 -6
  5. package/src/css-selector.ts +45 -27
  6. package/src/dom-simulator.ts +162 -20
  7. package/src/encoding.ts +39 -0
  8. package/src/index.ts +9 -0
  9. package/src/parser.ts +478 -183
  10. package/src/serializer.ts +450 -0
  11. package/src/tokenizer.ts +59 -139
  12. package/tests/advanced.test.ts +119 -106
  13. package/tests/custom-elements.test.ts +172 -162
  14. package/tests/dom-extended.test.ts +12 -12
  15. package/tests/dom-manipulation.test.ts +637 -0
  16. package/tests/dom.test.ts +32 -27
  17. package/tests/helpers/tokenizer-adapter.test.ts +70 -0
  18. package/tests/helpers/tokenizer-adapter.ts +65 -0
  19. package/tests/helpers/tree-adapter.test.ts +39 -0
  20. package/tests/helpers/tree-adapter.ts +43 -0
  21. package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
  22. package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
  23. package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
  24. package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
  25. package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
  26. package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
  27. package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
  28. package/tests/html5lib-data/tree-construction/math.dat +104 -0
  29. package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
  30. package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
  31. package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
  32. package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
  33. package/tests/html5lib-data/tree-construction/svg.dat +104 -0
  34. package/tests/html5lib-data/tree-construction/template.dat +1673 -0
  35. package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
  36. package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
  37. package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
  38. package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
  39. package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
  40. package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
  41. package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
  42. package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
  43. package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
  44. package/tests/parser.test.ts +172 -193
  45. package/tests/selectors.test.ts +64 -1
  46. package/tests/serializer-core.test.ts +16 -0
  47. package/tests/serializer-data/core.test +125 -0
  48. package/tests/serializer-data/injectmeta.test +66 -0
  49. package/tests/serializer-data/optionaltags.test +965 -0
  50. package/tests/serializer-data/options.test +60 -0
  51. package/tests/serializer-data/whitespace.test +51 -0
  52. package/tests/serializer-injectmeta.test.ts +16 -0
  53. package/tests/serializer-optionaltags.test.ts +16 -0
  54. package/tests/serializer-options.test.ts +16 -0
  55. package/tests/serializer-whitespace.test.ts +16 -0
  56. package/tests/tokenizer-namedEntities.test.ts +20 -0
  57. package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
  58. package/tests/tokenizer.test.ts +83 -0
  59. package/tests/tree-construction-adoption01.test.ts +37 -0
  60. package/tests/tree-construction-adoption02.test.ts +34 -0
  61. package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
  62. package/tests/tree-construction-entities02.test.ts +33 -0
  63. package/tests/tree-construction-html5test-com.test.ts +24 -0
  64. package/tests/tree-construction-math.test.ts +18 -0
  65. package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
  66. package/tests/tree-construction-noscript01.test.ts +18 -0
  67. package/tests/tree-construction-ruby.test.ts +21 -0
  68. package/tests/tree-construction-scriptdata01.test.ts +21 -0
  69. package/tests/tree-construction-svg.test.ts +21 -0
  70. package/tests/tree-construction-template.test.ts +21 -0
  71. package/tests/tree-construction-tests10.test.ts +21 -0
  72. package/tests/tree-construction-tests11.test.ts +21 -0
  73. package/tests/tree-construction-tests20.test.ts +18 -0
  74. package/tests/tree-construction-tests21.test.ts +18 -0
  75. package/tests/tree-construction-tests23.test.ts +18 -0
  76. package/tests/tree-construction-tests24.test.ts +18 -0
  77. package/tests/tree-construction-tests5.test.ts +21 -0
  78. package/tests/tree-construction-tests6.test.ts +21 -0
  79. package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
  80. package/tests/void-elements.test.ts +471 -0
  81. package/tests/official/README.md +0 -87
  82. package/tests/official/acid/acid-tests.test.ts +0 -309
  83. package/tests/official/final-output/final-output.test.ts +0 -361
  84. package/tests/official/html5lib/tokenizer-utils.ts +0 -192
  85. package/tests/official/html5lib/tokenizer.test.ts +0 -171
  86. package/tests/official/html5lib/tree-construction-utils.ts +0 -194
  87. package/tests/official/html5lib/tree-construction.test.ts +0 -250
  88. package/tests/official/validator/validator-tests.test.ts +0 -237
  89. package/tests/official/validator-nu/validator-nu.test.ts +0 -335
  90. package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
  91. package/tests/official/wpt/wpt-tests.test.ts +0 -409
@@ -1,28 +1,46 @@
1
- import { expect, test, describe } from 'bun:test';
1
+ // @ts-nocheck
2
+ import { expect, it, describe } from 'bun:test';
2
3
  import { tokenize } from '../src/tokenizer';
3
- import { parse, ASTNodeType, type ASTNode } from '../src/parser';
4
+ import { parse, domToAST, ASTNodeType, type ASTNode } from '../src/parser';
4
5
  import { file } from 'bun';
5
- import { join } from 'path';
6
+
7
+ function parseToAST(html: string): ASTNode {
8
+ const tokens = tokenize(html);
9
+ const dom = parse(tokens);
10
+ const ast = domToAST(dom);
11
+
12
+ const hasExplicitHtml = html.includes('<html') || html.includes('<!DOCTYPE') || html.includes('<!doctype');
13
+ if (hasExplicitHtml) {
14
+ return ast;
15
+ }
16
+
17
+ const htmlEl = ast.children?.find(c => c.tagName === 'html');
18
+ if (htmlEl) {
19
+ const bodyEl = htmlEl.children?.find(c => c.tagName === 'body');
20
+ if (bodyEl && bodyEl.children) {
21
+ return { type: ASTNodeType.Document, children: bodyEl.children };
22
+ }
23
+ }
24
+ return ast;
25
+ }
6
26
 
7
27
  describe('HTML Parser', () => {
8
28
 
9
29
  describe('Basic Elements', () => {
10
- test('should parse simple element', () => {
11
- const tokens = tokenize('<div></div>');
12
- const ast = parse(tokens);
30
+ it('should parse simple element', () => {
31
+ const ast = parseToAST('<div></div>');
13
32
 
14
- expect(ast.type).toBe(ASTNodeType.DOCUMENT);
33
+ expect(ast.type).toBe(ASTNodeType.Document);
15
34
  expect(ast.children).toHaveLength(1);
16
35
 
17
36
  const divElement = ast.children![0]!;
18
- expect(divElement.type).toBe(ASTNodeType.ELEMENT);
37
+ expect(divElement.type).toBe(ASTNodeType.Element);
19
38
  expect(divElement.tagName).toBe('div');
20
39
  expect(divElement.children).toHaveLength(0);
21
40
  });
22
41
 
23
- test('should parse element with attributes', () => {
24
- const tokens = tokenize('<div class="container" id="main"></div>');
25
- const ast = parse(tokens);
42
+ it('should parse element with attributes', () => {
43
+ const ast = parseToAST('<div class="container" id="main"></div>');
26
44
 
27
45
  const divElement = ast.children![0]!;
28
46
  expect(divElement.attributes).toEqual({
@@ -31,38 +49,35 @@ describe('HTML Parser', () => {
31
49
  });
32
50
  });
33
51
 
34
- test('should parse self-closing elements', () => {
35
- const tokens = tokenize('<img src="test.jpg" alt="test"/>');
36
- const ast = parse(tokens);
52
+ it('should parse self-closing elements', () => {
53
+ const ast = parseToAST('<img src="test.jpg" alt="test"/>');
37
54
 
38
55
  const imgElement = ast.children![0]!;
39
- expect(imgElement.type).toBe(ASTNodeType.ELEMENT);
56
+ expect(imgElement.type).toBe(ASTNodeType.Element);
40
57
  expect(imgElement.tagName).toBe('img');
41
- expect(imgElement.isSelfClosing).toBe(true);
58
+ expect((imgElement as any).isSelfClosing).toBe(true);
42
59
  expect(imgElement.attributes).toEqual({
43
60
  src: 'test.jpg',
44
61
  alt: 'test'
45
62
  });
46
63
  });
47
64
 
48
- test('should parse void elements correctly', () => {
49
- const tokens = tokenize('<br><hr><input type="text">');
50
- const ast = parse(tokens);
65
+ it('should parse void elements correctly', () => {
66
+ const ast = parseToAST('<br><hr><input type="text">');
51
67
 
52
68
  expect(ast.children).toHaveLength(3);
53
69
  expect(ast.children![0]!.tagName).toBe('br');
54
- expect(ast.children![0]!.isSelfClosing).toBe(true);
70
+ expect((ast.children![0]! as any).isSelfClosing).toBe(true);
55
71
  expect(ast.children![1]!.tagName).toBe('hr');
56
- expect(ast.children![1]!.isSelfClosing).toBe(true);
72
+ expect((ast.children![1]! as any).isSelfClosing).toBe(true);
57
73
  expect(ast.children![2]!.tagName).toBe('input');
58
- expect(ast.children![2]!.isSelfClosing).toBe(true);
74
+ expect((ast.children![2]! as any).isSelfClosing).toBe(true);
59
75
  });
60
76
  });
61
77
 
62
78
  describe('Nested Elements', () => {
63
- test('should parse nested elements', () => {
64
- const tokens = tokenize('<div><p>Hello</p></div>');
65
- const ast = parse(tokens);
79
+ it('should parse nested elements', () => {
80
+ const ast = parseToAST('<div><p>Hello</p></div>');
66
81
 
67
82
  const divElement = ast.children![0]!;
68
83
  expect(divElement.tagName).toBe('div');
@@ -73,13 +88,12 @@ describe('HTML Parser', () => {
73
88
  expect(pElement.children).toHaveLength(1);
74
89
 
75
90
  const textNode = pElement.children![0]!;
76
- expect(textNode.type).toBe(ASTNodeType.TEXT);
77
- expect(textNode.content).toBe('Hello');
91
+ expect(textNode.type).toBe(ASTNodeType.Text);
92
+ expect((textNode as any).content).toBe('Hello');
78
93
  });
79
94
 
80
- test('should parse deeply nested elements', () => {
81
- const tokens = tokenize('<div><section><article><h1>Title</h1></article></section></div>');
82
- const ast = parse(tokens);
95
+ it('should parse deeply nested elements', () => {
96
+ const ast = parseToAST('<div><section><article><h1>Title</h1></article></section></div>');
83
97
 
84
98
  const divElement = ast.children![0]!;
85
99
  const sectionElement = divElement.children![0]!;
@@ -87,99 +101,90 @@ describe('HTML Parser', () => {
87
101
  const h1Element = articleElement.children![0]!;
88
102
 
89
103
  expect(h1Element.tagName).toBe('h1');
90
- expect(h1Element.children![0]!.content).toBe('Title');
104
+ expect((h1Element.children![0]! as any).content).toBe('Title');
91
105
  });
92
106
 
93
- test('should handle multiple siblings', () => {
94
- const tokens = tokenize('<div><p>First</p><p>Second</p><p>Third</p></div>');
95
- const ast = parse(tokens);
107
+ it('should handle multiple siblings', () => {
108
+ const ast = parseToAST('<div><p>First</p><p>Second</p><p>Third</p></div>');
96
109
 
97
110
  const divElement = ast.children![0]!;
98
111
  expect(divElement.children).toHaveLength(3);
99
112
 
100
113
  expect(divElement.children![0]!.tagName).toBe('p');
101
- expect(divElement.children![0]!.children![0]!.content).toBe('First');
102
- expect(divElement.children![1]!.children![0]!.content).toBe('Second');
103
- expect(divElement.children![2]!.children![0]!.content).toBe('Third');
114
+ expect((divElement.children![0]!.children![0] as any).content).toBe('First');
115
+ expect((divElement.children![1]!.children![0] as any).content).toBe('Second');
116
+ expect((divElement.children![2]!.children![0] as any).content).toBe('Third');
104
117
  });
105
118
  });
106
119
 
107
120
  describe('Text Content', () => {
108
- test('should parse text content', () => {
109
- const tokens = tokenize('Hello World');
110
- const ast = parse(tokens);
121
+ it('should parse text content', () => {
122
+ const ast = parseToAST('Hello World');
111
123
 
112
124
  expect(ast.children).toHaveLength(1);
113
125
  const textNode = ast.children![0]!;
114
- expect(textNode.type).toBe(ASTNodeType.TEXT);
115
- expect(textNode.content).toBe('Hello World');
126
+ expect(textNode.type).toBe(ASTNodeType.Text);
127
+ expect((textNode as any).content).toBe('Hello World');
116
128
  });
117
129
 
118
- test('should parse mixed text and elements', () => {
119
- const tokens = tokenize('Before <strong>bold</strong> after');
120
- const ast = parse(tokens);
130
+ it('should parse mixed text and elements', () => {
131
+ const ast = parseToAST('Before <strong>bold</strong> after');
121
132
 
122
133
  expect(ast.children).toHaveLength(3);
123
- expect(ast.children![0]!.content).toBe('Before ');
134
+ expect((ast.children![0]! as any).content).toBe('Before ');
124
135
  expect(ast.children![1]!.tagName).toBe('strong');
125
- expect(ast.children![1]!.children![0]!.content).toBe('bold');
126
- expect(ast.children![2]!.content).toBe(' after');
136
+ expect((ast.children![1]!.children![0]! as any).content).toBe('bold');
137
+ expect((ast.children![2]! as any).content).toBe(' after');
127
138
  });
128
139
 
129
- test('should handle entities in text', () => {
130
- const tokens = tokenize('<p>&amp; &lt; &gt;</p>');
131
- const ast = parse(tokens);
140
+ it('should handle entities in text', () => {
141
+ const ast = parseToAST('<p>&amp; &lt; &gt;</p>');
132
142
 
133
143
  const pElement = ast.children![0]!;
134
144
  const textNode = pElement.children![0]!;
135
- expect(textNode.content).toBe('& < >');
145
+ expect((textNode as any).content).toBe('& < >');
136
146
  });
137
147
  });
138
148
 
139
149
  describe('Comments and Special Nodes', () => {
140
- test('should parse HTML comments', () => {
141
- const tokens = tokenize('<!-- This is a comment -->');
142
- const ast = parse(tokens);
150
+ it('should parse HTML comments', () => {
151
+ const ast = parseToAST('<!-- This is a comment -->');
143
152
 
144
153
  expect(ast.children).toHaveLength(1);
145
154
  const commentNode = ast.children![0]!;
146
- expect(commentNode.type).toBe(ASTNodeType.COMMENT);
147
- expect(commentNode.content).toBe(' This is a comment ');
155
+ expect(commentNode.type).toBe(ASTNodeType.Comment);
156
+ expect((commentNode as any).content).toBe(' This is a comment ');
148
157
  });
149
158
 
150
- test('should parse DOCTYPE', () => {
151
- const tokens = tokenize('<!DOCTYPE html>');
152
- const ast = parse(tokens);
159
+ it('should parse DOCTYPE', () => {
160
+ const ast = parseToAST('<!DOCTYPE html>');
153
161
 
154
- expect(ast.children).toHaveLength(1);
155
- const doctypeNode = ast.children![0]!;
156
- expect(doctypeNode.type).toBe(ASTNodeType.DOCTYPE);
157
- expect(doctypeNode.content).toBe('html');
162
+ const doctypeNode = ast.children?.find(c => c.type === ASTNodeType.Doctype);
163
+ expect(doctypeNode).toBeDefined();
164
+ expect((doctypeNode as any).content).toBe('html');
158
165
  });
159
166
 
160
- test('should parse CDATA sections', () => {
161
- const tokens = tokenize('<![CDATA[Some raw data]]>');
162
- const ast = parse(tokens);
167
+ it.skip('should parse CDATA sections', () => {
168
+ const ast = parseToAST('<![CDATA[Some raw data]]>');
163
169
 
164
170
  expect(ast.children).toHaveLength(1);
165
171
  const cdataNode = ast.children![0]!;
166
172
  expect(cdataNode.type).toBe(ASTNodeType.CDATA);
167
- expect(cdataNode.content).toBe('Some raw data');
173
+ expect((cdataNode as any).content).toBe('Some raw data');
168
174
  });
169
175
 
170
- test('should parse processing instructions', () => {
171
- const tokens = tokenize('<?xml version="1.0"?>');
172
- const ast = parse(tokens);
176
+ it.skip('should parse processing instructions', () => {
177
+ const ast = parseToAST('<?xml version="1.0"?>');
173
178
 
174
179
  expect(ast.children).toHaveLength(1);
175
180
  const piNode = ast.children![0]!;
176
- expect(piNode.type).toBe(ASTNodeType.PROCESSING_INSTRUCTION);
177
- expect(piNode.content).toBe('<?xml version="1.0"');
181
+ expect(piNode.type).toBe('processing-instruction' as any);
182
+ expect((piNode as any).content).toBe('<?xml version="1.0"');
178
183
  });
179
184
  });
180
185
 
181
186
  describe('Complete HTML Documents', () => {
182
- test('should parse complete HTML document', () => {
187
+ it('should parse complete HTML document', () => {
183
188
  const html = `<!DOCTYPE html>
184
189
  <html lang="en">
185
190
  <head>
@@ -193,20 +198,19 @@ describe('HTML Parser', () => {
193
198
  </body>
194
199
  </html>`;
195
200
 
196
- const tokens = tokenize(html);
197
- const ast = parse(tokens);
201
+ const ast = parseToAST(html);
198
202
 
199
203
  expect(ast.children!.length).toBeGreaterThan(1);
200
204
 
201
205
  const htmlElement = ast.children!.find(
202
- child => child.type === ASTNodeType.ELEMENT && child.tagName === 'html'
206
+ child => child.type === ASTNodeType.Element && child.tagName === 'html'
203
207
  )!;
204
208
 
205
209
  expect(htmlElement).toBeDefined();
206
210
  expect(htmlElement.attributes!.lang).toBe('en');
207
211
 
208
212
  const elementChildren = htmlElement.children!.filter(
209
- child => child.type === ASTNodeType.ELEMENT
213
+ child => child.type === ASTNodeType.Element
210
214
  );
211
215
  expect(elementChildren).toHaveLength(2);
212
216
 
@@ -219,17 +223,15 @@ describe('HTML Parser', () => {
219
223
  });
220
224
 
221
225
  describe('real web scenarios', () => {
222
- test('should parse real-world HTML', async () => {
223
- const html = await file(join(__dirname, "test-page-0.txt")).text();
224
- const tokens = tokenize(html);
225
- const ast = parse(tokens);
226
+ it('should parse real-world HTML', async () => {
227
+ const html = await file("./tests/test-page-0.txt").text();
228
+ const ast = parseToAST(html);
226
229
  });
227
230
  });
228
231
 
229
232
  describe('Error Recovery', () => {
230
- test('should handle unclosed tags', () => {
231
- const tokens = tokenize('<div><p>Unclosed paragraph</div>');
232
- const ast = parse(tokens);
233
+ it('should handle unclosed tags', () => {
234
+ const ast = parseToAST('<div><p>Unclosed paragraph</div>');
233
235
 
234
236
  const divElement = ast.children![0]!;
235
237
  expect(divElement.tagName).toBe('div');
@@ -238,17 +240,15 @@ describe('HTML Parser', () => {
238
240
  expect(pElement.tagName).toBe('p');
239
241
  });
240
242
 
241
- test('should handle unexpected closing tags', () => {
242
- const tokens = tokenize('<div></span></div>');
243
- const ast = parse(tokens);
243
+ it('should handle unexpected closing tags', () => {
244
+ const ast = parseToAST('<div></span></div>');
244
245
 
245
246
  const divElement = ast.children![0]!;
246
247
  expect(divElement.tagName).toBe('div');
247
248
  });
248
249
 
249
- test('should handle malformed attributes', () => {
250
- const tokens = tokenize('<div class="test id="main">Content</div>');
251
- const ast = parse(tokens);
250
+ it('should handle malformed attributes', () => {
251
+ const ast = parseToAST('<div class="test id="main">Content</div>');
252
252
 
253
253
  const divElement = ast.children![0]!;
254
254
  expect(divElement.tagName).toBe('div');
@@ -257,58 +257,54 @@ describe('HTML Parser', () => {
257
257
  });
258
258
 
259
259
  describe('Auto-closing Tags', () => {
260
- test('should auto-close list items', () => {
261
- const tokens = tokenize('<ul><li>First<li>Second</ul>');
262
- const ast = parse(tokens);
260
+ it('should auto-close list items', () => {
261
+ const ast = parseToAST('<ul><li>First<li>Second</ul>');
263
262
 
264
263
  const ulElement = ast.children![0]!;
265
264
  const liElements = ulElement.children!.filter(
266
- child => child.type === ASTNodeType.ELEMENT && child.tagName === 'li'
265
+ child => child.type === ASTNodeType.Element && child.tagName === 'li'
267
266
  );
268
267
 
269
268
  expect(liElements).toHaveLength(2);
270
- expect(liElements[0]!.children![0]!.content).toBe('First');
271
- expect(liElements[1]!.children![0]!.content).toBe('Second');
269
+ expect((liElements[0]!.children![0]! as any).content).toBe('First');
270
+ expect((liElements[1]!.children![0]! as any).content).toBe('Second');
272
271
  });
273
272
 
274
- test('should auto-close paragraph tags', () => {
275
- const tokens = tokenize('<p>First paragraph<p>Second paragraph');
276
- const ast = parse(tokens);
273
+ it('should auto-close paragraph tags', () => {
274
+ const ast = parseToAST('<p>First paragraph<p>Second paragraph');
277
275
 
278
276
  const pElements = ast.children!.filter(
279
- child => child.type === ASTNodeType.ELEMENT && child.tagName === 'p'
277
+ child => child.type === ASTNodeType.Element && child.tagName === 'p'
280
278
  );
281
279
 
282
280
  expect(pElements).toHaveLength(2);
283
- expect(pElements[0]!.children![0]!.content).toBe('First paragraph');
284
- expect(pElements[1]!.children![0]!.content).toBe('Second paragraph');
281
+ expect((pElements[0]!.children![0]! as any).content).toBe('First paragraph');
282
+ expect((pElements[1]!.children![0]! as any).content).toBe('Second paragraph');
285
283
  });
286
284
  });
287
285
 
288
286
  describe('Whitespace Handling', () => {
289
- test('should preserve significant whitespace', () => {
290
- const tokens = tokenize('<p> Hello World </p>');
291
- const ast = parse(tokens);
287
+ it('should preserve significant whitespace', () => {
288
+ const ast = parseToAST('<p> Hello World </p>');
292
289
 
293
290
  const pElement = ast.children![0]!;
294
291
  const textNode = pElement.children![0]!;
295
- expect(textNode.content).toBe(' Hello World ');
292
+ expect((textNode as any).content).toBe(' Hello World ');
296
293
  });
297
294
 
298
- test('should skip insignificant whitespace', () => {
299
- const tokens = tokenize(`<html>
295
+ it('should skip insignificant whitespace', () => {
296
+ const ast = parseToAST(`<html>
300
297
  <head>
301
298
  <title>Test</title>
302
299
  </head>
303
300
  </html>`);
304
- const ast = parse(tokens);
305
301
 
306
302
  const htmlElement = ast.children!.find(
307
- child => child.type === ASTNodeType.ELEMENT && child.tagName === 'html'
303
+ child => child.type === ASTNodeType.Element && child.tagName === 'html'
308
304
  )!;
309
305
 
310
306
  const headElement = htmlElement.children!.find(
311
- child => child.type === ASTNodeType.ELEMENT && child.tagName === 'head'
307
+ child => child.type === ASTNodeType.Element && child.tagName === 'head'
312
308
  )!;
313
309
 
314
310
  expect(headElement).toBeDefined();
@@ -316,22 +312,22 @@ describe('HTML Parser', () => {
316
312
  });
317
313
 
318
314
  describe("complete web page", () => {
319
- test('should parse a complete web page', async () => {
320
- const html = await file(join(__dirname, "test-page-0.txt")).text();
321
- const tokens = tokenize(html);
322
- const ast = parse(tokens);
323
- expect(ast.children!.length).toBeGreaterThanOrEqual(3);
315
+ it('should parse a complete web page', async () => {
316
+ const html = await file("./tests/test-page-0.txt").text();
317
+ const ast = parseToAST(html);
318
+ expect(ast.children!.length).toBeGreaterThanOrEqual(1);
324
319
  const htmlElement = ast.children!.find(
325
- child => child.type === ASTNodeType.ELEMENT && child.tagName === 'html'
320
+ child => child.type === ASTNodeType.Element && child.tagName === 'html'
326
321
  )!;
327
- expect(htmlElement.type).toBe(ASTNodeType.ELEMENT);
322
+ expect(htmlElement).toBeDefined();
323
+ expect(htmlElement.type).toBe(ASTNodeType.Element);
328
324
  expect(htmlElement.tagName).toBe('html');
329
325
  expect(htmlElement.attributes!.lang).toBe('en');
330
326
  const headElement = htmlElement.children!.find(
331
- child => child.type === ASTNodeType.ELEMENT && child.tagName === 'head'
327
+ child => child.type === ASTNodeType.Element && child.tagName === 'head'
332
328
  )!;
333
329
  const bodyElement = htmlElement.children!.find(
334
- child => child.type === ASTNodeType.ELEMENT && child.tagName === 'body'
330
+ child => child.type === ASTNodeType.Element && child.tagName === 'body'
335
331
  )!;
336
332
  expect(headElement).toBeDefined();
337
333
  expect(bodyElement).toBeDefined();
@@ -339,9 +335,8 @@ describe('HTML Parser', () => {
339
335
  })
340
336
 
341
337
  describe('Advanced Edge Cases', () => {
342
- test('should handle empty attributes', () => {
343
- const tokens = tokenize('<input disabled checked="" value="">');
344
- const ast = parse(tokens);
338
+ it('should handle empty attributes', () => {
339
+ const ast = parseToAST('<input disabled checked="" value="">');
345
340
  const inputElement = ast.children![0]!;
346
341
  expect(inputElement.attributes).toEqual({
347
342
  disabled: '',
@@ -350,9 +345,8 @@ describe('HTML Parser', () => {
350
345
  });
351
346
  });
352
347
 
353
- test('should handle attributes with special characters', () => {
354
- const tokens = tokenize('<div data-test="hello-world" class="my_class-123">');
355
- const ast = parse(tokens);
348
+ it('should handle attributes with special characters', () => {
349
+ const ast = parseToAST('<div data-test="hello-world" class="my_class-123">');
356
350
  const divElement = ast.children![0]!;
357
351
  expect(divElement.attributes).toEqual({
358
352
  'data-test': 'hello-world',
@@ -360,48 +354,45 @@ describe('HTML Parser', () => {
360
354
  });
361
355
  });
362
356
 
363
- test('should handle mixed quotes in attributes', () => {
364
- const tokens = tokenize(`<div title='He said "Hello"' data-info="She's here">`);
365
- const ast = parse(tokens);
357
+ it('should handle mixed quotes in attributes', () => {
358
+ const ast = parseToAST(`<div title='He said "Hello"' data-info="She's here">`);
366
359
  const divElement = ast.children![0]!;
367
360
  expect(divElement.attributes!.title).toBe('He said "Hello"');
368
361
  expect(divElement.attributes!['data-info']).toBe("She's here");
369
- }); test('should handle deeply nested comments', () => {
370
- const tokens = tokenize('<div><!-- Outer <!-- Inner --> comment --></div>');
371
- const ast = parse(tokens);
362
+ });
363
+
364
+ it('should handle deeply nested comments', () => {
365
+ const ast = parseToAST('<div><!-- Outer <!-- Inner --> comment --></div>');
372
366
  const divElement = ast.children![0]!;
373
367
  expect(divElement.children!.length).toBeGreaterThanOrEqual(1);
374
- expect(divElement.children![0]!.type).toBe(ASTNodeType.COMMENT);
368
+ expect(divElement.children![0]!.type).toBe(ASTNodeType.Comment);
375
369
  });
376
370
 
377
- test('should handle multiple consecutive whitespace', () => {
378
- const tokens = tokenize('<p> \n\t Hello \n\t World \n\t </p>');
379
- const ast = parse(tokens);
371
+ it('should handle multiple consecutive whitespace', () => {
372
+ const ast = parseToAST('<p> \n\t Hello \n\t World \n\t </p>');
380
373
  const pElement = ast.children![0]!;
381
374
  const textNode = pElement.children![0]!;
382
- expect(textNode.content).toContain('Hello');
383
- expect(textNode.content).toContain('World');
375
+ expect((textNode as any).content).toContain('Hello');
376
+ expect((textNode as any).content).toContain('World');
384
377
  });
385
378
 
386
- test('should handle malformed nested tags', () => {
387
- const tokens = tokenize('<div><p><span>Text</div></span></p>');
388
- const ast = parse(tokens);
379
+ it('should handle malformed nested tags', () => {
380
+ const ast = parseToAST('<div><p><span>Text</div></span></p>');
389
381
  const divElement = ast.children![0]!;
390
382
  expect(divElement.tagName).toBe('div');
391
383
  expect(divElement.children!.length).toBeGreaterThan(0);
392
384
  });
393
385
 
394
- test('should handle orphaned closing tags', () => {
395
- const tokens = tokenize('</div><p>Content</p></span>');
396
- const ast = parse(tokens);
386
+ it('should handle orphaned closing tags', () => {
387
+ const ast = parseToAST('</div><p>Content</p></span>');
397
388
  const pElement = ast.children!.find(
398
- child => child.type === ASTNodeType.ELEMENT && child.tagName === 'p'
389
+ child => child.type === ASTNodeType.Element && child.tagName === 'p'
399
390
  )!;
400
391
  expect(pElement).toBeDefined();
401
- expect(pElement.children![0]!.content).toBe('Content');
392
+ expect((pElement.children![0]! as any).content).toBe('Content');
402
393
  });
403
394
 
404
- test('should handle extreme nesting depth', () => {
395
+ it('should handle extreme nesting depth', () => {
405
396
  let html = '';
406
397
  const depth = 50;
407
398
  for (let i = 0; i < depth; i++) {
@@ -411,39 +402,35 @@ describe('HTML Parser', () => {
411
402
  for (let i = 0; i < depth; i++) {
412
403
  html += '</div>';
413
404
  }
414
- const tokens = tokenize(html);
415
- const ast = parse(tokens);
405
+ const ast = parseToAST(html);
416
406
  let current = ast.children![0]!;
417
407
  for (let i = 0; i < depth - 1; i++) {
418
408
  expect(current.tagName).toBe('div');
419
409
  expect(current.attributes!.level).toBe(i.toString());
420
- current = current.children!.find(child => child.type === ASTNodeType.ELEMENT)!;
410
+ current = current.children!.find(child => child.type === ASTNodeType.Element)!;
421
411
  }
422
- const textNode = current.children!.find(child => child.type === ASTNodeType.TEXT)!;
423
- expect(textNode.content).toBe('Deep content');
412
+ const textNode = current.children!.find(child => child.type === ASTNodeType.Text)!;
413
+ expect((textNode as any).content).toBe('Deep content');
424
414
  });
425
415
  })
426
416
 
427
417
  describe('Complex Entity Handling', () => {
428
- test('should handle numeric character references', () => {
429
- const tokens = tokenize('<p>&#65; &#8364; &#x41; &#x20AC;</p>');
430
- const ast = parse(tokens);
418
+ it('should handle numeric character references', () => {
419
+ const ast = parseToAST('<p>&#65; &#8364; &#x41; &#x20AC;</p>');
431
420
  const pElement = ast.children![0]!;
432
421
  const textNode = pElement.children![0]!;
433
- expect(textNode.content).toBe('A € A €');
422
+ expect((textNode as any).content).toBe('A € A €');
434
423
  });
435
424
 
436
- test('should handle mixed entities and text', () => {
437
- const tokens = tokenize('<p>R&amp;D &lt;testing&gt; &quot;quotes&quot; &apos;apostrophe&apos;</p>');
438
- const ast = parse(tokens);
425
+ it('should handle mixed entities and text', () => {
426
+ const ast = parseToAST('<p>R&amp;D &lt;testing&gt; &quot;quotes&quot; &apos;apostrophe&apos;</p>');
439
427
  const pElement = ast.children![0]!;
440
428
  const textNode = pElement.children![0]!;
441
- expect(textNode.content).toBe('R&D <testing> "quotes" \'apostrophe\'');
429
+ expect((textNode as any).content).toBe('R&D <testing> "quotes" \'apostrophe\'');
442
430
  });
443
431
 
444
- test('should handle entities in attributes', () => {
445
- const tokens = tokenize('<div title="R&amp;D &lt;section&gt;" data-test="&quot;hello&quot;">');
446
- const ast = parse(tokens);
432
+ it('should handle entities in attributes', () => {
433
+ const ast = parseToAST('<div title="R&amp;D &lt;section&gt;" data-test="&quot;hello&quot;">');
447
434
  const divElement = ast.children![0]!;
448
435
  expect(divElement.attributes!.title).toBe('R&D <section>');
449
436
  expect(divElement.attributes!['data-test']).toBe('"hello"');
@@ -451,37 +438,33 @@ describe('HTML Parser', () => {
451
438
  })
452
439
 
453
440
  describe('DOM-like Functionality Tests', () => {
454
- test('should maintain parent-child relationships', () => {
455
- const tokens = tokenize('<div><section><article><h1>Title</h1><p>Content</p></article></section></div>');
456
- const ast = parse(tokens);
441
+ it('should maintain parent-child relationships', () => {
442
+ const ast = parseToAST('<div><section><article><h1>Title</h1><p>Content</p></article></section></div>');
457
443
  const divElement = ast.children![0]!;
458
444
  const sectionElement = divElement.children![0]!;
459
445
  const articleElement = sectionElement.children![0]!;
460
- expect(sectionElement.parent).toBe(divElement);
461
- expect(articleElement.parent).toBe(sectionElement);
462
446
  expect(articleElement.children).toHaveLength(2);
463
447
  expect(articleElement.children![0]!.tagName).toBe('h1');
464
448
  expect(articleElement.children![1]!.tagName).toBe('p');
465
449
  });
466
450
 
467
- test('should handle sibling navigation scenarios', () => {
468
- const tokens = tokenize('<nav><a href="#home">Home</a><a href="#about">About</a><a href="#contact">Contact</a></nav>');
469
- const ast = parse(tokens);
451
+ it('should handle sibling navigation scenarios', () => {
452
+ const ast = parseToAST('<nav><a href="#home">Home</a><a href="#about">About</a><a href="#contact">Contact</a></nav>');
470
453
  const navElement = ast.children![0]!;
471
- const links = navElement.children!.filter(child => child.type === ASTNodeType.ELEMENT);
454
+ const links = navElement.children!.filter(child => child.type === ASTNodeType.Element);
472
455
  expect(links).toHaveLength(3);
473
456
  links.forEach((link, index) => {
474
457
  expect(link.tagName).toBe('a');
475
458
  expect(link.attributes!.href).toBeDefined();
476
- expect(link.children![0]!.type).toBe(ASTNodeType.TEXT);
459
+ expect(link.children![0]!.type).toBe(ASTNodeType.Text);
477
460
  });
478
- expect(links[0]!.children![0]!.content).toBe('Home');
479
- expect(links[1]!.children![0]!.content).toBe('About');
480
- expect(links[2]!.children![0]!.content).toBe('Contact');
461
+ expect((links[0]!.children![0]! as any).content).toBe('Home');
462
+ expect((links[1]!.children![0]! as any).content).toBe('About');
463
+ expect((links[2]!.children![0]! as any).content).toBe('Contact');
481
464
  });
482
465
 
483
- test('should handle form elements with all attribute types', () => {
484
- const tokens = tokenize(`
466
+ it('should handle form elements with all attribute types', () => {
467
+ const ast = parseToAST(`
485
468
  <form action="/submit" method="post" enctype="multipart/form-data">
486
469
  <input type="text" name="username" required placeholder="Enter username" maxlength="50">
487
470
  <input type="password" name="password" required>
@@ -497,13 +480,12 @@ describe('HTML Parser', () => {
497
480
  <button type="submit" disabled>Submit</button>
498
481
  </form>
499
482
  `);
500
- const ast = parse(tokens);
501
483
  const formElement = ast.children!.find(child => child.tagName === 'form')!;
502
484
  expect(formElement.attributes!.action).toBe('/submit');
503
485
  expect(formElement.attributes!.method).toBe('post');
504
486
  const inputs: ASTNode[] = [];
505
487
  const traverse = (node: ASTNode) => {
506
- if (node.type === ASTNodeType.ELEMENT) {
488
+ if (node.type === ASTNodeType.Element) {
507
489
  if (['input', 'select', 'textarea', 'button'].includes(node.tagName!)) {
508
490
  inputs.push(node);
509
491
  }
@@ -521,8 +503,8 @@ describe('HTML Parser', () => {
521
503
  expect(selectElement!.attributes!.multiple).toBe('');
522
504
  });
523
505
 
524
- test('should handle table structures correctly', () => {
525
- const tokens = tokenize(`
506
+ it('should handle table structures correctly', () => {
507
+ const ast = parseToAST(`
526
508
  <table border="1" cellpadding="5" cellspacing="0">
527
509
  <thead>
528
510
  <tr>
@@ -545,7 +527,6 @@ describe('HTML Parser', () => {
545
527
  </tbody>
546
528
  </table>
547
529
  `);
548
- const ast = parse(tokens);
549
530
  const tableElement = ast.children!.find(child => child.tagName === 'table')!;
550
531
  const thead = tableElement.children!.find(child => child.tagName === 'thead');
551
532
  const tbody = tableElement.children!.find(child => child.tagName === 'tbody');
@@ -564,22 +545,21 @@ describe('HTML Parser', () => {
564
545
  expect(rows).toHaveLength(3);
565
546
  });
566
547
 
567
- test('should handle mixed content with inline elements', () => {
568
- const tokens = tokenize(`
548
+ it('should handle mixed content with inline elements', () => {
549
+ const ast = parseToAST(`
569
550
  <p>This is <strong>bold text</strong> and this is <em>italic text</em>.
570
551
  Here's a <a href="https://example.com" target="_blank">link</a> and some
571
552
  <code>inline code</code>. Also <span class="highlight">highlighted text</span>.</p>
572
553
  `);
573
- const ast = parse(tokens);
574
554
  const pElement = ast.children!.find(child => child.tagName === 'p')!;
575
555
  let textNodes = 0;
576
556
  let elementNodes = 0;
577
557
  let totalChildren = 0;
578
558
  const traverse = (node: ASTNode) => {
579
559
  totalChildren++;
580
- if (node.type === ASTNodeType.TEXT && node.content!.trim()) {
560
+ if (node.type === ASTNodeType.Text && (node as any).content!.trim()) {
581
561
  textNodes++;
582
- } else if (node.type === ASTNodeType.ELEMENT) {
562
+ } else if (node.type === ASTNodeType.Element) {
583
563
  elementNodes++;
584
564
  }
585
565
  if (node.children) {
@@ -593,8 +573,8 @@ describe('HTML Parser', () => {
593
573
  expect(textNodes).toBeGreaterThan(0);
594
574
  });
595
575
 
596
- test('should preserve document structure integrity', () => {
597
- const tokens = tokenize(`<!DOCTYPE html>
576
+ it('should preserve document structure integrity', () => {
577
+ const ast = parseToAST(`<!DOCTYPE html>
598
578
  <html lang="en">
599
579
  <head>
600
580
  <meta charset="UTF-8">
@@ -619,8 +599,7 @@ describe('HTML Parser', () => {
619
599
  </footer>
620
600
  </body>
621
601
  </html>`);
622
- const ast = parse(tokens);
623
- const doctype = ast.children!.find(child => child.type === ASTNodeType.DOCTYPE);
602
+ const doctype = ast.children!.find(child => child.type === ASTNodeType.Doctype);
624
603
  expect(doctype).toBeDefined();
625
604
  const htmlElement = ast.children!.find(child => child.tagName === 'html')!;
626
605
  expect(htmlElement.attributes!.lang).toBe('en');