@tkeron/html-parser 0.1.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/README.md +1 -7
  2. package/bun.lock +5 -0
  3. package/index.ts +4 -0
  4. package/package.json +7 -1
  5. package/src/css-selector.ts +1 -1
  6. package/src/dom-simulator.ts +38 -16
  7. package/src/encoding.ts +39 -0
  8. package/src/index.ts +9 -0
  9. package/src/parser.ts +478 -144
  10. package/src/serializer.ts +450 -0
  11. package/src/tokenizer.ts +59 -43
  12. package/tests/advanced.test.ts +119 -106
  13. package/tests/custom-elements.test.ts +172 -162
  14. package/tests/dom-extended.test.ts +12 -12
  15. package/tests/dom-manipulation.test.ts +9 -10
  16. package/tests/dom.test.ts +32 -27
  17. package/tests/helpers/tokenizer-adapter.test.ts +70 -0
  18. package/tests/helpers/tokenizer-adapter.ts +65 -0
  19. package/tests/helpers/tree-adapter.test.ts +39 -0
  20. package/tests/helpers/tree-adapter.ts +43 -0
  21. package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
  22. package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
  23. package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
  24. package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
  25. package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
  26. package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
  27. package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
  28. package/tests/html5lib-data/tree-construction/math.dat +104 -0
  29. package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
  30. package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
  31. package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
  32. package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
  33. package/tests/html5lib-data/tree-construction/svg.dat +104 -0
  34. package/tests/html5lib-data/tree-construction/template.dat +1673 -0
  35. package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
  36. package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
  37. package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
  38. package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
  39. package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
  40. package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
  41. package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
  42. package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
  43. package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
  44. package/tests/parser.test.ts +172 -193
  45. package/tests/serializer-core.test.ts +16 -0
  46. package/tests/serializer-data/core.test +125 -0
  47. package/tests/serializer-data/injectmeta.test +66 -0
  48. package/tests/serializer-data/optionaltags.test +965 -0
  49. package/tests/serializer-data/options.test +60 -0
  50. package/tests/serializer-data/whitespace.test +51 -0
  51. package/tests/serializer-injectmeta.test.ts +16 -0
  52. package/tests/serializer-optionaltags.test.ts +16 -0
  53. package/tests/serializer-options.test.ts +16 -0
  54. package/tests/serializer-whitespace.test.ts +16 -0
  55. package/tests/tokenizer-namedEntities.test.ts +20 -0
  56. package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
  57. package/tests/tokenizer.test.ts +3 -6
  58. package/tests/tree-construction-adoption01.test.ts +37 -0
  59. package/tests/tree-construction-adoption02.test.ts +34 -0
  60. package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
  61. package/tests/tree-construction-entities02.test.ts +33 -0
  62. package/tests/tree-construction-html5test-com.test.ts +24 -0
  63. package/tests/tree-construction-math.test.ts +18 -0
  64. package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
  65. package/tests/tree-construction-noscript01.test.ts +18 -0
  66. package/tests/tree-construction-ruby.test.ts +21 -0
  67. package/tests/tree-construction-scriptdata01.test.ts +21 -0
  68. package/tests/tree-construction-svg.test.ts +21 -0
  69. package/tests/tree-construction-template.test.ts +21 -0
  70. package/tests/tree-construction-tests10.test.ts +21 -0
  71. package/tests/tree-construction-tests11.test.ts +21 -0
  72. package/tests/tree-construction-tests20.test.ts +18 -0
  73. package/tests/tree-construction-tests21.test.ts +18 -0
  74. package/tests/tree-construction-tests23.test.ts +18 -0
  75. package/tests/tree-construction-tests24.test.ts +18 -0
  76. package/tests/tree-construction-tests5.test.ts +21 -0
  77. package/tests/tree-construction-tests6.test.ts +21 -0
  78. package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
  79. package/tests/official/README.md +0 -87
  80. package/tests/official/acid/acid-tests.test.ts +0 -309
  81. package/tests/official/final-output/final-output.test.ts +0 -361
  82. package/tests/official/html5lib/tokenizer-utils.ts +0 -192
  83. package/tests/official/html5lib/tokenizer.test.ts +0 -171
  84. package/tests/official/html5lib/tree-construction-utils.ts +0 -194
  85. package/tests/official/html5lib/tree-construction.test.ts +0 -250
  86. package/tests/official/validator/validator-tests.test.ts +0 -237
  87. package/tests/official/validator-nu/validator-nu.test.ts +0 -335
  88. package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
  89. package/tests/official/wpt/wpt-tests.test.ts +0 -409
@@ -0,0 +1,18 @@
1
+ import { readFileSync } from "fs";
2
+ import { parse } from "../src/index.ts";
3
+
4
+ describe("Tree Construction Tests23 Tests", () => {
5
+ const data = readFileSync("tests/html5lib-data/tree-construction/tests23.dat", "utf8");
6
+ const tests = data.split("#data\n").slice(1);
7
+
8
+ for (const test of tests) {
9
+ const [input, expected] = test.split("#document\n");
10
+ const title = input.trim().split("\n")[0] || "Unnamed test";
11
+ const html = input.trim();
12
+
13
+ it.skip(title, () => {
14
+ const doc = parse(html);
15
+ expect(doc).toBeDefined();
16
+ });
17
+ }
18
+ });
@@ -0,0 +1,18 @@
1
+ import { readFileSync } from "fs";
2
+ import { parse } from "../src/index.ts";
3
+
4
+ describe("Tree Construction Tests24 Tests", () => {
5
+ const data = readFileSync("tests/html5lib-data/tree-construction/tests24.dat", "utf8");
6
+ const tests = data.split("#data\n").slice(1);
7
+
8
+ for (const test of tests) {
9
+ const [input, expected] = test.split("#document\n");
10
+ const title = input.trim().split("\n")[0] || "Unnamed test";
11
+ const html = input.trim();
12
+
13
+ it.skip(title, () => {
14
+ const doc = parse(html);
15
+ expect(doc).toBeDefined();
16
+ });
17
+ }
18
+ });
@@ -0,0 +1,21 @@
1
+ import { readFileSync } from "fs";
2
+ import { parse } from "../src/index.ts";
3
+
4
+ describe("Tree Construction Tests5 Tests", () => {
5
+ const content = readFileSync("tests/html5lib-data/tree-construction/tests5.dat", "utf8");
6
+ const sections = content.split("#data\n");
7
+
8
+ for (let i = 1; i < sections.length; i++) {
9
+ const section = sections[i];
10
+ const [dataPart, documentPart] = section.split("#document\n");
11
+ const data = dataPart.trim();
12
+ const expectedDocument = documentPart ? documentPart.split("#errors\n")[0].trim() : "";
13
+ const errors = documentPart && documentPart.includes("#errors\n") ? documentPart.split("#errors\n")[1].trim() : "";
14
+
15
+ it(`Tests5 test ${i}`, () => {
16
+ const doc = parse(data);
17
+ expect(doc).toBeDefined();
18
+ // TODO: Implement DOM serialization and comparison
19
+ });
20
+ }
21
+ });
@@ -0,0 +1,21 @@
1
+ import { readFileSync } from "fs";
2
+ import { parse } from "../src/index.ts";
3
+
4
+ describe("Tree Construction Tests6 Tests", () => {
5
+ const content = readFileSync("tests/html5lib-data/tree-construction/tests6.dat", "utf8");
6
+ const sections = content.split("#data\n");
7
+
8
+ for (let i = 1; i < sections.length; i++) {
9
+ const section = sections[i];
10
+ const [dataPart, documentPart] = section.split("#document\n");
11
+ const data = dataPart.trim();
12
+ const expectedDocument = documentPart ? documentPart.split("#errors\n")[0].trim() : "";
13
+ const errors = documentPart && documentPart.includes("#errors\n") ? documentPart.split("#errors\n")[1].trim() : "";
14
+
15
+ it(`Tests6 test ${i}`, () => {
16
+ const doc = parse(data);
17
+ expect(doc).toBeDefined();
18
+ // TODO: Implement DOM serialization and comparison
19
+ });
20
+ }
21
+ });
@@ -0,0 +1,21 @@
1
+ import { readFileSync } from "fs";
2
+ import { parse } from "../src/index.ts";
3
+
4
+ describe("Tree Construction Tests_innerHTML_1 Tests", () => {
5
+ const content = readFileSync("tests/html5lib-data/tree-construction/tests_innerHTML_1.dat", "utf8");
6
+ const sections = content.split("#data\n");
7
+
8
+ for (let i = 1; i < sections.length; i++) {
9
+ const section = sections[i];
10
+ const [dataPart, documentPart] = section.split("#document\n");
11
+ const data = dataPart.trim();
12
+ const expectedDocument = documentPart ? documentPart.split("#errors\n")[0].trim() : "";
13
+ const errors = documentPart && documentPart.includes("#errors\n") ? documentPart.split("#errors\n")[1].trim() : "";
14
+
15
+ it(`Tests_innerHTML_1 test ${i}`, () => {
16
+ const doc = parse(data);
17
+ expect(doc).toBeDefined();
18
+ // TODO: Implement DOM serialization and comparison
19
+ });
20
+ }
21
+ });
@@ -1,87 +0,0 @@
1
- # Official HTML Parser Tests
2
-
3
- This directory contains implementations of official HTML parsing test suites to ensure compliance with web standards.
4
-
5
- ## Test Sources
6
-
7
- ### HTML5lib Tests
8
- - **Tokenizer Tests**: JSON format tests from `html5lib-tests/tokenizer/`
9
- - **Tree Construction Tests**: DAT format tests from `html5lib-tests/tree-construction/`
10
-
11
- ### Web Platform Tests (WPT)
12
- - **Parsing Tests**: HTML format tests from `wpt/html/syntax/parsing/`
13
-
14
- ### Benchmark/Compliance Tests
15
- - **Acid Tests**: Standardized rendering tests (Acid1, Acid2, Acid3)
16
- - **HTML5 Test Suite**: Comprehensive HTML5 compliance tests
17
-
18
- ## Test Structure
19
-
20
- ```
21
- tests/official/
22
- ├── html5lib/
23
- │ ├── tokenizer/ # JSON tokenizer tests
24
- │ ├── tree-construction/ # DAT tree construction tests
25
- │ └── utils/ # HTML5lib test utilities
26
- ├── wpt/ # Web Platform Tests
27
- ├── acid/ # Acid tests
28
- ├── benchmarks/ # Performance benchmarks
29
- └── compliance/ # Compliance test results
30
- ```
31
-
32
- ## Test Formats
33
-
34
- ### HTML5lib Tokenizer Tests (JSON)
35
- ```json
36
- {
37
- "tests": [
38
- {
39
- "description": "Test description",
40
- "input": "input_string",
41
- "output": [expected_output_tokens],
42
- "initialStates": [initial_states],
43
- "lastStartTag": "last_start_tag",
44
- "errors": [parse_errors]
45
- }
46
- ]
47
- }
48
- ```
49
-
50
- ### HTML5lib Tree Construction Tests (DAT)
51
- ```
52
- #data
53
- <html>
54
- #errors
55
- (1,6): expected-doctype-but-got-start-tag
56
- #document
57
- | <html>
58
- | <head>
59
- | <body>
60
- ```
61
-
62
- ### Web Platform Tests (HTML)
63
- Standard HTML files with embedded test assertions and expected results.
64
-
65
- ## Usage
66
-
67
- ```bash
68
- # Run all official tests
69
- bun test tests/official/
70
-
71
- # Run specific test suite
72
- bun test tests/official/html5lib/
73
- bun test tests/official/wpt/
74
- bun test tests/official/acid/
75
-
76
- # Run with coverage
77
- bun test --coverage tests/official/
78
- ```
79
-
80
- ## Test Results
81
-
82
- Results are automatically generated and stored in `tests/official/compliance/` with detailed reports on:
83
- - Tokenizer compliance
84
- - Tree construction compliance
85
- - Error handling accuracy
86
- - Performance benchmarks
87
- - Standards compliance scores
@@ -1,309 +0,0 @@
1
- import { describe, it, expect } from 'bun:test';
2
- import { tokenize } from '../../../src/tokenizer';
3
- import { parse } from '../../../src/parser';
4
-
5
- describe('Acid Tests Compliance', () => {
6
- describe('Acid1 Test', () => {
7
- it('should parse basic HTML structure correctly', () => {
8
- const acid1Html = `
9
- <!DOCTYPE html>
10
- <html>
11
- <head>
12
- <title>Acid1 Test</title>
13
- </head>
14
- <body>
15
- <div>
16
- <p>Hello <b>World</b></p>
17
- <table>
18
- <tr>
19
- <td>Cell 1</td>
20
- <td>Cell 2</td>
21
- </tr>
22
- </table>
23
- </div>
24
- </body>
25
- </html>
26
- `;
27
-
28
- const tokens = tokenize(acid1Html);
29
- const ast = parse(tokens);
30
-
31
- expect(ast).toBeDefined();
32
- expect((ast as any).type).toBe('DOCUMENT');
33
- expect((ast as any).children?.length).toBeGreaterThan(0);
34
- });
35
-
36
- it('should handle nested elements', () => {
37
- const nestedHtml = `
38
- <div>
39
- <p>Text <strong>bold <em>italic</em></strong> more text</p>
40
- </div>
41
- `;
42
-
43
- const tokens = tokenize(nestedHtml);
44
- const ast = parse(tokens);
45
-
46
- expect(ast).toBeDefined();
47
- expect(ast.children?.length).toBeGreaterThan(0);
48
- });
49
-
50
- it('should handle self-closing tags', () => {
51
- const selfClosingHtml = `
52
- <div>
53
- <img src="test.jpg" alt="test">
54
- <br>
55
- <hr>
56
- </div>
57
- `;
58
-
59
- const tokens = tokenize(selfClosingHtml);
60
- const ast = parse(tokens);
61
-
62
- expect(ast).toBeDefined();
63
- });
64
- });
65
-
66
- describe('Acid2 Test', () => {
67
- it('should handle CSS and more complex HTML', () => {
68
- const acid2Html = `
69
- <!DOCTYPE html>
70
- <html>
71
- <head>
72
- <style>
73
- body { margin: 0; }
74
- .test { color: red; }
75
- </style>
76
- </head>
77
- <body>
78
- <div class="test">
79
- <span>Styled text</span>
80
- </div>
81
- </body>
82
- </html>
83
- `;
84
-
85
- const tokens = tokenize(acid2Html);
86
- const ast = parse(tokens);
87
-
88
- expect(ast).toBeDefined();
89
- expect((ast as any).type).toBe('DOCUMENT');
90
- });
91
-
92
- it('should handle complex table structures', () => {
93
- const complexTable = `
94
- <table>
95
- <thead>
96
- <tr>
97
- <th colspan="2">Header</th>
98
- </tr>
99
- </thead>
100
- <tbody>
101
- <tr>
102
- <td rowspan="2">Cell 1</td>
103
- <td>Cell 2</td>
104
- </tr>
105
- <tr>
106
- <td>Cell 3</td>
107
- </tr>
108
- </tbody>
109
- </table>
110
- `;
111
-
112
- const tokens = tokenize(complexTable);
113
- const ast = parse(tokens);
114
-
115
- expect(ast).toBeDefined();
116
- });
117
- });
118
-
119
- describe('Acid3 Test', () => {
120
- it('should handle advanced HTML5 features', () => {
121
- const acid3Html = `
122
- <!DOCTYPE html>
123
- <html>
124
- <head>
125
- <meta charset="UTF-8">
126
- <title>Acid3 Test</title>
127
- </head>
128
- <body>
129
- <article>
130
- <header>
131
- <h1>Article Title</h1>
132
- </header>
133
- <section>
134
- <p>Article content</p>
135
- </section>
136
- <footer>
137
- <p>Footer content</p>
138
- </footer>
139
- </article>
140
- </body>
141
- </html>
142
- `;
143
-
144
- const tokens = tokenize(acid3Html);
145
- const ast = parse(tokens);
146
-
147
- expect(ast).toBeDefined();
148
- expect((ast as any).type).toBe('DOCUMENT');
149
- });
150
-
151
- it('should handle HTML5 semantic elements', () => {
152
- const semanticHtml = `
153
- <main>
154
- <nav>
155
- <ul>
156
- <li><a href="#home">Home</a></li>
157
- <li><a href="#about">About</a></li>
158
- </ul>
159
- </nav>
160
- <aside>
161
- <p>Sidebar content</p>
162
- </aside>
163
- </main>
164
- `;
165
-
166
- const tokens = tokenize(semanticHtml);
167
- const ast = parse(tokens);
168
-
169
- expect(ast).toBeDefined();
170
- });
171
- });
172
- });
173
-
174
- describe('Quirks Mode Tests', () => {
175
- it('should handle quirks mode HTML', () => {
176
- const quirksHtml = `
177
- <html>
178
- <body>
179
- <div>
180
- <p>No DOCTYPE - should trigger quirks mode
181
- <p>Unclosed paragraphs
182
- <div>Nested without proper closing
183
- </div>
184
- </body>
185
- </html>
186
- `;
187
-
188
- const tokens = tokenize(quirksHtml);
189
- const ast = parse(tokens);
190
-
191
- expect(ast).toBeDefined();
192
- expect((ast as any).type).toBe('DOCUMENT');
193
- });
194
-
195
- it('should handle malformed HTML gracefully', () => {
196
- const malformedHtml = `
197
- <div>
198
- <p>Unclosed paragraph
199
- <span>Unclosed span
200
- <b>Bold text
201
- <i>Italic text
202
- </div>
203
- `;
204
-
205
- const tokens = tokenize(malformedHtml);
206
- const ast = parse(tokens);
207
-
208
- expect(ast).toBeDefined();
209
- });
210
-
211
- it('should handle mismatched tags', () => {
212
- const mismatchedHtml = `
213
- <div>
214
- <p>Paragraph</div>
215
- <span>Span</p>
216
- </span>
217
- `;
218
-
219
- const tokens = tokenize(mismatchedHtml);
220
- const ast = parse(tokens);
221
-
222
- expect(ast).toBeDefined();
223
- });
224
- });
225
-
226
- describe('Performance Benchmarks', () => {
227
- it('should parse small HTML quickly', () => {
228
- const smallHtml = '<div><p>Hello World</p></div>';
229
-
230
- const start = performance.now();
231
- const tokens = tokenize(smallHtml);
232
- const ast = parse(tokens);
233
- const end = performance.now();
234
-
235
- expect(ast).toBeDefined();
236
- expect(end - start).toBeLessThan(10);
237
- });
238
-
239
- it('should handle medium-sized HTML', () => {
240
- const mediumHtml = Array(100).fill('<div><p>Content</p></div>').join('');
241
-
242
- const start = performance.now();
243
- const tokens = tokenize(mediumHtml);
244
- const ast = parse(tokens);
245
- const end = performance.now();
246
-
247
- expect(ast).toBeDefined();
248
- expect(end - start).toBeLessThan(100);
249
- });
250
-
251
- it('should handle large HTML documents', () => {
252
- const largeHtml = Array(1000).fill('<div><p>Large content</p></div>').join('');
253
-
254
- const start = performance.now();
255
- const tokens = tokenize(largeHtml);
256
- const ast = parse(tokens);
257
- const end = performance.now();
258
-
259
- expect(ast).toBeDefined();
260
- expect(end - start).toBeLessThan(1000);
261
- });
262
-
263
- it('should handle deeply nested HTML', () => {
264
- let deepHtml = '';
265
- for (let i = 0; i < 100; i++) {
266
- deepHtml += '<div>';
267
- }
268
- deepHtml += 'Deep content';
269
- for (let i = 0; i < 100; i++) {
270
- deepHtml += '</div>';
271
- }
272
-
273
- const start = performance.now();
274
- const tokens = tokenize(deepHtml);
275
- const ast = parse(tokens);
276
- const end = performance.now();
277
-
278
- expect(ast).toBeDefined();
279
- expect(end - start).toBeLessThan(500);
280
- });
281
- });
282
-
283
- describe('Memory Usage Tests', () => {
284
- it('should not leak memory on repeated parsing', () => {
285
- const testHtml = '<div><p>Memory test</p></div>';
286
-
287
-
288
- for (let i = 0; i < 1000; i++) {
289
- const tokens = tokenize(testHtml);
290
- const ast = parse(tokens);
291
- expect(ast).toBeDefined();
292
- }
293
-
294
-
295
- expect(true).toBe(true);
296
- });
297
-
298
- it('should handle multiple large documents', () => {
299
- const largeHtml = Array(500).fill('<div><p>Large content</p></div>').join('');
300
-
301
- for (let i = 0; i < 10; i++) {
302
- const tokens = tokenize(largeHtml);
303
- const ast = parse(tokens);
304
- expect(ast).toBeDefined();
305
- }
306
-
307
- expect(true).toBe(true);
308
- });
309
- });