@tkeron/html-parser 0.1.7 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +1 -7
  2. package/bun.lock +5 -0
  3. package/index.ts +4 -0
  4. package/package.json +7 -1
  5. package/src/css-selector.ts +1 -1
  6. package/src/dom-simulator.ts +41 -17
  7. package/src/encoding.ts +39 -0
  8. package/src/index.ts +9 -0
  9. package/src/parser.ts +509 -143
  10. package/src/serializer.ts +450 -0
  11. package/src/tokenizer.ts +190 -118
  12. package/tests/advanced.test.ts +121 -108
  13. package/tests/custom-elements-head.test.ts +105 -0
  14. package/tests/dom-extended.test.ts +12 -12
  15. package/tests/dom-manipulation.test.ts +9 -10
  16. package/tests/dom.test.ts +32 -27
  17. package/tests/helpers/tokenizer-adapter.test.ts +70 -0
  18. package/tests/helpers/tokenizer-adapter.ts +65 -0
  19. package/tests/helpers/tree-adapter.test.ts +39 -0
  20. package/tests/helpers/tree-adapter.ts +60 -0
  21. package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
  22. package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
  23. package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
  24. package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
  25. package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
  26. package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
  27. package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
  28. package/tests/html5lib-data/tree-construction/math.dat +104 -0
  29. package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
  30. package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
  31. package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
  32. package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
  33. package/tests/html5lib-data/tree-construction/svg.dat +104 -0
  34. package/tests/html5lib-data/tree-construction/template.dat +1673 -0
  35. package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
  36. package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
  37. package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
  38. package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
  39. package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
  40. package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
  41. package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
  42. package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
  43. package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
  44. package/tests/parser.test.ts +173 -193
  45. package/tests/serializer-core.test.ts +16 -0
  46. package/tests/serializer-data/core.test +125 -0
  47. package/tests/serializer-data/injectmeta.test +66 -0
  48. package/tests/serializer-data/optionaltags.test +965 -0
  49. package/tests/serializer-data/options.test +60 -0
  50. package/tests/serializer-data/whitespace.test +51 -0
  51. package/tests/serializer-injectmeta.test.ts +16 -0
  52. package/tests/serializer-optionaltags.test.ts +16 -0
  53. package/tests/serializer-options.test.ts +16 -0
  54. package/tests/serializer-whitespace.test.ts +16 -0
  55. package/tests/tokenizer-namedEntities.test.ts +20 -0
  56. package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
  57. package/tests/tokenizer.test.ts +25 -32
  58. package/tests/tree-construction-adoption01.test.ts +37 -0
  59. package/tests/tree-construction-adoption02.test.ts +34 -0
  60. package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
  61. package/tests/tree-construction-entities02.test.ts +33 -0
  62. package/tests/tree-construction-html5test-com.test.ts +32 -0
  63. package/tests/tree-construction-math.test.ts +18 -0
  64. package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
  65. package/tests/tree-construction-noscript01.test.ts +18 -0
  66. package/tests/tree-construction-ruby.test.ts +21 -0
  67. package/tests/tree-construction-scriptdata01.test.ts +21 -0
  68. package/tests/tree-construction-svg.test.ts +21 -0
  69. package/tests/tree-construction-template.test.ts +21 -0
  70. package/tests/tree-construction-tests10.test.ts +21 -0
  71. package/tests/tree-construction-tests11.test.ts +21 -0
  72. package/tests/tree-construction-tests20.test.ts +18 -0
  73. package/tests/tree-construction-tests21.test.ts +18 -0
  74. package/tests/tree-construction-tests23.test.ts +18 -0
  75. package/tests/tree-construction-tests24.test.ts +18 -0
  76. package/tests/tree-construction-tests5.test.ts +21 -0
  77. package/tests/tree-construction-tests6.test.ts +21 -0
  78. package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
  79. package/tests/custom-elements.test.ts +0 -745
  80. package/tests/official/README.md +0 -87
  81. package/tests/official/acid/acid-tests.test.ts +0 -309
  82. package/tests/official/final-output/final-output.test.ts +0 -361
  83. package/tests/official/html5lib/tokenizer-utils.ts +0 -192
  84. package/tests/official/html5lib/tokenizer.test.ts +0 -171
  85. package/tests/official/html5lib/tree-construction-utils.ts +0 -194
  86. package/tests/official/html5lib/tree-construction.test.ts +0 -250
  87. package/tests/official/validator/validator-tests.test.ts +0 -237
  88. package/tests/official/validator-nu/validator-nu.test.ts +0 -335
  89. package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
  90. package/tests/official/wpt/wpt-tests.test.ts +0 -409
@@ -1,87 +0,0 @@
1
- # Official HTML Parser Tests
2
-
3
- This directory contains implementations of official HTML parsing test suites to ensure compliance with web standards.
4
-
5
- ## Test Sources
6
-
7
- ### HTML5lib Tests
8
- - **Tokenizer Tests**: JSON format tests from `html5lib-tests/tokenizer/`
9
- - **Tree Construction Tests**: DAT format tests from `html5lib-tests/tree-construction/`
10
-
11
- ### Web Platform Tests (WPT)
12
- - **Parsing Tests**: HTML format tests from `wpt/html/syntax/parsing/`
13
-
14
- ### Benchmark/Compliance Tests
15
- - **Acid Tests**: Standardized rendering tests (Acid1, Acid2, Acid3)
16
- - **HTML5 Test Suite**: Comprehensive HTML5 compliance tests
17
-
18
- ## Test Structure
19
-
20
- ```
21
- tests/official/
22
- ├── html5lib/
23
- │ ├── tokenizer/ # JSON tokenizer tests
24
- │ ├── tree-construction/ # DAT tree construction tests
25
- │ └── utils/ # HTML5lib test utilities
26
- ├── wpt/ # Web Platform Tests
27
- ├── acid/ # Acid tests
28
- ├── benchmarks/ # Performance benchmarks
29
- └── compliance/ # Compliance test results
30
- ```
31
-
32
- ## Test Formats
33
-
34
- ### HTML5lib Tokenizer Tests (JSON)
35
- ```json
36
- {
37
- "tests": [
38
- {
39
- "description": "Test description",
40
- "input": "input_string",
41
- "output": [expected_output_tokens],
42
- "initialStates": [initial_states],
43
- "lastStartTag": "last_start_tag",
44
- "errors": [parse_errors]
45
- }
46
- ]
47
- }
48
- ```
49
-
50
- ### HTML5lib Tree Construction Tests (DAT)
51
- ```
52
- #data
53
- <html>
54
- #errors
55
- (1,6): expected-doctype-but-got-start-tag
56
- #document
57
- | <html>
58
- | <head>
59
- | <body>
60
- ```
61
-
62
- ### Web Platform Tests (HTML)
63
- Standard HTML files with embedded test assertions and expected results.
64
-
65
- ## Usage
66
-
67
- ```bash
68
- # Run all official tests
69
- bun test tests/official/
70
-
71
- # Run specific test suite
72
- bun test tests/official/html5lib/
73
- bun test tests/official/wpt/
74
- bun test tests/official/acid/
75
-
76
- # Run with coverage
77
- bun test --coverage tests/official/
78
- ```
79
-
80
- ## Test Results
81
-
82
- Results are automatically generated and stored in `tests/official/compliance/` with detailed reports on:
83
- - Tokenizer compliance
84
- - Tree construction compliance
85
- - Error handling accuracy
86
- - Performance benchmarks
87
- - Standards compliance scores
@@ -1,309 +0,0 @@
1
- import { describe, it, expect } from 'bun:test';
2
- import { tokenize } from '../../../src/tokenizer';
3
- import { parse } from '../../../src/parser';
4
-
5
- describe('Acid Tests Compliance', () => {
6
- describe('Acid1 Test', () => {
7
- it('should parse basic HTML structure correctly', () => {
8
- const acid1Html = `
9
- <!DOCTYPE html>
10
- <html>
11
- <head>
12
- <title>Acid1 Test</title>
13
- </head>
14
- <body>
15
- <div>
16
- <p>Hello <b>World</b></p>
17
- <table>
18
- <tr>
19
- <td>Cell 1</td>
20
- <td>Cell 2</td>
21
- </tr>
22
- </table>
23
- </div>
24
- </body>
25
- </html>
26
- `;
27
-
28
- const tokens = tokenize(acid1Html);
29
- const ast = parse(tokens);
30
-
31
- expect(ast).toBeDefined();
32
- expect((ast as any).type).toBe('DOCUMENT');
33
- expect((ast as any).children?.length).toBeGreaterThan(0);
34
- });
35
-
36
- it('should handle nested elements', () => {
37
- const nestedHtml = `
38
- <div>
39
- <p>Text <strong>bold <em>italic</em></strong> more text</p>
40
- </div>
41
- `;
42
-
43
- const tokens = tokenize(nestedHtml);
44
- const ast = parse(tokens);
45
-
46
- expect(ast).toBeDefined();
47
- expect(ast.children?.length).toBeGreaterThan(0);
48
- });
49
-
50
- it('should handle self-closing tags', () => {
51
- const selfClosingHtml = `
52
- <div>
53
- <img src="test.jpg" alt="test">
54
- <br>
55
- <hr>
56
- </div>
57
- `;
58
-
59
- const tokens = tokenize(selfClosingHtml);
60
- const ast = parse(tokens);
61
-
62
- expect(ast).toBeDefined();
63
- });
64
- });
65
-
66
- describe('Acid2 Test', () => {
67
- it('should handle CSS and more complex HTML', () => {
68
- const acid2Html = `
69
- <!DOCTYPE html>
70
- <html>
71
- <head>
72
- <style>
73
- body { margin: 0; }
74
- .test { color: red; }
75
- </style>
76
- </head>
77
- <body>
78
- <div class="test">
79
- <span>Styled text</span>
80
- </div>
81
- </body>
82
- </html>
83
- `;
84
-
85
- const tokens = tokenize(acid2Html);
86
- const ast = parse(tokens);
87
-
88
- expect(ast).toBeDefined();
89
- expect((ast as any).type).toBe('DOCUMENT');
90
- });
91
-
92
- it('should handle complex table structures', () => {
93
- const complexTable = `
94
- <table>
95
- <thead>
96
- <tr>
97
- <th colspan="2">Header</th>
98
- </tr>
99
- </thead>
100
- <tbody>
101
- <tr>
102
- <td rowspan="2">Cell 1</td>
103
- <td>Cell 2</td>
104
- </tr>
105
- <tr>
106
- <td>Cell 3</td>
107
- </tr>
108
- </tbody>
109
- </table>
110
- `;
111
-
112
- const tokens = tokenize(complexTable);
113
- const ast = parse(tokens);
114
-
115
- expect(ast).toBeDefined();
116
- });
117
- });
118
-
119
- describe('Acid3 Test', () => {
120
- it('should handle advanced HTML5 features', () => {
121
- const acid3Html = `
122
- <!DOCTYPE html>
123
- <html>
124
- <head>
125
- <meta charset="UTF-8">
126
- <title>Acid3 Test</title>
127
- </head>
128
- <body>
129
- <article>
130
- <header>
131
- <h1>Article Title</h1>
132
- </header>
133
- <section>
134
- <p>Article content</p>
135
- </section>
136
- <footer>
137
- <p>Footer content</p>
138
- </footer>
139
- </article>
140
- </body>
141
- </html>
142
- `;
143
-
144
- const tokens = tokenize(acid3Html);
145
- const ast = parse(tokens);
146
-
147
- expect(ast).toBeDefined();
148
- expect((ast as any).type).toBe('DOCUMENT');
149
- });
150
-
151
- it('should handle HTML5 semantic elements', () => {
152
- const semanticHtml = `
153
- <main>
154
- <nav>
155
- <ul>
156
- <li><a href="#home">Home</a></li>
157
- <li><a href="#about">About</a></li>
158
- </ul>
159
- </nav>
160
- <aside>
161
- <p>Sidebar content</p>
162
- </aside>
163
- </main>
164
- `;
165
-
166
- const tokens = tokenize(semanticHtml);
167
- const ast = parse(tokens);
168
-
169
- expect(ast).toBeDefined();
170
- });
171
- });
172
- });
173
-
174
- describe('Quirks Mode Tests', () => {
175
- it('should handle quirks mode HTML', () => {
176
- const quirksHtml = `
177
- <html>
178
- <body>
179
- <div>
180
- <p>No DOCTYPE - should trigger quirks mode
181
- <p>Unclosed paragraphs
182
- <div>Nested without proper closing
183
- </div>
184
- </body>
185
- </html>
186
- `;
187
-
188
- const tokens = tokenize(quirksHtml);
189
- const ast = parse(tokens);
190
-
191
- expect(ast).toBeDefined();
192
- expect((ast as any).type).toBe('DOCUMENT');
193
- });
194
-
195
- it('should handle malformed HTML gracefully', () => {
196
- const malformedHtml = `
197
- <div>
198
- <p>Unclosed paragraph
199
- <span>Unclosed span
200
- <b>Bold text
201
- <i>Italic text
202
- </div>
203
- `;
204
-
205
- const tokens = tokenize(malformedHtml);
206
- const ast = parse(tokens);
207
-
208
- expect(ast).toBeDefined();
209
- });
210
-
211
- it('should handle mismatched tags', () => {
212
- const mismatchedHtml = `
213
- <div>
214
- <p>Paragraph</div>
215
- <span>Span</p>
216
- </span>
217
- `;
218
-
219
- const tokens = tokenize(mismatchedHtml);
220
- const ast = parse(tokens);
221
-
222
- expect(ast).toBeDefined();
223
- });
224
- });
225
-
226
- describe('Performance Benchmarks', () => {
227
- it('should parse small HTML quickly', () => {
228
- const smallHtml = '<div><p>Hello World</p></div>';
229
-
230
- const start = performance.now();
231
- const tokens = tokenize(smallHtml);
232
- const ast = parse(tokens);
233
- const end = performance.now();
234
-
235
- expect(ast).toBeDefined();
236
- expect(end - start).toBeLessThan(10);
237
- });
238
-
239
- it('should handle medium-sized HTML', () => {
240
- const mediumHtml = Array(100).fill('<div><p>Content</p></div>').join('');
241
-
242
- const start = performance.now();
243
- const tokens = tokenize(mediumHtml);
244
- const ast = parse(tokens);
245
- const end = performance.now();
246
-
247
- expect(ast).toBeDefined();
248
- expect(end - start).toBeLessThan(100);
249
- });
250
-
251
- it('should handle large HTML documents', () => {
252
- const largeHtml = Array(1000).fill('<div><p>Large content</p></div>').join('');
253
-
254
- const start = performance.now();
255
- const tokens = tokenize(largeHtml);
256
- const ast = parse(tokens);
257
- const end = performance.now();
258
-
259
- expect(ast).toBeDefined();
260
- expect(end - start).toBeLessThan(1000);
261
- });
262
-
263
- it('should handle deeply nested HTML', () => {
264
- let deepHtml = '';
265
- for (let i = 0; i < 100; i++) {
266
- deepHtml += '<div>';
267
- }
268
- deepHtml += 'Deep content';
269
- for (let i = 0; i < 100; i++) {
270
- deepHtml += '</div>';
271
- }
272
-
273
- const start = performance.now();
274
- const tokens = tokenize(deepHtml);
275
- const ast = parse(tokens);
276
- const end = performance.now();
277
-
278
- expect(ast).toBeDefined();
279
- expect(end - start).toBeLessThan(500);
280
- });
281
- });
282
-
283
- describe('Memory Usage Tests', () => {
284
- it('should not leak memory on repeated parsing', () => {
285
- const testHtml = '<div><p>Memory test</p></div>';
286
-
287
-
288
- for (let i = 0; i < 1000; i++) {
289
- const tokens = tokenize(testHtml);
290
- const ast = parse(tokens);
291
- expect(ast).toBeDefined();
292
- }
293
-
294
-
295
- expect(true).toBe(true);
296
- });
297
-
298
- it('should handle multiple large documents', () => {
299
- const largeHtml = Array(500).fill('<div><p>Large content</p></div>').join('');
300
-
301
- for (let i = 0; i < 10; i++) {
302
- const tokens = tokenize(largeHtml);
303
- const ast = parse(tokens);
304
- expect(ast).toBeDefined();
305
- }
306
-
307
- expect(true).toBe(true);
308
- });
309
- });