@tkeron/html-parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +24 -0
- package/LICENSE +21 -0
- package/README.md +120 -0
- package/bun.lock +29 -0
- package/index.ts +18 -0
- package/package.json +25 -0
- package/src/css-selector.ts +172 -0
- package/src/dom-simulator.ts +592 -0
- package/src/dom-types.ts +78 -0
- package/src/parser.ts +355 -0
- package/src/tokenizer.ts +413 -0
- package/tests/advanced.test.ts +487 -0
- package/tests/api-integration.test.ts +114 -0
- package/tests/dom-extended.test.ts +173 -0
- package/tests/dom.test.ts +482 -0
- package/tests/google-dom.test.ts +118 -0
- package/tests/google-homepage.txt +13 -0
- package/tests/official/README.md +87 -0
- package/tests/official/acid/acid-tests.test.ts +309 -0
- package/tests/official/final-output/final-output.test.ts +361 -0
- package/tests/official/html5lib/tokenizer-utils.ts +204 -0
- package/tests/official/html5lib/tokenizer.test.ts +184 -0
- package/tests/official/html5lib/tree-construction-utils.ts +208 -0
- package/tests/official/html5lib/tree-construction.test.ts +250 -0
- package/tests/official/validator/validator-tests.test.ts +237 -0
- package/tests/official/validator-nu/validator-nu.test.ts +335 -0
- package/tests/official/whatwg/whatwg-tests.test.ts +205 -0
- package/tests/official/wpt/wpt-tests.test.ts +409 -0
- package/tests/parser.test.ts +642 -0
- package/tests/selectors.test.ts +65 -0
- package/tests/test-page-0.txt +362 -0
- package/tests/tokenizer.test.ts +666 -0
- package/tsconfig.json +25 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
import { describe, it, expect } from 'bun:test';
|
|
2
|
+
import { parseHTML } from '../../../index';
|
|
3
|
+
|
|
4
|
+
// Helper function to normalize text for comparison
|
|
5
|
+
function normalizeText(text: string): string {
|
|
6
|
+
return text
|
|
7
|
+
.replace(/\s+/g, ' ')
|
|
8
|
+
.trim();
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
describe('Final HTML Output Validation', () => {
|
|
12
|
+
describe('Complete HTML Structure', () => {
|
|
13
|
+
it('should parse and create DOM structure correctly', () => {
|
|
14
|
+
const html = '<div class="container"><p>Hello World</p></div>';
|
|
15
|
+
|
|
16
|
+
const document = parseHTML(html);
|
|
17
|
+
|
|
18
|
+
expect(document).toBeDefined();
|
|
19
|
+
expect(document.nodeType).toBe(9); // DOCUMENT_NODE
|
|
20
|
+
|
|
21
|
+
const div = document.querySelector('div');
|
|
22
|
+
expect(div).toBeDefined();
|
|
23
|
+
expect(div?.className).toBe('container');
|
|
24
|
+
|
|
25
|
+
const p = document.querySelector('p');
|
|
26
|
+
expect(p).toBeDefined();
|
|
27
|
+
expect(p?.textContent).toBe('Hello World');
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it('should handle nested elements correctly', () => {
|
|
31
|
+
const html = '<div><span><strong>Bold text</strong></span></div>';
|
|
32
|
+
|
|
33
|
+
const document = parseHTML(html);
|
|
34
|
+
|
|
35
|
+
const div = document.querySelector('div');
|
|
36
|
+
expect(div).toBeDefined();
|
|
37
|
+
|
|
38
|
+
const span = div?.querySelector('span');
|
|
39
|
+
expect(span).toBeDefined();
|
|
40
|
+
|
|
41
|
+
const strong = span?.querySelector('strong');
|
|
42
|
+
expect(strong).toBeDefined();
|
|
43
|
+
expect(strong?.textContent).toBe('Bold text');
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
it('should handle self-closing tags correctly', () => {
|
|
47
|
+
const html = '<div><img src="test.jpg" alt="Test"><br><hr></div>';
|
|
48
|
+
|
|
49
|
+
const document = parseHTML(html);
|
|
50
|
+
|
|
51
|
+
const img = document.querySelector('img');
|
|
52
|
+
expect(img).toBeDefined();
|
|
53
|
+
expect(img?.getAttribute('src')).toBe('test.jpg');
|
|
54
|
+
expect(img?.getAttribute('alt')).toBe('Test');
|
|
55
|
+
|
|
56
|
+
const br = document.querySelector('br');
|
|
57
|
+
expect(br).toBeDefined();
|
|
58
|
+
|
|
59
|
+
const hr = document.querySelector('hr');
|
|
60
|
+
expect(hr).toBeDefined();
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it('should handle attributes correctly', () => {
|
|
64
|
+
const html = '<div id="main" class="container" data-value="test">Content</div>';
|
|
65
|
+
|
|
66
|
+
const document = parseHTML(html);
|
|
67
|
+
|
|
68
|
+
const div = document.querySelector('div');
|
|
69
|
+
expect(div).toBeDefined();
|
|
70
|
+
expect(div?.getAttribute('id')).toBe('main');
|
|
71
|
+
expect(div?.getAttribute('class')).toBe('container');
|
|
72
|
+
expect(div?.getAttribute('data-value')).toBe('test');
|
|
73
|
+
expect(div?.textContent).toBe('Content');
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
it('should handle text content correctly', () => {
|
|
77
|
+
const html = '<p>This is a <strong>bold</strong> text with <em>emphasis</em>.</p>';
|
|
78
|
+
|
|
79
|
+
const document = parseHTML(html);
|
|
80
|
+
|
|
81
|
+
const p = document.querySelector('p');
|
|
82
|
+
expect(p).toBeDefined();
|
|
83
|
+
expect(normalizeText(p?.textContent || '')).toBe('This is a bold text with emphasis.');
|
|
84
|
+
|
|
85
|
+
const strong = document.querySelector('strong');
|
|
86
|
+
expect(strong?.textContent).toBe('bold');
|
|
87
|
+
|
|
88
|
+
const em = document.querySelector('em');
|
|
89
|
+
expect(em?.textContent).toBe('emphasis');
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('should handle comments correctly', () => {
|
|
93
|
+
const html = '<div><!-- This is a comment --><p>Content</p></div>';
|
|
94
|
+
|
|
95
|
+
const document = parseHTML(html);
|
|
96
|
+
|
|
97
|
+
const div = document.querySelector('div');
|
|
98
|
+
expect(div).toBeDefined();
|
|
99
|
+
|
|
100
|
+
const p = document.querySelector('p');
|
|
101
|
+
expect(p).toBeDefined();
|
|
102
|
+
expect(p?.textContent).toBe('Content');
|
|
103
|
+
|
|
104
|
+
// Check for comment node
|
|
105
|
+
const commentNode = div?.childNodes[0];
|
|
106
|
+
expect(commentNode?.nodeType).toBe(8); // COMMENT_NODE
|
|
107
|
+
});
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
describe('DOM Structure Validation', () => {
|
|
111
|
+
it('should maintain correct parent-child relationships', () => {
|
|
112
|
+
const html = '<div><p><span>Nested</span></p></div>';
|
|
113
|
+
|
|
114
|
+
const document = parseHTML(html);
|
|
115
|
+
|
|
116
|
+
const div = document.querySelector('div');
|
|
117
|
+
expect(div).toBeDefined();
|
|
118
|
+
expect(div?.children.length).toBe(1);
|
|
119
|
+
|
|
120
|
+
const p = div?.children[0];
|
|
121
|
+
expect(p?.tagName).toBe('P');
|
|
122
|
+
expect(p?.children.length).toBe(1);
|
|
123
|
+
|
|
124
|
+
const span = p?.children[0];
|
|
125
|
+
expect(span?.tagName).toBe('SPAN');
|
|
126
|
+
expect(span?.textContent).toBe('Nested');
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
it('should handle mixed content correctly', () => {
|
|
130
|
+
const html = '<div>Text before <span>span content</span> text after</div>';
|
|
131
|
+
|
|
132
|
+
const document = parseHTML(html);
|
|
133
|
+
|
|
134
|
+
const div = document.querySelector('div');
|
|
135
|
+
expect(div).toBeDefined();
|
|
136
|
+
expect(div?.childNodes.length).toBe(3);
|
|
137
|
+
|
|
138
|
+
// First text node
|
|
139
|
+
expect(div?.childNodes[0]?.nodeType).toBe(3); // TEXT_NODE
|
|
140
|
+
expect(div?.childNodes[0]?.textContent).toBe('Text before ');
|
|
141
|
+
|
|
142
|
+
// Span element
|
|
143
|
+
expect(div?.childNodes[1]?.nodeType).toBe(1); // ELEMENT_NODE
|
|
144
|
+
expect((div?.childNodes[1] as Element)?.tagName).toBe('SPAN');
|
|
145
|
+
|
|
146
|
+
// Last text node
|
|
147
|
+
expect(div?.childNodes[2]?.nodeType).toBe(3); // TEXT_NODE
|
|
148
|
+
expect(div?.childNodes[2]?.textContent).toBe(' text after');
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
it('should handle void elements correctly', () => {
|
|
152
|
+
const html = '<div><img src="test.jpg"><br><input type="text"></div>';
|
|
153
|
+
|
|
154
|
+
const document = parseHTML(html);
|
|
155
|
+
|
|
156
|
+
const img = document.querySelector('img');
|
|
157
|
+
expect(img).toBeDefined();
|
|
158
|
+
expect(img?.getAttribute('src')).toBe('test.jpg');
|
|
159
|
+
|
|
160
|
+
const br = document.querySelector('br');
|
|
161
|
+
expect(br).toBeDefined();
|
|
162
|
+
|
|
163
|
+
const input = document.querySelector('input');
|
|
164
|
+
expect(input).toBeDefined();
|
|
165
|
+
expect(input?.getAttribute('type')).toBe('text');
|
|
166
|
+
});
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
describe('HTML5 Semantic Structure', () => {
|
|
170
|
+
it('should handle complete HTML5 document structure', () => {
|
|
171
|
+
const html = `<!DOCTYPE html>
|
|
172
|
+
<html>
|
|
173
|
+
<head>
|
|
174
|
+
<title>Test Document</title>
|
|
175
|
+
</head>
|
|
176
|
+
<body>
|
|
177
|
+
<header>
|
|
178
|
+
<nav>Navigation</nav>
|
|
179
|
+
</header>
|
|
180
|
+
<main>
|
|
181
|
+
<article>
|
|
182
|
+
<section>Content</section>
|
|
183
|
+
</article>
|
|
184
|
+
</main>
|
|
185
|
+
<footer>Footer</footer>
|
|
186
|
+
</body>
|
|
187
|
+
</html>`;
|
|
188
|
+
|
|
189
|
+
const document = parseHTML(html);
|
|
190
|
+
|
|
191
|
+
expect(document).toBeDefined();
|
|
192
|
+
expect(document.documentElement?.tagName).toBe('HTML');
|
|
193
|
+
|
|
194
|
+
const head = document.querySelector('head');
|
|
195
|
+
expect(head).toBeDefined();
|
|
196
|
+
|
|
197
|
+
const title = document.querySelector('title');
|
|
198
|
+
expect(title?.textContent).toBe('Test Document');
|
|
199
|
+
|
|
200
|
+
const body = document.querySelector('body');
|
|
201
|
+
expect(body).toBeDefined();
|
|
202
|
+
|
|
203
|
+
const header = document.querySelector('header');
|
|
204
|
+
expect(header).toBeDefined();
|
|
205
|
+
|
|
206
|
+
const nav = document.querySelector('nav');
|
|
207
|
+
expect(nav?.textContent).toBe('Navigation');
|
|
208
|
+
|
|
209
|
+
const main = document.querySelector('main');
|
|
210
|
+
expect(main).toBeDefined();
|
|
211
|
+
|
|
212
|
+
const article = document.querySelector('article');
|
|
213
|
+
expect(article).toBeDefined();
|
|
214
|
+
|
|
215
|
+
const section = document.querySelector('section');
|
|
216
|
+
expect(section?.textContent).toBe('Content');
|
|
217
|
+
|
|
218
|
+
const footer = document.querySelector('footer');
|
|
219
|
+
expect(footer?.textContent).toBe('Footer');
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
it('should handle HTML5 form elements', () => {
|
|
223
|
+
const html = `<form>
|
|
224
|
+
<fieldset>
|
|
225
|
+
<legend>Contact Information</legend>
|
|
226
|
+
<label for="email">Email:</label>
|
|
227
|
+
<input type="email" id="email" required>
|
|
228
|
+
<label for="phone">Phone:</label>
|
|
229
|
+
<input type="tel" id="phone">
|
|
230
|
+
<button type="submit">Submit</button>
|
|
231
|
+
</fieldset>
|
|
232
|
+
</form>`;
|
|
233
|
+
|
|
234
|
+
const document = parseHTML(html);
|
|
235
|
+
|
|
236
|
+
const form = document.querySelector('form');
|
|
237
|
+
expect(form).toBeDefined();
|
|
238
|
+
|
|
239
|
+
const fieldset = document.querySelector('fieldset');
|
|
240
|
+
expect(fieldset).toBeDefined();
|
|
241
|
+
|
|
242
|
+
const legend = document.querySelector('legend');
|
|
243
|
+
expect(legend?.textContent).toBe('Contact Information');
|
|
244
|
+
|
|
245
|
+
const emailInput = document.querySelector('input[type="email"]');
|
|
246
|
+
expect(emailInput).toBeDefined();
|
|
247
|
+
expect(emailInput?.getAttribute('id')).toBe('email');
|
|
248
|
+
expect(emailInput?.hasAttribute('required')).toBe(true);
|
|
249
|
+
|
|
250
|
+
const phoneInput = document.querySelector('input[type="tel"]');
|
|
251
|
+
expect(phoneInput).toBeDefined();
|
|
252
|
+
expect(phoneInput?.getAttribute('id')).toBe('phone');
|
|
253
|
+
|
|
254
|
+
const submitButton = document.querySelector('button[type="submit"]');
|
|
255
|
+
expect(submitButton).toBeDefined();
|
|
256
|
+
expect(submitButton?.textContent).toBe('Submit');
|
|
257
|
+
});
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
describe('Error Handling and Edge Cases', () => {
|
|
261
|
+
it('should handle malformed HTML gracefully', () => {
|
|
262
|
+
const malformedHTML = '<div><p>Unclosed paragraph<div>Another div</div>';
|
|
263
|
+
|
|
264
|
+
const document = parseHTML(malformedHTML);
|
|
265
|
+
|
|
266
|
+
expect(document).toBeDefined();
|
|
267
|
+
expect(document.nodeType).toBe(9); // DOCUMENT_NODE
|
|
268
|
+
|
|
269
|
+
const divs = document.querySelectorAll('div');
|
|
270
|
+
expect(divs.length).toBeGreaterThan(0);
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
it('should handle empty elements', () => {
|
|
274
|
+
const html = '<div></div><p></p><span></span>';
|
|
275
|
+
|
|
276
|
+
const document = parseHTML(html);
|
|
277
|
+
|
|
278
|
+
const div = document.querySelector('div');
|
|
279
|
+
expect(div).toBeDefined();
|
|
280
|
+
expect(div?.textContent).toBe('');
|
|
281
|
+
|
|
282
|
+
const p = document.querySelector('p');
|
|
283
|
+
expect(p).toBeDefined();
|
|
284
|
+
expect(p?.textContent).toBe('');
|
|
285
|
+
|
|
286
|
+
const span = document.querySelector('span');
|
|
287
|
+
expect(span).toBeDefined();
|
|
288
|
+
expect(span?.textContent).toBe('');
|
|
289
|
+
});
|
|
290
|
+
|
|
291
|
+
it('should handle special characters in text', () => {
|
|
292
|
+
const html = '<p>Special chars: < > & " '</p>';
|
|
293
|
+
|
|
294
|
+
const document = parseHTML(html);
|
|
295
|
+
|
|
296
|
+
const p = document.querySelector('p');
|
|
297
|
+
expect(p).toBeDefined();
|
|
298
|
+
expect(p?.textContent).toContain('Special chars:');
|
|
299
|
+
// The exact entity handling depends on your implementation
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
it('should handle multiple top-level elements', () => {
|
|
303
|
+
const html = '<div>First</div><p>Second</p><span>Third</span>';
|
|
304
|
+
|
|
305
|
+
const document = parseHTML(html);
|
|
306
|
+
|
|
307
|
+
const div = document.querySelector('div');
|
|
308
|
+
expect(div?.textContent).toBe('First');
|
|
309
|
+
|
|
310
|
+
const p = document.querySelector('p');
|
|
311
|
+
expect(p?.textContent).toBe('Second');
|
|
312
|
+
|
|
313
|
+
const span = document.querySelector('span');
|
|
314
|
+
expect(span?.textContent).toBe('Third');
|
|
315
|
+
});
|
|
316
|
+
});
|
|
317
|
+
|
|
318
|
+
describe('DOM API Compliance', () => {
|
|
319
|
+
it('should support basic DOM queries', () => {
|
|
320
|
+
const html = '<div id="test" class="container"><p class="text">Hello</p></div>';
|
|
321
|
+
|
|
322
|
+
const document = parseHTML(html);
|
|
323
|
+
|
|
324
|
+
// Test getElementById
|
|
325
|
+
const byId = document.getElementById('test');
|
|
326
|
+
expect(byId).toBeDefined();
|
|
327
|
+
expect(byId?.tagName).toBe('DIV');
|
|
328
|
+
|
|
329
|
+
// Test querySelector
|
|
330
|
+
const bySelector = document.querySelector('.container');
|
|
331
|
+
expect(bySelector).toBeDefined();
|
|
332
|
+
expect(bySelector?.id).toBe('test');
|
|
333
|
+
|
|
334
|
+
// Test querySelectorAll
|
|
335
|
+
const byClass = document.querySelectorAll('.text');
|
|
336
|
+
expect(byClass.length).toBe(1);
|
|
337
|
+
expect(byClass[0]?.textContent).toBe('Hello');
|
|
338
|
+
});
|
|
339
|
+
|
|
340
|
+
it('should support element traversal', () => {
|
|
341
|
+
const html = '<div><p>First</p><p>Second</p><p>Third</p></div>';
|
|
342
|
+
|
|
343
|
+
const document = parseHTML(html);
|
|
344
|
+
|
|
345
|
+
const div = document.querySelector('div');
|
|
346
|
+
expect(div).toBeDefined();
|
|
347
|
+
|
|
348
|
+
const children = div?.children;
|
|
349
|
+
expect(children?.length).toBe(3);
|
|
350
|
+
|
|
351
|
+
const firstP = children?.[0];
|
|
352
|
+
expect(firstP?.textContent).toBe('First');
|
|
353
|
+
|
|
354
|
+
const secondP = children?.[1];
|
|
355
|
+
expect(secondP?.textContent).toBe('Second');
|
|
356
|
+
|
|
357
|
+
const thirdP = children?.[2];
|
|
358
|
+
expect(thirdP?.textContent).toBe('Third');
|
|
359
|
+
});
|
|
360
|
+
});
|
|
361
|
+
});
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import { expect, describe, it } from 'bun:test';
|
|
2
|
+
import { tokenize, TokenType } from '../../../src/tokenizer';
|
|
3
|
+
import type { Token } from '../../../src/tokenizer';
|
|
4
|
+
|
|
5
|
+
// HTML5lib tokenizer test format
|
|
6
|
+
export interface HTML5libTokenizerTest {
|
|
7
|
+
description: string;
|
|
8
|
+
input: string;
|
|
9
|
+
output: HTML5libTokenOutput[];
|
|
10
|
+
initialStates?: string[];
|
|
11
|
+
lastStartTag?: string;
|
|
12
|
+
errors?: HTML5libError[];
|
|
13
|
+
doubleEscaped?: boolean;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface HTML5libTokenizerTestSuite {
|
|
17
|
+
tests: HTML5libTokenizerTest[];
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export type HTML5libTokenOutput =
|
|
21
|
+
| ['StartTag', string, Record<string, string>] // StartTag without self-closing
|
|
22
|
+
| ['StartTag', string, Record<string, string>, boolean] // StartTag with self-closing
|
|
23
|
+
| ['EndTag', string] // EndTag
|
|
24
|
+
| ['Comment', string] // Comment
|
|
25
|
+
| ['Character', string] // Character
|
|
26
|
+
| ['DOCTYPE', string, string | null, string | null, boolean]; // DOCTYPE
|
|
27
|
+
|
|
28
|
+
export interface HTML5libError {
|
|
29
|
+
code: string;
|
|
30
|
+
line: number;
|
|
31
|
+
col: number;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Converts HTML5lib token format to our internal token format
|
|
36
|
+
*/
|
|
37
|
+
export function convertHTML5libToken(html5libToken: HTML5libTokenOutput): Partial<Token> {
|
|
38
|
+
const type = html5libToken[0];
|
|
39
|
+
const nameOrData = html5libToken[1];
|
|
40
|
+
|
|
41
|
+
switch (type) {
|
|
42
|
+
case 'DOCTYPE':
|
|
43
|
+
return {
|
|
44
|
+
type: TokenType.DOCTYPE,
|
|
45
|
+
value: nameOrData || '',
|
|
46
|
+
attributes: {}
|
|
47
|
+
};
|
|
48
|
+
case 'StartTag':
|
|
49
|
+
const attributes = html5libToken[2];
|
|
50
|
+
const selfClosing = html5libToken[3];
|
|
51
|
+
return {
|
|
52
|
+
type: TokenType.TAG_OPEN,
|
|
53
|
+
value: nameOrData || '',
|
|
54
|
+
attributes: (typeof attributes === 'object' && attributes !== null) ? attributes : {},
|
|
55
|
+
isSelfClosing: typeof selfClosing === 'boolean' ? selfClosing : false
|
|
56
|
+
};
|
|
57
|
+
case 'EndTag':
|
|
58
|
+
return {
|
|
59
|
+
type: TokenType.TAG_CLOSE,
|
|
60
|
+
value: nameOrData || '',
|
|
61
|
+
attributes: {},
|
|
62
|
+
isClosing: true
|
|
63
|
+
};
|
|
64
|
+
case 'Comment':
|
|
65
|
+
return {
|
|
66
|
+
type: TokenType.COMMENT,
|
|
67
|
+
value: nameOrData || '',
|
|
68
|
+
attributes: {}
|
|
69
|
+
};
|
|
70
|
+
case 'Character':
|
|
71
|
+
return {
|
|
72
|
+
type: TokenType.TEXT,
|
|
73
|
+
value: nameOrData || '',
|
|
74
|
+
attributes: {}
|
|
75
|
+
};
|
|
76
|
+
default:
|
|
77
|
+
throw new Error(`Unknown HTML5lib token type: ${type}`);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Converts our internal token format to HTML5lib format for comparison
|
|
83
|
+
*/
|
|
84
|
+
export function convertToHTML5libToken(token: Token): HTML5libTokenOutput {
|
|
85
|
+
switch (token.type) {
|
|
86
|
+
case TokenType.DOCTYPE:
|
|
87
|
+
return ['DOCTYPE', token.value, null, null, true];
|
|
88
|
+
case TokenType.TAG_OPEN:
|
|
89
|
+
if (token.isSelfClosing) {
|
|
90
|
+
return ['StartTag', token.value, token.attributes || {}, true];
|
|
91
|
+
} else {
|
|
92
|
+
return ['StartTag', token.value, token.attributes || {}];
|
|
93
|
+
}
|
|
94
|
+
case TokenType.TAG_CLOSE:
|
|
95
|
+
return ['EndTag', token.value];
|
|
96
|
+
case TokenType.COMMENT:
|
|
97
|
+
return ['Comment', token.value];
|
|
98
|
+
case TokenType.TEXT:
|
|
99
|
+
return ['Character', token.value];
|
|
100
|
+
default:
|
|
101
|
+
throw new Error(`Unknown token type: ${token.type}`);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Normalizes adjacent character tokens as per HTML5lib spec
|
|
107
|
+
*/
|
|
108
|
+
export function normalizeCharacterTokens(tokens: Token[]): Token[] {
|
|
109
|
+
const normalized: Token[] = [];
|
|
110
|
+
let currentText = '';
|
|
111
|
+
|
|
112
|
+
for (const token of tokens) {
|
|
113
|
+
if (token.type === TokenType.TEXT) {
|
|
114
|
+
currentText += token.value;
|
|
115
|
+
} else {
|
|
116
|
+
if (currentText) {
|
|
117
|
+
normalized.push({
|
|
118
|
+
type: TokenType.TEXT,
|
|
119
|
+
value: currentText,
|
|
120
|
+
position: token.position,
|
|
121
|
+
attributes: {}
|
|
122
|
+
});
|
|
123
|
+
currentText = '';
|
|
124
|
+
}
|
|
125
|
+
if (token.type !== TokenType.EOF) {
|
|
126
|
+
normalized.push(token);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if (currentText) {
|
|
132
|
+
normalized.push({
|
|
133
|
+
type: TokenType.TEXT,
|
|
134
|
+
value: currentText,
|
|
135
|
+
position: { line: 1, column: 1, offset: 0 },
|
|
136
|
+
attributes: {}
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return normalized;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Runs a single HTML5lib tokenizer test
|
|
145
|
+
*/
|
|
146
|
+
export function runHTML5libTokenizerTest(test: HTML5libTokenizerTest): void {
|
|
147
|
+
const { description, input, output: expectedOutput, initialStates = ['Data state'] } = test;
|
|
148
|
+
|
|
149
|
+
// Process double-escaped input if needed
|
|
150
|
+
let processedInput = input;
|
|
151
|
+
if (test.doubleEscaped) {
|
|
152
|
+
processedInput = processedInput.replace(/\\u([0-9a-fA-F]{4})/g, (match, hex) => {
|
|
153
|
+
return String.fromCharCode(parseInt(hex, 16));
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
for (const initialState of initialStates) {
|
|
158
|
+
it(`${description} (${initialState})`, () => {
|
|
159
|
+
// Tokenize the input
|
|
160
|
+
const tokens = tokenize(processedInput);
|
|
161
|
+
|
|
162
|
+
// Normalize character tokens
|
|
163
|
+
const normalizedTokens = normalizeCharacterTokens(tokens);
|
|
164
|
+
|
|
165
|
+
// Convert to HTML5lib format for comparison
|
|
166
|
+
const actualOutput = normalizedTokens.map(convertToHTML5libToken);
|
|
167
|
+
|
|
168
|
+
// Process expected output if double-escaped
|
|
169
|
+
let processedExpectedOutput = expectedOutput;
|
|
170
|
+
if (test.doubleEscaped) {
|
|
171
|
+
processedExpectedOutput = expectedOutput.map(token => {
|
|
172
|
+
if (typeof token[1] === 'string') {
|
|
173
|
+
token[1] = token[1].replace(/\\u([0-9a-fA-F]{4})/g, (match, hex) => {
|
|
174
|
+
return String.fromCharCode(parseInt(hex, 16));
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
return token;
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Compare outputs
|
|
182
|
+
expect(actualOutput).toEqual(processedExpectedOutput);
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/**
|
|
188
|
+
* Runs all tests from an HTML5lib tokenizer test suite
|
|
189
|
+
*/
|
|
190
|
+
export function runHTML5libTokenizerTestSuite(testSuite: HTML5libTokenizerTestSuite, suiteName: string): void {
|
|
191
|
+
describe(`HTML5lib Tokenizer Tests: ${suiteName}`, () => {
|
|
192
|
+
testSuite.tests.forEach(test => {
|
|
193
|
+
runHTML5libTokenizerTest(test);
|
|
194
|
+
});
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Loads and runs HTML5lib tokenizer tests from JSON
|
|
200
|
+
*/
|
|
201
|
+
export async function loadHTML5libTokenizerTests(testData: string, suiteName: string): Promise<void> {
|
|
202
|
+
const testSuite: HTML5libTokenizerTestSuite = JSON.parse(testData);
|
|
203
|
+
runHTML5libTokenizerTestSuite(testSuite, suiteName);
|
|
204
|
+
}
|