@tkeron/html-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,361 @@
1
+ import { describe, it, expect } from 'bun:test';
2
+ import { parseHTML } from '../../../index';
3
+
4
+ // Helper function to normalize text for comparison
5
+ function normalizeText(text: string): string {
6
+ return text
7
+ .replace(/\s+/g, ' ')
8
+ .trim();
9
+ }
10
+
11
+ describe('Final HTML Output Validation', () => {
12
+ describe('Complete HTML Structure', () => {
13
+ it('should parse and create DOM structure correctly', () => {
14
+ const html = '<div class="container"><p>Hello World</p></div>';
15
+
16
+ const document = parseHTML(html);
17
+
18
+ expect(document).toBeDefined();
19
+ expect(document.nodeType).toBe(9); // DOCUMENT_NODE
20
+
21
+ const div = document.querySelector('div');
22
+ expect(div).toBeDefined();
23
+ expect(div?.className).toBe('container');
24
+
25
+ const p = document.querySelector('p');
26
+ expect(p).toBeDefined();
27
+ expect(p?.textContent).toBe('Hello World');
28
+ });
29
+
30
+ it('should handle nested elements correctly', () => {
31
+ const html = '<div><span><strong>Bold text</strong></span></div>';
32
+
33
+ const document = parseHTML(html);
34
+
35
+ const div = document.querySelector('div');
36
+ expect(div).toBeDefined();
37
+
38
+ const span = div?.querySelector('span');
39
+ expect(span).toBeDefined();
40
+
41
+ const strong = span?.querySelector('strong');
42
+ expect(strong).toBeDefined();
43
+ expect(strong?.textContent).toBe('Bold text');
44
+ });
45
+
46
+ it('should handle self-closing tags correctly', () => {
47
+ const html = '<div><img src="test.jpg" alt="Test"><br><hr></div>';
48
+
49
+ const document = parseHTML(html);
50
+
51
+ const img = document.querySelector('img');
52
+ expect(img).toBeDefined();
53
+ expect(img?.getAttribute('src')).toBe('test.jpg');
54
+ expect(img?.getAttribute('alt')).toBe('Test');
55
+
56
+ const br = document.querySelector('br');
57
+ expect(br).toBeDefined();
58
+
59
+ const hr = document.querySelector('hr');
60
+ expect(hr).toBeDefined();
61
+ });
62
+
63
+ it('should handle attributes correctly', () => {
64
+ const html = '<div id="main" class="container" data-value="test">Content</div>';
65
+
66
+ const document = parseHTML(html);
67
+
68
+ const div = document.querySelector('div');
69
+ expect(div).toBeDefined();
70
+ expect(div?.getAttribute('id')).toBe('main');
71
+ expect(div?.getAttribute('class')).toBe('container');
72
+ expect(div?.getAttribute('data-value')).toBe('test');
73
+ expect(div?.textContent).toBe('Content');
74
+ });
75
+
76
+ it('should handle text content correctly', () => {
77
+ const html = '<p>This is a <strong>bold</strong> text with <em>emphasis</em>.</p>';
78
+
79
+ const document = parseHTML(html);
80
+
81
+ const p = document.querySelector('p');
82
+ expect(p).toBeDefined();
83
+ expect(normalizeText(p?.textContent || '')).toBe('This is a bold text with emphasis.');
84
+
85
+ const strong = document.querySelector('strong');
86
+ expect(strong?.textContent).toBe('bold');
87
+
88
+ const em = document.querySelector('em');
89
+ expect(em?.textContent).toBe('emphasis');
90
+ });
91
+
92
+ it('should handle comments correctly', () => {
93
+ const html = '<div><!-- This is a comment --><p>Content</p></div>';
94
+
95
+ const document = parseHTML(html);
96
+
97
+ const div = document.querySelector('div');
98
+ expect(div).toBeDefined();
99
+
100
+ const p = document.querySelector('p');
101
+ expect(p).toBeDefined();
102
+ expect(p?.textContent).toBe('Content');
103
+
104
+ // Check for comment node
105
+ const commentNode = div?.childNodes[0];
106
+ expect(commentNode?.nodeType).toBe(8); // COMMENT_NODE
107
+ });
108
+ });
109
+
110
+ describe('DOM Structure Validation', () => {
111
+ it('should maintain correct parent-child relationships', () => {
112
+ const html = '<div><p><span>Nested</span></p></div>';
113
+
114
+ const document = parseHTML(html);
115
+
116
+ const div = document.querySelector('div');
117
+ expect(div).toBeDefined();
118
+ expect(div?.children.length).toBe(1);
119
+
120
+ const p = div?.children[0];
121
+ expect(p?.tagName).toBe('P');
122
+ expect(p?.children.length).toBe(1);
123
+
124
+ const span = p?.children[0];
125
+ expect(span?.tagName).toBe('SPAN');
126
+ expect(span?.textContent).toBe('Nested');
127
+ });
128
+
129
+ it('should handle mixed content correctly', () => {
130
+ const html = '<div>Text before <span>span content</span> text after</div>';
131
+
132
+ const document = parseHTML(html);
133
+
134
+ const div = document.querySelector('div');
135
+ expect(div).toBeDefined();
136
+ expect(div?.childNodes.length).toBe(3);
137
+
138
+ // First text node
139
+ expect(div?.childNodes[0]?.nodeType).toBe(3); // TEXT_NODE
140
+ expect(div?.childNodes[0]?.textContent).toBe('Text before ');
141
+
142
+ // Span element
143
+ expect(div?.childNodes[1]?.nodeType).toBe(1); // ELEMENT_NODE
144
+ expect((div?.childNodes[1] as Element)?.tagName).toBe('SPAN');
145
+
146
+ // Last text node
147
+ expect(div?.childNodes[2]?.nodeType).toBe(3); // TEXT_NODE
148
+ expect(div?.childNodes[2]?.textContent).toBe(' text after');
149
+ });
150
+
151
+ it('should handle void elements correctly', () => {
152
+ const html = '<div><img src="test.jpg"><br><input type="text"></div>';
153
+
154
+ const document = parseHTML(html);
155
+
156
+ const img = document.querySelector('img');
157
+ expect(img).toBeDefined();
158
+ expect(img?.getAttribute('src')).toBe('test.jpg');
159
+
160
+ const br = document.querySelector('br');
161
+ expect(br).toBeDefined();
162
+
163
+ const input = document.querySelector('input');
164
+ expect(input).toBeDefined();
165
+ expect(input?.getAttribute('type')).toBe('text');
166
+ });
167
+ });
168
+
169
+ describe('HTML5 Semantic Structure', () => {
170
+ it('should handle complete HTML5 document structure', () => {
171
+ const html = `<!DOCTYPE html>
172
+ <html>
173
+ <head>
174
+ <title>Test Document</title>
175
+ </head>
176
+ <body>
177
+ <header>
178
+ <nav>Navigation</nav>
179
+ </header>
180
+ <main>
181
+ <article>
182
+ <section>Content</section>
183
+ </article>
184
+ </main>
185
+ <footer>Footer</footer>
186
+ </body>
187
+ </html>`;
188
+
189
+ const document = parseHTML(html);
190
+
191
+ expect(document).toBeDefined();
192
+ expect(document.documentElement?.tagName).toBe('HTML');
193
+
194
+ const head = document.querySelector('head');
195
+ expect(head).toBeDefined();
196
+
197
+ const title = document.querySelector('title');
198
+ expect(title?.textContent).toBe('Test Document');
199
+
200
+ const body = document.querySelector('body');
201
+ expect(body).toBeDefined();
202
+
203
+ const header = document.querySelector('header');
204
+ expect(header).toBeDefined();
205
+
206
+ const nav = document.querySelector('nav');
207
+ expect(nav?.textContent).toBe('Navigation');
208
+
209
+ const main = document.querySelector('main');
210
+ expect(main).toBeDefined();
211
+
212
+ const article = document.querySelector('article');
213
+ expect(article).toBeDefined();
214
+
215
+ const section = document.querySelector('section');
216
+ expect(section?.textContent).toBe('Content');
217
+
218
+ const footer = document.querySelector('footer');
219
+ expect(footer?.textContent).toBe('Footer');
220
+ });
221
+
222
+ it('should handle HTML5 form elements', () => {
223
+ const html = `<form>
224
+ <fieldset>
225
+ <legend>Contact Information</legend>
226
+ <label for="email">Email:</label>
227
+ <input type="email" id="email" required>
228
+ <label for="phone">Phone:</label>
229
+ <input type="tel" id="phone">
230
+ <button type="submit">Submit</button>
231
+ </fieldset>
232
+ </form>`;
233
+
234
+ const document = parseHTML(html);
235
+
236
+ const form = document.querySelector('form');
237
+ expect(form).toBeDefined();
238
+
239
+ const fieldset = document.querySelector('fieldset');
240
+ expect(fieldset).toBeDefined();
241
+
242
+ const legend = document.querySelector('legend');
243
+ expect(legend?.textContent).toBe('Contact Information');
244
+
245
+ const emailInput = document.querySelector('input[type="email"]');
246
+ expect(emailInput).toBeDefined();
247
+ expect(emailInput?.getAttribute('id')).toBe('email');
248
+ expect(emailInput?.hasAttribute('required')).toBe(true);
249
+
250
+ const phoneInput = document.querySelector('input[type="tel"]');
251
+ expect(phoneInput).toBeDefined();
252
+ expect(phoneInput?.getAttribute('id')).toBe('phone');
253
+
254
+ const submitButton = document.querySelector('button[type="submit"]');
255
+ expect(submitButton).toBeDefined();
256
+ expect(submitButton?.textContent).toBe('Submit');
257
+ });
258
+ });
259
+
260
+ describe('Error Handling and Edge Cases', () => {
261
+ it('should handle malformed HTML gracefully', () => {
262
+ const malformedHTML = '<div><p>Unclosed paragraph<div>Another div</div>';
263
+
264
+ const document = parseHTML(malformedHTML);
265
+
266
+ expect(document).toBeDefined();
267
+ expect(document.nodeType).toBe(9); // DOCUMENT_NODE
268
+
269
+ const divs = document.querySelectorAll('div');
270
+ expect(divs.length).toBeGreaterThan(0);
271
+ });
272
+
273
+ it('should handle empty elements', () => {
274
+ const html = '<div></div><p></p><span></span>';
275
+
276
+ const document = parseHTML(html);
277
+
278
+ const div = document.querySelector('div');
279
+ expect(div).toBeDefined();
280
+ expect(div?.textContent).toBe('');
281
+
282
+ const p = document.querySelector('p');
283
+ expect(p).toBeDefined();
284
+ expect(p?.textContent).toBe('');
285
+
286
+ const span = document.querySelector('span');
287
+ expect(span).toBeDefined();
288
+ expect(span?.textContent).toBe('');
289
+ });
290
+
291
+ it('should handle special characters in text', () => {
292
+ const html = '<p>Special chars: &lt; &gt; &amp; &quot; &#39;</p>';
293
+
294
+ const document = parseHTML(html);
295
+
296
+ const p = document.querySelector('p');
297
+ expect(p).toBeDefined();
298
+ expect(p?.textContent).toContain('Special chars:');
299
+ // The exact entity handling depends on your implementation
300
+ });
301
+
302
+ it('should handle multiple top-level elements', () => {
303
+ const html = '<div>First</div><p>Second</p><span>Third</span>';
304
+
305
+ const document = parseHTML(html);
306
+
307
+ const div = document.querySelector('div');
308
+ expect(div?.textContent).toBe('First');
309
+
310
+ const p = document.querySelector('p');
311
+ expect(p?.textContent).toBe('Second');
312
+
313
+ const span = document.querySelector('span');
314
+ expect(span?.textContent).toBe('Third');
315
+ });
316
+ });
317
+
318
+ describe('DOM API Compliance', () => {
319
+ it('should support basic DOM queries', () => {
320
+ const html = '<div id="test" class="container"><p class="text">Hello</p></div>';
321
+
322
+ const document = parseHTML(html);
323
+
324
+ // Test getElementById
325
+ const byId = document.getElementById('test');
326
+ expect(byId).toBeDefined();
327
+ expect(byId?.tagName).toBe('DIV');
328
+
329
+ // Test querySelector
330
+ const bySelector = document.querySelector('.container');
331
+ expect(bySelector).toBeDefined();
332
+ expect(bySelector?.id).toBe('test');
333
+
334
+ // Test querySelectorAll
335
+ const byClass = document.querySelectorAll('.text');
336
+ expect(byClass.length).toBe(1);
337
+ expect(byClass[0]?.textContent).toBe('Hello');
338
+ });
339
+
340
+ it('should support element traversal', () => {
341
+ const html = '<div><p>First</p><p>Second</p><p>Third</p></div>';
342
+
343
+ const document = parseHTML(html);
344
+
345
+ const div = document.querySelector('div');
346
+ expect(div).toBeDefined();
347
+
348
+ const children = div?.children;
349
+ expect(children?.length).toBe(3);
350
+
351
+ const firstP = children?.[0];
352
+ expect(firstP?.textContent).toBe('First');
353
+
354
+ const secondP = children?.[1];
355
+ expect(secondP?.textContent).toBe('Second');
356
+
357
+ const thirdP = children?.[2];
358
+ expect(thirdP?.textContent).toBe('Third');
359
+ });
360
+ });
361
+ });
@@ -0,0 +1,204 @@
1
+ import { expect, describe, it } from 'bun:test';
2
+ import { tokenize, TokenType } from '../../../src/tokenizer';
3
+ import type { Token } from '../../../src/tokenizer';
4
+
5
+ // HTML5lib tokenizer test format
6
+ export interface HTML5libTokenizerTest {
7
+ description: string;
8
+ input: string;
9
+ output: HTML5libTokenOutput[];
10
+ initialStates?: string[];
11
+ lastStartTag?: string;
12
+ errors?: HTML5libError[];
13
+ doubleEscaped?: boolean;
14
+ }
15
+
16
+ export interface HTML5libTokenizerTestSuite {
17
+ tests: HTML5libTokenizerTest[];
18
+ }
19
+
20
+ export type HTML5libTokenOutput =
21
+ | ['StartTag', string, Record<string, string>] // StartTag without self-closing
22
+ | ['StartTag', string, Record<string, string>, boolean] // StartTag with self-closing
23
+ | ['EndTag', string] // EndTag
24
+ | ['Comment', string] // Comment
25
+ | ['Character', string] // Character
26
+ | ['DOCTYPE', string, string | null, string | null, boolean]; // DOCTYPE
27
+
28
+ export interface HTML5libError {
29
+ code: string;
30
+ line: number;
31
+ col: number;
32
+ }
33
+
34
+ /**
35
+ * Converts HTML5lib token format to our internal token format
36
+ */
37
+ export function convertHTML5libToken(html5libToken: HTML5libTokenOutput): Partial<Token> {
38
+ const type = html5libToken[0];
39
+ const nameOrData = html5libToken[1];
40
+
41
+ switch (type) {
42
+ case 'DOCTYPE':
43
+ return {
44
+ type: TokenType.DOCTYPE,
45
+ value: nameOrData || '',
46
+ attributes: {}
47
+ };
48
+ case 'StartTag':
49
+ const attributes = html5libToken[2];
50
+ const selfClosing = html5libToken[3];
51
+ return {
52
+ type: TokenType.TAG_OPEN,
53
+ value: nameOrData || '',
54
+ attributes: (typeof attributes === 'object' && attributes !== null) ? attributes : {},
55
+ isSelfClosing: typeof selfClosing === 'boolean' ? selfClosing : false
56
+ };
57
+ case 'EndTag':
58
+ return {
59
+ type: TokenType.TAG_CLOSE,
60
+ value: nameOrData || '',
61
+ attributes: {},
62
+ isClosing: true
63
+ };
64
+ case 'Comment':
65
+ return {
66
+ type: TokenType.COMMENT,
67
+ value: nameOrData || '',
68
+ attributes: {}
69
+ };
70
+ case 'Character':
71
+ return {
72
+ type: TokenType.TEXT,
73
+ value: nameOrData || '',
74
+ attributes: {}
75
+ };
76
+ default:
77
+ throw new Error(`Unknown HTML5lib token type: ${type}`);
78
+ }
79
+ }
80
+
81
+ /**
82
+ * Converts our internal token format to HTML5lib format for comparison
83
+ */
84
+ export function convertToHTML5libToken(token: Token): HTML5libTokenOutput {
85
+ switch (token.type) {
86
+ case TokenType.DOCTYPE:
87
+ return ['DOCTYPE', token.value, null, null, true];
88
+ case TokenType.TAG_OPEN:
89
+ if (token.isSelfClosing) {
90
+ return ['StartTag', token.value, token.attributes || {}, true];
91
+ } else {
92
+ return ['StartTag', token.value, token.attributes || {}];
93
+ }
94
+ case TokenType.TAG_CLOSE:
95
+ return ['EndTag', token.value];
96
+ case TokenType.COMMENT:
97
+ return ['Comment', token.value];
98
+ case TokenType.TEXT:
99
+ return ['Character', token.value];
100
+ default:
101
+ throw new Error(`Unknown token type: ${token.type}`);
102
+ }
103
+ }
104
+
105
+ /**
106
+ * Normalizes adjacent character tokens as per HTML5lib spec
107
+ */
108
+ export function normalizeCharacterTokens(tokens: Token[]): Token[] {
109
+ const normalized: Token[] = [];
110
+ let currentText = '';
111
+
112
+ for (const token of tokens) {
113
+ if (token.type === TokenType.TEXT) {
114
+ currentText += token.value;
115
+ } else {
116
+ if (currentText) {
117
+ normalized.push({
118
+ type: TokenType.TEXT,
119
+ value: currentText,
120
+ position: token.position,
121
+ attributes: {}
122
+ });
123
+ currentText = '';
124
+ }
125
+ if (token.type !== TokenType.EOF) {
126
+ normalized.push(token);
127
+ }
128
+ }
129
+ }
130
+
131
+ if (currentText) {
132
+ normalized.push({
133
+ type: TokenType.TEXT,
134
+ value: currentText,
135
+ position: { line: 1, column: 1, offset: 0 },
136
+ attributes: {}
137
+ });
138
+ }
139
+
140
+ return normalized;
141
+ }
142
+
143
+ /**
144
+ * Runs a single HTML5lib tokenizer test
145
+ */
146
+ export function runHTML5libTokenizerTest(test: HTML5libTokenizerTest): void {
147
+ const { description, input, output: expectedOutput, initialStates = ['Data state'] } = test;
148
+
149
+ // Process double-escaped input if needed
150
+ let processedInput = input;
151
+ if (test.doubleEscaped) {
152
+ processedInput = processedInput.replace(/\\u([0-9a-fA-F]{4})/g, (match, hex) => {
153
+ return String.fromCharCode(parseInt(hex, 16));
154
+ });
155
+ }
156
+
157
+ for (const initialState of initialStates) {
158
+ it(`${description} (${initialState})`, () => {
159
+ // Tokenize the input
160
+ const tokens = tokenize(processedInput);
161
+
162
+ // Normalize character tokens
163
+ const normalizedTokens = normalizeCharacterTokens(tokens);
164
+
165
+ // Convert to HTML5lib format for comparison
166
+ const actualOutput = normalizedTokens.map(convertToHTML5libToken);
167
+
168
+ // Process expected output if double-escaped
169
+ let processedExpectedOutput = expectedOutput;
170
+ if (test.doubleEscaped) {
171
+ processedExpectedOutput = expectedOutput.map(token => {
172
+ if (typeof token[1] === 'string') {
173
+ token[1] = token[1].replace(/\\u([0-9a-fA-F]{4})/g, (match, hex) => {
174
+ return String.fromCharCode(parseInt(hex, 16));
175
+ });
176
+ }
177
+ return token;
178
+ });
179
+ }
180
+
181
+ // Compare outputs
182
+ expect(actualOutput).toEqual(processedExpectedOutput);
183
+ });
184
+ }
185
+ }
186
+
187
+ /**
188
+ * Runs all tests from an HTML5lib tokenizer test suite
189
+ */
190
+ export function runHTML5libTokenizerTestSuite(testSuite: HTML5libTokenizerTestSuite, suiteName: string): void {
191
+ describe(`HTML5lib Tokenizer Tests: ${suiteName}`, () => {
192
+ testSuite.tests.forEach(test => {
193
+ runHTML5libTokenizerTest(test);
194
+ });
195
+ });
196
+ }
197
+
198
+ /**
199
+ * Loads and runs HTML5lib tokenizer tests from JSON
200
+ */
201
+ export async function loadHTML5libTokenizerTests(testData: string, suiteName: string): Promise<void> {
202
+ const testSuite: HTML5libTokenizerTestSuite = JSON.parse(testData);
203
+ runHTML5libTokenizerTestSuite(testSuite, suiteName);
204
+ }