@tkeron/html-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,184 @@
1
+ import { describe, it } from 'bun:test';
2
+ import {
3
+ loadHTML5libTokenizerTests,
4
+ runHTML5libTokenizerTestSuite,
5
+ type HTML5libTokenizerTestSuite
6
+ } from './tokenizer-utils';
7
+
8
+ // Sample HTML5lib tokenizer tests embedded directly
9
+ const basicTokenizerTests: HTML5libTokenizerTestSuite = {
10
+ "tests": [
11
+ {
12
+ "description": "Correct Doctype lowercase",
13
+ "input": "<!DOCTYPE html>",
14
+ "output": [["DOCTYPE", "html", null, null, true]]
15
+ },
16
+ {
17
+ "description": "Correct Doctype uppercase",
18
+ "input": "<!DOCTYPE HTML>",
19
+ "output": [["DOCTYPE", "html", null, null, true]]
20
+ },
21
+ {
22
+ "description": "Single Start Tag",
23
+ "input": "<h>",
24
+ "output": [["StartTag", "h", {}]]
25
+ },
26
+ {
27
+ "description": "Start Tag w/attribute",
28
+ "input": "<h a='b'>",
29
+ "output": [["StartTag", "h", { "a": "b" }]]
30
+ },
31
+ {
32
+ "description": "Start/End Tag",
33
+ "input": "<h></h>",
34
+ "output": [["StartTag", "h", {}], ["EndTag", "h"]]
35
+ },
36
+ {
37
+ "description": "Simple comment",
38
+ "input": "<!--comment-->",
39
+ "output": [["Comment", "comment"]]
40
+ },
41
+ {
42
+ "description": "Character data",
43
+ "input": "Hello World",
44
+ "output": [["Character", "Hello World"]]
45
+ },
46
+ {
47
+ "description": "Multiple attributes",
48
+ "input": "<h a='b' c='d'>",
49
+ "output": [["StartTag", "h", { "a": "b", "c": "d" }]]
50
+ },
51
+ {
52
+ "description": "Self-closing tag",
53
+ "input": "<br/>",
54
+ "output": [["StartTag", "br", {}, true]]
55
+ },
56
+ {
57
+ "description": "Empty comment",
58
+ "input": "<!---->",
59
+ "output": [["Comment", ""]]
60
+ },
61
+ {
62
+ "description": "Text with entities",
63
+ "input": "&amp;&lt;&gt;",
64
+ "output": [["Character", "&<>"]]
65
+ },
66
+ {
67
+ "description": "Numeric entity",
68
+ "input": "&#65;",
69
+ "output": [["Character", "A"]]
70
+ },
71
+ {
72
+ "description": "Hex entity",
73
+ "input": "&#x41;",
74
+ "output": [["Character", "A"]]
75
+ },
76
+ {
77
+ "description": "Unquoted attribute",
78
+ "input": "<h a=b>",
79
+ "output": [["StartTag", "h", { "a": "b" }]]
80
+ },
81
+ {
82
+ "description": "Tag with mixed case",
83
+ "input": "<DiV>",
84
+ "output": [["StartTag", "div", {}]]
85
+ }
86
+ ]
87
+ };
88
+
89
+ // Entity tests
90
+ const entityTests: HTML5libTokenizerTestSuite = {
91
+ "tests": [
92
+ {
93
+ "description": "Entity with trailing semicolon",
94
+ "input": "I'm &not;it",
95
+ "output": [["Character", "I'm ¬it"]]
96
+ },
97
+ {
98
+ "description": "Entity without trailing semicolon",
99
+ "input": "I'm &notit",
100
+ "output": [["Character", "I'm ¬it"]],
101
+ "errors": [
102
+ { "code": "missing-semicolon-after-character-reference", "line": 1, "col": 9 }
103
+ ]
104
+ },
105
+ {
106
+ "description": "Ampersand EOF",
107
+ "input": "&",
108
+ "output": [["Character", "&"]]
109
+ },
110
+ {
111
+ "description": "Unfinished entity",
112
+ "input": "&f",
113
+ "output": [["Character", "&f"]]
114
+ },
115
+ {
116
+ "description": "Ampersand, number sign",
117
+ "input": "&#",
118
+ "output": [["Character", "&#"]],
119
+ "errors": [
120
+ { "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 3 }
121
+ ]
122
+ }
123
+ ]
124
+ };
125
+
126
+ // Comment tests
127
+ const commentTests: HTML5libTokenizerTestSuite = {
128
+ "tests": [
129
+ {
130
+ "description": "Comment, Central dash no space",
131
+ "input": "<!----->",
132
+ "output": [["Comment", "-"]]
133
+ },
134
+ {
135
+ "description": "Comment, two central dashes",
136
+ "input": "<!-- --comment -->",
137
+ "output": [["Comment", " --comment "]]
138
+ },
139
+ {
140
+ "description": "Unfinished comment",
141
+ "input": "<!--comment",
142
+ "output": [["Comment", "comment"]],
143
+ "errors": [
144
+ { "code": "eof-in-comment", "line": 1, "col": 12 }
145
+ ]
146
+ },
147
+ {
148
+ "description": "Short comment",
149
+ "input": "<!-->",
150
+ "output": [["Comment", ""]],
151
+ "errors": [
152
+ { "code": "abrupt-closing-of-empty-comment", "line": 1, "col": 5 }
153
+ ]
154
+ },
155
+ {
156
+ "description": "Nested comment",
157
+ "input": "<!-- <!--test-->",
158
+ "output": [["Comment", " <!--test"]],
159
+ "errors": [
160
+ { "code": "nested-comment", "line": 1, "col": 10 }
161
+ ]
162
+ }
163
+ ]
164
+ };
165
+
166
+ // Run the embedded tests
167
+ describe('HTML5lib Tokenizer Tests', () => {
168
+ runHTML5libTokenizerTestSuite(basicTokenizerTests, 'Basic Tokenizer');
169
+ runHTML5libTokenizerTestSuite(entityTests, 'Entity Handling');
170
+ runHTML5libTokenizerTestSuite(commentTests, 'Comment Handling');
171
+ });
172
+
173
+ // Test for loading external test files (when available)
174
+ describe('HTML5lib External Tests', () => {
175
+ it('should be able to load external test files', async () => {
176
+ // This would be used to load actual HTML5lib test files
177
+ // const testData = await Bun.file('/path/to/test1.test').text();
178
+ // await loadHTML5libTokenizerTests(testData, 'External Test');
179
+
180
+ // For now, we'll just verify our utilities work
181
+ const testData = JSON.stringify(basicTokenizerTests);
182
+ await loadHTML5libTokenizerTests(testData, 'Loaded Basic Tests');
183
+ });
184
+ });
@@ -0,0 +1,208 @@
1
+ import { expect, describe, it } from 'bun:test';
2
+ import { parse } from '../../../src/parser';
3
+ import { tokenize } from '../../../src/tokenizer';
4
+ import type { ASTNode } from '../../../src/parser';
5
+
6
+ // HTML5lib tree construction test format
7
+ export interface HTML5libTreeTest {
8
+ data: string;
9
+ errors: string[];
10
+ newErrors?: string[];
11
+ documentFragment?: string;
12
+ scriptOff?: boolean;
13
+ scriptOn?: boolean;
14
+ document: string;
15
+ }
16
+
17
+ /**
18
+ * Parses HTML5lib DAT format test files
19
+ */
20
+ export function parseHTML5libDATFile(content: string): HTML5libTreeTest[] {
21
+ const tests: HTML5libTreeTest[] = [];
22
+ const sections = content.split('\n\n').filter(section => section.trim());
23
+
24
+ for (const section of sections) {
25
+ const lines = section.split('\n');
26
+ const test: Partial<HTML5libTreeTest> = {
27
+ errors: [] // Initialize errors as empty array
28
+ };
29
+
30
+ let currentSection = '';
31
+ let currentContent: string[] = [];
32
+
33
+ for (const line of lines) {
34
+ if (line.startsWith('#')) {
35
+ // Save previous section
36
+ if (currentSection) {
37
+ switch (currentSection) {
38
+ case 'data':
39
+ test.data = currentContent.join('\n');
40
+ break;
41
+ case 'errors':
42
+ test.errors = currentContent.filter(l => l.trim());
43
+ break;
44
+ case 'new-errors':
45
+ test.newErrors = currentContent.filter(l => l.trim());
46
+ break;
47
+ case 'document-fragment':
48
+ test.documentFragment = currentContent.join('\n');
49
+ break;
50
+ case 'document':
51
+ test.document = currentContent.join('\n');
52
+ break;
53
+ }
54
+ }
55
+
56
+ // Start new section
57
+ currentSection = line.substring(1);
58
+ currentContent = [];
59
+
60
+ // Handle script flags
61
+ if (currentSection === 'script-off') {
62
+ test.scriptOff = true;
63
+ } else if (currentSection === 'script-on') {
64
+ test.scriptOn = true;
65
+ }
66
+ } else {
67
+ currentContent.push(line);
68
+ }
69
+ }
70
+
71
+ // Save last section
72
+ if (currentSection) {
73
+ switch (currentSection) {
74
+ case 'data':
75
+ test.data = currentContent.join('\n');
76
+ break;
77
+ case 'errors':
78
+ test.errors = currentContent.filter(l => l.trim());
79
+ break;
80
+ case 'new-errors':
81
+ test.newErrors = currentContent.filter(l => l.trim());
82
+ break;
83
+ case 'document-fragment':
84
+ test.documentFragment = currentContent.join('\n');
85
+ break;
86
+ case 'document':
87
+ test.document = currentContent.join('\n');
88
+ break;
89
+ }
90
+ }
91
+
92
+ if (test.data && test.document) {
93
+ tests.push(test as HTML5libTreeTest);
94
+ }
95
+ }
96
+
97
+ return tests;
98
+ }
99
+
100
+ /**
101
+ * Converts AST to HTML5lib tree format
102
+ */
103
+ export function convertASTToHTML5libTree(node: ASTNode, depth: number = 0): string[] {
104
+ const lines: string[] = [];
105
+ const indent = '| ' + ' '.repeat(depth);
106
+
107
+ switch (node.type) {
108
+ case 'DOCUMENT':
109
+ // Document node doesn't have a line representation
110
+ break;
111
+ case 'DOCTYPE':
112
+ lines.push(`${indent}<!DOCTYPE ${node.tagName || 'html'}>`);
113
+ break;
114
+ case 'ELEMENT':
115
+ const tagName = node.tagName || 'unknown';
116
+ lines.push(`${indent}<${tagName}>`);
117
+
118
+ // Add attributes
119
+ if (node.attributes) {
120
+ for (const [name, value] of Object.entries(node.attributes).sort()) {
121
+ lines.push(`${indent} ${name}="${value}"`);
122
+ }
123
+ }
124
+ break;
125
+ case 'TEXT':
126
+ if (node.content && node.content.trim()) {
127
+ lines.push(`${indent}"${node.content}"`);
128
+ }
129
+ break;
130
+ case 'COMMENT':
131
+ lines.push(`${indent}<!-- ${node.content || ''} -->`);
132
+ break;
133
+ case 'CDATA':
134
+ lines.push(`${indent}<![CDATA[${node.content || ''}]]>`);
135
+ break;
136
+ }
137
+
138
+ // Add children
139
+ if (node.children) {
140
+ for (const child of node.children) {
141
+ lines.push(...convertASTToHTML5libTree(child, depth + 1));
142
+ }
143
+ }
144
+
145
+ return lines;
146
+ }
147
+
148
+ /**
149
+ * Normalizes HTML5lib tree format for comparison
150
+ */
151
+ export function normalizeHTML5libTree(tree: string): string {
152
+ return tree
153
+ .split('\n')
154
+ .map(line => line.trim())
155
+ .filter(line => line.length > 0)
156
+ .join('\n');
157
+ }
158
+
159
+ /**
160
+ * Runs a single HTML5lib tree construction test
161
+ */
162
+ export function runHTML5libTreeTest(test: HTML5libTreeTest, testName: string): void {
163
+ it(testName, () => {
164
+ const { data, document: expectedTree, documentFragment, scriptOff, scriptOn } = test;
165
+
166
+ // Parse the HTML
167
+ const tokens = tokenize(data);
168
+ const ast = parse(tokens);
169
+
170
+ // Convert to HTML5lib tree format
171
+ const actualTreeLines = convertASTToHTML5libTree(ast);
172
+ const actualTree = actualTreeLines.join('\n');
173
+
174
+ // Normalize both trees for comparison
175
+ const normalizedActual = normalizeHTML5libTree(actualTree);
176
+ const normalizedExpected = normalizeHTML5libTree(expectedTree);
177
+
178
+ // Compare trees
179
+ expect(normalizedActual).toBe(normalizedExpected);
180
+ });
181
+ }
182
+
183
+ /**
184
+ * Runs all tests from an HTML5lib tree construction test suite
185
+ */
186
+ export function runHTML5libTreeTestSuite(tests: HTML5libTreeTest[], suiteName: string): void {
187
+ describe(`HTML5lib Tree Construction Tests: ${suiteName}`, () => {
188
+ tests.forEach((test, index) => {
189
+ const testName = `Test ${index + 1}: ${test.data.substring(0, 50).replace(/\n/g, ' ')}...`;
190
+ runHTML5libTreeTest(test, testName);
191
+ });
192
+ });
193
+ }
194
+
195
+ /**
196
+ * Loads and runs HTML5lib tree construction tests from DAT format
197
+ */
198
+ export async function loadHTML5libTreeTests(testData: string, suiteName: string): Promise<void> {
199
+ const tests = parseHTML5libDATFile(testData);
200
+ runHTML5libTreeTestSuite(tests, suiteName);
201
+ }
202
+
203
+ /**
204
+ * Validates HTML5lib tree construction test format
205
+ */
206
+ export function validateHTML5libTreeTest(test: HTML5libTreeTest): boolean {
207
+ return !!(test.data && test.document && test.errors !== undefined);
208
+ }
@@ -0,0 +1,250 @@
1
+ import { describe, it, expect } from "bun:test";
2
+ import {
3
+ loadHTML5libTreeTests,
4
+ runHTML5libTreeTestSuite,
5
+ parseHTML5libDATFile,
6
+ type HTML5libTreeTest,
7
+ } from "./tree-construction-utils";
8
+
9
+ // Sample HTML5lib tree construction tests in DAT format
10
+ const basicTreeTestData = `#data
11
+ Test
12
+ #errors
13
+ (1,0): expected-doctype-but-got-chars
14
+ #document
15
+ | <html>
16
+ | <head>
17
+ | <body>
18
+ | "Test"
19
+
20
+ #data
21
+ <p>One<p>Two
22
+ #errors
23
+ (1,3): expected-doctype-but-got-start-tag
24
+ #document
25
+ | <html>
26
+ | <head>
27
+ | <body>
28
+ | <p>
29
+ | "One"
30
+ | <p>
31
+ | "Two"
32
+
33
+ #data
34
+ <html>
35
+ #errors
36
+ (1,6): expected-doctype-but-got-start-tag
37
+ #document
38
+ | <html>
39
+ | <head>
40
+ | <body>
41
+
42
+ #data
43
+ <head>
44
+ #errors
45
+ (1,6): expected-doctype-but-got-start-tag
46
+ #document
47
+ | <html>
48
+ | <head>
49
+ | <body>
50
+
51
+ #data
52
+ <body>
53
+ #errors
54
+ (1,6): expected-doctype-but-got-start-tag
55
+ #document
56
+ | <html>
57
+ | <head>
58
+ | <body>
59
+
60
+ #data
61
+ <html><head></head><body></body>
62
+ #errors
63
+ (1,6): expected-doctype-but-got-start-tag
64
+ #document
65
+ | <html>
66
+ | <head>
67
+ | <body>
68
+
69
+ #data
70
+ Line1<br>Line2
71
+ #errors
72
+ (1,0): expected-doctype-but-got-chars
73
+ #document
74
+ | <html>
75
+ | <head>
76
+ | <body>
77
+ | "Line1"
78
+ | <br>
79
+ | "Line2"
80
+
81
+ #data
82
+ <div>hello</div>
83
+ #errors
84
+ (1,5): expected-doctype-but-got-start-tag
85
+ #document
86
+ | <html>
87
+ | <head>
88
+ | <body>
89
+ | <div>
90
+ | "hello"
91
+
92
+ #data
93
+ <p><b>bold</b></p>
94
+ #errors
95
+ (1,3): expected-doctype-but-got-start-tag
96
+ #document
97
+ | <html>
98
+ | <head>
99
+ | <body>
100
+ | <p>
101
+ | <b>
102
+ | "bold"
103
+
104
+ #data
105
+ <!--comment-->
106
+ #errors
107
+ (1,0): expected-doctype-but-got-chars
108
+ #document
109
+ | <html>
110
+ | <head>
111
+ | <body>
112
+ | <!-- comment -->`;
113
+
114
+ const doctypeTestData = `#data
115
+ <!DOCTYPE html>
116
+ #errors
117
+ #document
118
+ | <!DOCTYPE html>
119
+ | <html>
120
+ | <head>
121
+ | <body>
122
+
123
+ #data
124
+ <!DOCTYPE html><html><head><title>Test</title></head><body><p>Hello</p></body></html>
125
+ #errors
126
+ #document
127
+ | <!DOCTYPE html>
128
+ | <html>
129
+ | <head>
130
+ | <title>
131
+ | "Test"
132
+ | <body>
133
+ | <p>
134
+ | "Hello"
135
+
136
+ #data
137
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
138
+ #errors
139
+ #document
140
+ | <!DOCTYPE html>
141
+ | <html>
142
+ | <head>
143
+ | <body>
144
+
145
+ #data
146
+ <!DOCTYPE html SYSTEM "about:legacy-compat">
147
+ #errors
148
+ #document
149
+ | <!DOCTYPE html>
150
+ | <html>
151
+ | <head>
152
+ | <body>`;
153
+
154
+ const errorHandlingTestData = `#data
155
+ <b><table><td></b><i></table>
156
+ #errors
157
+ (1,3): expected-doctype-but-got-start-tag
158
+ (1,14): unexpected-cell-in-table-body
159
+ (1,18): unexpected-end-tag
160
+ (1,29): unexpected-cell-end-tag
161
+ (1,29): expected-closing-tag-but-got-eof
162
+ #document
163
+ | <html>
164
+ | <head>
165
+ | <body>
166
+ | <b>
167
+ | <table>
168
+ | <tbody>
169
+ | <tr>
170
+ | <td>
171
+ | <i>
172
+
173
+ #data
174
+ <p><b><div><marquee></p></b></div>
175
+ #errors
176
+ (1,3): expected-doctype-but-got-start-tag
177
+ (1,11): unexpected-end-tag
178
+ (1,24): unexpected-end-tag
179
+ (1,28): unexpected-end-tag
180
+ (1,34): end-tag-too-early
181
+ (1,34): expected-closing-tag-but-got-eof
182
+ #document
183
+ | <html>
184
+ | <head>
185
+ | <body>
186
+ | <p>
187
+ | <b>
188
+ | <div>
189
+ | <b>
190
+ | <marquee>
191
+ | <p>
192
+
193
+ #data
194
+ <a><p><a></a></p></a>
195
+ #errors
196
+ (1,3): expected-doctype-but-got-start-tag
197
+ (1,9): unexpected-start-tag-implies-end-tag
198
+ (1,9): adoption-agency-1.3
199
+ (1,21): unexpected-end-tag
200
+ #document
201
+ | <html>
202
+ | <head>
203
+ | <body>
204
+ | <a>
205
+ | <p>
206
+ | <a>
207
+ | <a>`;
208
+
209
+ // Run the embedded tests
210
+ describe("HTML5lib Tree Construction Tests", () => {
211
+ it("should parse DAT format correctly", () => {
212
+ const tests = parseHTML5libDATFile(basicTreeTestData);
213
+ expect(tests.length).toBeGreaterThan(0);
214
+
215
+ // Check first test
216
+ const firstTest = tests[0];
217
+ if (firstTest) {
218
+ expect(firstTest.data).toBe("Test");
219
+ expect(firstTest.errors.length).toBeGreaterThan(0);
220
+ expect(firstTest.document).toContain("<html>");
221
+ }
222
+ });
223
+
224
+ it("should handle doctype tests", () => {
225
+ const tests = parseHTML5libDATFile(doctypeTestData);
226
+ expect(tests.length).toBeGreaterThan(0);
227
+
228
+ // Check first doctype test
229
+ const firstTest = tests[0];
230
+ if (firstTest) {
231
+ expect(firstTest.data).toBe("<!DOCTYPE html>");
232
+ expect(firstTest.errors.length).toBe(0);
233
+ expect(firstTest.document).toContain("<!DOCTYPE html>");
234
+ }
235
+ });
236
+
237
+ it("should handle error cases", () => {
238
+ const tests = parseHTML5libDATFile(errorHandlingTestData);
239
+ expect(tests.length).toBeGreaterThan(0);
240
+
241
+ // Check error handling
242
+ const firstTest = tests[0];
243
+ if (firstTest) {
244
+ expect(firstTest.errors.length).toBeGreaterThan(0);
245
+ expect(firstTest.errors[0]).toContain(
246
+ "expected-doctype-but-got-start-tag"
247
+ );
248
+ }
249
+ });
250
+ });