@tkeron/html-parser 0.1.7 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +1 -7
  2. package/bun.lock +5 -0
  3. package/index.ts +4 -0
  4. package/package.json +7 -1
  5. package/src/css-selector.ts +1 -1
  6. package/src/dom-simulator.ts +41 -17
  7. package/src/encoding.ts +39 -0
  8. package/src/index.ts +9 -0
  9. package/src/parser.ts +509 -143
  10. package/src/serializer.ts +450 -0
  11. package/src/tokenizer.ts +190 -118
  12. package/tests/advanced.test.ts +121 -108
  13. package/tests/custom-elements-head.test.ts +105 -0
  14. package/tests/dom-extended.test.ts +12 -12
  15. package/tests/dom-manipulation.test.ts +9 -10
  16. package/tests/dom.test.ts +32 -27
  17. package/tests/helpers/tokenizer-adapter.test.ts +70 -0
  18. package/tests/helpers/tokenizer-adapter.ts +65 -0
  19. package/tests/helpers/tree-adapter.test.ts +39 -0
  20. package/tests/helpers/tree-adapter.ts +60 -0
  21. package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
  22. package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
  23. package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
  24. package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
  25. package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
  26. package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
  27. package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
  28. package/tests/html5lib-data/tree-construction/math.dat +104 -0
  29. package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
  30. package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
  31. package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
  32. package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
  33. package/tests/html5lib-data/tree-construction/svg.dat +104 -0
  34. package/tests/html5lib-data/tree-construction/template.dat +1673 -0
  35. package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
  36. package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
  37. package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
  38. package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
  39. package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
  40. package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
  41. package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
  42. package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
  43. package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
  44. package/tests/parser.test.ts +173 -193
  45. package/tests/serializer-core.test.ts +16 -0
  46. package/tests/serializer-data/core.test +125 -0
  47. package/tests/serializer-data/injectmeta.test +66 -0
  48. package/tests/serializer-data/optionaltags.test +965 -0
  49. package/tests/serializer-data/options.test +60 -0
  50. package/tests/serializer-data/whitespace.test +51 -0
  51. package/tests/serializer-injectmeta.test.ts +16 -0
  52. package/tests/serializer-optionaltags.test.ts +16 -0
  53. package/tests/serializer-options.test.ts +16 -0
  54. package/tests/serializer-whitespace.test.ts +16 -0
  55. package/tests/tokenizer-namedEntities.test.ts +20 -0
  56. package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
  57. package/tests/tokenizer.test.ts +25 -32
  58. package/tests/tree-construction-adoption01.test.ts +37 -0
  59. package/tests/tree-construction-adoption02.test.ts +34 -0
  60. package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
  61. package/tests/tree-construction-entities02.test.ts +33 -0
  62. package/tests/tree-construction-html5test-com.test.ts +32 -0
  63. package/tests/tree-construction-math.test.ts +18 -0
  64. package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
  65. package/tests/tree-construction-noscript01.test.ts +18 -0
  66. package/tests/tree-construction-ruby.test.ts +21 -0
  67. package/tests/tree-construction-scriptdata01.test.ts +21 -0
  68. package/tests/tree-construction-svg.test.ts +21 -0
  69. package/tests/tree-construction-template.test.ts +21 -0
  70. package/tests/tree-construction-tests10.test.ts +21 -0
  71. package/tests/tree-construction-tests11.test.ts +21 -0
  72. package/tests/tree-construction-tests20.test.ts +18 -0
  73. package/tests/tree-construction-tests21.test.ts +18 -0
  74. package/tests/tree-construction-tests23.test.ts +18 -0
  75. package/tests/tree-construction-tests24.test.ts +18 -0
  76. package/tests/tree-construction-tests5.test.ts +21 -0
  77. package/tests/tree-construction-tests6.test.ts +21 -0
  78. package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
  79. package/tests/custom-elements.test.ts +0 -745
  80. package/tests/official/README.md +0 -87
  81. package/tests/official/acid/acid-tests.test.ts +0 -309
  82. package/tests/official/final-output/final-output.test.ts +0 -361
  83. package/tests/official/html5lib/tokenizer-utils.ts +0 -192
  84. package/tests/official/html5lib/tokenizer.test.ts +0 -171
  85. package/tests/official/html5lib/tree-construction-utils.ts +0 -194
  86. package/tests/official/html5lib/tree-construction.test.ts +0 -250
  87. package/tests/official/validator/validator-tests.test.ts +0 -237
  88. package/tests/official/validator-nu/validator-nu.test.ts +0 -335
  89. package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
  90. package/tests/official/wpt/wpt-tests.test.ts +0 -409
@@ -1,171 +0,0 @@
1
- import { describe, it } from 'bun:test';
2
- import {
3
- loadHTML5libTokenizerTests,
4
- runHTML5libTokenizerTestSuite,
5
- type HTML5libTokenizerTestSuite
6
- } from './tokenizer-utils';
7
-
8
-
9
- const basicTokenizerTests: HTML5libTokenizerTestSuite = {
10
- "tests": [
11
- {
12
- "description": "Correct Doctype lowercase",
13
- "input": "<!DOCTYPE html>",
14
- "output": [["DOCTYPE", "html", null, null, true]]
15
- },
16
- {
17
- "description": "Correct Doctype uppercase",
18
- "input": "<!DOCTYPE HTML>",
19
- "output": [["DOCTYPE", "html", null, null, true]]
20
- },
21
- {
22
- "description": "Single Start Tag",
23
- "input": "<h>",
24
- "output": [["StartTag", "h", {}]]
25
- },
26
- {
27
- "description": "Start Tag w/attribute",
28
- "input": "<h a='b'>",
29
- "output": [["StartTag", "h", { "a": "b" }]]
30
- },
31
- {
32
- "description": "Start/End Tag",
33
- "input": "<h></h>",
34
- "output": [["StartTag", "h", {}], ["EndTag", "h"]]
35
- },
36
- {
37
- "description": "Simple comment",
38
- "input": "<!--comment-->",
39
- "output": [["Comment", "comment"]]
40
- },
41
- {
42
- "description": "Character data",
43
- "input": "Hello World",
44
- "output": [["Character", "Hello World"]]
45
- },
46
- {
47
- "description": "Multiple attributes",
48
- "input": "<h a='b' c='d'>",
49
- "output": [["StartTag", "h", { "a": "b", "c": "d" }]]
50
- },
51
- {
52
- "description": "Self-closing tag",
53
- "input": "<br/>",
54
- "output": [["StartTag", "br", {}, true]]
55
- },
56
- {
57
- "description": "Empty comment",
58
- "input": "<!---->",
59
- "output": [["Comment", ""]]
60
- },
61
- {
62
- "description": "Text with entities",
63
- "input": "&amp;&lt;&gt;",
64
- "output": [["Character", "&<>"]]
65
- },
66
- {
67
- "description": "Numeric entity",
68
- "input": "&#65;",
69
- "output": [["Character", "A"]]
70
- },
71
- {
72
- "description": "Hex entity",
73
- "input": "&#x41;",
74
- "output": [["Character", "A"]]
75
- },
76
- {
77
- "description": "Unquoted attribute",
78
- "input": "<h a=b>",
79
- "output": [["StartTag", "h", { "a": "b" }]]
80
- },
81
- {
82
- "description": "Tag with mixed case",
83
- "input": "<DiV>",
84
- "output": [["StartTag", "div", {}]]
85
- }
86
- ]
87
- };
88
-
89
-
90
- const entityTests: HTML5libTokenizerTestSuite = {
91
- "tests": [
92
- {
93
- "description": "Entity with trailing semicolon",
94
- "input": "I'm &not;it",
95
- "output": [["Character", "I'm ¬it"]]
96
- },
97
- {
98
- "description": "Entity without trailing semicolon",
99
- "input": "I'm &notit",
100
- "output": [["Character", "I'm ¬it"]],
101
- "errors": [
102
- { "code": "missing-semicolon-after-character-reference", "line": 1, "col": 9 }
103
- ]
104
- },
105
- {
106
- "description": "Ampersand EOF",
107
- "input": "&",
108
- "output": [["Character", "&"]]
109
- },
110
- {
111
- "description": "Unfinished entity",
112
- "input": "&f",
113
- "output": [["Character", "&f"]]
114
- },
115
- {
116
- "description": "Ampersand, number sign",
117
- "input": "&#",
118
- "output": [["Character", "&#"]],
119
- "errors": [
120
- { "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 3 }
121
- ]
122
- }
123
- ]
124
- };
125
-
126
-
127
- const commentTests: HTML5libTokenizerTestSuite = {
128
- "tests": [
129
- {
130
- "description": "Comment, Central dash no space",
131
- "input": "<!----->",
132
- "output": [["Comment", "-"]]
133
- },
134
- {
135
- "description": "Comment, two central dashes",
136
- "input": "<!-- --comment -->",
137
- "output": [["Comment", " --comment "]]
138
- },
139
- {
140
- "description": "Unfinished comment",
141
- "input": "<!--comment",
142
- "output": [["Comment", "comment"]],
143
- "errors": [
144
- { "code": "eof-in-comment", "line": 1, "col": 12 }
145
- ]
146
- },
147
- {
148
- "description": "Short comment",
149
- "input": "<!-->",
150
- "output": [["Comment", ""]],
151
- "errors": [
152
- { "code": "abrupt-closing-of-empty-comment", "line": 1, "col": 5 }
153
- ]
154
- },
155
- {
156
- "description": "Nested comment",
157
- "input": "<!-- <!--test-->",
158
- "output": [["Comment", " <!--test"]],
159
- "errors": [
160
- { "code": "nested-comment", "line": 1, "col": 10 }
161
- ]
162
- }
163
- ]
164
- };
165
-
166
-
167
- describe('HTML5lib Tokenizer Tests', () => {
168
- runHTML5libTokenizerTestSuite(basicTokenizerTests, 'Basic Tokenizer');
169
- runHTML5libTokenizerTestSuite(entityTests, 'Entity Handling');
170
- runHTML5libTokenizerTestSuite(commentTests, 'Comment Handling');
171
- });
@@ -1,194 +0,0 @@
1
- import { expect, describe, it } from 'bun:test';
2
- import { parse } from '../../../src/parser';
3
- import { tokenize } from '../../../src/tokenizer';
4
- import type { ASTNode } from '../../../src/parser';
5
-
6
-
7
- export interface HTML5libTreeTest {
8
- data: string;
9
- errors: string[];
10
- newErrors?: string[];
11
- documentFragment?: string;
12
- scriptOff?: boolean;
13
- scriptOn?: boolean;
14
- document: string;
15
- }
16
-
17
-
18
- export function parseHTML5libDATFile(content: string): HTML5libTreeTest[] {
19
- const tests: HTML5libTreeTest[] = [];
20
- const sections = content.split('\n\n').filter(section => section.trim());
21
-
22
- for (const section of sections) {
23
- const lines = section.split('\n');
24
- const test: Partial<HTML5libTreeTest> = {
25
- errors: []
26
- };
27
-
28
- let currentSection = '';
29
- let currentContent: string[] = [];
30
-
31
- for (const line of lines) {
32
- if (line.startsWith('#')) {
33
-
34
- if (currentSection) {
35
- switch (currentSection) {
36
- case 'data':
37
- test.data = currentContent.join('\n');
38
- break;
39
- case 'errors':
40
- test.errors = currentContent.filter(l => l.trim());
41
- break;
42
- case 'new-errors':
43
- test.newErrors = currentContent.filter(l => l.trim());
44
- break;
45
- case 'document-fragment':
46
- test.documentFragment = currentContent.join('\n');
47
- break;
48
- case 'document':
49
- test.document = currentContent.join('\n');
50
- break;
51
- }
52
- }
53
-
54
-
55
- currentSection = line.substring(1);
56
- currentContent = [];
57
-
58
-
59
- if (currentSection === 'script-off') {
60
- test.scriptOff = true;
61
- } else if (currentSection === 'script-on') {
62
- test.scriptOn = true;
63
- }
64
- } else {
65
- currentContent.push(line);
66
- }
67
- }
68
-
69
-
70
- if (currentSection) {
71
- switch (currentSection) {
72
- case 'data':
73
- test.data = currentContent.join('\n');
74
- break;
75
- case 'errors':
76
- test.errors = currentContent.filter(l => l.trim());
77
- break;
78
- case 'new-errors':
79
- test.newErrors = currentContent.filter(l => l.trim());
80
- break;
81
- case 'document-fragment':
82
- test.documentFragment = currentContent.join('\n');
83
- break;
84
- case 'document':
85
- test.document = currentContent.join('\n');
86
- break;
87
- }
88
- }
89
-
90
- if (test.data && test.document) {
91
- tests.push(test as HTML5libTreeTest);
92
- }
93
- }
94
-
95
- return tests;
96
- }
97
-
98
-
99
- export function convertASTToHTML5libTree(node: ASTNode, depth: number = 0): string[] {
100
- const lines: string[] = [];
101
- const indent = '| ' + ' '.repeat(depth);
102
-
103
- switch (node.type) {
104
- case 'DOCUMENT':
105
-
106
- break;
107
- case 'DOCTYPE':
108
- lines.push(`${indent}<!DOCTYPE ${node.tagName || 'html'}>`);
109
- break;
110
- case 'ELEMENT':
111
- const tagName = node.tagName || 'unknown';
112
- lines.push(`${indent}<${tagName}>`);
113
-
114
-
115
- if (node.attributes) {
116
- for (const [name, value] of Object.entries(node.attributes).sort()) {
117
- lines.push(`${indent} ${name}="${value}"`);
118
- }
119
- }
120
- break;
121
- case 'TEXT':
122
- if (node.content && node.content.trim()) {
123
- lines.push(`${indent}"${node.content}"`);
124
- }
125
- break;
126
- case 'COMMENT':
127
- lines.push(`${indent}<!-- ${node.content || ''} -->`);
128
- break;
129
- case 'CDATA':
130
- lines.push(`${indent}<![CDATA[${node.content || ''}]]>`);
131
- break;
132
- }
133
-
134
-
135
- if (node.children) {
136
- for (const child of node.children) {
137
- lines.push(...convertASTToHTML5libTree(child, depth + 1));
138
- }
139
- }
140
-
141
- return lines;
142
- }
143
-
144
-
145
- export function normalizeHTML5libTree(tree: string): string {
146
- return tree
147
- .split('\n')
148
- .map(line => line.trim())
149
- .filter(line => line.length > 0)
150
- .join('\n');
151
- }
152
-
153
-
154
- export function runHTML5libTreeTest(test: HTML5libTreeTest, testName: string): void {
155
- it(testName, () => {
156
- const { data, document: expectedTree, documentFragment, scriptOff, scriptOn } = test;
157
-
158
-
159
- const tokens = tokenize(data);
160
- const ast = parse(tokens);
161
-
162
-
163
- const actualTreeLines = convertASTToHTML5libTree(ast);
164
- const actualTree = actualTreeLines.join('\n');
165
-
166
-
167
- const normalizedActual = normalizeHTML5libTree(actualTree);
168
- const normalizedExpected = normalizeHTML5libTree(expectedTree);
169
-
170
-
171
- expect(normalizedActual).toBe(normalizedExpected);
172
- });
173
- }
174
-
175
-
176
- export function runHTML5libTreeTestSuite(tests: HTML5libTreeTest[], suiteName: string): void {
177
- describe(`HTML5lib Tree Construction Tests: ${suiteName}`, () => {
178
- tests.forEach((test, index) => {
179
- const testName = `Test ${index + 1}: ${test.data.substring(0, 50).replace(/\n/g, ' ')}...`;
180
- runHTML5libTreeTest(test, testName);
181
- });
182
- });
183
- }
184
-
185
-
186
- export async function loadHTML5libTreeTests(testData: string, suiteName: string): Promise<void> {
187
- const tests = parseHTML5libDATFile(testData);
188
- runHTML5libTreeTestSuite(tests, suiteName);
189
- }
190
-
191
-
192
- export function validateHTML5libTreeTest(test: HTML5libTreeTest): boolean {
193
- return !!(test.data && test.document && test.errors !== undefined);
194
- }
@@ -1,250 +0,0 @@
1
- import { describe, it, expect } from "bun:test";
2
- import {
3
- loadHTML5libTreeTests,
4
- runHTML5libTreeTestSuite,
5
- parseHTML5libDATFile,
6
- type HTML5libTreeTest,
7
- } from "./tree-construction-utils";
8
-
9
-
10
- const basicTreeTestData = `#data
11
- Test
12
- #errors
13
- (1,0): expected-doctype-but-got-chars
14
- #document
15
- | <html>
16
- | <head>
17
- | <body>
18
- | "Test"
19
-
20
- #data
21
- <p>One<p>Two
22
- #errors
23
- (1,3): expected-doctype-but-got-start-tag
24
- #document
25
- | <html>
26
- | <head>
27
- | <body>
28
- | <p>
29
- | "One"
30
- | <p>
31
- | "Two"
32
-
33
- #data
34
- <html>
35
- #errors
36
- (1,6): expected-doctype-but-got-start-tag
37
- #document
38
- | <html>
39
- | <head>
40
- | <body>
41
-
42
- #data
43
- <head>
44
- #errors
45
- (1,6): expected-doctype-but-got-start-tag
46
- #document
47
- | <html>
48
- | <head>
49
- | <body>
50
-
51
- #data
52
- <body>
53
- #errors
54
- (1,6): expected-doctype-but-got-start-tag
55
- #document
56
- | <html>
57
- | <head>
58
- | <body>
59
-
60
- #data
61
- <html><head></head><body></body>
62
- #errors
63
- (1,6): expected-doctype-but-got-start-tag
64
- #document
65
- | <html>
66
- | <head>
67
- | <body>
68
-
69
- #data
70
- Line1<br>Line2
71
- #errors
72
- (1,0): expected-doctype-but-got-chars
73
- #document
74
- | <html>
75
- | <head>
76
- | <body>
77
- | "Line1"
78
- | <br>
79
- | "Line2"
80
-
81
- #data
82
- <div>hello</div>
83
- #errors
84
- (1,5): expected-doctype-but-got-start-tag
85
- #document
86
- | <html>
87
- | <head>
88
- | <body>
89
- | <div>
90
- | "hello"
91
-
92
- #data
93
- <p><b>bold</b></p>
94
- #errors
95
- (1,3): expected-doctype-but-got-start-tag
96
- #document
97
- | <html>
98
- | <head>
99
- | <body>
100
- | <p>
101
- | <b>
102
- | "bold"
103
-
104
- #data
105
- <!--comment-->
106
- #errors
107
- (1,0): expected-doctype-but-got-chars
108
- #document
109
- | <html>
110
- | <head>
111
- | <body>
112
- | <!-- comment -->`;
113
-
114
- const doctypeTestData = `#data
115
- <!DOCTYPE html>
116
- #errors
117
- #document
118
- | <!DOCTYPE html>
119
- | <html>
120
- | <head>
121
- | <body>
122
-
123
- #data
124
- <!DOCTYPE html><html><head><title>Test</title></head><body><p>Hello</p></body></html>
125
- #errors
126
- #document
127
- | <!DOCTYPE html>
128
- | <html>
129
- | <head>
130
- | <title>
131
- | "Test"
132
- | <body>
133
- | <p>
134
- | "Hello"
135
-
136
- #data
137
- <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
138
- #errors
139
- #document
140
- | <!DOCTYPE html>
141
- | <html>
142
- | <head>
143
- | <body>
144
-
145
- #data
146
- <!DOCTYPE html SYSTEM "about:legacy-compat">
147
- #errors
148
- #document
149
- | <!DOCTYPE html>
150
- | <html>
151
- | <head>
152
- | <body>`;
153
-
154
- const errorHandlingTestData = `#data
155
- <b><table><td></b><i></table>
156
- #errors
157
- (1,3): expected-doctype-but-got-start-tag
158
- (1,14): unexpected-cell-in-table-body
159
- (1,18): unexpected-end-tag
160
- (1,29): unexpected-cell-end-tag
161
- (1,29): expected-closing-tag-but-got-eof
162
- #document
163
- | <html>
164
- | <head>
165
- | <body>
166
- | <b>
167
- | <table>
168
- | <tbody>
169
- | <tr>
170
- | <td>
171
- | <i>
172
-
173
- #data
174
- <p><b><div><marquee></p></b></div>
175
- #errors
176
- (1,3): expected-doctype-but-got-start-tag
177
- (1,11): unexpected-end-tag
178
- (1,24): unexpected-end-tag
179
- (1,28): unexpected-end-tag
180
- (1,34): end-tag-too-early
181
- (1,34): expected-closing-tag-but-got-eof
182
- #document
183
- | <html>
184
- | <head>
185
- | <body>
186
- | <p>
187
- | <b>
188
- | <div>
189
- | <b>
190
- | <marquee>
191
- | <p>
192
-
193
- #data
194
- <a><p><a></a></p></a>
195
- #errors
196
- (1,3): expected-doctype-but-got-start-tag
197
- (1,9): unexpected-start-tag-implies-end-tag
198
- (1,9): adoption-agency-1.3
199
- (1,21): unexpected-end-tag
200
- #document
201
- | <html>
202
- | <head>
203
- | <body>
204
- | <a>
205
- | <p>
206
- | <a>
207
- | <a>`;
208
-
209
-
210
- describe("HTML5lib Tree Construction Tests", () => {
211
- it("should parse DAT format correctly", () => {
212
- const tests = parseHTML5libDATFile(basicTreeTestData);
213
- expect(tests.length).toBeGreaterThan(0);
214
-
215
-
216
- const firstTest = tests[0];
217
- if (firstTest) {
218
- expect(firstTest.data).toBe("Test");
219
- expect(firstTest.errors.length).toBeGreaterThan(0);
220
- expect(firstTest.document).toContain("<html>");
221
- }
222
- });
223
-
224
- it("should handle doctype tests", () => {
225
- const tests = parseHTML5libDATFile(doctypeTestData);
226
- expect(tests.length).toBeGreaterThan(0);
227
-
228
-
229
- const firstTest = tests[0];
230
- if (firstTest) {
231
- expect(firstTest.data).toBe("<!DOCTYPE html>");
232
- expect(firstTest.errors.length).toBe(0);
233
- expect(firstTest.document).toContain("<!DOCTYPE html>");
234
- }
235
- });
236
-
237
- it("should handle error cases", () => {
238
- const tests = parseHTML5libDATFile(errorHandlingTestData);
239
- expect(tests.length).toBeGreaterThan(0);
240
-
241
-
242
- const firstTest = tests[0];
243
- if (firstTest) {
244
- expect(firstTest.errors.length).toBeGreaterThan(0);
245
- expect(firstTest.errors[0]).toContain(
246
- "expected-doctype-but-got-start-tag"
247
- );
248
- }
249
- });
250
- });