@tkeron/html-parser 0.1.7 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -7
- package/bun.lock +5 -0
- package/index.ts +4 -0
- package/package.json +7 -1
- package/src/css-selector.ts +1 -1
- package/src/dom-simulator.ts +41 -17
- package/src/encoding.ts +39 -0
- package/src/index.ts +9 -0
- package/src/parser.ts +509 -143
- package/src/serializer.ts +450 -0
- package/src/tokenizer.ts +190 -118
- package/tests/advanced.test.ts +121 -108
- package/tests/custom-elements-head.test.ts +105 -0
- package/tests/dom-extended.test.ts +12 -12
- package/tests/dom-manipulation.test.ts +9 -10
- package/tests/dom.test.ts +32 -27
- package/tests/helpers/tokenizer-adapter.test.ts +70 -0
- package/tests/helpers/tokenizer-adapter.ts +65 -0
- package/tests/helpers/tree-adapter.test.ts +39 -0
- package/tests/helpers/tree-adapter.ts +60 -0
- package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
- package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
- package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
- package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
- package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
- package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
- package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
- package/tests/html5lib-data/tree-construction/math.dat +104 -0
- package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
- package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
- package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
- package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
- package/tests/html5lib-data/tree-construction/svg.dat +104 -0
- package/tests/html5lib-data/tree-construction/template.dat +1673 -0
- package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
- package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
- package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
- package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
- package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
- package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
- package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
- package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
- package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
- package/tests/parser.test.ts +173 -193
- package/tests/serializer-core.test.ts +16 -0
- package/tests/serializer-data/core.test +125 -0
- package/tests/serializer-data/injectmeta.test +66 -0
- package/tests/serializer-data/optionaltags.test +965 -0
- package/tests/serializer-data/options.test +60 -0
- package/tests/serializer-data/whitespace.test +51 -0
- package/tests/serializer-injectmeta.test.ts +16 -0
- package/tests/serializer-optionaltags.test.ts +16 -0
- package/tests/serializer-options.test.ts +16 -0
- package/tests/serializer-whitespace.test.ts +16 -0
- package/tests/tokenizer-namedEntities.test.ts +20 -0
- package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
- package/tests/tokenizer.test.ts +25 -32
- package/tests/tree-construction-adoption01.test.ts +37 -0
- package/tests/tree-construction-adoption02.test.ts +34 -0
- package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
- package/tests/tree-construction-entities02.test.ts +33 -0
- package/tests/tree-construction-html5test-com.test.ts +32 -0
- package/tests/tree-construction-math.test.ts +18 -0
- package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
- package/tests/tree-construction-noscript01.test.ts +18 -0
- package/tests/tree-construction-ruby.test.ts +21 -0
- package/tests/tree-construction-scriptdata01.test.ts +21 -0
- package/tests/tree-construction-svg.test.ts +21 -0
- package/tests/tree-construction-template.test.ts +21 -0
- package/tests/tree-construction-tests10.test.ts +21 -0
- package/tests/tree-construction-tests11.test.ts +21 -0
- package/tests/tree-construction-tests20.test.ts +18 -0
- package/tests/tree-construction-tests21.test.ts +18 -0
- package/tests/tree-construction-tests23.test.ts +18 -0
- package/tests/tree-construction-tests24.test.ts +18 -0
- package/tests/tree-construction-tests5.test.ts +21 -0
- package/tests/tree-construction-tests6.test.ts +21 -0
- package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
- package/tests/custom-elements.test.ts +0 -745
- package/tests/official/README.md +0 -87
- package/tests/official/acid/acid-tests.test.ts +0 -309
- package/tests/official/final-output/final-output.test.ts +0 -361
- package/tests/official/html5lib/tokenizer-utils.ts +0 -192
- package/tests/official/html5lib/tokenizer.test.ts +0 -171
- package/tests/official/html5lib/tree-construction-utils.ts +0 -194
- package/tests/official/html5lib/tree-construction.test.ts +0 -250
- package/tests/official/validator/validator-tests.test.ts +0 -237
- package/tests/official/validator-nu/validator-nu.test.ts +0 -335
- package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
- package/tests/official/wpt/wpt-tests.test.ts +0 -409
package/src/parser.ts
CHANGED
|
@@ -1,38 +1,14 @@
|
|
|
1
1
|
import type { Token } from './tokenizer.js';
|
|
2
2
|
import { TokenType } from './tokenizer.js';
|
|
3
|
-
|
|
4
|
-
export interface ASTNode {
|
|
5
|
-
type: ASTNodeType;
|
|
6
|
-
tagName?: string;
|
|
7
|
-
attributes?: Record<string, string>;
|
|
8
|
-
children?: ASTNode[];
|
|
9
|
-
content?: string;
|
|
10
|
-
parent?: ASTNode;
|
|
11
|
-
isSelfClosing?: boolean;
|
|
12
|
-
position?: {
|
|
13
|
-
start: number;
|
|
14
|
-
end: number;
|
|
15
|
-
line: number;
|
|
16
|
-
column: number;
|
|
17
|
-
};
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
export enum ASTNodeType {
|
|
21
|
-
DOCUMENT = 'DOCUMENT',
|
|
22
|
-
ELEMENT = 'ELEMENT',
|
|
23
|
-
TEXT = 'TEXT',
|
|
24
|
-
COMMENT = 'COMMENT',
|
|
25
|
-
CDATA = 'CDATA',
|
|
26
|
-
DOCTYPE = 'DOCTYPE',
|
|
27
|
-
PROCESSING_INSTRUCTION = 'PROCESSING_INSTRUCTION'
|
|
28
|
-
}
|
|
3
|
+
import { createDocument, createElement, createTextNode, createComment, createDoctype, appendChild } from './dom-simulator.js';
|
|
29
4
|
|
|
30
5
|
export interface ParserState {
|
|
31
6
|
tokens: Token[];
|
|
32
7
|
position: number;
|
|
33
8
|
length: number;
|
|
34
|
-
stack:
|
|
35
|
-
root:
|
|
9
|
+
stack: any[]; // DOM elements
|
|
10
|
+
root: any; // Document
|
|
11
|
+
insertionMode: InsertionMode;
|
|
36
12
|
errors: ParseError[];
|
|
37
13
|
}
|
|
38
14
|
|
|
@@ -44,6 +20,32 @@ export interface ParseError {
|
|
|
44
20
|
severity: 'error' | 'warning';
|
|
45
21
|
}
|
|
46
22
|
|
|
23
|
+
export enum InsertionMode {
|
|
24
|
+
Initial = 'initial',
|
|
25
|
+
BeforeHtml = 'beforeHtml',
|
|
26
|
+
BeforeHead = 'beforeHead',
|
|
27
|
+
InHead = 'inHead',
|
|
28
|
+
AfterHead = 'afterHead',
|
|
29
|
+
InBody = 'inBody'
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export enum ASTNodeType {
|
|
33
|
+
Document = 'document',
|
|
34
|
+
Element = 'element',
|
|
35
|
+
Text = 'text',
|
|
36
|
+
Comment = 'comment',
|
|
37
|
+
Doctype = 'doctype',
|
|
38
|
+
CDATA = 'cdata'
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface ASTNode {
|
|
42
|
+
type: ASTNodeType;
|
|
43
|
+
tagName?: string;
|
|
44
|
+
value?: string;
|
|
45
|
+
attributes?: Record<string, string>;
|
|
46
|
+
children?: ASTNode[];
|
|
47
|
+
}
|
|
48
|
+
|
|
47
49
|
const VOID_ELEMENTS = new Set([
|
|
48
50
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
|
49
51
|
'link', 'meta', 'param', 'source', 'track', 'wbr'
|
|
@@ -57,7 +59,41 @@ const AUTO_CLOSE_RULES: Record<string, string[]> = {
|
|
|
57
59
|
'li': ['li'],
|
|
58
60
|
'dt': ['dt', 'dd'],
|
|
59
61
|
'dd': ['dt', 'dd'],
|
|
62
|
+
'address': ['p'],
|
|
63
|
+
'article': ['p'],
|
|
64
|
+
'aside': ['p'],
|
|
65
|
+
'blockquote': ['p'],
|
|
66
|
+
'center': ['p'],
|
|
67
|
+
'details': ['p'],
|
|
68
|
+
'dialog': ['p'],
|
|
69
|
+
'dir': ['p'],
|
|
70
|
+
'div': ['p'],
|
|
71
|
+
'dl': ['p'],
|
|
72
|
+
'fieldset': ['p'],
|
|
73
|
+
'figcaption': ['p'],
|
|
74
|
+
'figure': ['p'],
|
|
75
|
+
'footer': ['p'],
|
|
76
|
+
'form': ['p'],
|
|
77
|
+
'h1': ['p'],
|
|
78
|
+
'h2': ['p'],
|
|
79
|
+
'h3': ['p'],
|
|
80
|
+
'h4': ['p'],
|
|
81
|
+
'h5': ['p'],
|
|
82
|
+
'h6': ['p'],
|
|
83
|
+
'header': ['p'],
|
|
84
|
+
'hgroup': ['p'],
|
|
85
|
+
'hr': ['p'],
|
|
86
|
+
'listing': ['p'],
|
|
87
|
+
'main': ['p'],
|
|
88
|
+
'menu': ['p'],
|
|
89
|
+
'nav': ['p'],
|
|
90
|
+
'ol': ['p'],
|
|
60
91
|
'p': ['p'],
|
|
92
|
+
'pre': ['p'],
|
|
93
|
+
'section': ['p'],
|
|
94
|
+
'summary': ['p'],
|
|
95
|
+
'table': ['p'],
|
|
96
|
+
'ul': ['p'],
|
|
61
97
|
'rt': ['rt', 'rp'],
|
|
62
98
|
'rp': ['rt', 'rp'],
|
|
63
99
|
'optgroup': ['optgroup'],
|
|
@@ -70,7 +106,7 @@ const AUTO_CLOSE_RULES: Record<string, string[]> = {
|
|
|
70
106
|
'th': ['td', 'th']
|
|
71
107
|
};
|
|
72
108
|
|
|
73
|
-
export function parse(tokens: Token[]):
|
|
109
|
+
export function parse(tokens: Token[]): any {
|
|
74
110
|
const state = createParserState(tokens);
|
|
75
111
|
|
|
76
112
|
while (state.position < state.length) {
|
|
@@ -84,21 +120,134 @@ export function parse(tokens: Token[]): ASTNode {
|
|
|
84
120
|
advance(state);
|
|
85
121
|
}
|
|
86
122
|
|
|
123
|
+
// Create implicit html, head, body if needed
|
|
124
|
+
if (state.root.childNodes && state.root.childNodes.length > 0) {
|
|
125
|
+
let hasHtml = false;
|
|
126
|
+
for (const child of state.root.childNodes) {
|
|
127
|
+
if (child.nodeType === 1 && child.tagName === 'HTML') {
|
|
128
|
+
hasHtml = true;
|
|
129
|
+
state.root.documentElement = child;
|
|
130
|
+
break;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
if (!hasHtml) {
|
|
134
|
+
const html = createElement('html', {});
|
|
135
|
+
const head = createElement('head', {});
|
|
136
|
+
const body = createElement('body', {});
|
|
137
|
+
appendChild(html, head);
|
|
138
|
+
appendChild(html, body);
|
|
139
|
+
|
|
140
|
+
const doctypes: any[] = [];
|
|
141
|
+
const commentsBeforeHtml: any[] = [];
|
|
142
|
+
const bodyContent: any[] = [];
|
|
143
|
+
const children = [...state.root.childNodes];
|
|
144
|
+
|
|
145
|
+
let foundElement = false;
|
|
146
|
+
for (const child of children) {
|
|
147
|
+
if (child.nodeType === 10) {
|
|
148
|
+
doctypes.push(child);
|
|
149
|
+
} else if (child.nodeType === 8 && !foundElement) {
|
|
150
|
+
commentsBeforeHtml.push(child);
|
|
151
|
+
} else {
|
|
152
|
+
if (child.nodeType === 1) foundElement = true;
|
|
153
|
+
bodyContent.push(child);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
for (const content of bodyContent) {
|
|
158
|
+
appendChild(body, content);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
state.root.childNodes = [];
|
|
162
|
+
for (const doctype of doctypes) {
|
|
163
|
+
doctype.parentNode = null;
|
|
164
|
+
appendChild(state.root, doctype);
|
|
165
|
+
}
|
|
166
|
+
for (const comment of commentsBeforeHtml) {
|
|
167
|
+
comment.parentNode = null;
|
|
168
|
+
appendChild(state.root, comment);
|
|
169
|
+
}
|
|
170
|
+
appendChild(state.root, html);
|
|
171
|
+
state.root.documentElement = html;
|
|
172
|
+
state.root.head = head;
|
|
173
|
+
state.root.body = body;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
87
177
|
while (state.stack.length > 1) {
|
|
88
178
|
const unclosedElement = state.stack.pop()!;
|
|
89
179
|
const currentToken = getCurrentToken(state);
|
|
90
|
-
addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.
|
|
180
|
+
addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.offset || 0);
|
|
91
181
|
}
|
|
92
182
|
|
|
93
183
|
return state.root;
|
|
94
184
|
}
|
|
95
185
|
|
|
186
|
+
export function domToAST(dom: any): ASTNode {
|
|
187
|
+
function convert(node: any): ASTNode | null {
|
|
188
|
+
if (!node) return null;
|
|
189
|
+
|
|
190
|
+
if (node.nodeType === 9) {
|
|
191
|
+
const children: ASTNode[] = [];
|
|
192
|
+
if (node.childNodes) {
|
|
193
|
+
for (const child of node.childNodes) {
|
|
194
|
+
const converted = convert(child);
|
|
195
|
+
if (converted) children.push(converted);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
return {
|
|
199
|
+
type: ASTNodeType.Document,
|
|
200
|
+
children
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
if (node.nodeType === 1) {
|
|
205
|
+
const children: ASTNode[] = [];
|
|
206
|
+
if (node.childNodes) {
|
|
207
|
+
for (const child of node.childNodes) {
|
|
208
|
+
const converted = convert(child);
|
|
209
|
+
if (converted) children.push(converted);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
const tagName = node.tagName?.toLowerCase();
|
|
213
|
+
return {
|
|
214
|
+
type: ASTNodeType.Element,
|
|
215
|
+
tagName,
|
|
216
|
+
attributes: node.attributes || {},
|
|
217
|
+
children,
|
|
218
|
+
isSelfClosing: VOID_ELEMENTS.has(tagName)
|
|
219
|
+
} as ASTNode & { isSelfClosing: boolean };
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if (node.nodeType === 3) {
|
|
223
|
+
return {
|
|
224
|
+
type: ASTNodeType.Text,
|
|
225
|
+
content: node.nodeValue || ''
|
|
226
|
+
} as ASTNode & { content: string };
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
if (node.nodeType === 8) {
|
|
230
|
+
return {
|
|
231
|
+
type: ASTNodeType.Comment,
|
|
232
|
+
content: node.nodeValue || ''
|
|
233
|
+
} as ASTNode & { content: string };
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (node.nodeType === 10) {
|
|
237
|
+
return {
|
|
238
|
+
type: ASTNodeType.Doctype,
|
|
239
|
+
content: node.name || 'html'
|
|
240
|
+
} as ASTNode & { content: string };
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
return null;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
return convert(dom) || { type: ASTNodeType.Document, children: [] };
|
|
247
|
+
}
|
|
248
|
+
|
|
96
249
|
function createParserState(tokens: Token[]): ParserState {
|
|
97
|
-
const root
|
|
98
|
-
type: ASTNodeType.DOCUMENT,
|
|
99
|
-
children: [],
|
|
100
|
-
tagName: '#document'
|
|
101
|
-
};
|
|
250
|
+
const root = createDocument();
|
|
102
251
|
|
|
103
252
|
return {
|
|
104
253
|
tokens,
|
|
@@ -106,81 +255,240 @@ function createParserState(tokens: Token[]): ParserState {
|
|
|
106
255
|
length: tokens.length,
|
|
107
256
|
stack: [root],
|
|
108
257
|
root,
|
|
258
|
+
insertionMode: InsertionMode.Initial,
|
|
109
259
|
errors: []
|
|
110
260
|
};
|
|
111
261
|
}
|
|
112
262
|
|
|
113
263
|
function parseToken(state: ParserState, token: Token): void {
|
|
114
|
-
switch (
|
|
115
|
-
case
|
|
116
|
-
|
|
264
|
+
switch (state.insertionMode) {
|
|
265
|
+
case InsertionMode.Initial:
|
|
266
|
+
parseTokenInInitialMode(state, token);
|
|
117
267
|
break;
|
|
118
|
-
case
|
|
119
|
-
|
|
268
|
+
case InsertionMode.BeforeHtml:
|
|
269
|
+
parseTokenInBeforeHtmlMode(state, token);
|
|
120
270
|
break;
|
|
121
|
-
case
|
|
122
|
-
|
|
271
|
+
case InsertionMode.BeforeHead:
|
|
272
|
+
parseTokenInBeforeHeadMode(state, token);
|
|
123
273
|
break;
|
|
124
|
-
case
|
|
125
|
-
|
|
274
|
+
case InsertionMode.InHead:
|
|
275
|
+
parseTokenInInHeadMode(state, token);
|
|
126
276
|
break;
|
|
127
|
-
case
|
|
128
|
-
|
|
277
|
+
case InsertionMode.AfterHead:
|
|
278
|
+
parseTokenInAfterHeadMode(state, token);
|
|
129
279
|
break;
|
|
130
|
-
case
|
|
131
|
-
|
|
132
|
-
break;
|
|
133
|
-
case TokenType.PROCESSING_INSTRUCTION:
|
|
134
|
-
parseProcessingInstruction(state, token);
|
|
280
|
+
case InsertionMode.InBody:
|
|
281
|
+
parseTokenInInBodyMode(state, token);
|
|
135
282
|
break;
|
|
283
|
+
default:
|
|
284
|
+
parseTokenInInBodyMode(state, token); // fallback
|
|
136
285
|
}
|
|
137
286
|
}
|
|
138
287
|
|
|
139
|
-
function
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
288
|
+
function parseTokenInInitialMode(state: ParserState, token: Token): void {
|
|
289
|
+
if (token.type === TokenType.DOCTYPE) {
|
|
290
|
+
// TODO: Create DOCTYPE node
|
|
291
|
+
parseDoctype(state, token);
|
|
292
|
+
state.insertionMode = InsertionMode.BeforeHtml;
|
|
293
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
294
|
+
parseComment(state, token);
|
|
295
|
+
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
296
|
+
// Ignore whitespace
|
|
297
|
+
} else {
|
|
298
|
+
// No DOCTYPE, create implicit DOCTYPE and switch to BeforeHtml
|
|
299
|
+
const doctype = createDoctype('html');
|
|
300
|
+
appendChild(state.root, doctype);
|
|
301
|
+
state.insertionMode = InsertionMode.BeforeHtml;
|
|
302
|
+
parseToken(state, token); // Re-parse in new mode
|
|
303
|
+
}
|
|
304
|
+
}
|
|
145
305
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
306
|
+
function parseTokenInBeforeHtmlMode(state: ParserState, token: Token): void {
|
|
307
|
+
if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'html') {
|
|
308
|
+
const html = createElement('html', token.attributes || {});
|
|
309
|
+
appendChild(state.root, html);
|
|
310
|
+
state.root.documentElement = html;
|
|
311
|
+
state.stack.push(html);
|
|
312
|
+
state.insertionMode = InsertionMode.BeforeHead;
|
|
313
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
314
|
+
parseComment(state, token);
|
|
315
|
+
} else if (token.type === TokenType.DOCTYPE) {
|
|
316
|
+
// Ignore
|
|
317
|
+
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
318
|
+
// Ignore whitespace
|
|
319
|
+
} else {
|
|
320
|
+
const html = createElement('html', {});
|
|
321
|
+
appendChild(state.root, html);
|
|
322
|
+
state.root.documentElement = html;
|
|
323
|
+
state.stack.push(html);
|
|
324
|
+
state.insertionMode = InsertionMode.BeforeHead;
|
|
325
|
+
parseToken(state, token);
|
|
326
|
+
}
|
|
327
|
+
}
|
|
155
328
|
|
|
156
|
-
|
|
157
|
-
|
|
329
|
+
function parseTokenInBeforeHeadMode(state: ParserState, token: Token): void {
|
|
330
|
+
if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'head') {
|
|
331
|
+
const head = createElement('head', token.attributes || {});
|
|
332
|
+
appendChild(getCurrentParent(state), head);
|
|
333
|
+
state.root.head = head;
|
|
334
|
+
state.stack.push(head);
|
|
335
|
+
state.insertionMode = InsertionMode.InHead;
|
|
336
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
337
|
+
parseComment(state, token);
|
|
338
|
+
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
339
|
+
// Ignore whitespace
|
|
340
|
+
} else {
|
|
341
|
+
const head = createElement('head', {});
|
|
342
|
+
appendChild(getCurrentParent(state), head);
|
|
343
|
+
state.root.head = head;
|
|
344
|
+
state.stack.push(head);
|
|
345
|
+
state.insertionMode = InsertionMode.InHead;
|
|
346
|
+
parseToken(state, token);
|
|
158
347
|
}
|
|
348
|
+
}
|
|
159
349
|
|
|
160
|
-
|
|
350
|
+
function parseOpenTag(state: ParserState, token: Token): void {
|
|
351
|
+
const tagName = token.value.toLowerCase();
|
|
352
|
+
const currentParent = getCurrentParent(state);
|
|
353
|
+
const element = createElement(tagName, token.attributes || {});
|
|
354
|
+
appendChild(currentParent, element);
|
|
355
|
+
|
|
356
|
+
if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
|
|
161
357
|
state.stack.push(element);
|
|
162
358
|
}
|
|
163
359
|
}
|
|
164
360
|
|
|
165
|
-
function
|
|
166
|
-
const
|
|
361
|
+
function parseTokenInInHeadMode(state: ParserState, token: Token): void {
|
|
362
|
+
const currentElement = getCurrentElement(state);
|
|
363
|
+
const currentTagName = currentElement?.tagName?.toLowerCase();
|
|
167
364
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
365
|
+
if (RAW_TEXT_ELEMENTS.has(currentTagName)) {
|
|
366
|
+
if (token.type === TokenType.TEXT) {
|
|
367
|
+
parseText(state, token);
|
|
368
|
+
return;
|
|
369
|
+
} else if (token.type === TokenType.TAG_CLOSE && token.value.toLowerCase() === currentTagName) {
|
|
370
|
+
state.stack.pop();
|
|
371
|
+
return;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
if (token.type === TokenType.TAG_OPEN) {
|
|
376
|
+
const tagName = token.value.toLowerCase();
|
|
377
|
+
if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
|
|
378
|
+
parseOpenTag(state, token);
|
|
379
|
+
} else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
|
|
380
|
+
parseOpenTag(state, token);
|
|
381
|
+
} else if (tagName === 'head') {
|
|
382
|
+
// Ignore duplicate <head> tags
|
|
383
|
+
} else if (tagName.includes('-')) {
|
|
384
|
+
// Custom elements (tags with hyphens) are valid in <head>
|
|
385
|
+
parseOpenTag(state, token);
|
|
386
|
+
} else {
|
|
387
|
+
state.stack.pop();
|
|
388
|
+
state.insertionMode = InsertionMode.AfterHead;
|
|
389
|
+
parseToken(state, token);
|
|
390
|
+
}
|
|
391
|
+
} else if (token.type === TokenType.TAG_CLOSE) {
|
|
392
|
+
const tagName = token.value.toLowerCase();
|
|
393
|
+
if (tagName === 'head') {
|
|
394
|
+
state.stack.pop();
|
|
395
|
+
state.insertionMode = InsertionMode.AfterHead;
|
|
396
|
+
} else if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
|
|
397
|
+
if (currentTagName === tagName) {
|
|
398
|
+
state.stack.pop();
|
|
175
399
|
}
|
|
400
|
+
} else if (tagName.includes('-') && currentTagName === tagName) {
|
|
401
|
+
// Handle closing tags for custom elements in <head>
|
|
176
402
|
state.stack.pop();
|
|
177
|
-
found = true;
|
|
178
|
-
break;
|
|
179
403
|
}
|
|
404
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
405
|
+
parseComment(state, token);
|
|
406
|
+
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
407
|
+
} else {
|
|
408
|
+
state.stack.pop();
|
|
409
|
+
state.insertionMode = InsertionMode.AfterHead;
|
|
410
|
+
parseToken(state, token);
|
|
180
411
|
}
|
|
412
|
+
}
|
|
181
413
|
|
|
182
|
-
|
|
183
|
-
|
|
414
|
+
function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
|
|
415
|
+
if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'body') {
|
|
416
|
+
const body = createElement('body', token.attributes || {});
|
|
417
|
+
appendChild(getCurrentParent(state), body);
|
|
418
|
+
state.root.body = body;
|
|
419
|
+
state.stack.push(body);
|
|
420
|
+
state.insertionMode = InsertionMode.InBody;
|
|
421
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
422
|
+
parseComment(state, token);
|
|
423
|
+
} else if (token.type === TokenType.TEXT && token.value.trim() === '') {
|
|
424
|
+
// Ignore whitespace
|
|
425
|
+
} else {
|
|
426
|
+
const body = createElement('body', {});
|
|
427
|
+
appendChild(getCurrentParent(state), body);
|
|
428
|
+
state.root.body = body;
|
|
429
|
+
state.stack.push(body);
|
|
430
|
+
state.insertionMode = InsertionMode.InBody;
|
|
431
|
+
parseToken(state, token);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
|
|
436
|
+
const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
|
|
437
|
+
|
|
438
|
+
function parseTokenInInBodyMode(state: ParserState, token: Token): void {
|
|
439
|
+
if (token.type === TokenType.TAG_OPEN) {
|
|
440
|
+
const tagName = token.value.toLowerCase();
|
|
441
|
+
|
|
442
|
+
handleAutoClosing(state, tagName);
|
|
443
|
+
|
|
444
|
+
const currentParent = getCurrentParent(state);
|
|
445
|
+
|
|
446
|
+
let namespaceURI: string | undefined;
|
|
447
|
+
if (tagName === 'svg') {
|
|
448
|
+
namespaceURI = SVG_NAMESPACE;
|
|
449
|
+
} else if (tagName === 'math') {
|
|
450
|
+
namespaceURI = MATHML_NAMESPACE;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
const element = createElement(tagName, token.attributes || {}, namespaceURI);
|
|
454
|
+
|
|
455
|
+
appendChild(currentParent, element);
|
|
456
|
+
|
|
457
|
+
if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
|
|
458
|
+
state.stack.push(element);
|
|
459
|
+
}
|
|
460
|
+
} else if (token.type === TokenType.TAG_CLOSE) {
|
|
461
|
+
const tagName = token.value.toLowerCase();
|
|
462
|
+
|
|
463
|
+
// Generate implied end tags
|
|
464
|
+
const impliedEndTags = ['dd', 'dt', 'li', 'option', 'optgroup', 'p', 'rb', 'rp', 'rt', 'rtc'];
|
|
465
|
+
while (state.stack.length > 1) { // Don't pop document
|
|
466
|
+
const currentElement = getCurrentElement(state);
|
|
467
|
+
if (!currentElement || !impliedEndTags.includes(currentElement.tagName.toLowerCase()) || currentElement.tagName.toLowerCase() === tagName) {
|
|
468
|
+
break;
|
|
469
|
+
}
|
|
470
|
+
state.stack.pop();
|
|
471
|
+
addError(state, `Implied end tag: ${currentElement.tagName}`, token.position?.offset || 0);
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
const currentElement = getCurrentElement(state);
|
|
475
|
+
if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
|
|
476
|
+
state.stack.pop();
|
|
477
|
+
} else {
|
|
478
|
+
// For now, just ignore unmatched closing tags
|
|
479
|
+
// TODO: Implement full adoption agency algorithm
|
|
480
|
+
addError(state, `Unmatched closing tag: ${tagName}`, token.position?.offset || 0);
|
|
481
|
+
}
|
|
482
|
+
} else if (token.type === TokenType.TEXT) {
|
|
483
|
+
parseText(state, token);
|
|
484
|
+
} else if (token.type === TokenType.COMMENT) {
|
|
485
|
+
parseComment(state, token);
|
|
486
|
+
} else if (token.type === TokenType.CDATA) {
|
|
487
|
+
parseCDATA(state, token);
|
|
488
|
+
} else if (token.type === TokenType.DOCTYPE) {
|
|
489
|
+
// Ignore
|
|
490
|
+
} else if (token.type === TokenType.PROCESSING_INSTRUCTION) {
|
|
491
|
+
parseProcessingInstruction(state, token);
|
|
184
492
|
}
|
|
185
493
|
}
|
|
186
494
|
|
|
@@ -192,76 +500,134 @@ function parseText(state: ParserState, token: Token): void {
|
|
|
192
500
|
return;
|
|
193
501
|
}
|
|
194
502
|
|
|
195
|
-
const textNode
|
|
196
|
-
|
|
197
|
-
content,
|
|
198
|
-
parent: currentParent,
|
|
199
|
-
position: token.position
|
|
200
|
-
};
|
|
201
|
-
|
|
202
|
-
if (currentParent.children) {
|
|
203
|
-
currentParent.children.push(textNode);
|
|
204
|
-
}
|
|
503
|
+
const textNode = createTextNode(content);
|
|
504
|
+
appendChild(currentParent, textNode);
|
|
205
505
|
}
|
|
206
506
|
|
|
207
507
|
function parseComment(state: ParserState, token: Token): void {
|
|
208
508
|
const currentParent = getCurrentParent(state);
|
|
209
509
|
|
|
210
|
-
const commentNode
|
|
211
|
-
|
|
212
|
-
content: token.value,
|
|
213
|
-
parent: currentParent,
|
|
214
|
-
position: token.position
|
|
215
|
-
};
|
|
216
|
-
|
|
217
|
-
if (currentParent.children) {
|
|
218
|
-
currentParent.children.push(commentNode);
|
|
219
|
-
}
|
|
510
|
+
const commentNode = createComment(token.value);
|
|
511
|
+
appendChild(currentParent, commentNode);
|
|
220
512
|
}
|
|
221
513
|
|
|
222
514
|
function parseCDATA(state: ParserState, token: Token): void {
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
const cdataNode: ASTNode = {
|
|
226
|
-
type: ASTNodeType.CDATA,
|
|
227
|
-
content: token.value,
|
|
228
|
-
parent: currentParent,
|
|
229
|
-
position: token.position
|
|
230
|
-
};
|
|
231
|
-
|
|
232
|
-
if (currentParent.children) {
|
|
233
|
-
currentParent.children.push(cdataNode);
|
|
234
|
-
}
|
|
515
|
+
// TODO: implement CDATA
|
|
235
516
|
}
|
|
236
517
|
|
|
237
518
|
function parseDoctype(state: ParserState, token: Token): void {
|
|
238
|
-
const
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
type: ASTNodeType.DOCTYPE,
|
|
242
|
-
content: token.value,
|
|
243
|
-
parent: currentParent,
|
|
244
|
-
position: token.position
|
|
245
|
-
};
|
|
246
|
-
|
|
247
|
-
if (currentParent.children) {
|
|
248
|
-
currentParent.children.push(doctypeNode);
|
|
249
|
-
}
|
|
519
|
+
const doctype = createDoctype(token.value || 'html');
|
|
520
|
+
appendChild(state.root, doctype);
|
|
521
|
+
state.root.doctype = doctype;
|
|
250
522
|
}
|
|
251
523
|
|
|
252
524
|
function parseProcessingInstruction(state: ParserState, token: Token): void {
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
const piNode: ASTNode = {
|
|
256
|
-
type: ASTNodeType.PROCESSING_INSTRUCTION,
|
|
257
|
-
content: token.value,
|
|
258
|
-
parent: currentParent,
|
|
259
|
-
position: token.position
|
|
260
|
-
};
|
|
525
|
+
// TODO: implement ProcessingInstruction
|
|
526
|
+
}
|
|
261
527
|
|
|
262
|
-
|
|
263
|
-
|
|
528
|
+
function runAdoptionAgencyAlgorithm(state: ParserState, tagName: string, token: Token): void {
|
|
529
|
+
// HTML5 Adoption Agency Algorithm - simplified but more correct implementation
|
|
530
|
+
|
|
531
|
+
// 1. If the current node is an HTML element whose tag name matches the token's tag name,
|
|
532
|
+
// then pop the current node off the stack of open elements and abort these steps.
|
|
533
|
+
const currentElement = getCurrentElement(state);
|
|
534
|
+
if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
|
|
535
|
+
state.stack.pop();
|
|
536
|
+
return;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
// 2. Let outer loop counter be 0
|
|
540
|
+
let outerLoopCounter = 0;
|
|
541
|
+
const formattingElements = ['a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'];
|
|
542
|
+
|
|
543
|
+
while (outerLoopCounter < 8) { // Prevent infinite loops
|
|
544
|
+
outerLoopCounter++;
|
|
545
|
+
|
|
546
|
+
// 3. Let the formatting element be the last element in the list of active formatting elements
|
|
547
|
+
// that is between the end of the list and the last scope marker or the start of the list,
|
|
548
|
+
// if any, that has the same tag name as the token.
|
|
549
|
+
|
|
550
|
+
// For simplicity, find the innermost element with matching tag name
|
|
551
|
+
let formattingElementIndex = -1;
|
|
552
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
553
|
+
const element = state.stack[i];
|
|
554
|
+
if (element.tagName && element.tagName.toLowerCase() === tagName && formattingElements.includes(tagName)) {
|
|
555
|
+
formattingElementIndex = i;
|
|
556
|
+
break;
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
if (formattingElementIndex === -1) {
|
|
561
|
+
// No formatting element found, just find any element with matching tag name
|
|
562
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
563
|
+
const element = state.stack[i];
|
|
564
|
+
if (element.tagName && element.tagName.toLowerCase() === tagName) {
|
|
565
|
+
formattingElementIndex = i;
|
|
566
|
+
break;
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
if (formattingElementIndex === -1) {
|
|
572
|
+
// No matching element found, ignore the token
|
|
573
|
+
addError(state, `Stray end tag: ${tagName}`, token.position?.offset || 0);
|
|
574
|
+
return;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
const formattingElement = state.stack[formattingElementIndex];
|
|
578
|
+
|
|
579
|
+
// 4. If there is no element in the stack of open elements that has the same tag name as the
|
|
580
|
+
// formatting element, then remove the element from the list of active formatting elements
|
|
581
|
+
// and abort these steps.
|
|
582
|
+
let openElementIndex = -1;
|
|
583
|
+
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
584
|
+
if (state.stack[i] === formattingElement) {
|
|
585
|
+
openElementIndex = i;
|
|
586
|
+
break;
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
if (openElementIndex === -1) {
|
|
591
|
+
// Element not in stack, ignore
|
|
592
|
+
return;
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
// 5. If the element is not in the stack of open elements, then this is a parse error;
|
|
596
|
+
// remove the element from the list of active formatting elements and abort these steps.
|
|
597
|
+
// (Already checked above)
|
|
598
|
+
|
|
599
|
+
// 6. Let the furthest block be the topmost node in the stack of open elements that is lower
|
|
600
|
+
// in the stack than the formatting element, and is an element in the special category.
|
|
601
|
+
const specialElements = ['address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'listing', 'main', 'menu', 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul', 'xmp'];
|
|
602
|
+
|
|
603
|
+
let furthestBlockIndex = -1;
|
|
604
|
+
for (let i = openElementIndex + 1; i < state.stack.length; i++) {
|
|
605
|
+
const element = state.stack[i];
|
|
606
|
+
if (element.tagName && specialElements.includes(element.tagName.toLowerCase())) {
|
|
607
|
+
furthestBlockIndex = i;
|
|
608
|
+
break;
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
if (furthestBlockIndex === -1) {
|
|
613
|
+
// No special element found, just pop elements until we reach the formatting element
|
|
614
|
+
while (state.stack.length > openElementIndex + 1) {
|
|
615
|
+
state.stack.pop();
|
|
616
|
+
}
|
|
617
|
+
state.stack.pop(); // Pop the formatting element
|
|
618
|
+
return;
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
// 7. Simplified: just pop everything until the formatting element
|
|
622
|
+
while (state.stack.length > openElementIndex + 1) {
|
|
623
|
+
state.stack.pop();
|
|
624
|
+
}
|
|
625
|
+
state.stack.pop(); // Pop the formatting element
|
|
626
|
+
return;
|
|
264
627
|
}
|
|
628
|
+
|
|
629
|
+
// If we get here, something went wrong, ignore the token
|
|
630
|
+
addError(state, `Adoption agency gave up on: ${tagName}`, token.position?.offset || 0);
|
|
265
631
|
}
|
|
266
632
|
|
|
267
633
|
function handleAutoClosing(state: ParserState, tagName: string): void {
|
|
@@ -269,19 +635,19 @@ function handleAutoClosing(state: ParserState, tagName: string): void {
|
|
|
269
635
|
if (!autoCloseList) return;
|
|
270
636
|
|
|
271
637
|
const currentElement = getCurrentElement(state);
|
|
272
|
-
if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName)) {
|
|
638
|
+
if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName.toLowerCase())) {
|
|
273
639
|
state.stack.pop();
|
|
274
640
|
}
|
|
275
641
|
}
|
|
276
642
|
|
|
277
|
-
function getCurrentParent(state: ParserState):
|
|
278
|
-
return state.stack[state.stack.length - 1]
|
|
643
|
+
function getCurrentParent(state: ParserState): any {
|
|
644
|
+
return state.stack[state.stack.length - 1];
|
|
279
645
|
}
|
|
280
646
|
|
|
281
|
-
function getCurrentElement(state: ParserState):
|
|
647
|
+
function getCurrentElement(state: ParserState): any {
|
|
282
648
|
for (let i = state.stack.length - 1; i >= 0; i--) {
|
|
283
|
-
const element = state.stack[i]
|
|
284
|
-
if (element.
|
|
649
|
+
const element = state.stack[i];
|
|
650
|
+
if (element.nodeType === 1) { // ELEMENT_NODE
|
|
285
651
|
return element;
|
|
286
652
|
}
|
|
287
653
|
}
|
|
@@ -306,7 +672,7 @@ function addError(state: ParserState, message: string, position: number): void {
|
|
|
306
672
|
});
|
|
307
673
|
}
|
|
308
674
|
|
|
309
|
-
function shouldSkipWhitespace(parent:
|
|
675
|
+
function shouldSkipWhitespace(parent: any): boolean {
|
|
310
676
|
const skipWhitespaceIn = new Set([
|
|
311
677
|
'html', 'head', 'body', 'table', 'tbody', 'thead', 'tfoot', 'tr',
|
|
312
678
|
'ul', 'ol', 'dl', 'select', 'optgroup'
|